コード例 #1
0
    def make_classifiers(self):
        # Set up data structures
        thesaurus_ids = options_list()

        # Dictionary of all the features we'll be using
        features = self._load_feature_list()

        # Number of senses for each thesaurus class (used later to calculate
        #  prior probabilities)
        number_of_senses = {id: 0 for id in thesaurus_ids}

        # Run through all the stored senses, building counts for each keyword
        total_senses = 0
        pl = PickleLoader(self.senses_dir)
        for sense in [s for s in pl.iterate() if s.branches]:
            total_senses += 1

            # Get the relevant thesaurus IDs for this sense
            ids = [id for id in sense.branches if id in thesaurus_ids]
            # Increment the sense count for each of these IDs
            for id in ids:
                number_of_senses[id] += 1

            # Apply all the features to this set of IDs
            for feature in sense.lemma_words:
                if feature in features:
                    for id in ids:
                        features[feature][id] += 1

        write_classifiers(self.classifiers_dir, thesaurus_ids, features,
                          number_of_senses, total_senses)
コード例 #2
0
    def classify_new_senses(self, **kwargs):
        bias_first = kwargs.get('bias_first', 1)
        bias_last = kwargs.get('bias_last', 1)
        dirname = kwargs.get('dir', 'default')

        # Set up the directory that output will be sent to
        outdir = os.path.join(self.output_dir, dirname)
        if not os.path.isdir(outdir):
            os.mkdir(outdir)

        # Load the classifiers into memory
        self.prior_probabilities, self.classifiers =\
            load_classifiers(self.classifiers_dir)

        # Adjust the values for '..._FIRST' or '..._LAST' features,
        #  so that these carry more or less weight than other features
        for feature, values in self.classifiers.items():
            for marker, weighting in (
                ('FIRST', bias_first),
                ('LAST', bias_last)
            ):
                if marker in feature and weighting != 1:
                    for id, old_log in values.items():
                        # Overwrite with the new log value
                        self.classifiers[feature][id] = old_log * weighting

        for letter in string.ascii_uppercase:
            print('Bayes-classifying in %s (%s)...' % (letter, dirname))
            output = []
            output_readable = []

            pl = PickleLoader(self.senses_dir, letters=letter)
            for sense in [s for s in pl.iterate() if not s.branches and
                          is_componentized(s)]:
                # Compute the top 20 results
                raw_results = self._classifyengine(sense)[0:20]
                # Package this into a result-set object
                result_set = BayesSense(sense=sense, results=raw_results,)
                output.append(result_set)

                output_readable.append('\n--------------------------------')
                output_readable.append('%s\t%d#eid%d' % (sense.lemma,
                    sense.refentry, sense.refid))
                for r in raw_results:
                    output_readable.append('\t%s\t%0.4g' % (
                        r.breadcrumb(), r.posterior))
                output_readable.append(result_set.display_features())

            # Output file for pickled result-set objects
            file1 = os.path.join(outdir, letter)
            with open(file1, 'wb') as filehandle:
                for o in output:
                    pickle.dump(o, filehandle)

            # Human-readable output file
            file2 = os.path.join(outdir, letter + '_readable.txt')
            with open(file2, 'w') as filehandle:
                for line in output_readable:
                    filehandle.write(line + '\n')