def make_classifiers(self): # Set up data structures thesaurus_ids = options_list() # Dictionary of all the features we'll be using features = self._load_feature_list() # Number of senses for each thesaurus class (used later to calculate # prior probabilities) number_of_senses = {id: 0 for id in thesaurus_ids} # Run through all the stored senses, building counts for each keyword total_senses = 0 pl = PickleLoader(self.senses_dir) for sense in [s for s in pl.iterate() if s.branches]: total_senses += 1 # Get the relevant thesaurus IDs for this sense ids = [id for id in sense.branches if id in thesaurus_ids] # Increment the sense count for each of these IDs for id in ids: number_of_senses[id] += 1 # Apply all the features to this set of IDs for feature in sense.lemma_words: if feature in features: for id in ids: features[feature][id] += 1 write_classifiers(self.classifiers_dir, thesaurus_ids, features, number_of_senses, total_senses)
def classify_new_senses(self, **kwargs): bias_first = kwargs.get('bias_first', 1) bias_last = kwargs.get('bias_last', 1) dirname = kwargs.get('dir', 'default') # Set up the directory that output will be sent to outdir = os.path.join(self.output_dir, dirname) if not os.path.isdir(outdir): os.mkdir(outdir) # Load the classifiers into memory self.prior_probabilities, self.classifiers =\ load_classifiers(self.classifiers_dir) # Adjust the values for '..._FIRST' or '..._LAST' features, # so that these carry more or less weight than other features for feature, values in self.classifiers.items(): for marker, weighting in ( ('FIRST', bias_first), ('LAST', bias_last) ): if marker in feature and weighting != 1: for id, old_log in values.items(): # Overwrite with the new log value self.classifiers[feature][id] = old_log * weighting for letter in string.ascii_uppercase: print('Bayes-classifying in %s (%s)...' % (letter, dirname)) output = [] output_readable = [] pl = PickleLoader(self.senses_dir, letters=letter) for sense in [s for s in pl.iterate() if not s.branches and is_componentized(s)]: # Compute the top 20 results raw_results = self._classifyengine(sense)[0:20] # Package this into a result-set object result_set = BayesSense(sense=sense, results=raw_results,) output.append(result_set) output_readable.append('\n--------------------------------') output_readable.append('%s\t%d#eid%d' % (sense.lemma, sense.refentry, sense.refid)) for r in raw_results: output_readable.append('\t%s\t%0.4g' % ( r.breadcrumb(), r.posterior)) output_readable.append(result_set.display_features()) # Output file for pickled result-set objects file1 = os.path.join(outdir, letter) with open(file1, 'wb') as filehandle: for o in output: pickle.dump(o, filehandle) # Human-readable output file file2 = os.path.join(outdir, letter + '_readable.txt') with open(file2, 'w') as filehandle: for line in output_readable: filehandle.write(line + '\n')