def prepare_examples(self, egs, for_training=True, verbose=False): dictionary = Parser.load_dictionary() # Prepares the examples into training data, applying features etc. if verbose: print "Preparing %d examples"%len(egs), data, labels, pos, ngram, nsenses, syntactic, lesky, lesky_words = {}, {}, {}, {}, {}, {}, {}, {} if (self.use_syntactic_features and for_training): word_list = Syntactic_features.prepare_file(self.training_file) syn_train = Syntactic_features.parse_stanford_output(self.training_file, word_list) syn_index = 0 for eg in egs: if verbose: sys.stdout.write(".") sys.stdout.flush() eg.word = eg.word.lower() if not eg.word in data: data[eg.word] = [] labels[eg.word] = [] pos[eg.word] = [] lesky[eg.word] = [] lesky_words[eg.word] = [] if (self.use_syntactic_features and for_training): syntactic[eg.word] = [] # text = eg.context_before + " " + eg.target + " " + eg.pos + " " + eg.context_after #text = eg.context_before + " " + eg.target + " " + eg.context_after pre_words = eg.context_before.lower().split()[-self.window_size:] post_words = eg.context_after.lower().split()[:self.window_size] text = ' '.join(pre_words) + ' ' + eg.target + ' ' + ' '.join(post_words) # TODO worsens our F1! data[eg.word].append( text ) label = [ idx for idx,val in enumerate(eg.senses) if val == 1 ] labels[eg.word].append( label ) pos[eg.word].append(eg.pos_positions(window=self.pos_window_size)) if self.use_lesk: lesky[eg.word].append(eg.lesk(dictionary, window_size=self.lesk_window_size)) if self.use_lesk_words: lesky_words[eg.word].append(' '.join(eg.lesk_words(dictionary, window_size=self.lesk_words_window_size))) if (self.use_syntactic_features and for_training): syntactic[eg.word].append(syn_train[syn_index]) syn_index += 1 if for_training and self.ngram_size > 0: if eg.word not in nsenses: nsenses[eg.word] = len(eg.senses) for idx in range(0,len(eg.senses)): ngram[eg.word+str(idx)] = NGramModel(self.ngram_size, smooth_type="lap", unknown_type=None, gram_type="n") for idx in label: key = eg.word+str(idx) assert key in ngram # Only laplacian smoothing and no unknowns allows incremental training ngram[key].train([text]) # print pos # raise Exception() if for_training: return (data, labels, pos, lesky, lesky_words, ngram, nsenses, syntactic) else: return (data, labels, pos, lesky, lesky_words)
def predict(self, egs): # Given a list of examples, predict their word senses res = [] if (self.use_syntactic_features): word_list = Syntactic_features.prepare_file(self.test_file) syntactic = Syntactic_features.parse_stanford_output(self.test_file, word_list) syn_index = 0 for eg in egs: eg.word = eg.word.lower() data,labels,pos,lesky,lesky_words = self.prepare_examples([eg], for_training=False) # Add context words X = self.vectorizers[eg.word].transform(data[eg.word]) # Add Parts of Speech X_pos = self.pos_vectorizers[eg.word].transform(pos[eg.word]) X = sps.hstack((X, X_pos)) # Add Lesky Words if self.use_lesk_words: X_leskywords = self.lesky_words_vectorizers[eg.word].transform(lesky_words[eg.word]) X = sps.hstack((X, X_leskywords)) # Add Lesky if self.use_lesk: X_lesk = MVectorizer.rectangularize(lesky[eg.word]) X = sps.hstack((X, X_lesk)) # Add Syntactic dependencies if (self.use_syntactic_features): if all(synfeat == [] for synfeat in syntactic[syn_index]): pass elif (not (eg.word in self.syn_vectorizers)): pass else: X_syn = self.syn_vectorizers[eg.word].transform([syntactic[syn_index]]) (x_rows,x_cols) = X.shape (xsyn_rows,xsyn_cols) = X_syn.shape if x_rows != xsyn_rows: X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols)) X_syn = sps.vstack((X_syn,X_filler)) X = sps.hstack((X, X_syn)) syn_index += 1 # Add NGram model if self.ngram_size > 0: num_senses = self.nsenses[eg.word] assert num_senses == len(eg.senses) ngram_list = [] for sentence in data[eg.word]: ngram_list.append( dict([ ( idx, self.ngram[eg.word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) ) X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list) X = sps.hstack((X, X_ngram)) Y = self.classifiers[eg.word].predict(X) senses = [0]*len(eg.senses) for y in list(Y[0]): senses[y] = 1 res.extend(senses) return res