コード例 #1
0
ファイル: scikit_classifier.py プロジェクト: jcccf/cs4740
 def prepare_examples(self, egs, for_training=True, verbose=False):
     dictionary = Parser.load_dictionary()
   
     # Prepares the examples into training data, applying features etc.
     if verbose:
         print "Preparing %d examples"%len(egs),
     data, labels, pos, ngram, nsenses, syntactic, lesky, lesky_words = {}, {}, {}, {}, {}, {}, {}, {}
     if (self.use_syntactic_features and for_training):
             word_list = Syntactic_features.prepare_file(self.training_file)
             syn_train = Syntactic_features.parse_stanford_output(self.training_file, word_list)
             syn_index = 0
     for eg in egs:
         if verbose:
             sys.stdout.write(".")
             sys.stdout.flush()
         
         eg.word = eg.word.lower()
         if not eg.word in data:
             data[eg.word] = []
             labels[eg.word] = []
             pos[eg.word] = []
             lesky[eg.word] = []
             lesky_words[eg.word] = []
             if (self.use_syntactic_features and for_training):
                 syntactic[eg.word] = []
         # text = eg.context_before + " " + eg.target + " " + eg.pos + " " + eg.context_after
         #text = eg.context_before + " " + eg.target + " " + eg.context_after
         pre_words = eg.context_before.lower().split()[-self.window_size:]
         post_words = eg.context_after.lower().split()[:self.window_size]
         text = ' '.join(pre_words) + ' ' + eg.target + ' ' + ' '.join(post_words) # TODO worsens our F1!
         data[eg.word].append( text )
         label = [ idx for idx,val in enumerate(eg.senses) if val == 1 ]
         labels[eg.word].append( label )
         pos[eg.word].append(eg.pos_positions(window=self.pos_window_size))
         
         if self.use_lesk:
           lesky[eg.word].append(eg.lesk(dictionary, window_size=self.lesk_window_size))
           
         if self.use_lesk_words:
           lesky_words[eg.word].append(' '.join(eg.lesk_words(dictionary, window_size=self.lesk_words_window_size)))
         
         if (self.use_syntactic_features and for_training):
             syntactic[eg.word].append(syn_train[syn_index])
             syn_index += 1
         if for_training and self.ngram_size > 0:
             if eg.word not in nsenses:
                 nsenses[eg.word] = len(eg.senses)
                 for idx in range(0,len(eg.senses)):
                     ngram[eg.word+str(idx)] = NGramModel(self.ngram_size, smooth_type="lap", unknown_type=None, gram_type="n")
             for idx in label:
                 key = eg.word+str(idx)
                 assert key in ngram
                 # Only laplacian smoothing and no unknowns allows incremental training
                 ngram[key].train([text])
     # print pos
     # raise Exception()
     if for_training:
         return (data, labels, pos, lesky, lesky_words, ngram, nsenses, syntactic)
     else:
         return (data, labels, pos, lesky, lesky_words)
コード例 #2
0
ファイル: scikit_classifier.py プロジェクト: jcccf/cs4740
 def predict(self, egs):
     # Given a list of examples, predict their word senses
     res = []
     if (self.use_syntactic_features):
         word_list = Syntactic_features.prepare_file(self.test_file)
         syntactic = Syntactic_features.parse_stanford_output(self.test_file, word_list)
         syn_index = 0
     for eg in egs:
         eg.word = eg.word.lower()
         data,labels,pos,lesky,lesky_words = self.prepare_examples([eg], for_training=False)
         
         # Add context words
         X = self.vectorizers[eg.word].transform(data[eg.word])
         
         # Add Parts of Speech
         X_pos = self.pos_vectorizers[eg.word].transform(pos[eg.word])
         X = sps.hstack((X, X_pos))
           
         # Add Lesky Words
         if self.use_lesk_words:
           X_leskywords = self.lesky_words_vectorizers[eg.word].transform(lesky_words[eg.word])
           X = sps.hstack((X, X_leskywords))
             
         # Add Lesky
         if self.use_lesk:
           X_lesk = MVectorizer.rectangularize(lesky[eg.word])
           X = sps.hstack((X, X_lesk))
         
         # Add Syntactic dependencies
         if (self.use_syntactic_features):
             if all(synfeat == [] for synfeat in syntactic[syn_index]):
                 pass
             elif (not (eg.word in self.syn_vectorizers)):
                 pass
             else:
                 X_syn = self.syn_vectorizers[eg.word].transform([syntactic[syn_index]])
                 (x_rows,x_cols) = X.shape
                 (xsyn_rows,xsyn_cols) = X_syn.shape
                 if x_rows != xsyn_rows:
                     X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols))
                     X_syn = sps.vstack((X_syn,X_filler))
                 X = sps.hstack((X, X_syn))
             syn_index += 1
         
         # Add NGram model
         if self.ngram_size > 0:
             num_senses = self.nsenses[eg.word]
             assert num_senses == len(eg.senses)
             ngram_list = []
             for sentence in data[eg.word]:
                 ngram_list.append( dict([ ( idx, self.ngram[eg.word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) )
             X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list)
             X = sps.hstack((X, X_ngram))
         
         Y = self.classifiers[eg.word].predict(X)
         
         senses = [0]*len(eg.senses)
         for y in list(Y[0]):
             senses[y] = 1
         res.extend(senses)
     return res