def train(labeled_featuresets, C=1e5):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        feat = [featureset for featureset, label in labeled_featuresets]
        feature_vectorizer = MVectorizer.DictsVectorizer()
        X = feature_vectorizer.fit_transform(feat)
        X = Normalizer().fit_transform(X)
        label_set = set( [label for featureset, label in labeled_featuresets] )
        label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
        y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
        print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
        classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True))
        classifier.fit(X,y)
        print "done"

        return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
Example #2
0
 def predict(self, egs):
     # Given a list of examples, predict their word senses
     res = []
     if (self.use_syntactic_features):
         word_list = Syntactic_features.prepare_file(self.test_file)
         syntactic = Syntactic_features.parse_stanford_output(self.test_file, word_list)
         syn_index = 0
     for eg in egs:
         eg.word = eg.word.lower()
         data,labels,pos,lesky,lesky_words = self.prepare_examples([eg], for_training=False)
         
         # Add context words
         X = self.vectorizers[eg.word].transform(data[eg.word])
         
         # Add Parts of Speech
         X_pos = self.pos_vectorizers[eg.word].transform(pos[eg.word])
         X = sps.hstack((X, X_pos))
           
         # Add Lesky Words
         if self.use_lesk_words:
           X_leskywords = self.lesky_words_vectorizers[eg.word].transform(lesky_words[eg.word])
           X = sps.hstack((X, X_leskywords))
             
         # Add Lesky
         if self.use_lesk:
           X_lesk = MVectorizer.rectangularize(lesky[eg.word])
           X = sps.hstack((X, X_lesk))
         
         # Add Syntactic dependencies
         if (self.use_syntactic_features):
             if all(synfeat == [] for synfeat in syntactic[syn_index]):
                 pass
             elif (not (eg.word in self.syn_vectorizers)):
                 pass
             else:
                 X_syn = self.syn_vectorizers[eg.word].transform([syntactic[syn_index]])
                 (x_rows,x_cols) = X.shape
                 (xsyn_rows,xsyn_cols) = X_syn.shape
                 if x_rows != xsyn_rows:
                     X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols))
                     X_syn = sps.vstack((X_syn,X_filler))
                 X = sps.hstack((X, X_syn))
             syn_index += 1
         
         # Add NGram model
         if self.ngram_size > 0:
             num_senses = self.nsenses[eg.word]
             assert num_senses == len(eg.senses)
             ngram_list = []
             for sentence in data[eg.word]:
                 ngram_list.append( dict([ ( idx, self.ngram[eg.word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) )
             X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list)
             X = sps.hstack((X, X_ngram))
         
         Y = self.classifiers[eg.word].predict(X)
         
         senses = [0]*len(eg.senses)
         for y in list(Y[0]):
             senses[y] = 1
         res.extend(senses)
     return res
Example #3
0
    def train(self,egs):
        # Trains a classifier for each word sense
        data,labels,pos,lesky,lesky_words,ngram,nsenses,syntactic = self.prepare_examples(egs,verbose=True)
        self.ngram = ngram
        self.nsenses = nsenses
        print "\nTraining on %d words"%len(data),
        for word in labels.iterkeys():
            sys.stdout.write(".")
            sys.stdout.flush()
            
            # Extract context features
            self.vectorizers[word] = Vectorizer()
            X = self.vectorizers[word].fit_transform(data[word])
            
            # Add Parts of Speech
            self.pos_vectorizers[word] = Vectorizer()
            X_pos = self.pos_vectorizers[word].fit_transform(pos[word])
            X = sps.hstack((X, X_pos))
              
            # Add Lesky Words
            if self.use_lesk_words:
              self.lesky_words_vectorizers[word] = Vectorizer()
              X_leskwords = self.lesky_words_vectorizers[word].fit_transform(lesky_words[word])
              X = sps.hstack((X, X_leskwords))
            
            # Add Lesky
            if self.use_lesk:
              X_lesk = MVectorizer.rectangularize(lesky[word])
              X = sps.hstack((X, X_lesk))
         
            # Add Syntactic dependencies
            if (self.use_syntactic_features):
                if all(synfeat == [] for synfeat in syntactic[word]):
                    pass
                else:
                    self.syn_vectorizers[word] = MVectorizer.ListsVectorizer()
                    X_syn = self.syn_vectorizers[word].fit_transform(syntactic[word])
                    (x_rows,x_cols) = X.shape
                    (xsyn_rows,xsyn_cols) = X_syn.shape
                    if x_rows != xsyn_rows:
                        X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols))
                        X_syn = sps.vstack((X_syn,X_filler))
                    X = sps.hstack((X, X_syn))
            
            # Add NGram model
            if self.ngram_size > 0:
                num_senses = self.nsenses[word]
                ngram_list = []
                for sentence in data[word]:
                    ngram_list.append( dict([ ( idx, self.ngram[word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) )
                X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list)
                X = sps.hstack((X, X_ngram))
                
            Y = labels[word]
            
            # Learn classifier
            # self.classifiers[word] = OneVsRestClassifier(SVC(kernel='linear',scale_C=True)) #Doesn't work
            self.classifiers[word] = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3))
            self.classifiers[word].fit(X,Y)

        print "\nDone"