np.random.seed(0) class LabeledLineSentence(object): def __init__(self, data ): self.data = data def __iter__(self): for uid, line in enumerate( self.data ): yield TaggedDocument( line.split(" ") , ["S_%s" % uid] ) model = Doc2Vec( alpha=0.025 , min_alpha=0.025 ) sentences = LabeledLineSentence( train_texts + test_texts ) model.build_vocab( sentences ) model.train( sentences ) for w in model.vocab.keys(): try: model[w] = embeddings[w] except : continue for epoch in range(10): model.train(sentences) model.alpha -= 0.002 model.min_alpha = model.alpha train_rep = np.array( [ model.docvecs[i] for i in range( train_matrix.shape[0] ) ] ) test_rep = np.array( [ model.docvecs[i + train_matrix.shape[0]] for i in range( test_matrix.shape[0] ) ] ) model = LinearSVC( random_state=0 ) model.fit( train_rep , train_labels ) results = model.predict( test_rep ) print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results ) )) print (sklearn.metrics.classification_report( test_labels , results ))