def doc_vect(filename): documents = [] from gensim.models.doc2vec import TaggedDocument with open(filename, 'r') as fo: for line in fo.readlines(): tokens = line.strip().split('\t') if tokens[2] != str(0): sentence = TaggedDocument(tokens[1].split(), [tokens[0]]) documents.append(sentence) print len(documents) from gensim.models.doc2vec import Doc2Vec import multiprocessing cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=100, negative=5, hs=0, min_count=1, workers=cores), # PV-DM w/average Doc2Vec(documents, dm=1, dm_mean=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), ] '''Inspect model''' # for model in simple_models[:1]: # print model # for label in range(1, 11): # inferred_docvec = model.docvecs[str(label)] # print label # print('%s:\n %s' % (model, model.most_similar(str(label)))) models_by_name = OrderedDict( (str(model), model) for model in simple_models) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) return models_by_name
def doc_vect(alldocs): print 'Doc2Vec Each Tag is ID' train_docs = [doc for doc in alldocs if doc.split == 'train'] test_docs = [doc for doc in alldocs if doc.split == 'test'] print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs))) documents = [] for doc in train_docs: sentence = TaggedDocument(doc.words, doc.tags) documents.append(sentence) print len(documents) cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores), # PV-DM w/average Doc2Vec(documents, dm=1, dm_mean=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores), ] models_by_name = OrderedDict((str(model), model) for model in simple_models) models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]]) for name, model in models_by_name.items(): print name train_targets, train_regressors = zip(*[(doc.sentiment, model.docvecs[doc.tags[0]]) for doc in train_docs]) test_targets, test_regressors = zip(*[(doc.sentiment, model.infer_vector(doc.words)) for doc in test_docs]) util.logit(train_regressors, train_targets, test_regressors, test_targets) util.svm(train_regressors, train_targets, test_regressors, test_targets)
def doc_vect(filename): """Using ground-truth labels or predicted labels in Word2Vec""" df = label_generate(filename) names = list(df.columns.values) # print names textid = names.index('Text') predid = names.index('Label') documents = [] from gensim.models.doc2vec import TaggedDocument for line in df.itertuples(): # print line # print line[textid+1] # print line[predid+1] sentence = TaggedDocument(line[textid + 1].split(), [str(line[predid + 1])]) documents.append(sentence) print len(documents) from gensim.models.doc2vec import Doc2Vec import multiprocessing cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=100, negative=5, hs=0, min_count=1, workers=cores), # PV-DM w/average Doc2Vec(documents, dm=1, dm_mean=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), ] models_by_name = OrderedDict( (str(model), model) for model in simple_models) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) pickle.dump(models_by_name, open('data/doc2vec.pick', 'w'))
def vect_all(X_train, y_train, X_test, y_predict): documents = [] from gensim.models.doc2vec import TaggedDocument for line in zip(X_train, y_train): sentence = TaggedDocument(line[0].split(), [str(line[1])]) documents.append(sentence) for line in zip(X_test, y_predict): sentence = TaggedDocument(line[0].split(), [str(line[1])]) documents.append(sentence) print len(documents) from gensim.models.doc2vec import Doc2Vec cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=100, negative=5, hs=0, min_count=1, workers=cores), # PV-DM w/average Doc2Vec(documents, dm=1, dm_mean=1, size=100, window=5, negative=5, hs=0, min_count=1, workers=cores), ] models_by_name = OrderedDict( (str(model), model) for model in simple_models) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) return models_by_name
def train_models(docs, passes): cores = multiprocessing.cpu_count() models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size gensim.models.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores), # PV-DBOW gensim.models.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores), # PV-DM w/average gensim.models.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores), #PV-DM, concatinate gensim.models.Doc2Vec(size=100, window=8, min_count=5, workers=4), ] print('Initilizing all models, build vocabs...') ''' models[0].build_vocab(docs) for model in models[1:]: model.reset_from(models[0]) ''' for model in models: print('---', model) model.build_vocab(docs) #models_by_name = OrderedDict((str(model), model) for model in models) models_by_name = OrderedDict() models_by_name['default'] = models[3] models_by_name['dmc'] = models[0] models_by_name['dbow'] = models[1] models_by_name['dmm'] = models[2] models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([models[1], models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([models[1], models[0]]) #learning rate decreaseing alpha, min_alpha= (0.025, 0.001) alpha_delta = (alpha - min_alpha) / passes for epoch in range(passes): print('====================pass', epoch) #shuffle(docs) for name, model in models_by_name.items(): model.alpha, model.min_alpha = alpha, alpha#set learning rate print('---training model', name, 'at alpha', alpha) with elapsed_timer() as elapsed: model.train(docs) print('------finished after %0.1fs'%elapsed()) if epoch%3==0: visualize(model.docvecs, docs, name + '_' + str(epoch)) alpha -= alpha_delta
def load_model(self, model_dir): """Load models """ for name in self.name_to_models.keys(): self.name_to_models[name] = Doc2Vec.load(model_dir + name) loaded_model = ConcatenatedDoc2Vec(list(self.name_to_models.values())) return loaded_model
def fit(self, documents, y=None): if self.model_name: print("Loading Book2Vec") model_data = os.path.join(self.model_dir, 'model_%s.doc2vec' % self.model_name) if self.model_name == 'dbow_dmm' or self.model_name == 'dbow_dmc': m1 = os.path.join( self.model_dir, 'model_%s.doc2vec' % self.model_name.split('_')[0]) m2 = os.path.join( self.model_dir, 'model_%s.doc2vec' % self.model_name.split('_')[1]) model1 = Doc2Vec.load(m1) model2 = Doc2Vec.load(m2) self.model_ = ConcatenatedDoc2Vec([model1, model2]) self.num_features_ = model1.syn0.shape[1] + model2.syn0.shape[1] else: self.model_ = Doc2Vec.load(model_data) self.num_features_ = self.model_.syn0.shape[1] print(self.num_features_) print("Done Loading vectors") else: raise OSError("Model does not exit") return self
def fit(self, documents, y=None): print("Here===> {}".format(self.model_name)) if self.model_name: print("Loading Vectors") model_data = os.path.join(self.model_dir, "model_%s.doc2vec" % self.model_name) if self.model_name == "dbow_dmm" or self.model_name == "dbow_dmc": m1 = os.path.join( self.model_dir, "model_%s.doc2vec" % self.model_name.split("_")[0]) m2 = os.path.join( self.model_dir, "model_%s.doc2vec" % self.model_name.split("_")[1]) model1 = Doc2Vec.load(m1) model2 = Doc2Vec.load(m2) self.model_ = ConcatenatedDoc2Vec([model1, model2]) self.num_features_ = model1.wv.syn0.shape[ 1] + model2.wv.syn0.shape[1] else: self.model_ = Doc2Vec.load(model_data) self.num_features_ = self.model_.wv.syn0.shape[1] print(self.num_features_) print("Done Loading vectors") else: print("Hereeeeee ", self.model_name) raise OSError("Model does not exit") return self
def infer_embedding(model_no, reviews, reviews_size, concatenate): # para model_no: which Doc2Vec model / list if concatenate # para reviews: all reviews (training or test) # type reviews: list(list(str)) # type reviews_size: int # para concatenate: whether to concatenate two doc2vec models if not concatenate: model = load_model(model_no) print("description of the doc2vec model\t", str(model)) else: model_1 = load_model(model_no[0]) model_2 = load_model(model_no[1]) print("description of the 1st doc2vec model\t", str(model_1)) print("description of the 2nd doc2vec model\t", str(model_2)) model = ConcatenatedDoc2Vec([model_1, model_2]) sentiment_review = namedtuple('Sentiment_Review', 'words tags sentiment') allreviews = [] bar = progressbar.ProgressBar() for i in bar(range(len(reviews))): tags = [i] if i < reviews_size: allreviews.append(sentiment_review(reviews[i], tags, 0.)) else: allreviews.append(sentiment_review(reviews[i], tags, 1.)) # infer the doc2vec embeddings with given model vectors, labels = vector_4_learning(model, allreviews) return vectors, labels
def build_doc2vec(all_docs, size_in): cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" models_list = [ # Doc2Vec(dm = 1, dm_mean= 1, size = 400, window = 10, negative = 11, hs = 0, min_count= 1, workers= cores), #Doc2Vec(dm=1, dm_mean=1, size=500, window=10, negative=11, hs=0, min_count=1, workers=cores), Doc2Vec(dm=1, dm_mean=1, size=size_in, window=3, negative=11, hs=0, min_count=1, workers=cores), # Doc2Vec(dm=0, size=400, negative=11, hs=0, min_count=1, workers=cores), #Doc2Vec(dm=0, size=500, negative=11, hs=0, min_count=1, workers=cores), Doc2Vec(dm=0, size=500, negative=11, hs=0, min_count=1, workers=cores) ] # classifying_docs should be in console before running this models_list[0].build_vocab(all_docs) print models_list[0] for model in models_list[1:]: model.reset_from(models_list[0]) print model models_by_name = OrderedDict((str(model), model) for model in models_list) models_by_name['dbow+dmm_500'] = ConcatenatedDoc2Vec( [models_list[0], models_list[1]]) #models_by_name['dbow+dmm_500'] = ConcatenatedDoc2Vec([models_list[1], models_list[4]]) #models_by_name['dbow+dmm_600'] = ConcatenatedDoc2Vec([models_list[2], models_list[5]]) return models_by_name
def get_model(self): if not os.path.isfile(self.dbow_path) and not os.path.isfile(self.dm_path): self.create_models() dbow_model = Doc2Vec.load(self.dbow_path) dm_model = Doc2Vec.load(self.dm_path) return ConcatenatedDoc2Vec([dbow_model, dm_model])
def get_text_feature(texts): model_dbow = Doc2Vec.load(os.path.dirname(os.path.abspath(__file__))+'/doc2vector_dbow.doc2vec') model_dm = Doc2Vec.load(os.path.dirname(os.path.abspath(__file__))+'/doc2vector_dm.doc2vec') model = ConcatenatedDoc2Vec([model_dm, model_dbow]) result = [] i = 1 for text in texts: if i % 1000 == 0: print(i,"texts of",len(texts),"texts have been loaded") i += 1 result.append(model.infer_vector(document=Preprocessing(text),alpha=0.025,min_alpha=0.025,steps=100)) return torch.tensor(result)
def create_models(corpus, vector_size): models = [ # PV-DM w/ concatenation Doc2Vec(dm=1, dm_concat=1, vector_size=vector_size, window=5, sample=1e-4, negative=5, hs=0, min_count=3, workers=WORKERS), # PV-DBOW Doc2Vec(dm=0, vector_size=vector_size, negative=5, hs=0, sample=1e-4, min_count=3, workers=WORKERS), # PV-DM w/ average Doc2Vec(dm=1, dm_mean=1, vector_size=vector_size, window=10, sample=1e-4, negative=5, hs=0, min_count=3, workers=WORKERS) ] build_vocabulary(models, corpus) models_by_name = OrderedDict((str(model), model) for model in models) models_by_name["dbow + dmm"] = ConcatenatedDoc2Vec([models[1], models[2]]) models_by_name["dbow + dmc"] = ConcatenatedDoc2Vec([models[1], models[0]]) return models_by_name
def __init__(self, model_dm_path, model_dbow_path): """ Load DM and DBOW trained models, create concatenated model :param model_dm_path: path to DM model :param model_dbow_path: path to DBOW model """ logger.info('Combining DM and DBOW models') self.tokenizer = RegexpTokenizer(r'\w+') self.model_dm = Doc2Vec.load(model_dm_path) self.model_dbow = Doc2Vec.load(model_dbow_path) self.model_dbow.delete_temporary_training_data( keep_doctags_vectors=True, keep_inference=True) self.model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) self.concatenated_model = ConcatenatedDoc2Vec( [self.model_dbow, self.model_dm])
def doc2vec_feats(train, test, dim1, dim2): train_corpus = train['Text'].values test_corpus = test['Text'].values corpus = pd.concat((train, test), axis=0) cores = multiprocessing.cpu_count() train_doc2vec_dm(corpus['Text'].values, dim1, cores) train_doc2vec_dbow(corpus['Text'].values, dim2, cores) print('Computing doc2vec(PV-DM + PV-DBOW) feature...') mode1_dm = gensim.models.doc2vec.Doc2Vec.load('data/models/doc2vec/' + str(dim1) + '_dim/neurips_d2v_dm') mode1_dbow = gensim.models.doc2vec.Doc2Vec.load('data/models/doc2vec/' + str(dim2) + '_dim/neurips_d2v_dbow') model = ConcatenatedDoc2Vec([mode1_dm, mode1_dbow]) cor = list(read_corpus(train_corpus)) train_feats = np.array( list(model.infer_vector(cor[idx].words) for idx in range(len(cor)))) del cor pd.DataFrame(data=train_feats).to_csv('data/features/doc2vec_dm' + str(dim1) + '_dbow' + str(dim2) + '_train.csv', header=False, index=False) cor = list(read_corpus(test_corpus)) test_feats = np.array( list(model.infer_vector(cor[idx].words) for idx in range(len(cor)))) pd.DataFrame(data=test_feats).to_csv('data/features/doc2vec_dm' + str(dim1) + '_dbow' + str(dim2) + '_test.csv', header=False, index=False)
# Testing F1 score: 0.10642747269276188 print( logreg.predict( [model_dmm.infer_vector(tokenize_text('fractura proximal'), steps=20)])) # ['s52.102'] # Puliamo la RAM model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # Model paring new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm]) # Concatenazione dei due modelli def get_vectors(model, tagged_docs): sents = tagged_docs.values targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents]) return targets, regressors # training regressione logistica y_train, X_train = get_vectors(new_model, train_tagged) y_test, X_test = get_vectors(new_model, test_tagged) logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test)
def labeldoc_vect(alldocs): print 'LabelDoc2Vec with lineNO as ID' train_docs = [doc for doc in alldocs if doc.split == 'train'] test_docs = [doc for doc in alldocs if doc.split == 'test'] # unlable_docs = [doc for doc in alldocs if doc.split == 'extra'] print('%d docs: %d train, %d test' % (len(alldocs), len(train_docs), len(test_docs))) documents = [] for doc in train_docs: slable = [] if doc.split == 'train': slable = ['s' + str(doc.label)] sentence = LabeledTaggedDocument(doc.words, doc.tags, slable) documents.append(sentence) # for doc in unlable_docs: # slable = [] # if doc.split == 'train': # slable = ['s'+str(doc.label)] # sentence = LabeledTaggedDocument(doc.words, doc.tags, slable) # documents.append(sentence) print len(documents) cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size LabelDoc2Vec(documents, dm=1, dm_concat=1, size=size, window=window, negative=5, hs=1, sample=1e-3, iter=iter, min_count=1, workers=cores), # PV-DBOW LabelDoc2Vec(documents, dm=0, size=size, window=window, negative=5, hs=1, sample=1e-3, iter=iter, min_count=1, workers=cores), # PV-DM w/average LabelDoc2Vec(documents, dm=1, dm_mean=1, size=size, window=window, negative=5, hs=1, sample=1e-3, iter=iter, min_count=1, workers=cores), ] models_by_name = OrderedDict( (str(model), model) for model in simple_models) models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) for name, model in models_by_name.items(): print name train_targets, train_regressors = zip(*[(doc.label, model.docvecs[doc.tags[0]]) for doc in train_docs]) test_targets, test_regressors = zip( *[(doc.label, model.infer_vector_label(doc.words)) for doc in test_docs]) data_util.logit(train_regressors, train_targets, test_regressors, test_targets)
dm_model_concat = Doc2Vec(dm=1, dm_concat=1, vector_size=50, workers=cores, epochs=100, min_count=2, negative=5, hs=0, sample=0) dm_model_concat.build_vocab(train_tagged_c.tolist()) dm_model_concat.train(train_tagged_c, total_examples=len(train_tagged_c), epochs=dm_model_concat.epochs) # mixed_model = ConcatenatedDoc2Vec([skip_gram_model, dm_model]) mixed_model_with_concat = ConcatenatedDoc2Vec( [skip_gram_model, dm_model_concat]) doc_vectors = [] for ind_ in order_level_one_month.index: doc_vectors.append(mixed_model_with_concat.docvecs[ind_]) doc_vec_df = pd.DataFrame(doc_vectors) doc_vec_df.columns = [f"custvec_{col}" for col in range(100)] order_level_one_month = pd.concat([order_level_one_month, doc_vec_df], axis=1) skip_gram_model.delete_temporary_training_data(False, False) # free up memory dm_model_concat.delete_temporary_training_data(False, False) # free up memory ########################## skip_gram_model = Doc2Vec(dm=0, vector_size=50,
from gensim import utils from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import BayesianRidge from sklearn.svm import LinearSVR OPTIMAL_INFER_STEPS_FOR_TEST = 9000 INFER_STEPS = OPTIMAL_INFER_STEPS_FOR_TEST # Load models model_dir = "./reviews/models/" model_dbow = Doc2Vec.load(model_dir + 'model.dbow') model_dm_mean = Doc2Vec.load(model_dir + 'model.dm_mean') model_dm_concat = Doc2Vec.load(model_dir + "model.dm_concat") model = ConcatenatedDoc2Vec([model_dbow, model_dm_concat]) #model = model_dm_mean #model = model_dm_concat #model = model_dbow # Training set label_to_num = OrderedDict() # Label to the number of label's sentences label_to_num[0] = 50000 label_to_num[1] = 50000 label_to_num[2] = 50000 label_to_num[3] = 50000 label_to_train_tag = { 0: "TRAIN_ONE_STAR_", 1: "TRAIN_TWO_STAR_", 2: "TRAIN_THREE_STAR_", 3: "TRAIN_FOUR_STAR_"
epochs=5, batch_size=10, verbose=2) model_dmm.save('/Users/ganfeng/Documents/Data Science/Projects/Natural Language Processing Project/Neural_Nets/model_dmm_01') prediction_model_dmm = np.argmax(model_dmm.predict(test_vecs_dmm), axis=-1) np.save('/Users/ganfeng/Documents/Data Science/Projects/Natural Language Processing Project/Neural_Nets/prediction_model_dmm.npy', prediction_model_dmm) print(accuracy_score(prediction_model_dmm, label_test)) print(classification_report(prediction_model_dmm, label_test)) ########################################## # Concatenate the above two models model_merge = ConcatenatedDoc2Vec([gensim_model_d2v_dbow_load, gensim_model_d2v_dmm_load]) train_vecs_merge = get_vectors(model_merge, df_review_train_loaded['text'][0:600000], 200) validation_vecs_merge = get_vectors(model_merge, df_review_train_loaded['text'][600000:800000], 200) test_vecs_merge = get_vectors(model_merge, df_review_test_loaded['text'], 200) # Modeling of the neural nets seed = 100 np.random.seed(seed) model_merge = Sequential() model_merge.add(Dense(50, activation='relu', input_dim=200)) model_merge.add(Dense(6, activation='softmax')) opt = keras.optimizers.Adam(learning_rate=0.0002) model_merge.compile(optimizer=opt,loss='sparse_categorical_crossentropy',metrics=['accuracy']) model_merge.fit(train_vecs_merge, y_train_final, validation_data=(validation_vecs_merge, y_validation_final),
def d2v(train_tagged, valid_tagged, d2v_model, d2v_alpha, d2v_min_alpha, vector_size, min_count, max_vocab_size): model_dm = Doc2Vec( dm=1, # 1: PV-DM. 0: PV-DBOW dm_mean=1, # 0: use the sum of the context word vectors. 1: use the mean epochs=20, seed=1234, workers=1, alpha=d2v_alpha, min_alpha= d2v_min_alpha, # initial learning rate; linearly drops to min_alpha as training progresses vector_size=vector_size, # feature vector dim min_count=min_count, max_vocab_size=max_vocab_size, hs=0, # 1: hierarchical softmax is used negative=5 ) # use negative sampling - how many “noise words” should be drawn model_dbow = Doc2Vec( dm=0, # 1: PV-DM. 0: PV-DBOW epochs=20, seed=1234, workers=1, alpha=d2v_alpha, min_alpha= d2v_min_alpha, # initial learning rate; linearly drops to min_alpha as training progresses vector_size=vector_size, # feature vector dim min_count=min_count, max_vocab_size=max_vocab_size, hs=0, # 1: hierarchical softmax is used negative=5 ) # use negative sampling - how many “noise words” should be drawn if d2v_model == "dm": model = model_dm model.build_vocab(train_tagged) model.train(train_tagged, total_examples=len(train_tagged.values), epochs=model.epochs) elif d2v_model == "dbow": model = model_dbow model.build_vocab(train_tagged) model.train(train_tagged, total_examples=len(train_tagged.values), epochs=model.epochs) else: # Train DM model_dm.build_vocab(train_tagged) model_dm.train(train_tagged, total_examples=len(train_tagged.values), epochs=model_dm.epochs) # Train DBOW model_dbow.build_vocab(train_tagged) model_dbow.train(train_tagged, total_examples=len(train_tagged.values), epochs=model_dbow.epochs) # Delete temporary training data to free up RAM model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # Combine DM and DBOW model = ConcatenatedDoc2Vec([model_dbow, model_dm]) train_docs = train_tagged.values Y_train, X_train_vec = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in train_docs]) valid_docs = valid_tagged.values Y_valid, X_valid_vec = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in valid_docs]) return X_train_vec, Y_train, X_valid_vec, Y_valid
# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores), # PV-DBOW Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores), # PV-DM w/average Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores)] ] # speed setup by sharing results of 1st model's vocabulary scan simple_models[0].build_vocab(doc_list) # PV-DM/concat requires one special NULL word so it serves as template for model in simple_models[1:]: model.reset_from(simple_models[0]) print("got model") from gensim.test.test_doc2vec import ConcatenatedDoc2Vec #Create dbow+dmm model (concatenation of model 2 and 3) train_model = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]]) #Load XLNet features xlnet_train = np.load("bert_features.npy").tolist() xlnet_features = np.load("bert_test_features.npy").tolist() alpha, min_alpha, passes = (0.025, 0.001, 20) alpha_delta = (alpha - min_alpha) / passes print("begin evaluation") accs = [] filterwarnings('ignore') for epoch in range(passes): ## Shuffle data # random.seed(epoch) docs_features = random.sample(list(zip(doc_list, xlnet_train)), len(doc_list)) random.seed(epoch) doc_list = random.sample(doc_list, len(doc_list))
def all_techniques(data1, target, text, test_percent, min_df, max_df, ngram_range1, ngram_range2, vector_size1, window1, negative1, min_count1): #Doc2vec parameters: vector_size, window, negative, min_count #train - test split X_train, X_test, y_train, y_test = data_prep(data1, target, text, test_percent) #Simple BOW ******************************************************************************************************** #min_df is used for removing terms that appear too infrequently. For example: #min_df = 0.01 means "ignore terms that appear in less than 1% of the documents". #min_df = 5 means "ignore terms that appear in less than 5 documents". #max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example: #max_df = 0.50 means "ignore terms that appear in more than 50% of the documents". #max_df = 25 means "ignore terms that appear in more than 25 documents". #WITH PLAIN LYRICS TEXT = text #tokenization vect = CountVectorizer(max_df=max_df, min_df=min_df).fit(X_train[TEXT]) #transform the documents in the training data to a document-term matrix X_train_vectorized = vect.transform(X_train[TEXT]) X_test_vectorized = vect.transform(X_test[TEXT]) #models with only lyrics output1_bow_plain_lyrics = log_reg_and_svm(X_train_vectorized, y_train, X_test_vectorized, y_test) #Tf-Idf ****************************************************************************************************************** vect_tf = TfidfVectorizer(min_df=min_df, max_df=max_df).fit(X_train[TEXT]) #transform X_train_vectorized_tf = vect_tf.transform(X_train[TEXT]) X_test_vectorized_tf = vect_tf.transform(X_test[TEXT]) #models with only lyrics output3_tfidf_plain_lyrics = log_reg_and_svm(X_train_vectorized_tf, y_train, X_test_vectorized_tf, y_test) #N-grams ****************************************************************************************************************** #document frequency of 5 and extracting 1-grams and 2-grams... vect3 = CountVectorizer(min_df=min_df, ngram_range=(ngram_range1, ngram_range2)).fit(X_train[TEXT]) X_train_vectorized_ng = vect3.transform(X_train[TEXT]) X_test_vectorized_ng = vect3.transform(X_test[TEXT]) #models with only lyrics output5_ngram_plain_lyrics = log_reg_and_svm(X_train_vectorized_ng, y_train, X_test_vectorized_ng, y_test) #Doc2Vec ******************************************************************************************************************************************************** train_doc = pd.DataFrame(pd.concat([X_train[TEXT], y_train], axis=1)) test_doc = pd.DataFrame(pd.concat([X_test[TEXT], y_test], axis=1)) train_doc2 = train_doc.apply(lambda x: TaggedDocument( words=tokenize_text(x[TEXT]), tags=[x[target]]), axis=1) test_doc2 = test_doc.apply(lambda x: TaggedDocument( words=tokenize_text(x[TEXT]), tags=[x[target]]), axis=1) #DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec. #The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph. #We set the minimum word count to 2 in order to discard words with very few occurrences. cores = multiprocessing.cpu_count() model_dbow = Doc2Vec(dm=0, vector_size=vector_size1, window=window1, negative=negative1, min_count=min_count1, hs=0, workers=cores, epochs=200) train_corpus = [x for x in train_doc2.values] model_dbow.build_vocab([x for x in train_doc2.values]) model_dbow.train(train_corpus, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs) y_train_doc, X_train_doc = get_vectors(model_dbow, train_doc2) y_test_doc, X_test_doc = get_vectors(model_dbow, test_doc2) #models output7_doc2vec_dbow_plain_lyrics = log_reg_and_svm( X_train_doc, y_train_doc, X_test_doc, y_test_doc) #dm = 1 model model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=vector_size1, window=window1, negative=negative1, min_count=min_count1, workers=cores, epochs=200) model_dmm.build_vocab([x for x in train_doc2.values]) model_dmm.train(train_corpus, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs) y_train_doc, X_train_doc = get_vectors(model_dmm, train_doc2) y_test_doc, X_test_doc = get_vectors(model_dmm, test_doc2) #models output8_doc2vec_dm_plain_lyrics = log_reg_and_svm(X_train_doc, y_train_doc, X_test_doc, y_test_doc) #Mix of dbow and dmm model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm]) y_train_doc, X_train_doc = get_vectors(new_model, train_doc2) y_test_doc, X_test_doc = get_vectors(new_model, test_doc2) #models output9_doc2vec_dbowdb_plain_lyrics = log_reg_and_svm( X_train_doc, y_train_doc, X_test_doc, y_test_doc) xxd11 = pd.DataFrame(output1_bow_plain_lyrics, index=['bow_plain_lyrics']) xxd11 = xxd11.append( pd.DataFrame(output3_tfidf_plain_lyrics, index=['tfidf_plain_lyrics'])) xxd11 = xxd11.append( pd.DataFrame(output5_ngram_plain_lyrics, index=['ngram_plain_lyrics'])) xxd11 = xxd11.append( pd.DataFrame(output7_doc2vec_dbow_plain_lyrics, index=['doc2vec_dbow_plain_lyrics'])) xxd11 = xxd11.append( pd.DataFrame(output8_doc2vec_dm_plain_lyrics, index=['doc2vec_dm_plain_lyrics'])) xxd11 = xxd11.append( pd.DataFrame(output9_doc2vec_dbowdb_plain_lyrics, index=['doc2vec_dbowdb_plain_lyrics'])) return xxd11
def main(): parser = argparse.ArgumentParser(description="") # Add options parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase output verbosity") # Add arguments parser.add_argument("input_file", help="The input file to be projected") # parser.add_argument("speech_feats_file", help="The input file to be projected") # parser.add_argument("out_path_file", help="The input file to be projected") args = parser.parse_args() transcription_data_file = args.input_file df_ = pd.read_csv(transcription_data_file, sep='|') df_.columns = ['utterance', 'text'] df_.index = range(df_.shape[0]) print(df_.head()) # df_['text']=df_['text'].apply(nltk.word_tokenize) print(df_.head()) train_tagged = df_.apply(lambda r: TaggedDocument( words=tokenize_text(r['text']), tags=r.utterance), axis=1) # print(unsup_reviews.head()) # # print(X_clean.shape) model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores) model_dbow.build_vocab(train_tagged) # %%time for epoch in range(30): model_dbow.train(utils.shuffle(train_tagged), total_examples=len(train_tagged.values), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha n_dim = 300 model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065) model_dmm.build_vocab(train_tagged) # %%time for epoch in range(30): model_dmm.train(utils.shuffle(train_tagged), total_examples=len(train_tagged.values), epochs=1) model_dmm.alpha -= 0.002 model_dmm.min_alpha = model_dmm.alpha model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm]) #Get training set vectors from our models x_doc2vec = OrderedDict() for utt, text in zip(df_['utterance'].to_list(), df_['text'].to_list()): tokens = model_dm.infer_vector(text) x_doc2vec[utt] = tokens df_doc2vec = pd.DataFrame(x_doc2vec).T df_doc2vec.columns = [ 'doc2vec_{}'.format(str(i).zfill(3)) for i in range(n_dim) ] df_doc2vec['utterance'] = df_doc2vec.index df_doc2vec.to_csv('output_doc2vec_features.csv', index=False) fname = get_tmpfile("my_doc2vec_model") model.save(fname) model = Doc2Vec.load( fname) # you can continue training with the loaded model!
y_train, X_train = vec_for_learning(model_dmm, train_tagged) y_test, X_test = vec_for_learning(model_dmm, test_tagged) logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) print('Testing accuracy %s' % accuracy_score(y_test, y_pred)) print('Testing F1 score: {}'.format( f1_score(y_test, y_pred, average='weighted'))) model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm]) def get_vectors(model, tagged_docs): sents = tagged_docs.values targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents]) return targets, regressors y_train, X_train = get_vectors(new_model, train_tagged) y_test, X_test = get_vectors(new_model, test_tagged) logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
def label_vect(alldocs): print 'Label2Vec with pre-classification' train_docs = [doc for doc in alldocs if doc.split == 'train'] test_docs = [doc for doc in alldocs if doc.split == 'test'] non_docs = [doc for doc in alldocs if doc.split == 'extra'] print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs))) ylin = pre_class(train_docs, test_docs, non_docs) documents = [] for doc in train_docs: sentence = TaggedDocument(doc.words, [str(doc.sentiment)]) documents.append(sentence) i = 0 for doc in test_docs + non_docs: sentence = TaggedDocument(doc.words, [str(ylin[i])]) documents.append(sentence) i += 1 print len(documents) cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(documents, dm=1, dm_concat=1, size=100, window=10, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores), # PV-DBOW Doc2Vec(documents, dm=0, size=100, window=10, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores), # PV-DM w/average Doc2Vec(documents, dm=1, dm_mean=1, size=100, window=10, negative=5, hs=1, sample=1e-3, min_count=1, workers=cores), ] models_by_name = OrderedDict( (str(model), model) for model in simple_models) models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) for name, model in models_by_name.items(): print name train_targets, train_regressors = zip(*[(doc.sentiment, model.infer_vector(doc.words)) for doc in train_docs]) test_targets, test_regressors = zip(*[(doc.sentiment, model.infer_vector(doc.words)) for doc in test_docs]) util.logit(train_regressors, train_targets, test_regressors, test_targets)
y_test, X_test = vector_for_learning(dmm_model, test_tagged) logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) logreg.score(X_train, y_train) logreg.score(X_test, y_test) print('Testing accuracy %s' % accuracy_score(y_test, y_pred)) wdv_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) dmm_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # Concatenated Model (Doc2Vec) new_model = ConcatenatedDoc2Vec([wdv_model, dmm_model]) def get_vectors(model, tagged_docs): sents = tagged_docs.values targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents]) return targets, regressors y_train, X_train = get_vectors(new_model, train_tagged) y_test, X_test = get_vectors(new_model, test_tagged) from imblearn.over_sampling import SMOTE
def concatenate_models(model1, model2): model = ConcatenatedDoc2Vec([model1, model2]) return model
model.reset_from(simple_models[0]) print(model) #simple_models[0].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) #simple_models[1].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) #simple_models[2].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) models_by_name = OrderedDict((str(model), model) for model in simple_models) #for model in simple_models: # model.syn0_lockf[:] = 1 ######################################################################################################### from gensim.test.test_doc2vec import ConcatenatedDoc2Vec models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[2]]) models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec( [simple_models[1], simple_models[0]]) ########################################################################################################## import numpy as np from random import sample def logistic_predictor_from_data(train_targets, train_regressors): classifier = LogisticRegression() classifier.fit(np.asarray(train_regressors), np.asarray(train_targets)) return classifier
Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=10, epochs =50), ] ## Concatenated Model ## Train both PV-DBOW and PV-DM and combine the two documents = train_corpus models[0].build_vocab(documents) models[1].reset_from(models[0]) for model in models: model.train(documents, total_examples=model.corpus_count, epochs=model.epochs) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec new_model = ConcatenatedDoc2Vec((models[0], models[1])) inferred_vector = model.infer_vector(train_corpus[0].words) sims = model.docvecs.most_similar([inferred_vector]) print(sims) :::{note} A thread on how to use `most_similar()` with `ConcatenatedDoc2Vec`: [link](https://stackoverflow.com/questions/54186233/doc2vec-infer-most-similar-vector-from-concatenateddocvecs) ::: # model 1 inferred_vector =new_model.models[0].infer_vector(train_corpus[0].words) sims2 = new_model.models[0].docvecs.most_similar([inferred_vector]) print(sims2) # model 2 inferred_vector =new_model.models[1].infer_vector(train_corpus[0].words)