def tagger(data, tag): train_tagged = data.apply( lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[tag]), axis=1) return train_tagged
def __iter__(self): for (id, sentence) in enumerate(self.sentences): yield TaggedDocument(sentence, tags=['SENT_%s' % str(id)])
def _transform(self, document): words = self._clean(document).split() tag = [self.k] return TaggedDocument(words, tag)
def __iter__(self): with open('../temp_data/news_dataset/processed_doc_texts.csv', 'r', encoding='latin-1') as f: for i, line in enumerate(f): if line: if len(line)>5: yield TaggedDocument(words=line.split(), tags=[str(i)])
JS = json.dumps(SONG_DATA) FP = open('SONG_DATA.json', 'a') #open new json file. If it does not exist, it will create one FP.write(JS) #write to json file FP.close() #close the connection with open('data/SONG_DATA.json') as json_file: SONG_DATA = json.load(json_file) SONG_LYRICS = [] for item in SONG_DATA: SONG_LYRICS.append(item['lyrics']) TAGGED_DATA = [ TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(SONG_LYRICS) ] MAX_EPOCHS = 50 VEC_SIZE = 20 ALPHA = 0.025 MODEL = Doc2Vec(size=VEC_SIZE, alpha=ALPHA, min_alpha=0.00025, min_count=1, dm=1) MODEL.build_vocab(TAGGED_DATA)
arrOutPseudocode = f1.read().split('\n') f1.close() f1 = open(fpCachedCode, 'r') arrOutCode = f1.read().split('\n') f1.close() f1 = open(fpCachedAST, 'r') arrOutAST = f1.read().split('\n') f1.close() f1 = open(fpCachedPOS, 'r') arrOutPOS = f1.read().split('\n') f1.close() lstAllInputTexts = arrOutPseudocode + arrOutCode + arrOutPOS + arrOutPOS print('len all text{}'.format(len(lstAllInputTexts))) tagged_data = [ TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(lstAllInputTexts) ] max_epochs = 20 vec_size = 100 alpha = 0.025 model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=0) model.build_vocab(tagged_data) for epoch in range(max_epochs):
#coding: UTF-8 from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import TaggedDocument import collections import pandas as pd import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage, fcluster import matplotlib.pyplot as plt with open(r'C:\Users\81903\OneDrive\デスクトップ\松本_WORK\novel_ana.txt', 'r', encoding="utf-8_sig") as f2: # 文書ごとに単語を分割してリストにする。 #ポイント:文書で1区切りになっている。【文書1,文書2,文書3,文書4...】 trainings = [ TaggedDocument(words=data.split(), tags=[i]) for i, data in enumerate(f2) ] # 学習の実行 #dm:1ならPV=DMで0ならPV-DBOWで学習する #vector_size:文章を何次元の分散表現に変換するかを指定 #window:次の単語の予測に何単語を用いるか(PV-DMの場合) 又は、文書idから何単語を予測するか(PV-DBOWの場合) #min_count:指定の数以下の出現回数の単語は無視する #wokes:学習に用いるスレッド数 # m = Doc2Vec(documents=trainings, dm=1, size=5, window=5, min_count=3, workers=1)
def __tag_tweet(self, tweets): for i in range(len(tweets)): yield TaggedDocument(simple_preprocess(tweets[i]), [i])
def create_tagged_document(tags, words): if use_tags: return TaggedDocument(words=words, tags=tags) else: return LabeledSentence(words=words, labels=tags)
tidyData = pd.concat([tidyData, chunk]) tidyData = pd.DataFrame(tidyData) # More NLP to do 2nd sentiment analysis and make doc2vec (did not work very well, dont use) newComments = tidyData sid = SentimentIntensityAnalyzer() newComments["sentiments"] = tidyData["comments"].apply(lambda x: sid.polarity_scores(x)) newComments = pd.concat([newComments.drop(['sentiments'], axis=1), newComments['sentiments'].apply(pd.Series)], axis=1) tidyData = newComments newNLP = tidyData.comments documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(newNLP)] # try differnt vector sizes! model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4) # model.docvecs[0] newNLP2 = pd.DataFrame([model.docvecs[i] for i in range(len(newNLP))]) # newNLP2 # Clean data more, number of active user numbners (remove "k"), fix data time format, remove nan def removeKs(dfNum): dfNum = str(dfNum).lstrip('0') dfNum = str(dfNum).lstrip() if len(dfNum) == 0: return 0
# In[Praktek no. 3] from gensim.models.doc2vec import Doc2Vec, TaggedDocument ## Exapmple document (list of sentences) doc = [ "I love pdf", "I love u", "I love sleep", "This is a good mouse", "This is a good house", "This is a good pause" ] tokenized_doc = ['love'] tokenized_doc print(doc) # In[] tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)] tagged_data ## Train doc2vec model model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs=100) # Save trained doc2vec model model.save("test_doc2vec.model") ## Load saved doc2vec model model = Doc2Vec.load("test_doc2vec.model") ## Print model vocabulary model.wv.vocab
def list_to_tagdoc(ls): for i, line in enumerate(ls): yield TaggedDocument(line, [i])
def feature_vecs_DOC_W2V(train_pos, train_neg, test_pos, test_neg): """ Returns the feature vectors for all text in the train and test datasets. """ # Load the pre-trained word2vec model word2vec_model = word2vec.Word2Vec.load(path_to_pretrained_w2v) # Doc2Vec requires TaggedDocument objects as input. # Turn the datasets from lists of words to lists of TaggedDocument objects. labeled_train_pos = [ TaggedDocument(words, ["TRAIN_POS_" + str(i)]) for i, words in enumerate(train_pos) ] labeled_train_neg = [ TaggedDocument(words, ["TRAIN_NEG_" + str(i)]) for i, words in enumerate(train_neg) ] labeled_test_pos = [ TaggedDocument(words, ["TEST_POS_" + str(i)]) for i, words in enumerate(test_pos) ] labeled_test_neg = [ TaggedDocument(words, ["TEST_NEG_" + str(i)]) for i, words in enumerate(test_neg) ] sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg # Use modified doc2vec codes for applying the pre-trained word2vec model model = doc2vec_modified.Doc2Vec(dm=0, dm_mean=1, alpha=0.025, min_alpha=0.0001, min_count=1, size=1000, hs=1, workers=4, train_words=False, train_lbls=True) model.reset_weights() # Copy wiki word2vec model into doc2vec model model.vocab = word2vec_model.vocab model.syn0 = word2vec_model.syn0 model.syn1 = word2vec_model.syn1 model.index2word = word2vec_model.index2word print("# of pre-trained vocab = " + str(len(model.vocab))) # Extract sentence labels for the training and test data train_pos_labels = [ "TRAIN_POS_" + str(i) for i in range(len(labeled_train_pos)) ] train_neg_labels = [ "TRAIN_NEG_" + str(i) for i in range(len(labeled_train_neg)) ] test_pos_labels = [ "TEST_POS_" + str(i) for i in range(len(labeled_test_pos)) ] test_neg_labels = [ "TEST_NEG_" + str(i) for i in range(len(labeled_test_neg)) ] sentence_labels = train_pos_labels + train_neg_labels + test_pos_labels + test_neg_labels new_syn0 = empty((len(sentences), model.layer1_size), dtype=REAL) new_syn1 = empty((len(sentences), model.layer1_size), dtype=REAL) syn_index = 0 # Initialize and add a vector of syn0 (i.e. input vector) and syn1 (i.e. output vector) for a vector of a label for label in sentence_labels: v = model.append_label_into_vocab( label) # I made this function in the doc2vec code random.seed( uint32(model.hashfxn(model.index2word[v.index] + str(model.seed)))) new_syn0[syn_index] = (random.rand(model.layer1_size) - 0.5) / model.layer1_size new_syn1[syn_index] = zeros((1, model.layer1_size), dtype=REAL) syn_index += 1 model.syn0 = vstack([model.syn0, new_syn0]) model.syn1 = vstack([model.syn1, new_syn1]) model.precalc_sampling() # Train the model # This may take a bit to run for i in range(5): start_time = time.time() print("Training iteration %d" % (i)) random.shuffle(sentences) model.train(sentences) print("Done - Training") print("--- %s minutes ---" % ((time.time() - start_time) / 60)) start_time = time.time() # Convert "nan" values into "0" in vectors indices_nan = isnan(model.syn0) model.syn0[indices_nan] = 0.0 indices_nan = isnan(model.syn1) model.syn1[indices_nan] = 0.0 # Extract the feature vectors for the training and test data train_pos_vec = [ model.syn0[model.vocab["TRAIN_POS_" + str(i)].index] for i in range(len(labeled_train_pos)) ] train_neg_vec = [ model.syn0[model.vocab["TRAIN_NEG_" + str(i)].index] for i in range(len(labeled_train_neg)) ] test_pos_vec = [ model.syn0[model.vocab["TEST_POS_" + str(i)].index] for i in range(len(labeled_test_pos)) ] test_neg_vec = [ model.syn0[model.vocab["TEST_NEG_" + str(i)].index] for i in range(len(labeled_test_neg)) ] print("Done - Extracting the feature vectors") print("--- %s minutes ---" % ((time.time() - start_time) / 60)) # Return the four feature vectors return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def main(): parser = argparse.ArgumentParser(description="") # Add options parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase output verbosity") # Add arguments parser.add_argument("input_file", help="The input file to be projected") # parser.add_argument("speech_feats_file", help="The input file to be projected") # parser.add_argument("out_path_file", help="The input file to be projected") args = parser.parse_args() transcription_data_file = args.input_file df_ = pd.read_csv(transcription_data_file, sep='|') df_.columns = ['utterance', 'text'] df_.index = range(df_.shape[0]) print(df_.head()) # df_['text']=df_['text'].apply(nltk.word_tokenize) print(df_.head()) train_tagged = df_.apply(lambda r: TaggedDocument( words=tokenize_text(r['text']), tags=r.utterance), axis=1) # print(unsup_reviews.head()) # # print(X_clean.shape) model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores) model_dbow.build_vocab(train_tagged) # %%time for epoch in range(30): model_dbow.train(utils.shuffle(train_tagged), total_examples=len(train_tagged.values), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha n_dim = 300 model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065) model_dmm.build_vocab(train_tagged) # %%time for epoch in range(30): model_dmm.train(utils.shuffle(train_tagged), total_examples=len(train_tagged.values), epochs=1) model_dmm.alpha -= 0.002 model_dmm.min_alpha = model_dmm.alpha model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) from gensim.test.test_doc2vec import ConcatenatedDoc2Vec new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm]) #Get training set vectors from our models x_doc2vec = OrderedDict() for utt, text in zip(df_['utterance'].to_list(), df_['text'].to_list()): tokens = model_dm.infer_vector(text) x_doc2vec[utt] = tokens df_doc2vec = pd.DataFrame(x_doc2vec).T df_doc2vec.columns = [ 'doc2vec_{}'.format(str(i).zfill(3)) for i in range(n_dim) ] df_doc2vec['utterance'] = df_doc2vec.index df_doc2vec.to_csv('output_doc2vec_features.csv', index=False) fname = get_tmpfile("my_doc2vec_model") model.save(fname) model = Doc2Vec.load( fname) # you can continue training with the loaded model!
def labelize_tweets_ug(tweets,label): result = [] prefix = label for i, t in zip(tweets.index, tweets): result.append(TaggedDocument(t.split(), [prefix + '_%s' % i])) return result
return questions, answers, topics questions, answers, topics = load_data() documents = [] n = 0 for question, topic in zip(questions, topics): if topic not in all_topics: all_topics[topic] = len(all_topics) + 1 if topic not in topic_count: topic_count[topic] = 1 else: topic_count[topic] += 1 documents.append(TaggedDocument(question, [n])) n += 1 # topics = enumerate(set(topics)) # print(list(topics)) # print(common_texts) # documents = [TaggedDocument(doc, [i%3]) for i, doc in enumerate(common_texts)] # %% print('topics\n', all_topics) print('topic counts') for x in topic_count: print(x, topic_count[x])
def __iter__(self): df = pd.read_csv(self.fileName) text = df['text'].values for idx, doc in tqdm(enumerate(text)): doc = self.preprocess(doc) yield TaggedDocument(words=doc.split(), tags=[idx])
return Q, code """ soup = BeautifulSoup(s, "lxml") for text in soup.find_all(text=True): if text.strip(): print(text) while (1): pass Q, code = splitQuestion(s) flatQ = '
\n'.join(Q) flatCode = '
\n'.join(code) # 学習データ読み込み train_data = [flatQ] #assert train_data != ... train_corpus = [flatQ] train_corpus = [ TaggedDocument(preprocess(doc), [i]) for i, doc in enumerate(train_data) ] # モデル作成 model = Doc2Vec(size=200) model.build_vocab(train_corpus) # 学習 model.train(train_corpus, total_examples=model.corpus_count, epochs=10) print(model.infer_vector(preprocess("This is a true.")))
(str) -> (dict) """ file_path = os.path.join(os.pardir, "outFinal", path) relations_file = open(file_path, 'rb') relations = pickle.load(relations_file) relations_file.close() return relations all_sentences = read_relation("all_sentences.pkl") sentences = [] for key in all_sentences.keys(): for d in all_sentences[key]: te = d[-1].replace("\n", "") sentences.append(te) sentences = list(set(sentences)) final_sentences = [s.split(" ") for s in sentences] documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(final_sentences)] model = Doc2Vec(documents, size=5, window=2, min_count=1, workers=4) fname = os.path.join(os.pardir,"outnew","doc2vecModel") model.save(fname)
fold_number = fold_number + 1 training_corpus = build_corpus(train) train_labels = build_labels(train) test_corpus = build_corpus(test) test_labels = build_labels(test) dummy_clf.fit(training_corpus, train_labels) dummy_accuracies.append(dummy_clf.score(test_corpus, test_labels) * 100) #Assigning hyperpartisan (true or false) tags to each document. print "Creating the Tagged version of the training_corpus" tagged_data = [] j = 0 for i in tqdm(training_corpus): tagged_data.append(TaggedDocument(i.lower(), tags=[train_labels[j]])) j = j + 1 #I'll make these command line args later vec_size = 100 alpha = 0.025 model = Doc2Vec(tagged_data, vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=5, dm=1, epochs=100) predictions = []
# -*- coding: utf-8 -*- from gensim.models.doc2vec import TaggedDocument from gensim.utils import simple_preprocess as preprocess from gensim.models import Doc2Vec file = open('text/data_neologd_indention_notbyte_2_10.txt', 'r', encoding='utf-8') trainings = [TaggedDocument(words = data.split(),tags = [i]) for i,data in enumerate(file)] model = Doc2Vec(documents= trainings, size=400, min_count=10, iter=100) model.save("model/doc2vec_2_10_iter100.model")
def labelize(data,tag): dataTag = [TaggedDocument(words = data[i],tags = '%s %s'%(tag,i)) for i in range(len(data))] return dataTag
# from gensim.test.utils import get_tmpfile # embds = preprocess.Embeddings() # embds.load() path2data = '/Users/tomoki/NLP_data/sentiment-analysis-twitter/tweet-texts-segmented.txt' texts = [] texts = [] with open(path2data, 'r') as f: for line in f.readlines(): tokens = line.split('\t') texts.append((tokens[-1].strip('\n').split(), int(tokens[1]))) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)] fname = 'twitter_doc2vec_model_win5_d100' # model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) # model = Doc2Vec(documents, window=5, min_count=3, workers=4, vector_size=100) # fname = get_tmpfile("my_doc2vec_model") # model.save(fname) model = Doc2Vec.load(fname) data = [[], []] data_size = len(texts) data[0], data[1] = texts[:-data_size // 10], texts[(data_size // 10) * 9:]
def __iter__(self): for idx, doc in enumerate(self.docs): yield TaggedDocument(doc.split(), [self.labels[idx]]) # clean doc
# jpype.attachThreadToJVM() token_doc = [ '/'.join(word) for word in mecab.pos(doc) if word[1] in filter_mecab ] return token_doc # 리스트에서 각 문장부분 토큰화 index_questions = [] for i in range(1, len(df2) + 1): # df2가 1부터 시작하므로 개수+1개만큼까지 써줘야 전체 데이터를 쓸 수 있다. index_questions.append([tokenize_mecab_noun(df2['질문'][i]), i]) # 명사만 추출 # Doc2Vec에서 사용하는 태그문서형으로 변경 tagged_questions = [TaggedDocument(d, [int(c)]) for d, c in index_questions] # 참고: https://cholol.tistory.com/469?category=803480 # 모델 불러오기 d2v_faqs = doc2vec.Doc2Vec.load( os.path.join( './model/d2v_faqs_size200_min5_epoch20_naver_physics_qna.model')) # 챗봇 형태로 연속된 질문 받기 while True: test_string = input("질문을 입력하세요: \n\t") tokened_test_string = tokenize_mecab_noun(test_string)
# trainingX.head() # trainingY.head() # testingMerged = pd.concat([trainingX,trainingY], axis=1) # testingMerged.head() # In[18]: from gensim.models.doc2vec import TaggedDocument from gensim import utils excerpts=[] for index, row in trainingX['Text'].iteritems(): concatText = " ".join(row) excerpts.append(TaggedDocument(utils.to_unicode(concatText).split(), ['Text' + '_%s' % str(index)])) for index, row in testingX['Text'].iteritems(): concatText = " ".join(row) excerpts.append(TaggedDocument(utils.to_unicode(concatText).split(), ['Text' + '_%s' % str(index)])) # In[39]: from gensim.models import Doc2Vec import os Text_INPUT_DIM=50 filename='preprocessedText50.d2v'
def to_array(self): for (id, sentence) in enumerate(self.sentences): self.tagged_sentences.append( TaggedDocument(words=sentence, tags=['SENT_%s' % str(id)])) return self.tagged_sentences
def loadData(self): read = ReadXML() self.data = read.transformData() self.tagged_data = [TaggedDocument(words=word_tokenize(_d.lower(),language='french'), tags=[str(i)]) for i, _d in enumerate(self.data['_source']['content'])]
# Divide into train, validation, and test sets desc_train, desc_temp, desc_idx_train, desc_idx_temp = train_test_split( desc_token, range(total_len), test_size=0.20, random_state=0) desc_val, desc_test, desc_idx_val, desc_idx_test = train_test_split( desc_temp, desc_idx_temp, test_size=0.5, random_state=0) print(len(desc_train)) print(len(desc_val)) print(len(desc_test)) tagged_data = [] for idx, entry in zip(desc_idx_train, desc_train): print(idx) if np.mod(idx, 500) == 0: print(idx) tagged_data.append(TaggedDocument(entry, tags=[str(idx)])) rw_mod_desc.columns rw_mod_desc['Cate_attached'] rw_mod_desc['Description'][0] rw_mod_desc['Desc_lemmatized'][0] rw_mod_desc['Cate_attached'][0] desc_train[0] max_epochs = 50 vec_size = 25 alpha = 0.025 window_size = 2 num_workers = 4 minimun_count = 1 dm_select = 1 # 1: PV-DM; 0:PV-DBOW
def __iter__(self): for idx, doc in enumerate(self.doc_list): yield TaggedDocument(doc, [self.labels_list[idx]])