def tagger(data, tag):
    train_tagged = data.apply(
        lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[tag]),
        axis=1)
    return train_tagged
Beispiel #2
0
 def __iter__(self):
     for (id, sentence) in enumerate(self.sentences):
         yield TaggedDocument(sentence, tags=['SENT_%s' % str(id)])
Beispiel #3
0
 def _transform(self, document):
     words = self._clean(document).split()
     tag = [self.k]
     return TaggedDocument(words, tag)
Beispiel #4
0
 def __iter__(self):
     with open('../temp_data/news_dataset/processed_doc_texts.csv', 'r', encoding='latin-1') as f:
         for i, line in enumerate(f):
             if line:
                 if len(line)>5:
                     yield TaggedDocument(words=line.split(), tags=[str(i)])
Beispiel #5
0
JS = json.dumps(SONG_DATA)
FP = open('SONG_DATA.json',
          'a')  #open new json file. If it does not exist, it will create one
FP.write(JS)  #write to json file
FP.close()  #close the connection

with open('data/SONG_DATA.json') as json_file:
    SONG_DATA = json.load(json_file)
    SONG_LYRICS = []

    for item in SONG_DATA:
        SONG_LYRICS.append(item['lyrics'])

    TAGGED_DATA = [
        TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
        for i, _d in enumerate(SONG_LYRICS)
    ]

    MAX_EPOCHS = 50
    VEC_SIZE = 20
    ALPHA = 0.025

    MODEL = Doc2Vec(size=VEC_SIZE,
                    alpha=ALPHA,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=1)

    MODEL.build_vocab(TAGGED_DATA)
arrOutPseudocode = f1.read().split('\n')
f1.close()
f1 = open(fpCachedCode, 'r')
arrOutCode = f1.read().split('\n')
f1.close()
f1 = open(fpCachedAST, 'r')
arrOutAST = f1.read().split('\n')
f1.close()
f1 = open(fpCachedPOS, 'r')
arrOutPOS = f1.read().split('\n')
f1.close()

lstAllInputTexts = arrOutPseudocode + arrOutCode + arrOutPOS + arrOutPOS
print('len all text{}'.format(len(lstAllInputTexts)))
tagged_data = [
    TaggedDocument(words=word_tokenize(_d), tags=[str(i)])
    for i, _d in enumerate(lstAllInputTexts)
]
max_epochs = 20
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=0)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
Beispiel #7
0
#coding: UTF-8
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import collections
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt

with open(r'C:\Users\81903\OneDrive\デスクトップ\松本_WORK\novel_ana.txt',
          'r',
          encoding="utf-8_sig") as f2:
    # 文書ごとに単語を分割してリストにする。
    #ポイント:文書で1区切りになっている。【文書1,文書2,文書3,文書4...】
    trainings = [
        TaggedDocument(words=data.split(), tags=[i])
        for i, data in enumerate(f2)
    ]
# 学習の実行
#dm:1ならPV=DMで0ならPV-DBOWで学習する
#vector_size:文章を何次元の分散表現に変換するかを指定
#window:次の単語の予測に何単語を用いるか(PV-DMの場合) 又は、文書idから何単語を予測するか(PV-DBOWの場合)
#min_count:指定の数以下の出現回数の単語は無視する
#wokes:学習に用いるスレッド数
#
m = Doc2Vec(documents=trainings,
            dm=1,
            size=5,
            window=5,
            min_count=3,
            workers=1)
Beispiel #8
0
 def __tag_tweet(self, tweets):
     for i in range(len(tweets)):
         yield TaggedDocument(simple_preprocess(tweets[i]), [i])
def create_tagged_document(tags, words):
    if use_tags:
        return TaggedDocument(words=words, tags=tags)
    else:
        return LabeledSentence(words=words, labels=tags)
Beispiel #10
0
    tidyData = pd.concat([tidyData, chunk])

tidyData = pd.DataFrame(tidyData)

# More NLP to do 2nd sentiment analysis and make doc2vec (did not work very well, dont use)

newComments = tidyData

sid = SentimentIntensityAnalyzer()
newComments["sentiments"] = tidyData["comments"].apply(lambda x: sid.polarity_scores(x))
newComments = pd.concat([newComments.drop(['sentiments'], axis=1), newComments['sentiments'].apply(pd.Series)], axis=1)

tidyData = newComments
newNLP = tidyData.comments

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(newNLP)]
# try differnt vector sizes!
model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4)

# model.docvecs[0]
newNLP2 = pd.DataFrame([model.docvecs[i] for i in range(len(newNLP))])
# newNLP2

# Clean data more, number of active user numbners (remove "k"), fix data time format, remove nan

def removeKs(dfNum):
    dfNum = str(dfNum).lstrip('0')
    dfNum = str(dfNum).lstrip()
    if len(dfNum) == 0:
        return 0
Beispiel #11
0
# In[Praktek no. 3]

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
## Exapmple document (list of sentences)
doc = [
    "I love pdf", "I love u", "I love sleep", "This is a good mouse",
    "This is a good house", "This is a good pause"
]

tokenized_doc = ['love']
tokenized_doc

print(doc)
# In[]
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data
## Train doc2vec model
model = Doc2Vec(tagged_data,
                vector_size=20,
                window=2,
                min_count=1,
                workers=4,
                epochs=100)
# Save trained doc2vec model
model.save("test_doc2vec.model")
## Load saved doc2vec model
model = Doc2Vec.load("test_doc2vec.model")
## Print model vocabulary
model.wv.vocab
Beispiel #12
0
def list_to_tagdoc(ls):
    for i, line in enumerate(ls):
        yield TaggedDocument(line, [i])
Beispiel #13
0
def feature_vecs_DOC_W2V(train_pos, train_neg, test_pos, test_neg):
    """
    Returns the feature vectors for all text in the train and test datasets.
    """
    # Load the pre-trained word2vec model
    word2vec_model = word2vec.Word2Vec.load(path_to_pretrained_w2v)

    # Doc2Vec requires TaggedDocument objects as input.
    # Turn the datasets from lists of words to lists of TaggedDocument objects.
    labeled_train_pos = [
        TaggedDocument(words, ["TRAIN_POS_" + str(i)])
        for i, words in enumerate(train_pos)
    ]
    labeled_train_neg = [
        TaggedDocument(words, ["TRAIN_NEG_" + str(i)])
        for i, words in enumerate(train_neg)
    ]
    labeled_test_pos = [
        TaggedDocument(words, ["TEST_POS_" + str(i)])
        for i, words in enumerate(test_pos)
    ]
    labeled_test_neg = [
        TaggedDocument(words, ["TEST_NEG_" + str(i)])
        for i, words in enumerate(test_neg)
    ]

    sentences = labeled_train_pos + labeled_train_neg + labeled_test_pos + labeled_test_neg

    # Use modified doc2vec codes for applying the pre-trained word2vec model
    model = doc2vec_modified.Doc2Vec(dm=0,
                                     dm_mean=1,
                                     alpha=0.025,
                                     min_alpha=0.0001,
                                     min_count=1,
                                     size=1000,
                                     hs=1,
                                     workers=4,
                                     train_words=False,
                                     train_lbls=True)
    model.reset_weights()

    # Copy wiki word2vec model into doc2vec model
    model.vocab = word2vec_model.vocab
    model.syn0 = word2vec_model.syn0
    model.syn1 = word2vec_model.syn1
    model.index2word = word2vec_model.index2word

    print("# of pre-trained vocab = " + str(len(model.vocab)))

    # Extract sentence labels for the training and test data
    train_pos_labels = [
        "TRAIN_POS_" + str(i) for i in range(len(labeled_train_pos))
    ]
    train_neg_labels = [
        "TRAIN_NEG_" + str(i) for i in range(len(labeled_train_neg))
    ]
    test_pos_labels = [
        "TEST_POS_" + str(i) for i in range(len(labeled_test_pos))
    ]
    test_neg_labels = [
        "TEST_NEG_" + str(i) for i in range(len(labeled_test_neg))
    ]

    sentence_labels = train_pos_labels + train_neg_labels + test_pos_labels + test_neg_labels

    new_syn0 = empty((len(sentences), model.layer1_size), dtype=REAL)
    new_syn1 = empty((len(sentences), model.layer1_size), dtype=REAL)

    syn_index = 0

    # Initialize and add a vector of syn0 (i.e. input vector) and syn1 (i.e. output vector) for a vector of a label
    for label in sentence_labels:
        v = model.append_label_into_vocab(
            label)  # I made this function in the doc2vec code

        random.seed(
            uint32(model.hashfxn(model.index2word[v.index] + str(model.seed))))

        new_syn0[syn_index] = (random.rand(model.layer1_size) -
                               0.5) / model.layer1_size
        new_syn1[syn_index] = zeros((1, model.layer1_size), dtype=REAL)

        syn_index += 1

    model.syn0 = vstack([model.syn0, new_syn0])
    model.syn1 = vstack([model.syn1, new_syn1])

    model.precalc_sampling()

    # Train the model
    # This may take a bit to run
    for i in range(5):
        start_time = time.time()

        print("Training iteration %d" % (i))
        random.shuffle(sentences)
        model.train(sentences)

        print("Done - Training")
        print("--- %s minutes ---" % ((time.time() - start_time) / 60))
        start_time = time.time()

        # Convert "nan" values into "0" in vectors
        indices_nan = isnan(model.syn0)
        model.syn0[indices_nan] = 0.0

        indices_nan = isnan(model.syn1)
        model.syn1[indices_nan] = 0.0

        # Extract the feature vectors for the training and test data
        train_pos_vec = [
            model.syn0[model.vocab["TRAIN_POS_" + str(i)].index]
            for i in range(len(labeled_train_pos))
        ]
        train_neg_vec = [
            model.syn0[model.vocab["TRAIN_NEG_" + str(i)].index]
            for i in range(len(labeled_train_neg))
        ]
        test_pos_vec = [
            model.syn0[model.vocab["TEST_POS_" + str(i)].index]
            for i in range(len(labeled_test_pos))
        ]
        test_neg_vec = [
            model.syn0[model.vocab["TEST_NEG_" + str(i)].index]
            for i in range(len(labeled_test_neg))
        ]

        print("Done - Extracting the feature vectors")
        print("--- %s minutes ---" % ((time.time() - start_time) / 60))

    # Return the four feature vectors
    return train_pos_vec, train_neg_vec, test_pos_vec, test_neg_vec
def main():
    parser = argparse.ArgumentParser(description="")

    # Add options
    parser.add_argument("-v",
                        "--verbosity",
                        action="count",
                        default=0,
                        help="increase output verbosity")

    # Add arguments

    parser.add_argument("input_file", help="The input file to be projected")
    # parser.add_argument("speech_feats_file", help="The input file to be projected")
    # parser.add_argument("out_path_file", help="The input file to be projected")
    args = parser.parse_args()
    transcription_data_file = args.input_file
    df_ = pd.read_csv(transcription_data_file, sep='|')
    df_.columns = ['utterance', 'text']

    df_.index = range(df_.shape[0])

    print(df_.head())

    # df_['text']=df_['text'].apply(nltk.word_tokenize)
    print(df_.head())
    train_tagged = df_.apply(lambda r: TaggedDocument(
        words=tokenize_text(r['text']), tags=r.utterance),
                             axis=1)  # print(unsup_reviews.head())

    # # print(X_clean.shape)

    model_dbow = Doc2Vec(dm=0,
                         vector_size=300,
                         negative=5,
                         hs=0,
                         min_count=2,
                         sample=0,
                         workers=cores)
    model_dbow.build_vocab(train_tagged)

    # %%time
    for epoch in range(30):
        model_dbow.train(utils.shuffle(train_tagged),
                         total_examples=len(train_tagged.values),
                         epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    n_dim = 300

    model_dmm = Doc2Vec(dm=1,
                        dm_mean=1,
                        vector_size=300,
                        window=10,
                        negative=5,
                        min_count=1,
                        workers=cores,
                        alpha=0.065,
                        min_alpha=0.065)
    model_dmm.build_vocab(train_tagged)

    # %%time
    for epoch in range(30):
        model_dmm.train(utils.shuffle(train_tagged),
                        total_examples=len(train_tagged.values),
                        epochs=1)
        model_dmm.alpha -= 0.002
        model_dmm.min_alpha = model_dmm.alpha

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                              keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)

    from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
    new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    #Get training set vectors from our models
    x_doc2vec = OrderedDict()
    for utt, text in zip(df_['utterance'].to_list(), df_['text'].to_list()):
        tokens = model_dm.infer_vector(text)
        x_doc2vec[utt] = tokens

    df_doc2vec = pd.DataFrame(x_doc2vec).T
    df_doc2vec.columns = [
        'doc2vec_{}'.format(str(i).zfill(3)) for i in range(n_dim)
    ]
    df_doc2vec['utterance'] = df_doc2vec.index
    df_doc2vec.to_csv('output_doc2vec_features.csv', index=False)
    fname = get_tmpfile("my_doc2vec_model")
    model.save(fname)
    model = Doc2Vec.load(
        fname)  # you can continue training with the loaded model!
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result
Beispiel #16
0
    return questions, answers, topics


questions, answers, topics = load_data()

documents = []
n = 0
for question, topic in zip(questions, topics):
    if topic not in all_topics:
        all_topics[topic] = len(all_topics) + 1
    if topic not in topic_count:
        topic_count[topic] = 1
    else:
        topic_count[topic] += 1

    documents.append(TaggedDocument(question, [n]))
    n += 1

# topics = enumerate(set(topics))
# print(list(topics))

# print(common_texts)

# documents = [TaggedDocument(doc, [i%3]) for i, doc in enumerate(common_texts)]
# %%

print('topics\n', all_topics)
print('topic counts')
for x in topic_count:
    print(x, topic_count[x])
Beispiel #17
0
 def __iter__(self):
     df = pd.read_csv(self.fileName)
     text = df['text'].values
     for idx, doc in tqdm(enumerate(text)):
         doc = self.preprocess(doc)
         yield TaggedDocument(words=doc.split(), tags=[idx])
Beispiel #18
0
    return Q, code
"""

soup = BeautifulSoup(s, "lxml")
for text in soup.find_all(text=True):
    if text.strip():
        print(text)

while (1):
    pass

Q, code = splitQuestion(s)

flatQ = '
\n'.join(Q)
flatCode = '
\n'.join(code)
# 学習データ読み込み
train_data = [flatQ]
#assert train_data != ...
train_corpus = [flatQ]

train_corpus = [
    TaggedDocument(preprocess(doc), [i]) for i, doc in enumerate(train_data)
]
# モデル作成
model = Doc2Vec(size=200)
model.build_vocab(train_corpus)
# 学習
model.train(train_corpus, total_examples=model.corpus_count, epochs=10)

print(model.infer_vector(preprocess("This is a true.")))
        (str) -> (dict)
    """

    file_path = os.path.join(os.pardir, "outFinal", path)
    relations_file = open(file_path, 'rb')
    relations = pickle.load(relations_file)
    relations_file.close()
    return relations

all_sentences = read_relation("all_sentences.pkl")

sentences = []
for key in all_sentences.keys():
    for d in all_sentences[key]:
        te = d[-1].replace("\n", "")

        sentences.append(te)


sentences = list(set(sentences))
final_sentences = [s.split(" ") for s in sentences]

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(final_sentences)]

model = Doc2Vec(documents, size=5, window=2, min_count=1, workers=4)

fname = os.path.join(os.pardir,"outnew","doc2vecModel")
model.save(fname)


    fold_number = fold_number + 1

    training_corpus = build_corpus(train)
    train_labels = build_labels(train)
    test_corpus = build_corpus(test)
    test_labels = build_labels(test)

    dummy_clf.fit(training_corpus, train_labels)
    dummy_accuracies.append(dummy_clf.score(test_corpus, test_labels) * 100)

    #Assigning hyperpartisan (true or false) tags to each document.
    print "Creating the Tagged version of the training_corpus"
    tagged_data = []
    j = 0
    for i in tqdm(training_corpus):
        tagged_data.append(TaggedDocument(i.lower(), tags=[train_labels[j]]))
        j = j + 1

    #I'll make these command line args later
    vec_size = 100
    alpha = 0.025

    model = Doc2Vec(tagged_data,
                    vector_size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=5,
                    dm=1,
                    epochs=100)

    predictions = []
Beispiel #21
0
# -*- coding: utf-8 -*-

from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess as preprocess
from gensim.models import Doc2Vec

file = open('text/data_neologd_indention_notbyte_2_10.txt', 'r', encoding='utf-8')

trainings = [TaggedDocument(words = data.split(),tags = [i]) for i,data in enumerate(file)]

model = Doc2Vec(documents= trainings, size=400, min_count=10, iter=100)

model.save("model/doc2vec_2_10_iter100.model")
Beispiel #22
0
def labelize(data,tag):
    dataTag = [TaggedDocument(words = data[i],tags = '%s %s'%(tag,i)) for i in range(len(data))]
    return dataTag
Beispiel #23
0
# from gensim.test.utils import get_tmpfile

# embds = preprocess.Embeddings()
# embds.load()

path2data = '/Users/tomoki/NLP_data/sentiment-analysis-twitter/tweet-texts-segmented.txt'

texts = []
texts = []
with open(path2data, 'r') as f:
    for line in f.readlines():
        tokens = line.split('\t')
        texts.append((tokens[-1].strip('\n').split(), int(tokens[1])))

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]

fname = 'twitter_doc2vec_model_win5_d100'
# model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
# model = Doc2Vec(documents, window=5, min_count=3, workers=4, vector_size=100)

# fname = get_tmpfile("my_doc2vec_model")

# model.save(fname)
model = Doc2Vec.load(fname)


data = [[], []]
data_size = len(texts)
data[0], data[1] = texts[:-data_size // 10], texts[(data_size // 10) * 9:]
Beispiel #24
0
 def __iter__(self):
     for idx, doc in enumerate(self.docs):
         yield TaggedDocument(doc.split(), [self.labels[idx]]) # clean doc
Beispiel #25
0
    # jpype.attachThreadToJVM()
    token_doc = [
        '/'.join(word) for word in mecab.pos(doc) if word[1] in filter_mecab
    ]
    return token_doc


# 리스트에서 각 문장부분 토큰화

index_questions = []
for i in range(1,
               len(df2) + 1):  # df2가 1부터 시작하므로 개수+1개만큼까지 써줘야 전체 데이터를 쓸 수 있다.
    index_questions.append([tokenize_mecab_noun(df2['질문'][i]), i])  # 명사만 추출

# Doc2Vec에서 사용하는 태그문서형으로 변경
tagged_questions = [TaggedDocument(d, [int(c)]) for d, c in index_questions]

# 참고: https://cholol.tistory.com/469?category=803480

# 모델 불러오기

d2v_faqs = doc2vec.Doc2Vec.load(
    os.path.join(
        './model/d2v_faqs_size200_min5_epoch20_naver_physics_qna.model'))

# 챗봇 형태로 연속된 질문 받기

while True:
    test_string = input("질문을 입력하세요: \n\t")

    tokened_test_string = tokenize_mecab_noun(test_string)
Beispiel #26
0
# trainingX.head()
# trainingY.head()
# testingMerged = pd.concat([trainingX,trainingY], axis=1)
# testingMerged.head()


# In[18]:


from gensim.models.doc2vec import TaggedDocument
from gensim import utils

excerpts=[]
for index, row in trainingX['Text'].iteritems():
    concatText = " ".join(row)
    excerpts.append(TaggedDocument(utils.to_unicode(concatText).split(), ['Text' + '_%s' % str(index)]))

for index, row in testingX['Text'].iteritems():
    concatText = " ".join(row)
    excerpts.append(TaggedDocument(utils.to_unicode(concatText).split(), ['Text' + '_%s' % str(index)]))


# In[39]:


from gensim.models import Doc2Vec
import os

Text_INPUT_DIM=50
filename='preprocessedText50.d2v'
Beispiel #27
0
 def to_array(self):
     for (id, sentence) in enumerate(self.sentences):
         self.tagged_sentences.append(
             TaggedDocument(words=sentence, tags=['SENT_%s' % str(id)]))
     return self.tagged_sentences
Beispiel #28
0
 def loadData(self):
     read = ReadXML()
     self.data = read.transformData()
     self.tagged_data = [TaggedDocument(words=word_tokenize(_d.lower(),language='french'), tags=[str(i)]) for i, _d in enumerate(self.data['_source']['content'])]
# Divide into train, validation, and test sets
desc_train, desc_temp, desc_idx_train, desc_idx_temp = train_test_split(
    desc_token, range(total_len), test_size=0.20, random_state=0)
desc_val, desc_test, desc_idx_val, desc_idx_test = train_test_split(
    desc_temp, desc_idx_temp, test_size=0.5, random_state=0)

print(len(desc_train))
print(len(desc_val))
print(len(desc_test))

tagged_data = []
for idx, entry in zip(desc_idx_train, desc_train):
    print(idx)
    if np.mod(idx, 500) == 0:
        print(idx)
    tagged_data.append(TaggedDocument(entry, tags=[str(idx)]))

rw_mod_desc.columns
rw_mod_desc['Cate_attached']
rw_mod_desc['Description'][0]
rw_mod_desc['Desc_lemmatized'][0]
rw_mod_desc['Cate_attached'][0]
desc_train[0]

max_epochs = 50
vec_size = 25
alpha = 0.025
window_size = 2
num_workers = 4
minimun_count = 1
dm_select = 1  # 1: PV-DM; 0:PV-DBOW
Beispiel #30
0
 def __iter__(self):
     for idx, doc in enumerate(self.doc_list):
         yield TaggedDocument(doc, [self.labels_list[idx]])