def stem(CONFIGURATION, sents):

    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sents:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        expression_new.append(
                            ps.stem(re.sub('[^A-z0-9<>]', '', word.lower())))
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                if not " ".join(expression) == '' and len(
                        " ".join(expression)) > 1:
                    tmp = tmp + expression
            if len(tmp) > 1:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv")):
        yield document
def main():
    import argparse

    parser = argparse.ArgumentParser()
    models = ['dbow', 'dm_mean', 'dm_sum', 'dm_concat']
    parser.add_argument('model', choices=models)
    parser.add_argument('dest')
    parser.add_argument('--size', default=300, type=int)
    args = parser.parse_args()

    common_params = {
        'min_count': 14,
        'iter': 10,
        'workers': mp.cpu_count(),
        'size': args.size
    }

    if args.model == 'dbow':
        model = Doc2Vec(dm=0, **common_params)
    elif args.model == 'dm_concat':
        model = Doc2Vec(dm=1, dm_concat=1, **common_params)
    elif args.model == 'dm_mean':
        model = Doc2Vec(dm=1, dm_mean=1, **common_params)
    elif args.model == 'dm_sum':
        model = Doc2Vec(dm=1, dm_mean=0, **common_params)

    texts = TaggedLineDocument(TRAIN_TEXTS_FILE)

    model.build_vocab(texts)
    model.train(texts, epochs=model.iter, total_examples=model.corpus_count)

    model.save(args.dest)
Beispiel #3
0
def trainDoc2Vector(splitedword_path, sentence_count, vector_dimension,
                    train_count):
    logging.info("start doc2vector training data")
    sentences = TaggedLineDocument(splitedword_path)
    model = Doc2Vec(sentences,
                    size=vector_dimension,
                    dm=0,
                    window=8,
                    min_count=2,
                    workers=multiprocessing.cpu_count())

    for i in range(train_count):
        model.train(sentences,
                    total_examples=sentence_count,
                    epochs=model.iter)

    model.save('word2vec/PV-DBOW_doc2vec.model')
    # save vectors
    out = open('word2vec/PV-DBOW_doc2vec.vector', mode='w+', encoding='utf-8')
    for index in range(0, sentence_count, 1):
        docvec = model.docvecs[index]
        out.write(' '.join(str(f) for f in docvec) + "\n")
    out.close()
    # with open('', mode='w', encoding='utf-8') as f:
    #     f.write(model.docvecs)
    logging.info("end doc2vector")
def build_doc2vec_model(dataset, vec_lenght, save_folder, name="Movie"):
    """
    build doc2vec model
    """
    #use helperfunction write_all_in_txt, which creates an txt file with required format for doc2vec model
    if name == "Movie":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset, save_file="data/doc_2_vec/movie_d2v_input.txt")
    elif name == "Financial":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset,
            save_file="data/doc_2_vec/financial_d2v_input.txt")

    doc = open(txt_file, "r", encoding="utf-8")
    documents = TaggedLineDocument(doc)

    model = gensim.models.Doc2Vec(documents,
                                  dm=0,
                                  dbow_words=0,
                                  size=vec_lenght,
                                  window=10,
                                  hs=0,
                                  negative=5,
                                  sample=1e-4,
                                  iter=20,
                                  min_count=10,
                                  workers=4,
                                  alpha=0.1)
    doc.close()
    model.save(fname_or_handle=save_folder)
    return model
Beispiel #5
0
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' +
                  str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)

            #Building the Doc2vec Model
            f = urlopen(page_url)
            html = f.read()
            open('temp.txt', 'w').close()
            f = open('temp.txt', 'w')
            f.write(html)
            f.close()
            html = TaggedLineDocument('temp.txt')
            model = Doc2Vec(html, size=100, window=8, min_count=5, workers=4)
            model.train(html, total_examples=100, epochs=5)
            #print model.docvecs[0]

            #saving data for building and testing the svm
            # if len(Spider.data_train)<50:
            #    Spider.data_train.add(model.docvecs[0])
            #else:
            #   Spider.data_test.add(model.docvecs[0])

            #set_to_file(Spider.data_train,'data_train.txt')
            #set_to_file(Spider.data_test,'data_test.txt')

            Spider.crawled.add(page_url)
            Spider.update_files()
Beispiel #6
0
def file2model(df, ofile):
    print("columns:", df.columns.tolist())

    lines = []
    for cur_id, row in df.groupby('id_df'):
        title = row['title_df'].values[0].replace("\n", ' ').replace(
            "\r\n", ' ').replace("\r", ' ')
        abstract = row['abstract'].values[0].replace("\n", ' ').replace(
            "\r\n", ' ').replace("\r", ' ')
        keywords = row['keywords'].values[0]
        keywords = keywords.replace("\n", ' ') if keywords is str else ""
        line = " ".join(map(str, [cur_id, "\t", title, abstract, keywords]))
        lines.append(line.lower().replace(".", " "))

    open(ofile, 'w').writelines(l + "\n" for l in lines)

    sentences = TaggedLineDocument(ofile)
    model = Doc2Vec(sentences, size=100, window=300, min_count=10, workers=4)
    # model = Doc2Vec(sentences, size=100, window=300, min_count=10, workers=4)
    id_list = [s[0][0] for s in sentences]
    vect_list = []

    need_vect_list = True
    if need_vect_list:
        for s in sentences:
            sent = " ".join(s.words)
            vect_list.append(model.infer_vector(sent))

    return model, vect_list, id_list
Beispiel #7
0
    def fit(self,
            corpus,
            vector_size=300,
            window=10,
            min_count=1,
            dm=1,
            hs=0,
            negative=5,
            epochs=10,
            workers=8):

        if isinstance(corpus, str) and Path(corpus).is_file():
            corpus = TaggedLineDocument(corpus)
        else:
            corpus = [
                TaggedDocument(line, [idx]) for idx, line in enumerate(corpus)
            ]

        model = Doc2Vec(documents=tqdm(corpus),
                        vector_size=vector_size,
                        window=window,
                        min_count=min_count,
                        dm=dm,
                        hs=hs,
                        negative=negative,
                        epochs=epochs,
                        workers=workers)
        return model
def train(args):
    documents = TaggedLineDocument(LYRIC)
    return Doc2Vec(documents,
                   size=args.size,
                   window=args.window,
                   min_count=args.min_count,
                   workers=args.workers,
                   dm=args.dm)
Beispiel #9
0
def create_doc2vec_model_v2(train_d2v_data_file_path, vector_size, lexicon,
                            files_prefix, model_path):
    print('Training d2v model')
    docs = TaggedLineDocument(train_d2v_data_file_path)
    model = Doc2Vec(docs, size=vector_size, window=10, min_count=1, workers=8)
    model.save(model_path)

    return model
Beispiel #10
0
def check_for_doc2vecmodel(doc2vec_fname, docs_fname, corpus, dictionary):
    try:
        doc2vec = models.Doc2Vec.load(doc2vec_fname)
    except IOError:
        print('Training Doc2Vec model, this may take a long time')
        documents = TaggedLineDocument(docs_fname)
        doc2vec = models.doc2vec.Doc2Vec(documents=documents, workers=4)
        doc2vec.save(doc2vec_fname)
    return doc2vec
Beispiel #11
0
def pre_train():
    if os.path.exists('data/dataset/data.txt'):
        # documents = TaggedLineDocument('data/dataset/data.txt')
        print("data.txt 文件存在")
        pass
    else:
        data_txt()
    documents = TaggedLineDocument('data/dataset/data.txt')
    return documents
Beispiel #12
0
def dataset():
    df = open('data/blog_notopic.txt','r',encoding='utf-8')
    blogs=TaggedLineDocument(df)
    
    #计算x_train数据量
    '''count=-1
    for count, line in enumerate(open('data/blog_notopic.txt', 'r',encoding='utf-8')):
        pass
    count += 1'''
    return blogs
Beispiel #13
0
 def train(self, source_corpus_path, update=False):
     """
     Train an uninitialized model using corpus.
     Each line in the corpus should be words of a sentence separated by space.
     
     :param source_corpus_path: Path to corpus.
     :param update: Update vocab.
     :return: Nothing.
     """
     documents = TaggedLineDocument(source_corpus_path)
     self.model.build_vocab(documents, update=update)
     self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.iter)
def prepare_training_data(sentences, CONFIGURATION):

    #sentences = stem(CONFIGURATION, sentences)
    ctr = 0
    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sentences:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        word = (ps.stem(re.sub('[^A-z0-9<>]', '',
                                               word.lower())))
                        #if len(word)>2:
                        #    words = #[word[i:i+3] for i in range(len(word)-3+1)]
                        #else:
                        words = [word]
                        expression_new = expression_new + words
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                #if not " ".join(expression) == '' and len(" ".join(expression))>1:
                tmp = tmp + expression
            if len(tmp) > 0:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
                ctr += 1
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    sentences = [
        document for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv"))
    ]

    #x = tuplize(sentences, CONFIGURATION)

    #x = eliminate_rare_and_frequent_terms(x)

    #documents = list()
    #for index, row in x.iterrows():
    #    documents.append([str(row[0])] + [str(row[1])])

    #documents = literalize(documents)

    documents = sentences

    return documents
def trainDoc2Vector(sentence_count, vector_dimension):
    # train and save the model
    sentences = TaggedLineDocument('sources/splited_words.txt')
    model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count())
    model.train(sentences, total_examples=sentence_count, epochs=model.iter)
    model.save('result/doc2vec.model')
    # save vectors
    out = open('result/doc2vec.vector', mode='w+', encoding='utf-8')
    for index in range(0, sentence_count, 1):
        docvec = model.docvecs[index]
        out.write(' '.join(str(f) for f in docvec) + "\n")

    out.close()
def build_doc2vec_model():
    # Creating labeled sentences from training data
    sentences = TaggedLineDocument('bulk-total.txt')
    model = Doc2Vec(alpha=0.1,
                    size=30,
                    window=10,
                    min_count=5,
                    dm=0,
                    dbow_words=1,
                    iter=10)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=81863, epochs=10)
    model.save('../models/clpsych-30dim-large.d2v')
    def process(self):

        log.info("Commencing execution")

        tagged_docs = TaggedLineDocument(self.labeled_articles_file_path)

        log.info("Training Doc2Vec model")
        doc2vec_model = doc2vec_helper.init_model(tagged_docs)
        doc2vec_model.save(self.doc2vec_model_file_path)
        log.info("Learnt vocab from training set and saved doc2vec model")

        x_train = list()
        with open(self.labeled_articles_file_path) as training_set:
            for line in training_set:
                x_train.append(doc2vec_model.infer_vector(line))

        y_train = [0] * self.samples_per_class_train
        y_train.extend([1] * self.samples_per_class_train)

        x_test = list()
        with open(self.articles_source_file_path) as test_set:
            for line in test_set:
                x_test.append(doc2vec_model.infer_vector(line))

        y_true = [1] * self.samples_per_class_test
        y_true.extend([0] * self.samples_per_class_test)

        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train)
        scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path)
        y_pred = ml_model_logreg.predict(x_test)
        log.info("Logistic Regression")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train)
        y_pred = ml_model_svm.predict(x_test)
        log.info("SVM")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train)
        y_pred = ml_model_nb.predict(x_test)
        log.info("Naive Bayes")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        log.info("Completed execution")
Beispiel #18
0
def calculate_and_save_word2vec_dict(words_list, files):
    dataset_file = os.path.join(word2vec_taget_dir, 'dataset.txt')
    FileProcessor(dataset_file).file_write(
        'utf8', u''.join([words + u'\n' for words in words_list][:-1]))
    doces = TaggedLineDocument(dataset_file)

    doc2Vec_model = doc2vec.Doc2Vec(doces, size=200, window=10, workers=4)
    doc2Vec_model.train(doces,
                        total_examples=doc2Vec_model.corpus_count,
                        epochs=200)
    doc2Vec_model.save(os.path.join(word2vec_taget_dir, 'doc2vec_model.txt'))
    FileProcessor(os.path.join(word2vec_taget_dir, 'tagged_map.txt'))\
        .file_write('utf8', u''.join([u'{0} {1} \n'.format(index, value.decode('utf8')) for index, value in enumerate(files)]))

    return doc2Vec_model
Beispiel #19
0
def train():
    tagged = TaggedLineDocument(filetgge)
    model = Word2Vec(alpha=0.025,
                     min_alpha=0.025,
                     size=50,
                     window=5,
                     min_count=5,
                     workers=8)
    model.build_vocab(tagged)
    for i in range(10):
        model.train(tagged)
        model.alpha -= 0.0002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    # model.save_word2vec_format(filetgge+'.model')
    model.save(filetgge + '.model')
Beispiel #20
0
def main():
    model_file = datahub.get_full_path("doc2vec.model")
    if not Path(model_file).exists():
        tokenized_file = datahub.get_full_path("articles1_token.txt")
        if not Path(tokenized_file).exists():
            print("data is not yet tokenized and saved. Doing now.")
            print("loading data")
            data_list = datahub.load_data(datahub.get_full_path("articles1.csv"))
            print("tokenizing data")
            tokenized_content_list = datahub.tokenize_content(data_list)
            print("saving tokenizing data in txt file")
            datahub.save_tagged_data(tokenized_content_list, tokenized_file)

        vector_size = 50
        epochs = 100
        print("training model")
        trained_model = v_model.train_model(TaggedLineDocument(tokenized_file), vector_size, epochs)
        print("saving model")
        v_model.save_model(trained_model, model_file)
        print("finish")
    else:
        print("{} already exists. Exiting.".format(model_file))
Beispiel #21
0
    user_dict[line.split()[0]] = ''
u_f.close()

for line in f_b_t.readlines():
    if utils.key_in_dic(line.replace('\n', '').split('\t')[0], user_dict):
        p_f.write(line)
    else:
        pass
f_b_t.close()
p_f.close()

#
from gensim.models.doc2vec import TaggedLineDocument, Doc2Vec
user_tranj_vec = '../data/user_tranj_vec.txt'

documents = TaggedLineDocument(doc_file)
model = Doc2Vec(documents,
                size=128,
                negative=10,
                window=8,
                hs=0,
                min_count=0,
                workers=15,
                iter=30)

user_id_list = []
u_f = open(user_file)
for line in u_f:
    user_id_list.append(line.split('\n')[0])
u_f.close()
Beispiel #22
0
# docLabels = ["input.txt"]
# data = []
# for doc in docLabels:
#     data.append(open(doc, 'r'))
#     print(data)
#
# it = LabeledLineSentence(data, [1])
# # print(it)= ["input.txt"]
# data = []
questions_path = "train_questionsq.txt"
answers_path = "train_answersq.txt"

questions = open(questions_path, 'r')
answers = open(answers_path, 'r')

doc = TaggedLineDocument("input.txt")

model = gensim.models.Doc2Vec(size=100,
                              window=10,
                              min_count=1,
                              workers=11,
                              alpha=0.025,
                              min_alpha=0.025)  # use fixed learning rate

model.build_vocab(doc)

model.iter = 300

model.train(doc, total_examples=model.corpus_count, epochs=model.iter)
#
# for epoch in range(10):
Beispiel #23
0
import os.path
import sys
import multiprocessing

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 4:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]

    model = Doc2Vec(TaggedLineDocument(inp),
                    size=200,
                    window=5,
                    min_count=5,
                    workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)  #save dov2vec
    model.save_word2vec_format(outp2, binary=False)  #save word2vec
Beispiel #24
0
 def train(cls):
     model = Doc2Vec(documents=TaggedLineDocument(cls.corpus_path), vector_size=300, window=5, min_count=1, workers=4)
     model.save(config.model_path.format('d2v.model'))
Beispiel #25
0
if not os.path.exists(base_output_path+'docs_artist_blocks.txt.gz'):
    with gzip.open(base_output_path+'docs_artist_blocks.txt.gz','w') as fout, gzip.open(base_output_path+'indices.txt.gz','w') as indices:
        files = sorted(glob.glob(scrobble_path+'*.txt'))
        for fi in tq(files):
            artists = [line.split('\t')[1] for line in open(fi)]
            last = None
            blocks = []
            for a in tq(artists):
                if a != last:
                    blocks.append(a)
                last = a
            doc = ' '.join(blocks)
            fout.write(doc+'\n')
            userid = fi[fi.rfind('\\')+1:-4]
            indices.write(userid+'\n')
documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz'))]



%time model = Doc2Vec(documents, size=dim, window=win, min_count=min_count,workers=workers)

dpath = 'P:/Projects/BigMusic/jared.data/d2v/artist_dict.pkl'
if not os.path.exists(dpath):
    artist_dict = {}
    for line in tq(open('P:/Projects/BigMusic/jared.rawdata/lastfm_itemlist.txt')):
        line = line.split('\t')
        if line[1]=='0':
            artist_dict[line[2]] = line[0]
    cPickle.dump(artist_dict,open(dpath,'wb'))
else:
    artist_dict = cPickle.load(open(dpath))
Beispiel #26
0
#-*-coding:utf-8-*-
from __future__ import division
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument
# import numpy.linalg
import numpy as np
import math
import scipy
from PIL import Image, ImageDraw
input_file = r"H:\network_diagnosis_data\test\GTPC_TUNNEL_PATH_BROKEN.3054.txt"
sentences = TaggedLineDocument(input_file)
dim = 1000
model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=dim)  # default 300 ά
model.build_vocab(sentences)

for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save(r'.\data\test_d2v')
# print model.infer_vector([u'people', u'like', u'words'])

total_num = model.docvecs.count
# print total_num
# print len( model.docvecs[0] )
para_vec = []
for i in xrange(total_num):
    if i == 0:
        para_vec = model.docvecs[i]
        continue
    para_vec = np.vstack((para_vec, model.docvecs[i]))
print para_vec
Beispiel #27
0
def d2ctest():
    documents = TaggedLineDocument("new_text2.txt")
    model = Doc2Vec(documents, size=10, window=2, min_count=1, workers=1)
    print(model)
    model
Beispiel #28
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from time import localtime, strftime

# document for training
doc_path = 'dota_picks'
corpus = TaggedLineDocument(doc_path)

# Doc2Vec parameters; self explanatory
vector_size = 50
window_size = 5
min_count = 0
sampling_threshold = 1e-4
negative_size = 5
train_epoch = 100
dm = 0  #0 = dbow; 1 = dmpv
worker_count = 8  #number of parallel processes

model = Doc2Vec(size=vector_size,
                window=window_size,
                min_count=min_count,
                sample=sampling_threshold,
                workers=worker_count,
                hs=0,
                dm=dm,
                negative=negative_size,
                dbow_words=1,
                dm_concat=1)
print("Building Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
model.build_vocab(corpus)
print("Built Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
Beispiel #29
0
import sys
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument

file = sys.argv[1]
epochs = int(sys.argv[2])
words = sys.argv[3].split(' ')

steps = 50

docs = TaggedLineDocument(file)

model = Doc2Vec(docs, min_count=1, epochs=epochs)

docs_list = list(docs)

to_docstr = lambda x: ' '.join(docs_list[x].words)

print(f'--- similar : {to_docstr(0)} ---')

for i, p in model.docvecs.most_similar(0):
    print(f'{p}, {to_docstr(i)}')

print('')
print(f'--- similar : {words} ---')

x = model.infer_vector(words, steps=steps)

for tag, p in model.docvecs.most_similar([x]):
    print(f'{p}, {to_docstr(tag)}')
Beispiel #30
0
from keras.models import Sequential, Model
from keras.optimizers import Adam

data_file = sys.argv[1]
dest_file_prefix = sys.argv[2]

epoch = int(sys.argv[3])
batch = int(sys.argv[4])

wv_size = 200
wv_epoch = 2000

num_unit = 512
input_size = (10, )

docs = TaggedLineDocument(data_file)

words = [d.words for d in docs]

wv_model = Word2Vec(words, wv_size, min_count=1, iter=wv_epoch)

input_size += (wv_model.vector_size, )

word_maxlen = np.max([len(w) for w in words])


def discriminator(input_shape):
    model = Sequential()

    model.add(GRU(num_unit, input_shape=input_shape))
    model.add(Dropout(0.3))