コード例 #1
0
 def label(self):
     for i in range(len(self.sents) / 2):
         self.labelledSents.append(
             doc2vec.LabeledSentence(words=self.sents[2 * i].split(),
                                     tags=['title_%s' % self.articleId[i]]))
         self.labelledSents.append(
             doc2vec.LabeledSentence(
                 words=self.sents[2 * i + 1].split(),
                 tags=['question_%s' % self.articleId[i]]))
コード例 #2
0
def iter_docs_queries():
    df = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0)
    for idx, row in tqdm(df.iterrows()):
        yield doc2vec.LabeledSentence(str(row[1]).split(), ['QUERY_%d' % idx])

    for filename in tqdm(os.listdir('./docs/')):
        path = os.path.join('./docs/', filename)
        df = pd.read_csv(path, sep='\t', index_col=0, header=None)
        for idx, row in df.iterrows():
            yield doc2vec.LabeledSentence((str(row[1]) + ' ' + str(row[2])).split(), ['DOC_%d' % idx])
コード例 #3
0
 def label(self):
     for i in range(len(self.sents) / 3):
         self.labelledSents.append(
             doc2vec.LabeledSentence(
                 words=self.sents[3 * i].split(),
                 tags=['postText_%s' % self.articleId[i]]))
         self.labelledSents.append(
             doc2vec.LabeledSentence(
                 words=self.sents[3 * i + 1].split(),
                 tags=['targetTitle_%s' % self.articleId[i]]))
         self.labelledSents.append(
             doc2vec.LabeledSentence(
                 words=self.sents[3 * i + 2].split(),
                 tags=['targetDescription_%s' % self.articleId[i]]))
コード例 #4
0
ファイル: doc_to_vec.py プロジェクト: yejiachen/nlp_2018
def doc2vec(vec_size, min_count_of_each_word, window_size, n_epoch):

	# load 'article_cutted'
	with open('article_cutted', 'rb') as file:
	    data = pickle.load(file)

	# create a document id map
	sentence_list = []
	for i, l in enumerate(data):
	    sentence_list.append(doc2vec.LabeledSentence(words=l, tags=[str(i)]))

	# define doc2vec model
	model = Doc2Vec(size=vec_size, min_count=min_count_of_each_word, window=window_size)
	# build vocabulary
	model.build_vocab(sentence_list)

	# train doc2vec model ; shuffle data every epoch
	for i in range(n_epoch):
	    random.shuffle(sentence_list)
	    model.train(sentence_list, total_examples=len(data), epochs=1)

	# print result
	model.docvecs['0']
	# save result
	model.save('word2vec_model/doc2vec.wv.syn0.npy')
コード例 #5
0
def label_sentences(corpus, label_type):

    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
    return labeled
コード例 #6
0
ファイル: nlp.py プロジェクト: Sandy4321/Word2Vec-Doc2Vec
def doc_to_labeled_sentences(doc,
                             tokenizer,
                             sent_num_start,
                             remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words

    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(doc.strip())

    # 2. Loop over each sentence
    sentences = []
    sent_num = sent_num_start
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call doc_to_wordlist to get a list of words
            words = doc_to_wordlist(raw_sentence, remove_stopwords)
            labeled_sentence = doc2vec.LabeledSentence(
                words=words, labels=['SENT_%s' % sent_num])
            sentences.append(labeled_sentence)
            sent_num = sent_num + 1

    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return (sentences, sent_num)
コード例 #7
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield d2v.LabeledSentence(
                     utils.to_unicode(line).split(),
                     [prefix + '_%s' % item_no])
コード例 #8
0
ファイル: d2v_model.py プロジェクト: nvohra1/code
 def label_text(corpus, label_type):
     func_name = sys._getframe().f_code.co_name
     logging.info("d2vModel :: " + str(func_name))
     labeled = []
     for i, v in enumerate(corpus):
         label = label_type + '_' + str(i)
         labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
     return labeled
コード例 #9
0
ファイル: doc2vec_embedding.py プロジェクト: zhenv5/PyStack
def get_sentenses(posts, type_prefix):
    sentences = []
    #analyzedDocument = namedtuple('AnalyzedDocument', 'ID')
    for k, v in posts.iteritems():
        words = v.lower().split()
        tags = ["".join((type_prefix, str(k)))]
        sentences.append(doc2vec.LabeledSentence(words=words, tags=tags))
    #print sentences
    return sentences
コード例 #10
0
 def __iter__(self):
     for content, (page_id, title) in self.wiki.get_texts():
         yield doc2vec.LabeledSentence(
             # 1. 对content中的每一个c,
             # 2. 转换成简体中文之后用jieba分词
             # 3. 加入到words列表中
             words=[w for c in content
                    for w in jieba.cut(Converter('zh-hans').convert(c))],
             tags=[title])
コード例 #11
0
 def __iter__(self):
     for content, (page_id, title) in self.wiki.get_texts():
         yield doc2vec.LabeledSentence(
             # 1. 对content中的每一个c,
             # 2. 转换成简体中文之后用jieba分词
             # 3. 加入到words列表中
             words=[w for c in content
                    for w in jieba.cut(HanziConv.toSimplified(c))],
             tags=[title])
コード例 #12
0
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(
                     d2v.LabeledSentence(
                         utils.to_unicode(line).split(),
                         [prefix + '_%s' % item_no]))
     return self.sentences
コード例 #13
0
ファイル: d2v.py プロジェクト: tunnelshade/nightfury
 def __init__(self):
     sens = self._unpickle_doc()
     if sens:
         data = [
             doc2vec.LabeledSentence(words=words, tags=["SENT_%d" % i])
             for i, words in enumerate(sens)
         ]
         self._d2v = Doc2Vec(data, size=config.STATE_D2V_DIM, min_count=1)
     else:
         self._d2v = Doc2Vec(size=config.STATE_D2V_DIM, min_count=1)
コード例 #14
0
ファイル: agent.py プロジェクト: sampr0/nightfury
 def _load_d2v(self):
     sens = self._unpickle_doc()
     if sens:
         data = [
             doc2vec.LabeledSentence(words=words, tags=["SENT_%d" % i])
             for i, words in enumerate(sens)
         ]
         self._d2v = Doc2Vec(data, size=2, min_count=1)
     else:
         self._d2v = Doc2Vec(size=2, min_count=1)
コード例 #15
0
ファイル: doc2vec.py プロジェクト: timothytiet/projectlhp
    def initParagraph(self, sentences):
        count = -1
        sen = []
        for sentence in sentences:
            count += 1
            temp = doc2vec.LabeledSentence(words=sentence,
                                           tags=['id' + str(count)])
            sen.append(temp)

        return sen
コード例 #16
0
ファイル: model.py プロジェクト: nirupam1sharma/doc2vec
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the review.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
    return labeled
コード例 #17
0
def label_sentences(corpus, label_type):
    """
    Gensim'in Doc2Vec uygulaması, her belgenin / paragrafın kendisiyle ilişkilendirilmiş bir etiketi olmasını gerektirir.
     Bunu LabeledSentence yöntemini kullanarak yapıyoruz. Biçim "TRAIN_i" veya "TEST_i" olacaktır; burada "i"
     gözden geçirmenin dummy endeksi.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
    return labeled
コード例 #18
0
def Doc2Vec(dataframe):
    '''INPUT: Pandas Dataframe
       OUTPUT: Trained Doc2Vec model'''

    #Clean all the posts in the dataframe and create a LabeledSentence generator with the post and tag data.
    docs = []
    for i in range(len(dataframe)):
        post = cleanText(dataframe['ttl_ctxt'].values[i])
        tags = dataframe['tags'].values[i]
        labeledsent = doc2vec.LabeledSentence(words=post, tags=tags)
        docs.append(labeledsent)
    #Train the Doc2Vec model with a list of generators.
    model = doc2vec.Doc2Vec(docs)
    return model
コード例 #19
0
def df2labeled_sentence(df):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the review.
    """
    logging.info("Create labeled sentence")

    labeled = []
    for index, row in df.iterrows():
        label = row.Class + '_' + str(row['ID'])
        text = row.Text.split()
        labeled.append(doc2vec.LabeledSentence(text, [label]))

    return (labeled)
コード例 #20
0
ファイル: Extract.py プロジェクト: camilothorne/tree-kernel
    def buildRawSents(self, myfile):

        for txtfile in glob.glob(devdata + myfile):
            xmldoc = minidom.parse(txtfile)
            itemlist0 = xmldoc.getElementsByTagName('document')
            count = 0
            for it0 in itemlist0:
                parag = ""
                itemlist = it0.getElementsByTagName('text')
                for item in itemlist:
                    if '.' in item.firstChild.data:
                        parag = parag + " " + item.firstChild.data
                toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8'))
                lab = [txtfile + '_' + ` count `]
                self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab))
                count = count + 1
コード例 #21
0
ファイル: doc2vec_model.py プロジェクト: aakgun/DMPrj
 def label_sentences2(corpus, label_type):
     """
     Gensim's Doc2Vec implementation requires each
      document/paragraph to have a label associated with it.
     We do this by using the LabeledSentence method.
     The format will be "TRAIN_i" or "TEST_i" where "i" is
     a dummy index of the review.
     """
     labeled = []
     for i, v in enumerate(corpus):
         label = label_type + '_' + str(i)
         #labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
         labeled.append(
             doc2vec.LabeledSentence(words=Tokenization(
                 corpus, concept=False, stem=True, removeStopwords=True),
                                     tags=[label]))
         #LabeledSentence(words=Tokenization(corpus, concept=False, stem, removeStopwords), tags=[fnames[i]])
     return labeled
コード例 #22
0
ファイル: similarity.py プロジェクト: zhaoqinghai/harvester
def train():
    documents = []

    with open('/home/ycw/tax_data.csv', 'r') as f:
        reader = csv.reader(f, dialect='excel', delimiter=',')
        for line in reader:
            print(line)
            word_list = transform_text(line[1].strip(), strip=False)
            # word_list = eval(line[2])
            documents.append(doc2vec.LabeledSentence(word_list, [line[0]]))

    model = Doc2Vec(documents,
                    dm=1,
                    size=DIMENSION,
                    window=5,
                    negative=5,
                    min_count=2,
                    workers=4)
    model.save('../models/doc2vec.model')

    indexer = AnnoyIndexer(model, 2)
    # _, tem_fn = mkstemp()
    indexer.save('../models/dv_index')
コード例 #23
0
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

#%%Embedd the paragraphs
# create word2vec model
print("starting paragraphe vectorization")
from gensim.models import word2vec, doc2vec
list_all_paragraphe_split = [sentence.split()
                             for sentence in list_all_paragraphe_filtered]
labels = ["paragraph_"+str(i) for i in range(len(list_all_paragraphe_split))]
model = word2vec.Word2Vec(list_all_paragraphe_split, size=100)
print("done with word2vec")
# create doc2vec model
# [str(list_all_paragraphe_split.index(sentence))]
sentences = [doc2vec.LabeledSentence(
    words=sentence, tags=labels) for sentence in list_all_paragraphe_split]
model_doc = doc2vec.Doc2Vec(sentences, size=100)
print("done with doc2vec")
# store the model to  files

model_doc.save('my_model_complete_data.doc2vec')

#%%Do a dimension reduction on the data to see if it influences the clustering
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
sentences_vec = [model_doc.docvecs[label] for label in labels]
sentences_vec = normalize(sentences_vec)
pca = PCA(n_components="mle", svd_solver='full')
pca.fit(sentences_vec)
コード例 #24
0
    for list in lol:
        output.extend(list)
    return output


if __name__ == '__main__':
    assert doc2vec.FAST_VERSION > -1  # Apparently it is unusably slow otherwise.
    sentences = []
    for city in args.city:
        print "%s\tLoading tweets from city: %s" % (time.asctime(), city)
        nghd_tweets = ujson.load(open('data/%s/%s' % (city, NGHD_TWEETS_FILE)))
        print "%s\tDone loading tweets for city: %s" % (time.asctime(), city)
        for nghd, tweet_words in nghd_tweets.iteritems():
            words_flattened = flatten(tweet_words)
            sentences.append(
                doc2vec.LabeledSentence(words=words_flattened,
                                        tags=['CITY_' + city, 'NGHD_' + nghd]))

    # This below is all from https://rare-technologies.com/doc2vec-tutorial/
    print "%s\tbuilding vocab" % time.asctime()
    model = doc2vec.Doc2Vec(size=100,
                            min_count=2,
                            alpha=0.025,
                            min_alpha=0.025,
                            max_vocab_size=30000,
                            workers=4)
    model.build_vocab(sentences)

    # print "%s\tdone building vocab, this many words: %s" % (time.asctime(), model.
    for epoch in range(10):
        print "%s\ttraining epoch %s" % (time.asctime(), epoch)
        model.train(sentences)
コード例 #25
0
def labelizeReviews(reviews):
    labelized = []
    for i, v in enumerate(reviews):
        label = '%s' % (i)
        labelized.append(doc2vec.LabeledSentence(v, [label]))
    return labelized
コード例 #26
0
def sentences(wakati_list):
    return [
        doc2vec.LabeledSentence(tokens, tags=[category])
        for category, tokens in wakati_list
    ]
コード例 #27
0
	def __iter__(self):
		for i, text in enumerate(self.doc1): 
			yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i]) 
コード例 #28
0
 def __iter__(self):
     for idx, doc in enumerate(self.doc_list):
         labels=[self.labels_list[idx]]
         words=doc.split()
         yield Doc2Vec.LabeledSentence(words,labels)
コード例 #29
0
ファイル: veooz_embed.py プロジェクト: libin19861023/RARE
 def label(self):
     for uid, line in enumerate(self.sents):
         self.labelledSents.append(
             doc2vec.LabeledSentence(words=line.split(),
                                     tags=['SENT_%s' % uid]))
コード例 #30
0
    print(str(round(num * 100 / len(sentences2), 3)) + '%', end='\r')
    futures.append(executor.submit(label_Sentences, item))
concurrent.futures.wait(futures)

tags = []
for i in futures:
    tags.append(i.result())

#Pickle/save list of tags for sentences
f = open('/home/lanna/Dropbox/Insight/tags', 'wb')
pickle.dump(tags, f)

LabeledSentences = []
for i in range(0, len(sentences2)):
    LabeledSentences.append(
        doc2vec.LabeledSentence(sentences2[i].split(), tags[i]))

#https://linanqiu.github.io/2015/05/20/word2vec-sentiment/

nfeatures = 300
model = gensim.models.doc2vec.Doc2Vec(workers=10,
                                      size=nfeatures,
                                      window=10,
                                      min_count=1,
                                      alpha=0.025,
                                      min_alpha=0.025)
#Build the vocabulary table: digesting all the words and filtering out the unique words, and doing some basic counts on them
model.build_vocab(LabeledSentences)

#Train Doc2Vec
from random import shuffle