Ejemplo n.º 1
0
from gensim.models import word2vec
data = word2vec.Text8Corpus("wiki_wakati.txt")
model = word2vec.Word2Vec(data, size=100)
model.save("wiki.model")
print("ok")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
作者:fuli.shen
时间:2017年6月27日 
"""

from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus(
    u"E:\\work_document\\part-r-00000-news_w2v_sampling.text")  # 加载语料
model = word2vec.Word2Vec(sentences, size=200)  # 训练skip-gram模型; 默认window=5
model.save(u"./word2vec_model/news_word2vec.custompy")
print "save news_word2vec custompy successfully"
if __name__ == "__main__":
    pass
    def train_val(self):
        result = {}

        for n,name in tqdm(enumerate(self.val_author_data)):

            pubs = []
            #get the author's all paper
            for clusters in self.val_author_data[name]:

                pubs.append(clusters)
            #print(pubs)

            name_pubs_raw = {}
            for i,pid in enumerate(pubs):
                name_pubs_raw[pid] = self.val_pub_data[pid]
            #load the author's features
            save_relation(name_pubs_raw,name)

            mpg = MetaPathGenerator ()
            mpg.read_data ("gene")

            all_embs = []
            rw_num = 10
            cp = set()
            #start to random walk
            for k in range(rw_num):
                mpg.generate_WMRW ("gene/RW.txt", 5, 20)
                sentences = word2vec.Text8Corpus (r'gene/RW.txt')
                ##########use word2vec to train the paper's embedding###############
                model = word2vec.Word2Vec (sentences, size=128, negative=25, min_count=1, window=10)
                embs = []
                for i, pid in enumerate (pubs):
                    if pid in model:
                        embs.append (model[pid])
                    else:
                        cp.add (i)
                        embs.append (np.zeros (128))
                all_embs.append (embs)
            all_embs = np.array (all_embs)

            ##########################loading the sematic feautures#################
            ptext_emb = load_data ('gene', 'ptext_emb.pkl')
            tcp = load_data ('gene', 'tcp.pkl')

            tembs = []
            for i, pid in enumerate (pubs):
                tembs.append (ptext_emb[pid])

            ##############get the paper's connection's cosine matrix####################
            sk_sim = np.zeros ((len (pubs), len (pubs)))
            for k in range (rw_num):
                sk_sim = sk_sim + pairwise_distances (all_embs[k], metric="cosine")
            sk_sim = sk_sim / rw_num

            ##############get the paper's semantic's cosine matrix####################
            tembs = pairwise_distances (tembs, metric="cosine")

            w = 1
            sim = (np.array (sk_sim) + w * np.array (tembs)) / (1 + w)

            pre = DBSCAN (eps=0.2, min_samples=4, metric="precomputed").fit_predict (sim)
            pre = np.array (pre)

            ##离群论文集
            outlier = set ()
            for i in range (len (pre)):
                if pre[i] == -1:
                    outlier.add (i)
            for i in cp:
                outlier.add (i)
            for i in tcp:
                outlier.add (i)

            ##基于阈值的相似性匹配
            paper_pair = generate_pair (pubs, outlier)
            paper_pair1 = paper_pair.copy ()
            K = len (set (pre))
            for i in range (len (pre)):
                if i not in outlier:
                    continue
                j = np.argmax (paper_pair[i])
                while j in outlier:
                    paper_pair[i][j] = -1
                    j = np.argmax (paper_pair[i])
                if paper_pair[i][j] >= 1.5:
                    pre[i] = pre[j]
                else:
                    pre[i] = K
                    K = K + 1

            for ii, i in enumerate (outlier):
                for jj, j in enumerate (outlier):
                    if jj <= ii:
                        continue
                    else:
                        if paper_pair1[i][j] >= 1.5:
                            pre[j] = pre[i]

            #print (pre, len (set (pre)))

            result[name] = []
            for i in set (pre):
                oneauthor = []
                for idx, j in enumerate (pre):
                    if i == j:
                        oneauthor.append (pubs[idx])
                result[name].append (oneauthor)

        json.dump (result, open (self.args['val_result'], 'w', encoding='utf-8'), indent=4)
        f1 = f1_score(result,self.args)
        print("f1:",f1)
Ejemplo n.º 4
0
def train(path = Dir.res+"/sen_data/604_corpus.txt",save_path = Dir.res+"/w2v/w2v.model"):
    sentences = word2vec.Text8Corpus(path)  # 加载语料
    model = word2vec.Word2Vec(sentences, size=10,window=3,min_count=1)
    model.save(save_path)
    return save_path
Ejemplo n.º 5
0
                              header=0,
                              delimiter="\t",
                              quoting=3)  #读入我们预设的unlabel数据(目前来说比较小)
        pat = re.compile(r'[A-Za-z]+')
        #提取全部单词(如果是提取汉字的话就....)如果想进一步考虑到标点符号对语义的影响,
        #不妨加上[!@#$%^&*]等等垃(yue)圾(pao)短信中常见的一些符号
        with open('imdb_text', 'a', encoding='utf-8') as f:
            for rev in data_un.review:  #对语料库里面的评论进行迭代
                str_list = pat.findall(rev)  #先提取出所有单词
                # str_list = [x.lower() for x in str_list]
                #小写化所有单词,但是实际应用中大小写也会影响语义,看情况是否选择最小化
                string = ' '.join(str_list)
                f.write(string + '\n')
                #上述操作以后我们就能得到一个写满处理后的string的文件啦
            del data_un
    sentences = word2vec.Text8Corpus("imdb_text")  # 加载语料
    model = word2vec.Word2Vec(sentences,
                              size=50)  #训练skip-gram模型,词向量长度设置50(不知道会不会有点大),
    #默认window=5,考虑上下5个单词来进行预测,计算词向量,如果一个词出现的次数少于5次那就默认这是一个生僻词,忽略掉,实际操作中这个数字可以改变
    model.save('mymodel')
    #然后保存下来可以用于下一次训练啦~
else:
    model = word2vec.Word2Vec.load('mymodel')
    #如果已经存在之前训练好的model那就直接导入
    #In[3]主要用于训练我们的词向量
# In[4]:
word_vectors = model.wv  #单词与向量的对应都在wv里面,把训练好的model复制过来
del model
# In[5]:
data_t['vec'] = data_t.review.apply(
    lambda x: [word_vectors[w] for w in x.split() if w in word_vectors])
Ejemplo n.º 6
0
"""
Word2Vec 模型:
* Word2Vec 通过训练,可以把对文本内容的处理简化为K维向量空间中的向量运算.(而向量空间上的相似度可以用来表示文本语义上的相似度)
    * 采用的模型有CBOW(Continuous Bag-Of-Words,即连续的词袋模型)和 Skip-Gram 两种.
    * 因此,Word2Vec 输出的词向量可以被用来做很多NLP相关的工作,比如聚类、找同义词、词性分析等等.
* CBOW 模型: 能够根据输入周围n-1个词来预测出这个词本身.
    * 也就是说,CBOW模型的输入是某个词A周围的n个单词的词向量之和,输出是词A本身的词向量.
* Skip-gram 模型: 能够根据词本身来预测周围有哪些词.
    * 也就是说,Skip-gram模型的输入是词A本身,输出是词A周围的n个单词的词向量.
"""

import pandas as pd
from gensim.models import word2vec

# 加载语料
sentences = word2vec.Text8Corpus(u"/opt/data/NLP/4.word2vec/text8.txt")
model = word2vec.Word2Vec(sentences, size=200)  # 训练skip-gram模型; 默认window=5

# 计算两个词的相似度/相关程度
y1 = model.similarity("woman", "man")
print(u"woman和man的相似度为:", y1)
print("--------\n")

# 计算某个词的相关词列表
y2 = model.most_similar("good", topn=20)  # 20个最相关的
print(pd.Series(y2))
# print(u"和good最相关的词有:\n")
# for item in y2:
#     print(item[0], item[1])
print("--------\n")
Ejemplo n.º 7
0
from gensim.models import word2vec
sentences = word2vec.Text8Corpus(u'分词后的爽肤水评论.txt')
model = word2vec.Word2Vec(sentences, size=50)

# y2 = model.similarity(u"好", u"还行")
# print(y2)

for i in model.most_similar(u"滋润"):
    print i[0], i[1]
Ejemplo n.º 8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File     : word2vec
# @Author   : 张志毅
# @Time     : 2020/9/10 9:41

from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus(
    u"D:\\Python\\WorkSpace\\word2vec\\Data\\text8")  # 加载语料
model = word2vec.Word2Vec(sentences, size=32)  # 训练skip-gram模型; 默认window=5

# 保存模型,以便重用
model.save("text8.model")
# 对应的加载方式
# model_2 = word2vec.Word2Vec.load("text8.model")

# 以一种C语言可以解析的形式存储词向量
model.wv.save_word2vec_format('embedding1.txt', binary=False)
# 对应的加载方式
# model_3 = word2vec.Word2Vec.load_word2vec_format("text8.model.bin", binary=True)

if __name__ == "__main__":
    pass
Ejemplo n.º 9
0
# -*- coding: utf-8 -*-
import gensim

from gensim.models import word2vec
sentences = word2vec.Text8Corpus(
    'D:/gitHubRes/python/词向量/text-classification-cnn-rnn/data/baike_triples.txt'
)
model = word2vec.Word2Vec(sentences, min_count=5, size=100)

model.save('D:/baike.model')
print("训练完成")
Ejemplo n.º 10
0
#-*- encoding:utf-8 -*-
__author__ = ''
from gensim.models import word2vec
import logging
import numpy as np
filename = "D:\\chinese\\word2vec_corpus\\merged_ehr_2_segdone.txt"
sentence =  word2vec.Text8Corpus(filename)
n_dim =100
model =word2vec.Word2Vec(sentence,size=n_dim)
model.save(u"abc.model")


#test
from gensim.models import word2vec
model_2 = word2vec.Word2Vec.load("gensim_train.model")

y1 = model_2.most_similar(u'肝脏',topn = 6)
y2 = model_2.wv[u'肝脏']



#import word2vec

# segment word part
import jieba
import struct
filePath='merged.txt'
fileSegWordDonePath ='corpusSegDone.txt'
# read the file by line
fileTrainRead = []
#fileTestRead = []
Ejemplo n.º 11
0
from gensim.models import word2vec
data = word2vec.Text8Corpus("wiki.gubun")
model = word2vec.Word2Vec(data)
model.save("wiki.model")
print("ok")
Ejemplo n.º 12
0
#     print(seg_lists)
#
# # 分完词后保存到新的txt中
# with open('fenci_0225.txt','w',encoding='utf-8') as f:
#     for i in seg_lists:
#         if i =='':
#             pass
#         else:
#             f.write(i)
#             # f.write('\n')
# print("分词结果保存成功")

#------------------------------------------

# 用 word2vec 进行训练
sentences = word2vec.Text8Corpus('slurm-13014726.out')
#52776
# #用来处理按文本分词语料
print(sentences)
model = word2vec.Word2Vec(sentences,
                          size=100,
                          window=5,
                          min_count=5,
                          workers=5,
                          sg=1,
                          hs=1)  #训练模型就这一句话  去掉出现频率小于2的词
# model = word2vec.Word2Vec(sentences,sg=1,size=100,window=5,min_count=5,negative=3,sample=0.001,hs=1,workers=4)
# http://blog.csdn.net/szlcw1/article/details/52751314 训练skip-gram模型; 第一个参数是训练预料,min_count是小于该数的单词会被踢出,默认值为5,size是神经网络的隐藏层单元数,在保存的model.txt中会显示size维的向量值。默认是100。默认window=5
# # 第一个参数是训练语料,第二个参数是小于该数的单词会被剔除,默认值为5, 第三个参数是神经网络的隐藏层单元数,默认为100
# model=word2vec.Word2Vec(sentences,min_count=3, size=50, window=5, workers=4)
Ejemplo n.º 13
0
# -*- coding: utf-8 -*-
"""
use jieba and gensim to create word2vec
生成词向量
"""
import jieba
from gensim.models import word2vec
import time
import os

dir_path = "/Users/luheng/dureader/data/preprocessed/trainset/"
file_path = dir_path + 'train.conll'

vocabfile = dir_path + 'vocab.txt'
word2vecfile = dir_path + 'vec.txt'
sentences = word2vec.Text8Corpus(file_path)
model = word2vec.Word2Vec(sentences,
                          size=50,
                          min_count=5,
                          max_vocab_size=100000)
model.save(dir_path + 'mymodel')
vocab = open(vocabfile, 'w+')
vec = open(word2vecfile, 'w+')
'''
这边需要把vocab改成唯一的
'''

model = word2vec.Word2Vec.load(dir_path + 'mymodel')
all_words = set()

for line in open(file_path):
Ejemplo n.º 14
0
flags.DEFINE_string('save_path', '../model/Word2Vec/', 'path for saving data')
flags.DEFINE_integer('min_count', 2, 'term occurs less than this is ignored')
flags.DEFINE_integer('size', 50, 'embedding dimensions')
flags.DEFINE_integer('window', 4, 'terms occur within a window-neighborhood of a term')
flags.DEFINE_integer('sg', 1, 'sg=1:skip-gram model; sg=other:CBoW model')
# flags.DEFINE_float()
# flags.DEFINE_boolean()
FLAGS = flags.FLAGS

# the major part
if __name__ == '__main__':
    # logging information
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # load-in training sentences
    sentences = word2vec.Text8Corpus(FLAGS.data_path)

    # training step:
    model = word2vec.Word2Vec(sentences,
                 	      min_count=FLAGS.min_count,
                              size=FLAGS.size,
                              window=FLAGS.window,
                              sg=FLAGS.sg)

    # save the trained model
    if not os.path.exists(FLAGS.save_path):
        os.makedirs(FLAGS.save_path)
    cur_time = datetime.datetime.now().strftime("%m-%d_%H:%M:%S")
    model.save(os.path.join(FLAGS.save_path, 'luru_news_'+cur_time+'.model'))
    model.wv.save_word2vec_format(os.path.join(FLAGS.save_path, 'luru_news_'+cur_time+'.model.bin'), binary=True)
Ejemplo n.º 15
0
    comment = comment.replace('~', '')
    comment = comment.replace('{"error_message": "EMPTY SENTENCE"}', '')
    comment = comment.replace('…', '')
    comment = comment.replace('\r', '')
    comment = comment.replace('\t', ' ')
    comment = comment.replace('\f', ' ')
    comment = comment.replace('/', '')
    comment = comment.replace('、', ' ')
    comment = comment.replace('/', '')
    comment = comment.replace(' ', '')
    comment = comment.replace(' ', '')
    comment = comment.replace('_', '')
    comment = comment.replace('?', ' ')
    comment = comment.replace('?', ' ')
    comment = comment.replace('了', '')
    comment = comment.replace('➕', '')
    return comment


comment = open('test.txt').read()
comment = ' '.join(jieba.cut(comment))

fo = open("afterSeg.txt", "w")
fo.write(comment)
print("finished!")
fo.close()

sentences = word2vec.Text8Corpus(u'afterSeg.txt')

model = word2vec.Word2Vec(sentences, min_count=3, size=50, window=5, workers=1)
Ejemplo n.º 16
0
1) Radim Řehůřek (author of gensim) -
    http://rare-technologies.com/performance-shootout-of-nearest-neighbours-intro
2) Erik Bernhardsson (author of annoy) - 
    https://github.com/erikbern/ann-benchmarks
"""

import time, random
import numpy as np
from gensim.models import word2vec
from sklearn.neighbors import KDTree

# Download text8 dataset from:
# http://mattmahoney.net/dc/text8.zip
# and unzip the file

sentences = word2vec.Text8Corpus('text8')
model = word2vec.Word2Vec(sentences, size=200, workers=8)
model.init_sims(replace=True)  # normalize the vectors

words = random.sample(model.vocab.keys(), 100)


class ANNSearch:
    word2idx = {}
    idx2word = {}
    data = []

    def __init__(self, model):
        for counter, key in enumerate(model.vocab.keys()):
            self.data.append(model[key])
            self.word2idx[key] = counter
Ejemplo n.º 17
0
'''
90. word2vecによる学習
81で作成したコーパスに対してword2vecを適用し,単語ベクトルを学習せよ.
さらに,学習した単語ベクトルの形式を変換し,86-89のプログラムを動かせ.
'''
from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus('../chapter09/corpus81')

model = word2vec.Word2Vec(sentences, size=300, window=5)
model.save('w2v')
Ejemplo n.º 18
0
def my_word2vec(cut_filename):
    mysetence = word2vec.Text8Corpus(cut_filename)
    #model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5)
    model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5)
    model.save('./model/zh_wiki_global.model')
                    else:
                        avgword2vec = avgword2vec + word2idf[word] * model[word]
            # if at least one word in the sentence has a word embeddings :
            if avgword2vec is not None:
                avgword2vec = avgword2vec / sumidf  # normalize sum
                array_sentences.append(line)
                array_embeddings.append(avgword2vec)
    print 'avg_word2vec_idf: Generated embeddings for {0} sentences from {1} dataset.'.format(
        len(array_sentences), dataset)
    return array_sentences, array_embeddings


if __name__ == "__main__":

    if False:  # FIRST PART
        sentences = word2vec.Text8Corpus('data/text8')

        # Train a word2vec model
        embedding_size = 200
        model = word2vec.Word2Vec(sentences, size=embedding_size)

        # Train a word2vec model with phrases
        bigram_transformer = gensim.models.Phrases(sentences)
        model_phrase = Word2Vec(bigram_transformer[sentences], size=200)
    else:
        # Loading model trained on words
        model = word2vec.Word2Vec.load('models/text8.model')

        # Loading model enhanced with phrases (2-grams)
        model_phrase = word2vec.Word2Vec.load('models/text8.phrase.model')
    """
Ejemplo n.º 20
0


if __name__ == '__main__':
    # Read data from files
    train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'electronics', 'trainData.tsv'), header=0, delimiter="\t", quoting=3 )
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'electronics', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
    #unlabeled_train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', "unlabeledTrainData.tsv"), header=0,  delimiter="\t", quoting=3 )
    print "Read %d labeled train reviews, %d labeled test reviews, " % (train["review"].size,test["review"].size )

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    #sentences=cPickle.load(open('sentences.p', 'rb'))
    sentences = word2vec.Text8Corpus('electronics/alldata.txt')
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
    # Set values for various parameters
    num_features = 200    # Word vector dimensionality
    min_word_count = 10   # Minimum word count
    num_workers = 16       # Number of threads to run in parallel
    context = 10         # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words
    # Initialize and train the model (this will take some time)
    print "Training Word2Vec model..."
    #model = word2vec.Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count,window = context, sample = downsampling, seed=1)
    #model.init_sims(replace=True)
    #model_name = "200features_10minwords_10context_electronics"
    #model.save(model_name)
    model = word2vec.Word2Vec.load("200features_10minwords_10context_electronics")
    #
Ejemplo n.º 21
0
# -*- coding: utf-8 -*-

from gensim.models import word2vec
import logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus('DATASET/merge.txt')
model = word2vec.Word2Vec(sentences, size=5, min_count=0, window=1)
#size:是每个词的向量维度;
#window:是词向量训练时的上下文扫描窗口大小,窗口为5就是考虑前5个词和后5个词;
#min-count:设置最低频率,默认是5,如果一个词语在文档中出现的次数小于5,那么就会丢弃;
#workers:是训练的进程数

model.save('text.model')  #模型存储
model.wv.save_word2vec_format('text.model.bin')  #格式化存储

model['單位']  #得到单个单词的向量表示
model.most_similar(['上班'])  #得到接近相似度结果
model.similarity('單位', '上班')  #判断两个词汇的相似度
Ejemplo n.º 22
0
def train_w2v():
    data = pd.read_csv('../data/full_tobe_classify_180316.csv',encoding='GBK')
    data['cutted_Dis'].to_csv('../data/lg_all_data.txt',index=False,encoding='utf-8')
    sentences=word2vec.Text8Corpus('../data/lg_all_data.txt')
    model=word2vec.Word2Vec(sentences,min_count=2,size=256)
    model.save('./word2vec/lg_data_model_comment_256dim')
Ejemplo n.º 23
0
from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus("../input/text8")

model = word2vec.Word2Vec(sentences, size=200)
model.save("text8.model")

from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec

model = KeyedVectors.load("text8.model.wv.vectors.npy")

print(model['word'])
Ejemplo n.º 24
0
from gensim.models import word2vec

sentences = word2vec.Text8Corpus(r'gene/all_text.txt')
model = word2vec.Word2Vec(sentences,
                          size=100,
                          negative=5,
                          min_count=2,
                          window=5)
model.save('word2vec/Aword2vec.model')
Ejemplo n.º 25
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
功能:测试gensim使用,处理中文语料
时间:2016年5月21日 20:49:07
"""

from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus(u"C:\\Users\\lenovo\\Desktop\\word2vec实验\\亚马逊中文书评语料.txt")  # 加载语料
model = word2vec.Word2Vec(sentences, size=200)  # 默认window=5

# 计算两个词的相似度/相关程度
y1 = model.similarity(u"不错", u"好")
print u"【不错】和【好】的相似度为:", y1
print "--------\n"

# 计算某个词的相关词列表
y2 = model.most_similar(u"书", topn=20)  # 20个最相关的
print u"和【书】最相关的词有:\n"
for item in y2:
    print item[0], item[1]
print "--------\n"

# 寻找对应关系
print u"书-不错,质量-"
y3 = model.most_similar([u'质量', u'不错'], [u'书'], topn=3)
    print('正在对{}进行分词.'.format(novel))
    with codecs.open('novel/{}.txt'.format(novel), encoding='UTF-8') as f:
        sentences += [list(jieba.cut(line.strip())) for line in f]
print('分词完成.')

print('正在保存分词结果到sentences.txt.')
f = open('sentences.txt', 'w', encoding='UTF-8')
text = ''
for line in sentences:
    text += ' '.join(line)
    text += '\n'
f.write(text)
f.close()
print('保存完成.')

print("训练中……")
# Load file
sentence = word2vec.Text8Corpus("sentences.txt")
# Setting degree and Produce Model(Train)
model = word2vec.Word2Vec(sentence)  # size=500, window=5, min_count=5, workers=4, sg=1, max_vocab_size=120000000
try:
    # 删除重复模型
    os.remove("model/{}.model.bin".format(query.replace(",", "-")))
except:
    pass
else:
    print('检测到同名模型,将自动删除')
# Save model
model.wv.save_word2vec_format("model/{}.model.bin".format(query.replace(",", "-")), binary=True)
print("训练模型已存储")
Ejemplo n.º 27
0
from gensim.models import word2vec

# 加载分句后的文件
sentences = word2vec.Text8Corpus('NewsCar_new_after_process/1/2.txt')

# 转换为一个个字
tokens = []
for sen in sentences:
    print(type(sen))
    for j in sen:
        for token in j:
            tokens.append(token)

# size 表示向量维度 min_count表示最小出现次数
model = word2vec.Word2Vec(tokens, size=100, min_count=1)

# 计算和车最相似的5个字
x = model.most_similar("车", topn=5)
print(x)

# 输出'汽车'的词向量
print(model['车'])

# 保存模型
model.save("res.model")
# 对应的加载方式
# model_2 = word2vec.Word2Vec.load("text8.model")
Ejemplo n.º 28
0
# # save seg file and load it as Text8Corpus

# In[41]:


with open('seg/allseg.txt','w',encoding='utf8') as output:
    for line in seg_train:
        for word in line:
            output.write(word+' ')
        output.write('\n')


# In[42]:


sentences = word2vec.Text8Corpus('seg/allseg.txt')


# # train w2v model

# In[43]:


dim = 64
min_count = 1
window = 20
iteration = 150
sg = 1
neg = 5
note = 'all'
fname = str(dim)+'m'+str(min_count)+'w'+str(window)+'it'+str(iteration)+'sg'+str(1)+'neg'+str(neg)+note
from gensim.models import word2vec
import logging
sentences = word2vec.Text8Corpus('/tmp/text8')
model = word2vec.Word2Vec(sentences, size=200)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
Ejemplo n.º 30
0
def main(
        limit=30,  # 句子长度
        x_limit=1,
        y_limit=2):
    from word_sequence import WordSequence

    print('extract lines')
    """dgk语料"""
    # fp = open("raw_data/dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8')
    """xiaohuangji语料"""
    fp = open("raw_data/xiaohaungji50w_test.conv",
              'r',
              errors='ignore',
              encoding='utf-8')

    # 保存全部句子列表
    groups = []
    # 保存一行
    group = []

    for line in tqdm(fp):  # 显示进度条

        if line.startswith('M '):  # 句子处理M开头
            line = line.replace('\n', '')  # 去掉回撤

            if '/' in line:
                line = line[2:].split('/')  # 去掉斜杠 -> return <list>
                line = list(regular(''.join(line)))  # 去掉词语

                line = jieba.lcut(''.join(line))
            else:
                line = list(line[2:])

            group.append(line)
            # print(group)

        else:  # E开头句子---line.startswith('E ')
            if group:
                groups.append(group)
                group = []

    if group:
        groups.append(group)
        group = []

    print('\nextract group')
    """定义问答对"""
    x_data = []
    y_data = []

    for group in tqdm(groups):
        # print(group)
        for index, line in enumerate(group):
            if index == 0 and good_line(line): x_data.append(line)
            if index == 1 and good_line(line): y_data.append(line)

    print(x_data)
    print(y_data)

    # 问答对数据量
    print('\n问句数量:' + str(len(x_data)), '答句数量:' + str(len(y_data)))

    # 将问答对放入zip object(至多20字符)
    for ask, answer in zip(x_data[:30], y_data[:30]):
        print(''.join(ask))
        print(''.join(answer))
        print('-' * 20)
    """组装数据"""
    data = list(zip(x_data, y_data))

    # 组装规则:
    data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit
            and len(y) >= y_limit and len(x) >= x_limit]
    x_data, y_data = zip(*data)

    # word_sequence模型训练
    print('fit word_sequence')
    from gensim.models import word2vec
    import gensim
    sentences = word2vec.Text8Corpus(train_file_name)  # 加载语料
    model = gensim.models.Word2Vec(sentences,
                                   size=200)  # 训练skip-gram模型; 默认window=5