Python PathLineSentences Examples, gensim.models.word2vec.PathLineSentences Python Examples

Example #1

0

Show file

def get_or_create_cut_word(input_path, output_path):
    """
    获取分词文件

    :param input_path:
    :param output_path:
    :return:
    """
    if os.path.exists(output_path):
        return word2vec.PathLineSentences(output_path)
    else:
        input_data = open(input_path, 'r')
        output_data = open(output_path, 'w')
        for line in input_data:
            output_data.write(segment_depart(line) + '\n')
        return word2vec.PathLineSentences(output_path)

Example #2

0

Show file

 def testPathLineSentencesOneFile(self):
     """Does PathLineSentences work with a single file argument?"""
     test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
     with utils.smart_open(test_file) as orig:
         sentences = word2vec.PathLineSentences(test_file)
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Example #3

0

Show file

def train_THUCNews(segment_dir, out_word2vec_path):
    sentences = word2vec.PathLineSentences(segment_dir)
    model = train_wordVectors(sentences,
                              embedding_size=128,
                              window=5,
                              min_count=5)
    save_wordVectors(model, out_word2vec_path)

Example #4

0

Show file

    def extract_sentences(self):
        """Extract sentences from data set for Word2Vec model.
        See https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec for detail.

        :return: sentences type of list of list.
        """
        pickle_path = os.path.join(self.root, self.pickled_folder)
        pickle_file = 'sentences.pickle'

        if self.test_mode:
            try:
                os.remove(os.path.join(pickle_path, pickle_file))
                os.rmdir(pickle_path)
            except FileNotFoundError:
                pass

        try:
            with open(os.path.join(pickle_path, pickle_file), 'rb') as f:
                print("Sentences will be loaded from pickled file: " +
                      pickle_file)
                return pickle.load(f)
        except FileNotFoundError:
            print("Cannot find pickled file to load sentences.")
            pass
        except Exception as error:
            raise error

        print("Extracting...")
        sentences = []
        for mode in ['train', 'test']:
            for classification in ['pos', 'neg', 'unsup']:
                if mode == 'test' and classification == 'unsup':
                    # There is no test/unsup in our data.
                    continue
                path = os.path.join(self.root, mode, classification)
                # sentences would be 12,500 review data sentences list.
                test_index = 0
                for sentence in word2vec.PathLineSentences(path):
                    test_index += 1
                    if self.test_mode and test_index > TEST_DATA_SIZE:
                        break

                    alphabetic_words = list(
                        map(lambda x: to_alphabetic(x), sentence))
                    words = list(
                        filter(lambda x: len(x) != 0, alphabetic_words))
                    sentences += words
        # Sentences look like [[review.split()], [...], ...].
        sentences = [sentences]
        try:
            os.mkdir(pickle_path)
        except FileExistsError:
            # 'processed' folder already exists.
            pass

        with open(os.path.join(pickle_path, pickle_file), 'wb') as f:
            pickle.dump(sentences, f, pickle.HIGHEST_PROTOCOL)

        print("Done.")
        return sentences

Example #5

0

Show file

File: train_w2v.py Project: TprceOYX/nlp_project

def train_w2v():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    # 1.format: 指定输出的格式和内容，format可以输出很多有用信息，
    # %(asctime)s: 打印日志的时间
    # %(levelname)s: 打印日志级别名称
    # %(message)s: 打印日志信息
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    # 打印这是一个通知日志
    logger.info("running %s" % ' '.join(sys.argv))
    # [1]若只有一个文件，使用LineSentence读取文件
    # segment_path='./data/segment/segment_0.txt'
    # sentences = word2vec.LineSentence(segment_path)

    # [1]若存在多文件，使用PathLineSentences读取文件列表

    segment_dir = './train_data'
    sentences = word2vec.PathLineSentences(segment_dir)
    # 一般训练，设置以下几个参数即可：
    word2vec_path = './models/train/word2vec.model'
    model = train_wordVectors(word2vec_path,
                              sentences,
                              embedding_size=256,
                              window=5,
                              min_count=5)
    print(model.alpha)
    save_wordVectors(model, word2vec_path)

Example #6

0

Show file

    def train(self,
              corpus_path,
              size=100,
              min_count=1,
              window=5,
              iter=20,
              out_path='./model/word2vec/word2vec.model'):
        logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
                            level=logging.INFO)
        try:
            sentences = word2vec.PathLineSentences(corpus_path)
        except Exception as e:
            print(e)
            return

        if self.__model is None:
            self.__model = word2vec.Word2Vec(sentences,
                                             size=size,
                                             min_count=min_count,
                                             window=window,
                                             iter=iter)
        else:
            self.__model.build___vocab(sentences, update=True)
            self.__model.train(sentences,
                               total_examples=self.__model.corpus_count,
                               epochs=self.__model.iter)
        self.__vocab_index = self.__model.wv.index2word
        self.__model.save(out_path)

Example #7

0

Show file

File: train_word2vec.py Project: kzinmr/embedding_docker

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--corpus-dir",
        "-i",
        default="/app/workspace/data",
        help="Location of pre-training text files.",
    )
    parser.add_argument('--output', '-o')
    parser.add_argument('--dimension', '-d', type=int, default=256)
    parser.add_argument('--window', '-w', type=int, default=16)
    parser.add_argument('--min-count', type=int, default=10)
    parser.add_argument('--max-vocab-size', type=int, default=30000)
    parser.add_argument('--max-sentence-length', type=int, default=30000)
    parser.add_argument('--workers', type=int, default=-1)
    parser.add_argument('--sg', type=int, default=1)
    args = parser.parse_args()
    outputpath = args.output
    mc = multiprocessing.cpu_count() // 2
    workers = mc if args.workers == -1 else args.workers
    sentences = word2vec.PathLineSentences(args.corpus_dir, max_sentence_length=args.max_sentence_length)
    model = word2vec.Word2Vec(sentences,
                              size=args.dimension,
                              window=args.window,
                              min_count=args.min_count,
                              max_vocab_size=args.max_vocab_size,
                              workers=workers,
                              sg=args.sg)
    # not saving temporary data
    model.delete_temporary_training_data()
    model.save(outputpath)
    model.wv.save_word2vec_format(f'{outputpath}.txt')

Example #8

0

Show file

def pre_train(segmented_dir):
    sys.path.append('..')

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir),
                              size=300,
                              min_count=2,
                              workers=8,
                              iter=10)
    with open(os.path.join(segmented_dir, 'w2v_dic.data'),
              'w',
              encoding='utf-8') as f:
        for word in model.wv.vocab:
            f.write(word + ' ')
            f.write(' '.join(list(map(str, model[word]))))
            f.write('\n')
    f.close()

    model.save_word2vec_format(os.path.join(segmented_dir, 'w2v_model.bin'),
                               binary=True)

Example #9

0

Show file

File: w2v.py Project: bleakwindzhou/MedicalNamedEntityRecognition

def w2v_train(segment_dir='./data/segment/oil.txt',
              word2vec_path='./models/w2v/oil.model'):
    sentences = word2vec.PathLineSentences(segment_dir)
    model2 = train_wordVectors(sentences,
                               embedding_size=300,
                               window=5,
                               min_count=1)
    save_wordVectors(model2, word2vec_path)

Example #10

0

Show file

 def testPathLineSentences(self):
     """Does PathLineSentences work with a path argument?"""
     with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
     utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
         sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
         orig = orig1.readlines() + orig2.readlines()
         orig_counter = 0  # to go through orig while matching PathLineSentences
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
             orig_counter += 1

Example #11

0

Show file

 def word2vec(self):
     sentences = word2vec.PathLineSentences("nlp/cut_words.txt")
     model = Word2Vec(sentences, size=20, window=5, min_count=1, workers=4)
     model.save("nlp/word2vec.model")
     model = Word2Vec.load("nlp/word2vec.model")
     # a= model.train([["吸收公众存款", "吸收公众存款"]], total_examples=1, epochs=1)
     vector = model.wv['新材料']
     a = model.similar_by_vector(vector)
     print(a)
     print(vector)

Example #12

0

Show file

File: w2v.py Project: shishishu/tensorflow-nlp-sentiment-analysis

def w2v_training(seg_corpus_dir, embedding_size):
    w2v_model_file = 'w2v_embed_' + str(embedding_size) + '.model'
    w2v_vector_file = 'w2v_embed_' + str(embedding_size) + '.txt'
    sentences = word2vec.PathLineSentences(seg_corpus_dir)
    workers = multiprocessing.cpu_count()
    # basic setting in w2v
    w2v_model = word2vec.Word2Vec(sentences=sentences, size=embedding_size, window=5, min_count=5, workers=workers,\
        sg=1, hs=0, negative=10, ns_exponent=0.75, iter=10, sorted_vocab=1)
    w2v_model.save(config.params_dir + w2v_model_file)
    w2v_model.wv.save_word2vec_format(config.params_dir + w2v_vector_file, binary=False)
    logging.info('Word2Vec training is done and data are saved..')

Example #13

0

Show file

File: segwords_classifier.py Project: boris-zhang/general-machine-learning-code

def word2vec_vectorizer(rst1,
                        rst2,
                        embedding_size=1024,
                        in_window=20,
                        in_min_count=5):
    sentences = word2vec.PathLineSentences('./segwords')
    w2vModel = word2vec.Word2Vec(sentences,
                                 sg=1,
                                 size=embedding_size,
                                 window=in_window,
                                 min_count=in_min_count)
    return w2vModel

Example #14

0

Show file

File: word2vec_warrior.py Project: hhachiya/inverseFunction

 def w2v_train(self):  #word2vecを学習する
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                         level=logging.INFO)
     sentences = word2vec.PathLineSentences('./warrior-data-wakati')
     model = word2vec.Word2Vec(sentences,
                               sg=1,
                               size=100,
                               min_count=1,
                               window=10,
                               hs=1,
                               negative=0)
     model.save(self.modelpath)

Example #15

0

Show file

File: fasttext2vec.py Project: SeekingDream/EmbeddingEvaluation

 def __init__(self, file_name, dataset, vocab, vec_dim, epoch):
     super(FastEmbedding, self).__init__(file_name, dataset, vocab, vec_dim,
                                         epoch)
     file_list = word2vec.PathLineSentences(self.file_name).input_files
     res = []
     for file_name in file_list:
         with open(file_name, 'r') as f:
             res.append(f.read())
     if not os.path.isdir('../tmp_res'):
         os.mkdir('../tmp_res')
     with open('../tmp_res/tmp_file', 'w') as f:
         f.writelines(res)

Example #16

0

Show file

 def __init__(self, file_name, dataset, vocab, vec_dim, epoch):
     super(Doc2VecEmbedding, self).__init__(file_name, dataset, vocab,
                                            vec_dim, epoch)
     sentences = word2vec.PathLineSentences(self.file_name)
     docLabels = sentences.input_files
     data = []
     for doc in docLabels:
         try:
             with open(doc) as f:
                 doc_data = f.read()
                 data.append(doc_data)
         except:
             pass
     self.it = LabeledLineSentence(data, docLabels)

Example #17

0

Show file

File: training.py Project: cailmdaley/discursive-distributions

 def train(self, **kwargs):
     arg_string = '_'.join( key + '=' + str(value) 
         for key, value in kwargs.items())
     print(self.algorithm + '_' + arg_string)
     
     if self.algorithm == 'word2vec':
         sentences = word2vec.PathLineSentences(self.save_dir.joinpath('line_sentences'))
         self.model = word2vec.Word2Vec(sentences, **kwargs)
     if self.algorithm == 'doc2vec':
         self.model = doc2vec.Doc2Vec(self, **kwargs)
         
     savepath = self.save_dir.joinpath(self.algorithm + '_' + arg_string)
     self.model.save(str(savepath))
     return self.model

Example #18

0

Show file

def train():
    logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s", level=logging.INFO)
    log = '加载语料...\n'
    sentences = word2vec.PathLineSentences(common.CORPUS_PATH)
    log = log + ' \n'.join(sentences.input_files)

    log = log + '\n开始训练..., word2vec.Word2Vec(sentences, min_count=1)\n'
    model = word2vec.Word2Vec(sentences, min_count=1)

    # 保存模型
    if not os.path.exists(common.MODEL_PATH):
        os.mkdir(common.MODEL_PATH)
    model.save(common.MODEL_PATH + '/' + common.MODEL_FILE)
    log = log + '训练结束...，模型保存在' + common.MODEL_FILE
    return log

Example #19

0

Show file

def word_vec(path):
    logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",
                        level=logging.INFO)
    # sentences = word2vec.LineSentence(r"D:\pycharm_project\knowledge_rule\datas\wiki\AA\wiki_corpus")
    sentences = word2vec.PathLineSentences(path)
    model = word2vec.Word2Vec(sentences,
                              size=200,
                              window=5,
                              min_count=5,
                              workers=multiprocessing.cpu_count())
    # 保存模型
    model.save("../model/20200928/corpus00.model")
    # 保存词向量
    model.wv.save_word2vec_format("../model/20200928/corpus00.vector",
                                  binary=False)

Example #20

0

Show file

 def doc2vec(self):
     sentences = word2vec.PathLineSentences("nlp/cut_words.txt")
     documents = [
         TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)
     ]
     # for  i ,doc in  enumerate(sentences):
     #     ddd = TaggedDocument(doc,[i])
     #     print(ddd)
     model = Doc2Vec(documents,
                     vector_size=20,
                     window=2,
                     min_count=1,
                     workers=4)
     model.save("nlp/doc2vec.model")
     model = Doc2Vec.load("nlp/doc2vec.model")
     vector = model.infer_vector(["电器开关零部件及附件制造"])
     model.similar_by_vector(vector)
     pass

Example #21

0

Show file

File: process_data.py Project: physicsLoveJava/blstm-crf

def build_word2vec():
    tuples = [
        (train_path, 'train.txt'),
        (test_path, 'test.txt'),
        (dev_path, 'dev.txt'),
    ]
    for (path, name) in tuples:
        transform_only_sentences(path, name)
    sentences = word2vec.PathLineSentences(sentences_path)
    model = word2vec.Word2Vec(sentences,
                              size=embedding_size,
                              hs=1,
                              min_count=5)
    print(len(model.wv.vocab))
    model.wv.add(padding_letter, np.zeros(model.wv.vector_size))
    print(len(model.wv.vocab))
    model.wv.save_word2vec_format(word2vec_path)
    return model.wv

Example #22

0

Show file

File: glove.py Project: SeekingDream/EmbeddingEvaluation

    def generate_embedding(self, model_type):
        sentences = word2vec.PathLineSentences(self.file_name)

        # Training the corpus to generate the co-occurance matrix which is used in GloVe
        corpus = Corpus()  # Creating a corpus object
        corpus.fit(sentences, window=self.window) 

        # Training GloVe model
        glove = Glove(
            no_components=self.vec_dim, 
            learning_rate=self.learning_rate
        ) 
        glove.fit(
            corpus.matrix, epochs=self.epoch, 
            no_threads=self.no_threads, verbose=self.verbose
        )
        glove.add_dictionary(corpus.dictionary)

        return trans_vocab(glove.dictionary, glove.word_vectors)

Example #23

0

Show file

File: auto_embedding_train.py Project: zhangyunxing37/word2vec_train_embedding

 def embedding_train(self):
     print("词向量训练 start")
     starttime = datetime.datetime.now()
     # 获取文件夹中所有文件
     sent = word2vec.PathLineSentences(self.save_path)
     # 具体参数在self.parms设置
     model = word2vec.Word2Vec(sentences=tqdm(sent), **self.parms)
     endtime = datetime.datetime.now()
     print('秒：', (endtime - starttime).seconds)
     # 保存模型---载入word2vec.Word2Vec.load("\\name.model")
     model.save(self.embedding_path + '\\' + "word_embedding1.model")
     model.wv.save_word2vec_format(self.embedding_path + '\\' +
                                   "word_embedding1.txt",
                                   binary=0)
     # 载入bin、txt文件: gensim.models.KeyedVectors.load_word2vec_format('/ .txt/bin', binary=False)
     model.wv.save_word2vec_format(self.embedding_path + '\\' +
                                   "word_embedding1.bin",
                                   binary=0)
     print("##词向量训练已完成## the word_embedding and model exists in " +
           self.embedding_path)
     return (model)

Example #24

0

Show file

File: SIF.py Project: baiyigali/mrc

    def train_embeddings(self):
        sys.path.append('..')
        logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
        logging.root.setLevel(level=logging.INFO)
        self.logger.info("running %s" % ' '.join(sys.argv))

        model = word2vec.Word2Vec(word2vec.PathLineSentences(
            self.segmented_dir),
                                  size=300,
                                  min_count=2,
                                  workers=8,
                                  iter=15)
        w2v_dict = {}
        for word in model.wv.vocab:
            w2v_dict[word] = model[word]
        with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'wb') as f:
            pkl.dump(w2v_dict, f)
        f.close()
        model.wv.save_word2vec_format(os.path.join(self.prepared_dir,
                                                   'w2v_model.bin'),
                                      binary=True)

Example #25

0

Show file

File: pretrain_embedding.py Project: SilenceWinter/MRC2018

def pre_train(brc_data, segmented_dir):
    # parser = argparse.ArgumentParser('Reading Comprehension on BaiduRC dataset')
    # path_settings = parser.add_argument_group('path settings')
    # path_settings.add_argument('--train_files', nargs='+',
    #                            default=['../data/trainset/search.train.json'],
    #                            help='list of files that contain the preprocessed train data')
    # path_settings.add_argument('--dev_files', nargs='+',
    #                            default=['../data/devset/search.dev.json'],
    #                            help='list of files that contain the preprocessed dev data')
    # path_settings.add_argument('--test_files', nargs='+',
    #                            default=['../data/testset/search.test.json'],
    #                            help='list of files that contain the preprocessed test data')
    # path_settings.add_argument('--segmented_dir', default='../data/segmented',
    #                            help='the dir to store segmented sentences')

    sys.path.append('..')
    # args = parser.parse_args()
    # for files in args.train_files + args.dev_files + args.test_files:
    #     json_to_sentence.load_data(files, args.segmented_dir)
    load_data(brc_data, segmented_dir)

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir),
                              size=300,
                              min_count=2,
                              workers=8,
                              iter=10)
    with open(os.path.join(segmented_dir, 'w2v_dic.data'),
              'w',
              encoding='utf-8') as f:
        for word in model.wv.vocab:
            f.write(word + ' ')
            f.write(' '.join(list(map(str, model[word]))))
            f.write('\n')
    f.close()

Example #26

0

Show file

File: ML_utils.py Project: jounjieli/ML_Sentiment_analysis

def Word2vec_train(file_path,
                   save_path,
                   dir_path=None,
                   save_name='word2vec_model',
                   replace_old=False,
                   model_size=300,
                   model_window=10,
                   model_min_count=5,
                   **kw):
    """
    batch train usage: set dir_path、save_name, file_path = None, save_path = None
    if Multiple files using dir_path
    """
    from gensim.models import word2vec
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # https://radimrehurek.com/gensim/models/word2vec.html
    if file_path != None:
        #單檔案
        sentences = word2vec.LineSentence(file_path)
        model = word2vec.Word2Vec(sentences,
                                  size=model_size,
                                  window=model_window,
                                  min_count=model_min_count,
                                  **kw)
        #保存模型，供日後使用
        model.save(save_path)
    if dir_path != None and file_path == None:
        #多檔案
        sentences = word2vec.PathLineSentences(dir_path)
        model = word2vec.Word2Vec(sentences,
                                  size=model_size,
                                  window=model_window,
                                  min_count=model_min_count,
                                  **kw)
        #保存模型，供日後使用
        model.save(os.path.join(dir_path, save_name))

Example #27

0

Show file

File: pretrain_embedding.py Project: MrRace/DuReader

def pre_train(segmented_dir, embed_size):
    """
    根据训语料训练词向量。或者可以考虑全部语料加上百度知道的数据集？？
    :param brc_data:
    :param segmented_dir:
    :return:
    """

    sys.path.append('..')
    # 将原始数据的分词结果进行保存
    # save_seg_data(brc_data, segmented_dir)

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # 这里的语料是被预处理成多个结果PathLineSentences可以支持多个大文件，对内存很友好。
    # 如果语料是单个大文件的话，建议使用LineSentences（file）这个类加载训练语料，同样内存友好。
    # 默认embed_size=300
    model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir),
                              size=embed_size,
                              min_count=2,
                              workers=12,
                              iter=10)
    # 保存模型
    model.save(os.path.join(segmented_dir, 'w2v_dic.data'))
    with open(os.path.join(segmented_dir, 'w2v_dic.data'),
              'w',
              encoding='utf-8') as f:
        for word in model.wv.vocab:
            f.write(word + ' ')
            f.write(' '.join(list(map(str, model[word]))))
            f.write('\n')
    f.close()

Example #28

0

Show file

def train_test():
    # [1]若只有一个文件，使用LineSentence读取文件
    # segment_path='./data/segment/segment_0.txt'
    # sentences = word2vec.LineSentence(segment_path)

    # [1]若存在多文件，使用PathLineSentences读取文件列表

    segment_dir = './data/segment'
    sentences = word2vec.PathLineSentences(segment_dir)

    # 简单的训练
    model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3, size=100)
    print(model.wv.similarity('沙瑞金', '高育良'))
    # print(model.wv.similarity('李达康'.encode('utf-8'), '王大路'.encode('utf-8')))

    # 一般训练，设置以下几个参数即可：
    word2vec_path = './models/word2Vec.model'
    model2 = train_wordVectors(sentences,
                               embedding_size=128,
                               window=5,
                               min_count=5)
    save_wordVectors(model2, word2vec_path)
    model2 = load_wordVectors(word2vec_path)
    print(model2.wv.similarity('沙瑞金', '高育良'))

Example #29

0

Show file

File: action_3.py Project: IELBHJY/RS6

from gensim.models import word2vec

segment_folder = 'word2vec/three_kingdoms/segment'
sentences = word2vec.PathLineSentences(segment_folder)

model = word2vec.Word2Vec(sentences, size=100, window=3, min_count=3)
#model.wv.save_word2vec_format('file1.txt', binary=False)
#model.wv.similarity('刘备', '关羽')
print(model.wv.most_similar(positive=['曹操']))
print(model.wv.most_similar(positive=['曹操','刘备'],negative=['张飞']))
#[('孙权', 0.986218273639679), ('荆州', 0.9801917672157288), ('夫人', 0.9764574766159058), ('周瑜', 0.9756923913955688), ('今反', 0.9745445847511292), ('孔明', 0.9739490747451782), ('已', 0.9734069108963013), ('拜', 0.9730291366577148), ('拜谢', 0.9727320671081543), ('袁绍', 0.9722797870635986)]
#[('今', 0.9847639799118042), ('臣', 0.9846991300582886), ('吾', 0.9833989143371582), ('主公', 0.9833654165267944), ('丞相', 0.9818264842033386), ('某', 0.9800719022750854), ('问', 0.9799109697341919), ('此', 0.9775131940841675), ('告', 0.9753938317298889), ('卿', 0.9734485149383545)]

Example #30

0

Show file

from gensim.models import word2vec

word_file = './three_kingdoms/segment/seg_threekingdoms.txt'
senstence = word2vec.PathLineSentences(word_file)

model1 = word2vec.Word2Vec(senstence, size=128, window=3, min_count=2)

print(model1.wv.most_similar('曹操'))
print(model1.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞']))
model1.save('./models/word2Vec_threekingdim.model')
'''
[('孙权', 0.9883049726486206), ('先主', 0.9877791404724121), ('回报', 0.9873332977294922), 
('夫人', 0.9860264658927917), ('关公', 0.9857215881347656), ('孔明', 0.9843080043792725), 
('荆州', 0.983728289604187), ('周瑜', 0.9833334684371948), ('往', 0.9825193285942078), ('又', 0.9818975329399109)]

[('丞相', 0.9887984395027161), ('臣', 0.9875719547271729), ('某', 0.9866517782211304), 
('此', 0.9865485429763794), ('大叫', 0.9859899282455444), ('皆曰', 0.9858393669128418), 
('朕', 0.9830409288406372), ('书略', 0.9822883605957031), ('乃曰', 0.9815787076950073), ('既', 0.9811386466026306)]
'''