Example #1
0
 def __iter__(self):
     json_list = json_dict_from_file(self.dirname,"content")
     for json_dict in json_list:
         # try:
         content = delete_stop_words(clean_comment(json_dict['content']), return_list=True)
         # content = delete_stop_words(clean_comment(json_dict), return_list=True)
         # return content
         yield content
Example #2
0
 def __iter__(self):
     json_list = json_dict_from_file(self.dirname, "content")
     for json_dict in json_list:
         # try:
         content = delete_stop_words(clean_comment(json_dict['content']),
                                     return_list=True)
         # content = delete_stop_words(clean_comment(json_dict), return_list=True)
         # return content
         yield content
Example #3
0
    feature_size = 500
    content_window = 10
    freq_min_count = 4
    threads_num = 8
    negative = 6   # best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。
    t_iter = 60

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(save_model):
        model = Word2Vec.load(save_model)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        s_list = json_dict_from_file(file_name,"content")
        model = Word2Vec(s_list, size=feature_size, window=content_window, iter=t_iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc-tic))
        model.save(save_model)
        model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")





    """
    品牌维度
    """
    # brand =[u'性能',
Example #4
0
    feature_size = 500
    content_window = 10
    freq_min_count = 4
    threads_num = 8
    negative = 6  # best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。
    t_iter = 60

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(save_model):
        model = Word2Vec.load(save_model)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        s_list = json_dict_from_file(file_name, "content")
        model = Word2Vec(s_list,
                         size=feature_size,
                         window=content_window,
                         iter=t_iter,
                         min_count=freq_min_count,
                         negative=negative,
                         workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc - tic))
        model.save(save_model)
        model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    """
    品牌维度
    """