Ejemplo n.º 1
0
 def train_static(self):
     if os.path.isfile(os.path.join(self.static_folder, "static.model")):
         self.compass = Word2Vec.load(os.path.join(self.static_folder, "static.model"))
         print("Static model loaded.")
     else:
         files = PathLineSentences(self.slices_folder + '/' + self.slice_type)
         files.input_files = [file for file in files.input_files if not os.path.basename(file).startswith('.')]
         print("Training static embeddings.")
         self.compass = self.train_model(files)
         self.compass.save(os.path.join(self.static_folder, "static.model"))
     global gvocab
     gvocab = self.compass.wv.vocab
Ejemplo n.º 2
0
def main():
    """
    Preprocess corpus (remove low-frequency words, etc.).
    """

    # Get the arguments
    args = docopt("""Preprocess corpus (remove low-frequency words, etc.).

    Usage:
        preprocess.py <corpDir> <outPath> <minFreq>
        
    Arguments:
       
        <corpDir> = path to corpus or corpus directory (iterates through files)
        <outPath> = output path
        <minFreq> = minimum frequency threshold
        
    """)

    corpDir = args['<corpDir>']
    outPath = args['<outPath>']
    minFreq = int(args['<minFreq>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get sentence iterator
    sentences = PathLineSentences(corpDir)

    # Initialize frequency dictionary
    freqs = defaultdict(int)

    # Iterate over sentences and words
    for sentence in sentences:
        for word in sentence:
            freqs[word] = freqs[word] + 1

    # Get sentence iterator
    sentences = PathLineSentences(corpDir)

    # Write output
    with open(outPath, 'w', encoding='utf-8') as f_out:
        for sentence in sentences:
            out_sentence = [
                word for word in sentence if freqs[word] >= minFreq
            ]
            if len(out_sentence) > 1:
                f_out.write(' '.join(out_sentence) + '\n')

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 3
0
def train(in_dir,
          out_file,
          negative=5,
          n_workers=4,
          window=5,
          size=128,
          min_count=10,
          nr_iter=2):
    w2v_model = Word2Vec(size=size,
                         window=window,
                         min_count=min_count,
                         workers=workers,
                         sample=1e-5,
                         negative=negative,
                         iter=epochs)
    sentences = PathLineSentences(in_dir)
    print("Building the vocabulary...")
    w2v_model.build_vocab(sentences)
    print("Training the model...")
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=w2v_model.iter)
    print("Creating the sense2vec model...")
    vector_map = VectorMap(size)
    for string in w2v_model.wv.vocab:
        vocab = w2v_model.wv.vocab[string]
        freq, idx = vocab.count, vocab.index
        if freq < min_count:
            continue
        vector = w2v_model.wv.vectors[idx]
        vector_map.borrow(string, freq, vector)
    print("Saving the model...")
    vector_map.save(out_file)
    print("Saved model to file: ", out_file)
Ejemplo n.º 4
0
def train_wd2vec(config):
    logging.info('开始训练词向量....')
    t1 = time.time()
    word2vec_model = Word2Vec(PathLineSentences(config.corpus_path),
                              size=config.embedding_size,
                              window=config.win_size,
                              min_count=config.min_count,
                              sg=config.sg,
                              workers=config.cpu_count,
                              iter=config.n_iter)
    '''
    训练方式2:
        word2vec_model = Word2Vec(size=EMBEDDING_SIZE,
                                  window=WIN_SIZE,
                                  min_count=MIN_COUNT,
                                  sg=0,
                                  workers=CPU_COUNT,
                                  iter=N_ITER)
        word2vec_model.build_vocab(sentences)
        word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)        
    '''

    t2 = time.time()
    logging.info('词向量训练结束!总用时:{}min'.format((t2 - t1) / 60.0))

    word2vec_model.save(config.model_save_path)  # 保存词向量模型
    logging.info('词向量模型已保存......')
Ejemplo n.º 5
0
def get_poi_model(path1,path2,model_name):

    sentences = PathLineSentences(path1)
    #model = Word2Vec(sentences, size=100, iter=10, min_count=20)
    model = Word2Vec(sentences, sg=1, size=10, window=5, min_count=0, hs=1, negative=3, sample=0.001)
    ''''
    · sentences:可以是一个list,对于大语料集,建议使用BrownCorpus,Text8Corpus或LineSentence构建。
    · sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法。
    · size:是指特征向量的维度,默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
    · window:表示当前词与预测词在一个句子中的最大距离是多少,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。
    · alpha: 是学习速率
    · seed:用于随机数发生器。与初始化词向量有关。
    · min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
    · max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个,则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。
    · sample: 高频词汇的随机降采样的配置阈值,默认为1e-3,范围是(0,1e-5)
    · workers参数控制训练的并行数。此参数只有在安装了Cpython后才有效,否则只能使用单核。
    · hs: 如果为1则会采用hierarchical softmax技巧。如果设置为0(defaut),则negative sampling负采样会被使用。
    · negative: 如果>0,则会采用negativesamping,用于设置多少个noise words
    · cbow_mean: 如果为0,则采用上下文词向量的和,如果为1(defaut)则采用均值。只有使用CBOW的时候才起作用。
    · hashfxn: hash函数来初始化权重。默认使用python的hash函数
    · iter: 迭代次数,默认为5
    · trim_rule: 用于设置词汇表的整理规则,指定那些单词要留下,哪些要被删除。可以设置为None(min_count会被使用)或者一个接受()并返回RU·E_DISCARD,uti·s.RU·E_KEEP或者uti·s.RU·E_DEFAU·T的函数。
    · sorted_vocab: 如果为1(defaut),则在分配word index 的时候会先对单词基于频率降序排序。
    · batch_words:每一批的传递给线程的单词的数量,默认为10000

    '''
    if not os.path.exists(path2):  # 检验给出的路径是否存在
        os.makedirs(path2)
    save_model_name = path2 + os.path.sep + model_name + '.model'
    model.save(save_model_name)
    model.wv.save_word2vec_format(save_model_name + ".bin", binary=True)
    print('model done')
Ejemplo n.º 6
0
def train_model(training_dir_path):
    logger = logging.getLogger("w2v_logger")
    sentences = PathLineSentences(training_dir_path)

    #logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

    logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s")
    logging.root.setLevel(level=logging.INFO)

    params = {
        "size": 100,
        "window": 10,
        "min_count": 10,
        "workers": max(1,
                       multiprocessing.cpu_count() - 2),
        "sample": 1E-3
    }

    logger.info("training the model")

    word2vec = Word2Vec(sentences, **params)

    logger.info("saving the model")
    #Specify folder name
    word2vec.save("../models/model_n/result.model")
Ejemplo n.º 7
0
def create_initial_model():
    """
    Does (very slow) initial work of reading over entire corpus, generating the vocabulary, 
    importing relevant word vectors from GoogleNews embedding, and then saving this iniital 
    model so we don't have to do it again. 
    """
    sentences = PathLineSentences(HN_MONTHLY_CORPUS_DIR)
    model = Word2Vec(size=300, sg=1, hs=1,
                     negative=0)  # Hey! Is no negative sampling a problem?
    # Hopefully this just means a bad non-optimization
    # rather than invalid results. Check the gensim code.
    # I believe that not using negative sampling just means
    # I'm using the (very) inefficient approach of computing
    # the softmax over all non-context words at each iteration.
    # So not invalid, just inefficient, probably contributing to
    # worse vectors than I would have otherwise.
    print("building vocabulary...")
    model.build_vocab(sentences)
    print("intersecting pretrained word vectors...")
    model.intersect_word2vec_format(GOOGLE_NEWS_EMBEDDING_FILE,
                                    lockf=1.0,
                                    binary=True)
    print("saving...")
    model.save(INITIAL_MODEL)
    return model
Ejemplo n.º 8
0
def node2vec(corpus_dir, save_wv):
    print("load corpus")
    walks = PathLineSentences(corpus_dir)
    print("train word2vec model")
    model = Word2Vec(walks, size=64)
    print("save model.wv")
    model.wv.save(save_wv)
Ejemplo n.º 9
0
def train_model():
    # 日志信息输出
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # check and process input arguments
    # if len(sys.argv) < 4:
    #     print(globals()['__doc__'] % locals())
    #     sys.exit(1)
    # input_dir, outp1, outp2 = sys.argv[1:4]
    # input为输入语料, outp1为输出模型, outp2位vector格式的模型
    input_dir = 'data/douluo_cut_word.txt'
    outp1 = 'model/douluo.model'
    outp2 = 'model/douluo.vector'
    # 训练模型
    # 输入语料目录:PathLineSentences(input_dir)
    # embedding size:256 共现窗口大小:10 去除出现次数5以下的词,多线程运行,迭代10次
    model = Word2Vec(PathLineSentences(input_dir),
                     size=256,
                     window=10,
                     min_count=5,
                     workers=multiprocessing.cpu_count(),
                     iter=10)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)
Ejemplo n.º 10
0
 def __len__(self):
     doc_count = 0
     for sub_directory in self.wiki_path.iterdir():
         if sub_directory.is_dir():
             for _ in PathLineSentences(str(sub_directory)):
                 doc_count += 1
     return doc_count
Ejemplo n.º 11
0
def train_embeddings(): 
    inpath = LOGS + 'word2vec_train_data/'
    sentences = PathLineSentences(inpath)
    epoch_logger = EpochLogger()
    print("Starting word2vec....")
    model = Word2Vec(sentences, size=100, window=5, min_count=5, 
          workers=multiprocessing.cpu_count(), seed=0, callbacks=[epoch_logger])
    model.save(LOGS + 'fiction_word2vec_model')
Ejemplo n.º 12
0
def main():

    parser = argparse.ArgumentParser(
        description='Get frequencies from corpus.')
    parser.add_argument(
        '--data_path',
        type=str,
        required=True,
        help='Path to corpus or corpus directory (iterates through files).')
    parser.add_argument('--output_path',
                        type=str,
                        required=True,
                        help='Output path for result file.')
    parser.add_argument('--norm',
                        action='store_true',
                        help='Normalize frequency by total corpus frequency.')
    parser.add_argument('--lower',
                        action='store_true',
                        help='Apply lowercasing to the corpus.')
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get sentence iterator
    sentences = PathLineSentences(args.data_path)

    # Initialize frequency dictionary
    freqs = defaultdict(int)

    # Iterate over sentences and words
    corpusSize = 0
    for sentence in sentences:
        for word in sentence:
            corpusSize += 1
            if args.lower:
                freqs[word.lower()] = freqs[word.lower()] + 1
            else:
                freqs[word] = freqs[word] + 1

    freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))

    # Write frequency scores
    with open(args.output_path, 'w', encoding='utf-8') as f_out:
        for i, word in enumerate(freqs, start=1):
            if args.norm:
                freqs[word] = float(
                    freqs[word]
                ) / corpusSize  # Normalize by total corpus frequency
                f_out.write('\t'.join((word, str(freqs[word]), str(i))) + '\n')
            else:
                f_out.write('\t'.join((word, str(freqs[word]), str(i))) + '\n')

    logging.info('tokens: %d' % (corpusSize))
    logging.info('types: %d' % (len(freqs.keys())))
    logging.info("--- %s seconds ---" % (time.time() - start_time))
def trainVector(inputpath="news_articles_cut.txt",
                outpath="word2vec_news2.model"):
    model = word2vec.Word2Vec(PathLineSentences(inputpath),
                              size=100,
                              window=5,
                              min_count=2,
                              workers=10)
    # tsne_plot(model)
    model.save(outpath)
Ejemplo n.º 14
0
def main():
    # output directory for preprocessed files
    data_dir, meta_file = make_data_dir(args.target)

    # read and process corpus (args.raw)
    for orig_file in tqdm(os.listdir(args.raw)):

        # create filename for preprocessed orig_file
        prep_file = data_dir + str(
            os.path.splitext(orig_file)[0]) + "_prep.txt"

        # read orig_file and preprocess
        with open(os.path.join(args.raw, orig_file), 'r') as inputfile:
            # sents = list of preprocessed sentences
            sents = process_text(
                inputfile,
                args.lower,
                args.longs,  # args.lemmatization,
                args.ocr,
                args.umlauts,
                args.punctuation,
                args.numbers,
                args.stopwords)

            # save in outputfile, one sentence per line
            with open(prep_file, 'w+') as outputfile:
                for s in sents:
                    outputfile.write(s + '\n')
                # logging.info('preprocessing of {} finished'.format(orig_file))

    if args.bigrams:
        sentences = PathLineSentences(data_dir)
        phrases = Phrases(sentences, min_count=5, threshold=10)
        # print(list(phrases[sentences]))

        filenames = os.listdir(data_dir)
        for file in filenames:
            path = os.path.join(data_dir, file)
            with open(path, "r+") as f:
                tokenized_sentences = phrases[LineSentence(path)]
                f.seek(0)
                for s in tokenized_sentences:
                    f.write('{}\n'.format(' '.join(s)))
                f.truncate()

    # save args to csv as metadata for preprocessed corpus
    with open(meta_file, 'w+') as metadata:
        args_dict = args.__dict__
        args_dict['corpus_dir'] = data_dir
        writer = csv.writer(metadata)
        writer.writerow(args_dict.keys())
        writer.writerow(args_dict.values())
        logging.info(
            'params used for preprocessing saved at {:s}'.format(meta_file))

    logging.info('preprocessed files saved at {:s}'.format(data_dir))
Ejemplo n.º 15
0
 def train(corpus, modelpath):
     if not os.path.isdir(corpus):
         raise ValueError('input is should be a path')
     sentences = PathLineSentences(corpus)
     model = Word2Vec(iter=3)
     model.build_vocab(sentences)
     model.train(sentences,
                 total_examples=model.corpus_count,
                 epochs=model.iter)
     model.save(modelpath)
Ejemplo n.º 16
0
def word_vec():
    logger.info('Word to vec')
    model = word2vec.Word2Vec(PathLineSentences(seg_file),
                              sg=1,
                              size=300,
                              window=5,
                              min_count=10,
                              sample=1e-4,
                              workers=multiprocessing.cpu_count())
    model.wv.save_word2vec_format(word_vec_file, binary=False)
Ejemplo n.º 17
0
    def train_compass(self, compass_text, overwrite=False):
        compass_exists = os.path.isfile(
            os.path.join(self.opath, "compass.model"))
        if compass_exists and overwrite is False:
            self.compass = Word2Vec.load(
                os.path.join(self.opath, "compass.model"))
            print("Compass loaded from file.")
        else:
            sentences = PathLineSentences(compass_text)
            sentences.input_files = [
                s for s in sentences.input_files
                if not os.path.basename(s).startswith('.')
            ]
            print("Training the compass.")
            if compass_exists:
                print("Compass will be overwritten after training")
            self.compass = self.train_model(sentences)
            self.compass.save(os.path.join(self.opath, "compass.model"))

        self.gvocab = self.compass.wv.vocab
Ejemplo n.º 18
0
 def train_w2v(self):
     logger.info('Traing w2v model...')
     # CBOW
     model = Word2Vec(PathLineSentences(self.w2v_corpus_path),
                      sg=1,
                      size=self.emb_dim,
                      window=5,
                      min_count=5,
                      workers=multiprocessing.cpu_count() / 12)
     model.save(self.w2v_model_path)
     model.wv.save_word2vec_format(self.w2v_vector_path, binary=False)
Ejemplo n.º 19
0
 def fit(self, input_dir):
     '''
     input_dir是所有词文件存放的目录
     '''
     # embedding size:256 共现窗口大小:10 去除出现次数5以下的词,多线程运行,迭代10次
     model = Word2Vec(PathLineSentences(input_dir),
                      size=256,
                      window=10,
                      min_count=5,
                      workers=multiprocessing.cpu_count(),
                      iter=10)
     self.model = model.evaluate_word_pairs
Ejemplo n.º 20
0
def train_word_embedding_model():
    """
    WordEmbeddingModelを学習する
    """
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = PathLineSentences(PATH_LINE_SENTENCES_DIR_PATH)
    model = word2vec.Word2Vec(sentences,
                              size=embedding_size,
                              window=embedding_window,
                              min_count=embedding_min_count,
                              sg=embedding_sg)
    model.save(embedding_model_path)
def extract_documents(corpus_path):
    sentence_corpus = PathLineSentences(corpus_path)
    
    documents = [
            TaggedDocument(doc, [Path(tag).name]) for tag, doc in zip(sentence_corpus.input_files, 
            sentence_corpus)
        ]

    ## shuffle modify in place
    # return shuffle(documents)
    ## shuffle using sample
    return sample(documents, len(documents))
Ejemplo n.º 22
0
def word2vec():
    print('Start...')
    model = Word2Vec(PathLineSentences('word.txt'),
                     size=50,
                     window=5,
                     min_count=5,
                     workers=multiprocessing.cpu_count(),
                     negative=5)
    model.save('gensim_wv_test.model')
    model.wv.save_word2vec_format('vector_gensim', binary=False)
    print("Finished!")
    return model
Ejemplo n.º 23
0
def train(input_dir):
    gc.collect()
    #data = PathLineSentences(input_dir)
    model = gensim.models.Word2Vec(PathLineSentences(input_dir),workers=multiprocessing.cpu_count() * 2, sg=1)
    '''
    for i in range(len(data))[::100]:
        if i==0:
            continue
        tte = model.corpus_count + len(data[i:i+100])
        model.train(data[i:i+100],total_examples=tte,epochs=model.epochs)
    '''

    model.save("word2Vector.model")
    print('ok')
Ejemplo n.º 24
0
def create_model():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    input_dir = '/data/token'
    outp1 = '/model/w2v/word level/GeoW2V.model'
    outp2 = '/model/w2v/word level/word2vec.bin'
    fileNames = os.listdir(input_dir)
    model = Word2Vec(PathLineSentences(input_dir),
                     size=256, window=10, min_count=5,
                     workers=multiprocessing.cpu_count(), iter=10)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)
Ejemplo n.º 25
0
def train_word2vec_model(path):
    """
    训练词向量
    """
    if os.path.exists(path):
        print('Train Word2Vec model using all files under ', path)
        model = Word2Vec(PathLineSentences(path),
                         min_count=10,
                         sg=0,
                         size=128,
                         window=20,
                         workers=4)  # CBOW model
        print(model.most_similar('说', topn=20))
    else:
        print(path, ' does not exit')
    return model
Ejemplo n.º 26
0
def main(args):
    # load a config file
    with open(args.param_path) as f:
        param = yaml.safe_load(f)['w2v']

    sentences = PathLineSentences(param['input'])

    model = Word2Vec(sentences,
                     size=param['size'],
                     window=param['window'],
                     min_count=param['min_count'],
                     sg=param['sg'],
                     negative=param['negative'],
                     iter=param['iter'],
                     workers=param['workers'])

    model.save(param['output'])
Ejemplo n.º 27
0
    def train(self, *args, **kwargs):
        super(FastTextModel, self).train(
            # Path to a corpus file in LineSentence format
            PathLineSentences(args[0]),

            # Count of sentences.
            total_examples=self.corpus_count,

            # Count of raw words in sentences.
            total_words=self.corpus_total_words,

            # Count of words already trained. Set this to 0 for
            # the usual case of training on all words in sentences.
            word_count=0,

            # Number of iterations (epochs) over the corpus.
            epochs=self.epochs)
def test_gensim():
    sents = [
        'I am a good student'.split(), 'Good good study day day up'.split()
    ]
    print(sents)
    sents2 = "I am a good student Good good study day day up"
    # model = word2vec.Word2Vec(sents2, size=100, window=5, min_count=2, workers=10)
    model = word2vec.Word2Vec(PathLineSentences("news_articles_cut.txt"),
                              size=100,
                              window=5,
                              min_count=5,
                              workers=10)
    # 打印单词'good'的词向量
    print(model.wv.word_vec('此前'))
    # 打印和'good'相似的前2个单词
    print(model.wv.most_similar('此前', topn=2))
    # 保存模型到文件
    model.save('w2v.model')
Ejemplo n.º 29
0
    def init(args: Namespace):
        """Convenient factory method for initialising the model
        using the command-line arguments. This class acts as a
        shallow wrapper around the
        :class:`~gensim.models.fasttext.FastText` class.

        Arguments:
            args: Parsed command-line arguments.
        Returns:
            model: The initialised model.
        """
        init_kwargs = {
            # Dimensionality of the word vectors.
            'size': args.dim,

            # The maximum distance between the current
            # and predicted word within a sentence.
            'window': args.window_size,

            # The model ignores all words with total
            # frequency lower than this.
            'min_count': args.min_count,

            # Train using skip-gram if `sg=1`, else use CBoW.
            'sg': 1 if args.sg else 0,

            # Number of iterations (epochs) over the corpus.
            'iter': args.epochs,

            # Use these many worker threads to train the model
            # (=faster training with multicore machines).
            'workers': min(32, cpu_count()),

            # sort the vocabulary by descending frequency
            # before assigning word indices
            'sorted_vocab': 1,

            # Target size (in words) for batches of examples passed to worker threads.
            'batch_words': 6000
        }
        model = FastTextModel(**init_kwargs)
        model.build_vocab(PathLineSentences(args.source))

        return model
Ejemplo n.º 30
0
def train_model_on_top(model_path, more_sentences_path):
    logger = logging.getLogger("w2v_logger")
    logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s")
    logging.root.setLevel(level=logging.INFO)

    model = gensim.models.Word2Vec.load(model_path)
    logger.info("loaded the model")

    more_sentences = PathLineSentences(more_sentences_path)
    logger.info("loaded the sentences")

    #setting update to True, allows for the dictionary to accept new words
    model.build_vocab(more_sentences, update=True)

    model.train(more_sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)

    model.save(model_path + "-retrained_train_test")