def run(args):
    logger.info('load data from disk')
    processing = Preprocessor(min_len=2,
                              stopwords_path=config['stopword_path'])
    examples = []
    with open(config['data_path'], 'r') as fr:
        for i, line in enumerate(fr):
            # 数据首行为列名
            if i == 0 and False:
                continue
            line = line.strip("\n")
            line = processing(line)
            if line:
                examples.append(line.split())
    logger.info("initializing emnedding model")
    word2vec_model = gensim_word2vec.Word2Vec(
        sg=1,
        iter=10,
        size=args.embedd_dim,
        window=args.window_size,
        min_count=args.min_freq,
        save_path=config['gensim_embedding_path'],
        num_workers=args.num_workers,
        seed=args.seed)
    word2vec_model.train_w2v([[word for word in document]
                              for document in examples])
Ejemplo n.º 2
0
def main():
    print('模型存放在' + str(config['pytorch_embedding_path']))
    parser = argparse.ArgumentParser(
        description='PyTorch Word2Vec model training')
    parser.add_argument("--model", type=str, default='skip_gram')
    parser.add_argument("--task", type=str, default='training word vector')
    parser.add_argument('--seed',
                        default=2018,
                        type=int,
                        help='Seed for initializing training.')
    parser.add_argument('--resume',
                        default=False,
                        type=bool,
                        help='Choose whether resume checkpoint model')
    parser.add_argument('--embedd_dim', default=300, type=int)
    parser.add_argument('--epochs', default=6, type=int)
    parser.add_argument('--window_size', default=5, type=int)
    parser.add_argument('--n_gpus', default='0', type=str)
    parser.add_argument('--min_freq', default=20, type=int)
    parser.add_argument('--sample', default=1e-3, type=float)
    parser.add_argument('--negative_sample_num', default=5, type=int)
    parser.add_argument('--learning_rate', default=0.025, type=float)
    parser.add_argument('--weight_decay', default=5e-4, type=float)
    parser.add_argument('--vocab_size', default=30000000, type=int)
    args = parser.parse_args()
    init_logger(log_file=config['log_dir'] / (args.model + ".log"))
    logger.info(f"seed is {args.seed}")
    seed_everything(seed=args.seed)
    run(args)
def main():
    parser = argparse.ArgumentParser(
        description='Gensim Word2Vec model training')
    parser.add_argument("--model", type=str, default='gensim_word2vec')
    parser.add_argument("--task", type=str, default='training word vector')
    parser.add_argument('--seed',
                        default=2018,
                        type=int,
                        help='Seed for initializing training.')
    parser.add_argument('--resume',
                        default=False,
                        type=bool,
                        help='Choose whether resume checkpoint model')
    parser.add_argument('--embedd_dim', default=300, type=int)
    parser.add_argument('--spochs', default=6, type=int)
    parser.add_argument('--window_size', default=5, str=int)
    parser.add_argument('--n_gpu', default='0', type=str)
    parser.add_argument('--min_freq', default=5, type=int)
    parser.add_argument('--sample', default=1e-3, type=float)
    parser.add_argument('--negative_sample_num', default=5, type=int)
    parser.add_argument('--learning_rate', default=0.025, type=float)
    parser.add_argument('--weight_decay', default=5e-4, type=float)
    parser.add_argument('--vocab_size', default=30000000, type=int)
    parser.add_argument('--num_workers', default=10)
    args = parser.parse_args()
    init_logger(log_file=config['log_dir'] / (args.model + ".log"))
    logger.info("seed is %d" % args['seed'])
    seed_everything(seed=args['seed'])
    run(args)
Ejemplo n.º 4
0
def run(args):
    # **************************** 加载数据集 ****************************
    logger.info('starting load train data from disk')
    train_dataset = DataLoader(skip_header=False,
                               negative_num=args.negative_sample_num,
                               window_size=args.window_size,
                               data_path=config['data_path'],
                               vocab_path=config['vocab_path'],
                               vocab_size=args.vocab_size,
                               min_freq=args.min_freq,
                               shuffle=True,
                               seed=args.seed,
                               sample=args.sample)

    # **************************** 模型和优化器 ***********************
    logger.info("initializing model")
    model = SkipGram(embedding_dim=args.embedd_dim,
                     vocab_size=len(train_dataset.vocab))
    optimizer = optim.SGD(params=model.parameters(), lr=args.learning_rate)

    # **************************** callbacks ***********************
    logger.info("initializing callbacks")
    train_monitor = TrainingMonitor(file_dir=config['figure_dir'],
                                    arch=args.model)
    lr_scheduler = StepLR(optimizer=optimizer,
                          lr=args.learning_rate,
                          epochs=args.epochs)

    # **************************** training model ***********************
    logger.info('training model....')
    trainer = Trainer(model=model,
                      vocab=train_dataset.vocab,
                      optimizer=optimizer,
                      epochs=args.epochs,
                      logger=logger,
                      training_monitor=train_monitor,
                      lr_scheduler=lr_scheduler,
                      n_gpu=args.n_gpus,
                      model_save_path=config['model_save_path'],
                      vector_save_path=config['pytorch_embedding_path'])
    trainer.train(train_data=train_dataset)