parser.add_argument('--vocab_size',
                        default=Word2VecVocab.MAX_VOCAB,
                        type=int,
                        help="maximum number of vocab (default:1e5)")
    parser.add_argument(
        '--token',
        default=Word2VecVocab.TOKEN,
        choices=['word', 'morph', 'character', 'jaso'],
        help="token is word or morph or character (default: 'word')")
    parser.add_argument('--min_count',
                        default=Word2VecVocab.MIN_COUNT,
                        type=int)
    args = parser.parse_args()

    try:
        if not os.path.exists(args.text_file):
            log.error(f'text file does not exists. {args.text_file}')
            exit(-1)

        vocab = Word2VecVocab.build(text_file=args.text_file,
                                    vocab_size=args.vocab_size,
                                    token=args.token,
                                    min_count=args.min_count,
                                    data_dir=args.data_dir)
        log.info(f'vocab: {vocab.filepath} {NumUtil.comma_str(len(vocab))}')
        log.info(f'vocab.idx2word: {vocab.idx2word[:10]}')
        log.info(f'vocab.idx2freq: {vocab.idx2freq[:10]}')
    except:
        log.error(traceback.format_exc())
Exemple #2
0
                                  neg_weight=args.neg_weight,
                                  subsample=args.subsample,
                                  learning_rate=args.learning_rate,
                                  learning_decay=args.learning_decay)
        log.info(f'Word2VecTrainer() OK. (elapsed: {watch.elapsed_string()})')
        log.info(trainer)
        log.info(f'trainer.train(epoch={args.epoch}, batch={args.batch}) ...')
        watch.start()
        embedding = Word2VecEmbedding(filepath=embedding_file,
                                      vocab=corpus.vocab)
        embedding_file = trainer.train(iterations=args.epoch,
                                       batch=args.batch,
                                       embedding=embedding,
                                       args=args)
        log.info(
            f'embedding_file: {embedding_file} train OK. (elapsed: {watch.elapsed_string()})'
        )

        if is_server():
            SlackUtil.send_message(
                f'embedding_file: {embedding_file} train OK. (elapsed: {watch.elapsed_string()})'
            )
            SlackUtil.send_message(
                f'[{hostname()}][{args.device_no}] {sys.argv} OK.')
    except:
        log.error(traceback.format_exc())
        if is_server():
            SlackUtil.send_message(
                f'[{hostname()}][{args.device_no}] {sys.argv} ERROR.')
            SlackUtil.send_message(traceback.format_exc())
    @property
    def data2text(self):
        for iword, owords in self.data:
            yield self.vocab.idx2word[iword], [self.vocab.idx2word[o] for o in owords]


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--text_file', default=WIKIPEDIA_SENTENCE_FILE, type=str, help="corpus file path")
    parser.add_argument('--data_dir', default=WORD2VEC_DATA_DIR, type=str, help="data directory path (default:'./data')")

    parser.add_argument('--vocab_file', default=Word2VecVocab.DEFAULT_FILE, type=str)
    parser.add_argument('--window', default=Word2VecCorpus.WINDOW, type=int, help="window size")
    parser.add_argument('--side', default=Word2VecCorpus.SIDE, type=str, choices=['both', 'front', 'back'], help="target words in front or back or both (default: both)")
    args = parser.parse_args()
    try:
        log.info(f'vocab_file {args.vocab_file}')

        if not os.path.exists(args.vocab_file):
            log.error(f'vocab file does not exists. {args.vocab_file}')

        vocab = Word2VecVocab.load(args.vocab_file)
        log.info(vocab)
        for args.window in [args.window]:  # [1, 2, 3, 4, 5]:
            for args.side in [args.side]:  # ['both', 'front', 'back']:
                log.info(f'window: {args.window} side: {args.side}')
                corpus = Word2VecCorpus.build(text_file=args.text_file, vocab=vocab, window=args.window, side=args.side, data_dir=args.data_dir)
                log.info(f'corpus: {corpus.filepath} {NumUtil.comma_str(len(corpus))}')
    except:
        log.error(traceback.format_exc())