Example #1
0
def train_main(args):
    """
    Trains model with specified args.
    """
    # Load text
    with open(args.text_path) as f:
        text = f.read()
    logger.info("corpus length: %s.", len(text))

    # Restore model from checkpoint or build model
    if args.restore:
        load_path = args.checkpoint_path if args.restore is True else args.restore
        model = load_model(load_path)
        logger.info("model restored: %s.", load_path)
    else:
        model = build_model(batch_size=args.batch_size,
                            seq_len=args.seq_len,
                            vocab_size=VOCAB_SIZE,
                            embedding_size=args.embedding_size,
                            rnn_size=args.rnn_size,
                            num_layers=args.num_layers,
                            drop_rate=args.drop_rate,
                            learning_rate=args.learning_rate,
                            clip_norm=args.clip_norm)

    # Make and clear checkpoint directory
    model.save(args.checkpoint_path)
    logger.info("model saved: %s.", args.checkpoint_path)

    callbacks = [
        ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False),
        LoggerCallback(text, model)
    ]

    # Split data into training and validation
    training_fraction = 0.95
    split = int(round(len(text) * training_fraction))

    text_train = text[:split]
    text_validation = text[split:]

    # Start training
    num_batches = (len(text_train) - 1) // (args.batch_size * args.seq_len)
    val_batches = (len(text_validation) - 1) // (args.batch_size *
                                                 args.seq_len)
    model.reset_states()
    model.fit_generator(batch_generator(encode_text(text_train),
                                        args.batch_size,
                                        args.seq_len,
                                        one_hot_labels=True),
                        num_batches,
                        args.num_epochs,
                        callbacks=callbacks,
                        validation_data=batch_generator(
                            encode_text(text_validation),
                            args.batch_size,
                            args.seq_len,
                            one_hot_labels=True),
                        validation_steps=val_batches)
    return model
Example #2
0
def get_data(in_file, out_file):
    '''
    加载语料
    :param in_file: 中文数据集路径
    :param out_file: 英文数据集路径
    :return:
    '''
    print('getting data {}->{}...'.format(in_file, out_file))
    with open(in_file, 'r', encoding='utf-8') as file:
        in_lines = file.readlines()
    with open(out_file, 'r', encoding='utf-8') as file:
        out_lines = file.readlines()

    samples = []

    for i in tqdm(range(len(in_lines))):
        sentence_zh = in_lines[i].strip()
        tokens = jieba.cut(sentence_zh.strip())
        in_data = encode_text(
            src_char2idx,
            tokens)  # encode_text(src_char2idx, tokens) 将语料转为id序列

        sentence_en = out_lines[i].strip().lower()
        tokens = [
            normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en)
        ]  # 将英文单词预处理
        out_data = [Config.sos_id] + encode_text(
            tgt_char2idx, tokens) + [Config.eos_id]  # 转为id 并加上开始和结束标志

        # 这里的maxlen_in=50 和 maxlen_out=100 也是有超参数给出的
        if len(in_data) < Config.maxlen_in and len(
                out_data
        ) < Config.maxlen_out and Config.unk_id not in in_data and Config.unk_id not in out_data:
            samples.append({'in': in_data, 'out': out_data})
    return samples
Example #3
0
def main():
    print('Load raw data')
    data = utils.load_dumped('../data/raw/dump.txt')

    print('Filter text')
    content = [utils.filter_text(_[1]) for _ in tqdm(data)]

    idx = np.arange(len(content))
    np.random.seed(19)
    np.random.shuffle(idx)

    test_len = int(0.1 * len(idx))

    print('Split into train/test')
    test = "".join(content[_] for _ in tqdm(idx[:test_len]))
    train = "".join(content[_] for _ in tqdm(idx[test_len:]))

    vocab = utils.generate_vocab()
    with open('../data/processed/vocab.json', 'w') as fout:
        json.dump(vocab, fout)

    print('Encoding test')
    test = utils.encode_text(test, vocab)
    np.save('../data/processed/test', test)

    print('Encoding train')
    train = utils.encode_text(train, vocab)
    np.save('../data/processed/train', train)
Example #4
0
def get_data(in_file):
    '''
    得到数据并切分数据
    :param in_file:
    :return:
    '''
    print('getting data {}...'.format(in_file))
    with open(in_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    samples = []
    for line in lines:
        sentences = line.split('|')
        in_sentence = sentences[0].strip()
        out_sentence = sentences[1].strip()

        in_data = encode_text(char2idx, in_sentence)
        out_data = [Config.sos_id] + encode_text(
            char2idx, out_sentence) + [Config.eos_id]

        if len(in_data) < Config.maxlen_in and len(
                out_data
        ) < Config.maxlen_out and Config.unk_id not in in_data and Config.unk_id not in out_data:
            samples.append({'in': in_data, 'out': out_data})
    return samples
Example #5
0
def get_data(in_file, out_file):
    print('getting data {}->{}...'.format(in_file, out_file))
    with open(in_file, 'r', encoding='utf-8') as file:
        in_lines = file.readlines()
    with open(out_file, 'r', encoding='utf-8') as file:
        out_lines = file.readlines()

    samples = []

    for i in tqdm(range(len(in_lines))):
        sentence_zh = in_lines[i].strip()
        tokens = jieba.cut(sentence_zh.strip())
        in_data = encode_text(src_char2idx, tokens)

        sentence_en = out_lines[i].strip().lower()
        tokens = [
            normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en)
        ]
        out_data = [sos_id] + encode_text(tgt_char2idx, tokens) + [eos_id]

        if len(in_data) < maxlen_in and len(
                out_data
        ) < maxlen_out and unk_id not in in_data and unk_id not in out_data:
            samples.append({'in': in_data, 'out': out_data})
    return samples
def build_samples():
    word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r'))
    word_map_en = json.load(open('data/WORDMAP_en.json', 'r'))

    for usage in ['train', 'valid']:
        if usage == 'train':
            translation_path_en = os.path.join(train_translation_folder,
                                               train_translation_en_filename)
            translation_path_zh = os.path.join(train_translation_folder,
                                               train_translation_zh_filename)
            filename = 'data/samples_train.json'
        else:
            translation_path_en = os.path.join(valid_translation_folder,
                                               valid_translation_en_filename)
            translation_path_zh = os.path.join(valid_translation_folder,
                                               valid_translation_zh_filename)
            filename = 'data/samples_valid.json'

        print('loading {} texts and vocab'.format(usage))
        with open(translation_path_en, 'r') as f:
            data_en = f.readlines()

        with open(translation_path_zh, 'r') as f:
            data_zh = f.readlines()

        print('building {} samples'.format(usage))
        samples = []
        for idx in tqdm(range(len(data_en))):
            sentence_zh = data_zh[idx].strip()
            seg_list = jieba.cut(sentence_zh)
            input_zh = encode_text(word_map_zh, list(seg_list))

            sentence_en = data_en[idx].strip().lower()
            tokens = [
                normalizeString(s) for s in nltk.word_tokenize(sentence_en)
                if len(normalizeString(s)) > 0
            ]
            output_en = encode_text(word_map_en, tokens)

            if len(input_zh) <= max_len and len(
                    output_en
            ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en:
                samples.append({
                    'input': list(input_zh),
                    'output': list(output_en)
                })

        with open(filename, 'w') as f:
            json.dump(samples, f, indent=4)

        print('{} {} samples created at: {}.'.format(len(samples), usage,
                                                     filename))
Example #7
0
def generate_text(model, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    print("generating {} characters from top {} choices.".format(length, top_n), file=sys.stderr)
    print('generating with seed: "{}".'.format(seed), file=sys.stderr)
    generated = seed
    encoded = utils.encode_text(seed)
    model.reset_states()

    for idx in encoded[:-1]:
        x = np.array([[idx]])
        # input shape: (1, 1)
        # set internal states
        model.predict(x)

    next_index = encoded[-1]
    for i in range(length):
        x = np.array([[next_index]])
        # input shape: (1, 1)
        probs = model.predict(x)
        # output shape: (1, 1, vocab_size)
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # append to sequence
        generated += utils.ID2CHAR[next_index]

    return generated
Example #8
0
def generate_text(model, sess, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = encode_text(seed)

    x = np.expand_dims(encoded[:-1], 0)
    # input shape: [1, seq_len]
    # get rnn state due to seed sequence
    state = sess.run(model["output_state"], feed_dict={model["X"]: x})

    next_index = encoded[-1]
    for i in range(length):
        x = np.array([[next_index]])
        # input shape: [1, 1]
        feed_dict = {model["X"]: x, model["input_state"]: state}
        probs, state = sess.run([model["probs"], model["output_state"]],
                                feed_dict=feed_dict)
        # output shape: [1, 1, vocab_size]
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # append to sequence
        generated += ID2CHAR[next_index]

    logger.info("generated text: \n%s\n", generated)
    return generated
def generate_text(model, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = encode_text(seed).astype(np.int32)
    model.predictor.reset_state()

    with chainer.using_config("train", False), chainer.no_backprop_mode():
        for idx in encoded[:-1]:
            x = Variable(np.array([idx]))
            # input shape: [1]
            # set internal states
            model.predictor(x)

        next_index = encoded[-1]
        for i in range(length):
            x = Variable(np.array([next_index], dtype=np.int32))
            # input shape: [1]
            probs = F.softmax(model.predictor(x))
            # output shape: [1, vocab_size]
            next_index = sample_from_probs(probs.data.squeeze(), top_n)
            # append to sequence
            generated += ID2CHAR[next_index]

    logger.info("generated text: \n%s\n", generated)
    return generated
Example #10
0
def generate_text(model, seed, length=512, top_n=2):
    """
    Generates text of specified length from trained model with given seed (e.g. the prefix string).
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = encode_text(seed)
    model.reset_states()

    for idx in encoded[:-1]:
        x = np.array([[idx]])
        # Input shape: (1, 1)
        # Set internal states
        model.predict(x)

    next_index = encoded[-1]
    for i in range(length):
        x = np.array([[next_index]])
        # Input shape: (1, 1)
        probs = model.predict(x)
        # Output shape: (1, 1, vocab_size)
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # Append to sequence
        if ID2CHAR[next_index] in [".", "!", "?"]:
            generated += ID2CHAR[next_index]
            break
        elif ID2CHAR[next_index] == "\n":
            break
        generated += ID2CHAR[next_index]

    logger.info("generated text: \n%s\n", generated)
    return generated
def analyze_after_init(comment, encoder, voc):
    # 构造分析对象
    sample = {'content': comment, 'label_tensor': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
    pair_batch = []
    result = []
    content = sample['content']
    result.append({'content': content})
    content = content.strip()

    # 分词
    seg_list = jieba.cut(content)
    input_indexes = encode_text(voc.word2index, list(seg_list))
    label_tensor = sample['label_tensor']
    pair_batch.append((input_indexes, label_tensor))

    # 分析
    # 将数据整理为tensor
    input_variable, lengths, _ = batch2TrainData(pair_batch)
    # 设置用什么device处理数据
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    # 使用encoder 计算输入得到输出
    outputs = encoder(input_variable, lengths)
    # 取出第一个维度的最大值
    _, outputs = torch.max(outputs, 1)
    print('outputs.size(): ' + str(outputs.size()))
    # 将输出数据转移到内存中,并将类型转换为numpy
    outputs = outputs.cpu().numpy()

    # 整理输出
    result[0]['labels'] = (outputs[0] - 2).tolist()
    return result[0]
def generate_text(model, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = mx.nd.array(encode_text(seed))
    seq_len = encoded.shape[0]

    x = F.expand_dims(encoded[:seq_len - 1], 1)
    # input shape: [seq_len, 1]
    state = model.begin_state()
    # get rnn state due to seed sequence
    _, state = model(x, state)

    next_index = encoded[seq_len - 1].asscalar()
    for i in range(length):
        x = mx.nd.array([[next_index]])
        # input shape: [1, 1]
        logit, state = model(x, state)
        # output shape: [1, vocab_size]
        probs = F.softmax(logit)
        next_index = sample_from_probs(probs.asnumpy().squeeze(), top_n)
        # append to sequence
        generated += ID2CHAR[next_index]

    logger.info("generated text: \n%s\n", generated)
    return generated
 def __init__(self, text, batch_size=64, seq_len=64):
     self.data_iterator = batch_generator(
         encode_text(text).astype(np.int32), batch_size, seq_len)
     self.num_batches = (len(text) - 1) // (batch_size * seq_len)
     self.iteration = 0
     self.epoch = 0
     self.is_new_epoch = True
Example #14
0
def generate_text(model, seed, length=512, top_n=None):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = encode_text(seed, get_CHAR2ID())
    model.reset_states()

    for idx in encoded[:-1]:
        x = np.array([[idx]])
        # input shape: (1, 1)
        # set internal states
        model.predict(x)

    next_index = encoded[-1]
    for i in range(length):
        x = np.array([[next_index]])
        # input shape: (1, 1)
        probs = model.predict(x)
        # output shape: (1, 1, vocab_size)
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # append to sequence
        generated += get_ID2CHAR()[next_index]

    logger.info("generated text: \n%s\n", generated)
    return generated
def generate_text(model, seed, length=512, top_n=10):
    """
    generates text of specified length from trained model
    with given seed character sequence.
    """
    logger.info("generating %s characters from top %s choices.", length, top_n)
    logger.info('generating with seed: "%s".', seed)
    generated = seed
    encoded = encode_text(seed)
    encoded = Variable(torch.from_numpy(encoded), volatile=True)
    model.eval()

    x = encoded[:-1].unsqueeze(1)
    # input shape: [seq_len, 1]
    state = model.init_state()
    # get rnn state due to seed sequence
    _, state = model.predict(x, state)

    next_index = encoded[-1:]
    for i in range(length):
        x = next_index.unsqueeze(1)
        # input shape: [1, 1]
        probs, state = model.predict(x, state)
        # output shape: [1, 1, vocab_size]
        next_index = sample_from_probs(probs.squeeze(), top_n)
        # append to sequence
        generated += ID2CHAR[next_index.data[0]]

    logger.info("generated text: \n%s\n", generated)
    return generated
Example #16
0
def get_data(in_file):
    contexts, questions = getCQpair(in_file)

    samples = []

    for i in tqdm(range(len(contexts))):

        tokens = [s.strip() for s in nltk.word_tokenize(contexts[i])]
        in_data = encode_text(word2idx_dict, tokens)

        q_tokens = [s.strip() for s in nltk.word_tokenize(questions[i])]
        out_data = [sos_id] + encode_text(word2idx_dict, q_tokens) + [eos_id]

        if len(in_data) < maxlen_in and len(out_data) < maxlen_out:
            samples.append({'in': in_data, 'out': out_data})

    return samples
Example #17
0
def train_main(args):
    """
    trains model specfied in args.
    main method for train subcommand.
    """
    # load text
    text = load_text(args.text_path)

    if args.test_path:
        test_text = load_text(args.test_path)
    else:
        test_text = None

    # load or build model
    if args.restore:
        load_path = args.checkpoint_path if args.restore is True else args.restore
        model = load_model(load_path)
        logger.info("model restored: %s.", load_path)
    else:
        model = build_model(batch_size=args.batch_size,
                            seq_len=args.seq_len,
                            vocab_size=get_VOCAB_SIZE(),
                            embedding_size=args.embedding_size,
                            rnn_size=args.rnn_size,
                            num_layers=args.num_layers,
                            drop_rate=args.drop_rate,
                            learning_rate=args.learning_rate,
                            clip_norm=args.clip_norm)

    # make and clear checkpoint directory
    log_dir = make_dirs(args.checkpoint_path, empty=True)
    model.save(args.checkpoint_path)
    logger.info("model saved: %s.", args.checkpoint_path)
    # callbacks
    callbacks = [
        ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False),
        TensorBoard(log_dir,
                    write_graph=True,
                    embeddings_freq=1,
                    embeddings_metadata={
                        "embedding_1":
                        os.path.abspath(os.path.join("data", "id2char.tsv"))
                    }),
        LoggerCallback(text, test_text, model, args.checkpoint_path)
    ]

    # training start
    num_batches = (len(text) - 1) // (args.batch_size * args.seq_len)
    model.reset_states()
    model.fit_generator(batch_generator(encode_text(text, get_CHAR2ID()),
                                        args.batch_size,
                                        args.seq_len,
                                        one_hot_labels=True),
                        num_batches,
                        args.num_epochs,
                        callbacks=callbacks)
    return model
Example #18
0
        def mp_func(data_en, data_zh, manager_d, index):
            samples = []
            for idx in tqdm(range(len(data_en))):
                sentence_zh = data_zh[idx].strip()
                seg_list = jieba.cut(sentence_zh)
                input_zh = encode_text(word_map_zh, list(seg_list))

                sentence_en = data_en[idx].strip().lower()
                tokens = [
                    normalizeString(s) for s in nltk.word_tokenize(sentence_en)
                    if len(normalizeString(s)) > 0
                ]
                output_en = encode_text(word_map_en, tokens)

                if len(input_zh) <= max_len and len(
                        output_en
                ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en:
                    samples.append({
                        'input': list(input_zh),
                        'output': list(output_en)
                    })

            manager_d[index] = samples
            return manager_d
Example #19
0
def train_main(args):
    """
    trains model specfied in args.
    main method for train subcommand.
    """
    # load text
    with open(args.text_path) as f:
        text = f.read()
    logger.info("corpus length: %s.", len(text))

    # restore or build model
    if args.restore:
        load_path = args.checkpoint_path if args.restore is True else args.restore
        with open("{}.json".format(args.checkpoint_path)) as f:
            model_args = json.load(f)
        logger.info("model restored: %s.", load_path)
    else:
        load_path = None
        model_args = {
            "batch_size": args.batch_size,
            "vocab_size": VOCAB_SIZE,
            "embedding_size": args.embedding_size,
            "rnn_size": args.rnn_size,
            "num_layers": args.num_layers,
            "p_keep": 1 - args.drop_rate,
            "learning_rate": args.learning_rate,
            "clip_norm": args.clip_norm
        }

    # build train model
    train_graph = tf.Graph()
    with train_graph.as_default():
        train_model = build_model(**model_args)

    with tf.Session(graph=train_graph) as train_sess:
        # restore or initialise model weights
        if load_path is not None:
            train_model["saver"].restore(train_sess, load_path)
            logger.info("model weights restored: %s.", load_path)
        else:
            train_sess.run(train_model["init_op"])

        # clear checkpoint directory
        log_dir = make_dirs(args.checkpoint_path, empty=True)
        # save model
        with open("{}.json".format(args.checkpoint_path), "w") as f:
            json.dump(train_model["args"], f, indent=2)
        checkpoint_path = train_model["saver"].save(train_sess,
                                                    args.checkpoint_path)
        logger.info("model saved: %s.", checkpoint_path)
        # tensorboard logger
        summary_writer = tf.summary.FileWriter(log_dir, train_sess.graph)
        # embeddings visualisation
        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = "EmbedSequence/embeddings"
        embedding.metadata_path = os.path.abspath(
            os.path.join("data", "id2char.tsv"))
        projector.visualize_embeddings(summary_writer, config)
        logger.info("tensorboard set up.")

        # build infer model
        inference_graph = tf.Graph()
        with inference_graph.as_default():
            inference_model = load_inference_model(args.checkpoint_path)

        # training start
        num_batches = (len(text) - 1) // (args.batch_size * args.seq_len)
        data_iter = batch_generator(encode_text(text), args.batch_size,
                                    args.seq_len)
        fetches = [
            train_model["train_op"], train_model["output_state"],
            train_model["loss"], train_model["summary"]
        ]
        state = train_sess.run(train_model["input_state"])
        logger.info("start of training.")
        time_train = time.time()

        for i in range(args.num_epochs):
            epoch_losses = np.empty(num_batches)
            time_epoch = time.time()
            # training epoch
            for j in tqdm(range(num_batches),
                          desc="epoch {}/{}".format(i + 1, args.num_epochs)):
                x, y = next(data_iter)
                feed_dict = {
                    train_model["X"]: x,
                    train_model["Y"]: y,
                    train_model["input_state"]: state
                }
                _, state, loss, summary_log = train_sess.run(
                    fetches, feed_dict)
                epoch_losses[j] = loss

            # logs
            duration_epoch = time.time() - time_epoch
            logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1,
                        duration_epoch, epoch_losses.mean())
            # tensorboard logs
            summary_writer.add_summary(summary_log, i + 1)
            summary_writer.flush()
            # checkpoint
            checkpoint_path = train_model["saver"].save(
                train_sess, args.checkpoint_path)
            logger.info("model saved: %s.", checkpoint_path)

            # generate text
            seed = generate_seed(text)
            with tf.Session(graph=inference_graph) as infer_sess:
                # restore weights
                inference_model["saver"].restore(infer_sess, checkpoint_path)
                generate_text(inference_model, infer_sess, seed)

        # training end
        duration_train = time.time() - time_train
        logger.info("end of training, duration: %ds.", duration_train)
        # generate text
        seed = generate_seed(text)
        with tf.Session(graph=inference_graph) as infer_sess:
            # restore weights
            inference_model["saver"].restore(infer_sess, checkpoint_path)
            generate_text(inference_model, infer_sess, seed, 1024, 3)

    return train_model
def train_main(args):
    """
    trains model specfied in args.
    main method for train subcommand.
    """
    # load text
    with open(args.text_path) as f:
        text = f.read()
    logger.info("corpus length: %s.", len(text))

    # load or build model
    if args.restore:
        logger.info("restoring model.")
        load_path = args.checkpoint_path if args.restore is True else args.restore
        model = Model.load(load_path)
    else:
        model = Model(vocab_size=VOCAB_SIZE,
                      embedding_size=args.embedding_size,
                      rnn_size=args.rnn_size,
                      num_layers=args.num_layers,
                      drop_rate=args.drop_rate)

    # make checkpoint directory
    make_dirs(args.checkpoint_path)
    model.save(args.checkpoint_path)

    # loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    # training start
    num_batches = (len(text) - 1) // (args.batch_size * args.seq_len)
    data_iter = batch_generator(encode_text(text), args.batch_size,
                                args.seq_len)
    state = model.init_state(args.batch_size)
    logger.info("start of training.")
    time_train = time.time()

    for i in range(args.num_epochs):
        epoch_losses = torch.Tensor(num_batches)
        time_epoch = time.time()
        # training epoch
        for j in tqdm(range(num_batches),
                      desc="epoch {}/{}".format(i + 1, args.num_epochs)):
            # prepare inputs
            x, y = next(data_iter)
            x = Variable(torch.from_numpy(x)).t()
            y = Variable(torch.from_numpy(y)).t().contiguous()
            # reset state variables to remove their history
            state = tuple([Variable(var.data) for var in state])
            # prepare model
            model.train()
            model.zero_grad()
            # calculate loss
            logits, state = model.forward(x, state)
            loss = criterion(logits, y.view(-1))
            epoch_losses[j] = loss.data[0]
            # calculate gradients
            loss.backward()
            # clip gradient norm
            nn.utils.clip_grad_norm(model.parameters(), args.clip_norm)
            # apply gradient update
            optimizer.step()

        # logs
        duration_epoch = time.time() - time_epoch
        logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1,
                    duration_epoch, epoch_losses.mean())
        # checkpoint
        model.save(args.checkpoint_path)
        # generate text
        seed = generate_seed(text)
        generate_text(model, seed)

    # training end
    duration_train = time.time() - time_train
    logger.info("end of training, duration: %ds.", duration_train)
    # generate text
    seed = generate_seed(text)
    generate_text(model, seed, 1024, 3)
    return model
Example #21
0
def equal_file(path1, path2):
    # c extension module can only handle encoded path
    path1 = utils.encode_text(path1)
    path2 = utils.encode_text(path2)
    ret = compare_file(path1, path2)
    return ret == 0
Example #22
0
def copy_file(src, dst):
    # c extension module can only handle encoded path
    src = utils.encode_text(src)
    dst = utils.encode_text(dst)
    ret, errmsg = cp_file(src, dst)
    return ret == 0, errmsg
Example #23
0
def hex_middle_md5(path, full=False):
    # c extension module can only handle encoded path
    path = utils.encode_text(path)
    digest = calc_middle_md5(path, 1 if full else 0)
    return ''.join('{:02x}'.format(ord(x)) for x in digest)
def train_main(args):
    """
    trains model specfied in args.
    main method for train subcommand.
    """
    # load text
    with open(args.text_path) as f:
        text = f.read()
    logger.info("corpus length: %s.", len(text))

    # restore or build model
    if args.restore:
        logger.info("restoring model.")
        load_path = args.checkpoint_path if args.restore is True else args.restore
        model = Model.load(load_path)
    else:
        model = Model(vocab_size=VOCAB_SIZE,
                      embedding_size=args.embedding_size,
                      rnn_size=args.rnn_size,
                      num_layers=args.num_layers,
                      drop_rate=args.drop_rate)
        model.initialize(mx.init.Xavier())
    model.hybridize()

    # make checkpoint directory
    make_dirs(args.checkpoint_path)
    model.save(args.checkpoint_path)

    # loss function
    loss = gluon.loss.SoftmaxCrossEntropyLoss(batch_axis=1)
    # optimizer
    optimizer = mx.optimizer.Adam(learning_rate=args.learning_rate,
                                  clip_gradient=args.clip_norm)
    # trainer
    trainer = gluon.Trainer(model.collect_params(), optimizer)

    # training start
    num_batches = (len(text) - 1) // (args.batch_size * args.seq_len)
    data_iter = batch_generator(encode_text(text), args.batch_size,
                                args.seq_len)
    state = model.begin_state(args.batch_size)
    logger.info("start of training.")
    time_train = time.time()
    for i in range(args.num_epochs):
        epoch_losses = mx.nd.empty(num_batches)
        time_epoch = time.time()
        # training epoch
        for j in tqdm(range(num_batches),
                      desc="epoch {}/{}".format(i + 1, args.num_epochs)):
            # prepare inputs
            x, y = next(data_iter)
            x = mx.nd.array(x.T)
            y = mx.nd.array(y.T)
            # reset state variables to remove their history
            state = [arr.detach() for arr in state]

            with autograd.record():
                logits, state = model(x, state)
                # calculate loss
                L = loss(logits, y)
                L = F.mean(L)
                epoch_losses[j] = L.asscalar()
                # calculate gradient
                L.backward()
            # apply gradient update
            trainer.step(1)

        # logs
        duration_epoch = time.time() - time_epoch
        logger.info("epoch: %s, duration: %ds, loss: %.6g.", i + 1,
                    duration_epoch,
                    F.mean(epoch_losses).asscalar())
        # checkpoint
        model.save_params(args.checkpoint_path)
        logger.info("model saved: %s.", args.checkpoint_path)
        # generate text
        seed = generate_seed(text)
        generate_text(model, seed)

    # training end
    duration_train = time.time() - time_train
    logger.info("end of training, duration: %ds.", duration_train)
    # generate text
    seed = generate_seed(text)
    generate_text(model, seed, 1024, 3)
    return model
Example #25
0
def train_model(config, is_wandb=False):
    """
    Build and train a classifier based on a configuration object.
    Runs a stratified K-fold cross-validation process balancing both labels and languages in
    each fold.

    Arguments:
    - config: A configuration object for the run.
    - is_wandb: a flag for sweeps, adding the Weights and Biases callback to the model.

    Returns:
    - model: The best trained classifier out of all folds.
    - preds_oof: Out of fold predictions on the training set.
    - preds_test: Predictions on the test set.
    """

    if config.VERBOSE:
        print("--- Reading Data ---")

    df_train = pd.read_csv(config.PATH_TRAIN)
    df_test = pd.read_csv(config.PATH_TEST)

    if config.VERBOSE:
        print("Done!")

    if is_wandb:
        wb = wandb.keras.WandbCallback()

    if config.TRANSLATION:

        if config.VERBOSE:
            print("--- Translating Premises ---")

        df_train.loc[df_train.language != "English", "premise"] = df_train[
            df_train.language != "English"].premise.apply(
                lambda x: translate_text(x))
        df_test.loc[df_test.language != "English", "premise"] = df_test[
            df_test.language != "English"].premise.apply(
                lambda x: translate_text(x))

        if config.VERBOSE:
            print("Done!")
            print("--- Translating Hypotheses ---")

        df_train.loc[df_train.language != "English", "hypothesis"] = df_train[
            df_train.language != "English"].hypothesis.apply(
                lambda x: translate_text(x))
        df_test.loc[df_test.language != "English", "hypothesis"] = df_test[
            df_test.language != "English"].hypothesis.apply(
                lambda x: translate_text(x))

        if config.VERBOSE:
            print("Done!")

    if config.VERBOSE:
        print("--- Preprocessing ---")

    # adding language column for stratified splitting
    df_train["language_label"] = df_train.language.astype(
        str) + "_" + df_train.label.astype(str)

    # stratified K-fold on language and label for balance
    skf = StratifiedKFold(n_splits=config.TRAIN_SPLITS,
                          shuffle=True,
                          random_state=config.SEED)

    preds_oof = np.zeros((df_train.shape[0], 3))
    preds_test = np.zeros((df_test.shape[0], 3))
    acc_oof = []

    if config.VERBOSE:
        print("Done!")

    for (fold, (train_index, valid_index)) in enumerate(
            skf.split(df_train, df_train.language_label)):

        if config.VERBOSE:
            print(f"--- Fold {fold+1} ---")

        # Initializing TPU
        if config.ACCELERATOR == "TPU":
            if config.tpu:
                config.initialize_accelerator()

        if config.VERBOSE:
            print("Building Model...")

        tf.keras.backend.clear_session()
        with config.strategy.scope():
            model = build_classifier(config.MODEL_NAME, config.MAX_LENGTH,
                                     config.LEARNING_RATE, config.METRICS)
            if fold == 0:
                print(model.summary())

        X_train = df_train.iloc[train_index]
        X_valid = df_train.iloc[valid_index]

        y_train = X_train.label.values
        y_valid = X_valid.label.values

        if config.VERBOSE:
            print("Tokenizing...")

        # Encoding text data using tokenizer
        X_train_encoded = encode_text(df=X_train,
                                      tokenizer=config.TOKENIZER,
                                      max_len=config.MAX_LENGTH,
                                      padding=config.PAD_TO_MAX_LENGTH)
        X_valid_encoded = encode_text(df=X_valid,
                                      tokenizer=config.TOKENIZER,
                                      max_len=config.MAX_LENGTH,
                                      padding=config.PAD_TO_MAX_LENGTH)

        # Creating TF Datasets
        ds_train = to_tfds(X_train_encoded,
                           y_train,
                           config.AUTO,
                           repeat=True,
                           shuffle=True,
                           batch_size=config.BATCH_SIZE * config.REPLICAS)
        ds_valid = to_tfds(X_valid_encoded,
                           y_valid,
                           config.AUTO,
                           batch_size=config.BATCH_SIZE * config.REPLICAS * 4)

        n_train = X_train.shape[0]

        # Only need to encode test data once
        if fold == 0:
            X_test_encoded = encode_text(df=df_test,
                                         tokenizer=config.TOKENIZER,
                                         max_len=config.MAX_LENGTH,
                                         padding=config.PAD_TO_MAX_LENGTH)

        # Defining checkpoint callback
        sv = tf.keras.callbacks.ModelCheckpoint(
            "models\model.h5",
            monitor="val_sparse_categorical_accuracy",
            verbose=0,
            save_best_only=True,
            save_weights_only=True,
            mode="max",
            save_freq="epoch")

        # Adding wandb callback
        cbs = [sv]
        if is_wandb:
            cbs.append(wb)

        if config.VERBOSE:
            print("Training...")

        model_history = model.fit(ds_train,
                                  epochs=config.EPOCHS,
                                  callbacks=cbs,
                                  steps_per_epoch=n_train /
                                  config.BATCH_SIZE // config.REPLICAS,
                                  validation_data=ds_valid,
                                  verbose=config.VERBOSE)

        if config.VERBOSE:
            print("Validating...")

        # Scoring validation data
        model.load_weights("models\model.h5")
        ds_valid = to_tfds(X_valid_encoded,
                           -1,
                           config.AUTO,
                           labelled=False,
                           batch_size=config.BATCH_SIZE * config.REPLICAS * 4)

        preds_valid = model.predict(ds_valid, verbose=config.VERBOSE)
        acc = accuracy_score(y_valid, np.argmax(preds_valid, axis=1))

        preds_oof[valid_index] = preds_valid
        acc_oof.append(acc)

        if config.VERBOSE:
            print("Testing...")

        # Scoring test data
        ds_test = to_tfds(X_test_encoded,
                          -1,
                          config.AUTO,
                          labelled=False,
                          batch_size=config.BATCH_SIZE * config.REPLICAS * 4)
        preds_test += model.predict(
            ds_test, verbose=config.VERBOSE) / config.TRAIN_SPLITS

        print(f"Fold {fold + 1} Accuracy: {round(acc, 4)}")

        g = gc.collect()

    # overall CV score and standard deviation
    print(f"CV Mean Accuracy: {round(np.mean(acc_oof), 4)}")
    print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}")

    return model, preds_oof, preds_test
Example #26
0
    encoder.eval()

    filename = os.path.join(valid_folder, valid_filename)
    user_reviews = pd.read_csv(filename)
    samples = parse_user_reviews(user_reviews)

    samples = random.sample(samples, 10)
    pair_batch = []
    result = []
    for i, sample in enumerate(samples):
        content = sample['content']
        # print(content)
        result.append({'content': content})
        content = content.strip()
        seg_list = jieba.cut(content)
        input_indexes = encode_text(voc.word2index, list(seg_list))
        label_tensor = sample['label_tensor']
        pair_batch.append((input_indexes, label_tensor))

    test_data = batch2TrainData(pair_batch)
    input_variable, lengths, _ = test_data
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    outputs = encoder(input_variable, lengths)
    _, outputs = torch.max(outputs, 1)
    print('outputs.size(): ' + str(outputs.size()))
    outputs = outputs.cpu().numpy()

    for i in range(10):
        result[i]['labels'] = (outputs[i] - 2).tolist()