예제 #1
0
    tokenizer = load_tokenizer(args.tokenizer_path)
    num_vocab = tokenizer.get_vocab_size()
    logger.info("build model")
    model = setup_model(num_vocab, args.emb_size,
                        args.hid_size, args.num_class)
    logger.info("load model")
    model = load_model(model, args.model_path)
    x = encode_input(tokenizer, args.q1, args.q2)
    logger.info("predict the label")
    y_pred = model(x)
    y_pred = y_pred.argmax().unsqueeze(dim=0)
    print(decode_label(y_pred.detach().numpy()))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", help="model path", required=True)
    parser.add_argument("--tokenizer-path",
                        help="tokenizer path saved from train", required=True)
    parser.add_argument("--q1", help="question 1", required=True)
    parser.add_argument("--q2", help="question 2", required=True)
    parser.add_argument(
        "--emb-size", help="embedding size for embedding layer", default=512, type=int)
    parser.add_argument(
        "--hid-size", help="hidden size in lstm", default=512, type=int)
    parser.add_argument(
        "--num-class", help="number of class target", default=2, type=int)
    args = parser.parse_args()
    logger = log(path="logs/", file="lstm.log")
    main(args)
    w2v_model.train(
        sentences=sent,
        total_examples=w2v_model.corpus_count,
        epochs=20,
        report_delay=1,
        compute_loss=True,  # set compute_loss = True
        callbacks=[callback()])  # add the callback class

    logger.info("Save word embedding pickle")
    w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.vectors))
    with open(model_path + 'w2v_embed.pkl', 'wb') as file:
        pickle.dump(w2v, file)

    logger.info("Save Word2Vec model")
    w2v_model.save(model_path + 'word2vec.model')

    w2v_model.wv.most_similar(['dog'])


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--clean_data',
                        type=str,
                        default='../data/clean_quora_duplicate_questions.csv')
    parser.add_argument('--model_path', type=str, default='model/')
    opt = parser.parse_args()

    logger = log(path="logs/", file="word2vec.log")

    main(opt.clean_data, opt.model_path)
    if os.path.exists(model_path):
        model = load(model_path)
        return model
    else:
        ValueError("model not found")


def main(model_path, q1, q2):
    logger.info("load model")
    vectorizer_1, vectorizer_2, model = load_model(model_path)
    logger.info("text cleansing")
    q1 = clean_text(q1)
    q2 = clean_text(q2)
    logger.info("text transformation")
    vec_q1 = vectorizer_1.transform(np.array([q1]))
    vec_q2 = vectorizer_2.transform(np.array([q2]))
    questions = concat(vec_q1, vec_q2)
    logger.info("predict the label")
    y_pred = model.predict(questions)
    print(decode_label(y_pred))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", help="model path", required=True)
    parser.add_argument("--q1", help="question 1", required=True)
    parser.add_argument("--q2", help="question 2", required=True)
    args = parser.parse_args()
    logger = log(path="logs/", file="ensemble.log")
    main(args.model_path, args.q1, args.q2)
        f1 = f1_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred)

        logger.info('KFold -%s Accuracy: %s', kf, accuracy)
        logger.info('KFold -%s F1: %s', kf, f1)
        logger.info('KFold -%s Precision: %s', kf, prec)
        total_acc.append(accuracy)
        total_f1.append(f1)
        total_prec.append(prec)
        logger.info('--------------------------------')

    logger.info('==================================')
    logger.info('Performance')
    logger.info('Accuracy: %s', sum(total_acc)/5)
    logger.info('F1: %s', sum(total_f1)/5)
    logger.info('Precision: %s', sum(total_prec)/5)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--clean_data', type=str,
                        default='../data/clean_quora_duplicate_questions.csv')
    parser.add_argument('--kfold_data', type=str,
                        default='../data/cross_validation_data')
    parser.add_argument('--word_embed', type=str,
                        default='model/w2v_embed.pkl')
    opt = parser.parse_args()

    logger = log(path="logs/", file="word2vec_cosine.log")
    main(opt.clean_data, opt.kfold_data, opt.word_embed)
    model.eval()
    with torch.no_grad():
        logits = model(input_id,
                       token_type_ids=token_type_id,
                       attention_mask=attention_mask)

    logits = logits[0].detach().cpu().numpy()
    pred = np.argmax(logits, axis=1)

    pred_label = decode_label(pred)
    print("Result Prediction: ", pred_label)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', type=str, default='model/')
    parser.add_argument('--q1', type=str, default='')
    parser.add_argument('--q2', type=str, default='')
    parser.add_argument('--test_data', type=str, default='../data/test.tsv')
    parser.add_argument('--batch_size', type=int, default=32)
    opt = parser.parse_args()

    logger = log(path="logs/", file="bert_inference.log")

    device_type = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device_type)
    logger.debug(device)

    single_infer(opt.model_path, opt.q1, opt.q2)
    # main(opt.model_path, opt.test_data, opt.batch_size)