def load_data(src, tgt, src_emb, tgt_emb, src_lng, tgt_lng, center_lng):
    print(f"Loading src: {src} ... ", file=sys.stderr, end="", flush=True)
    src_embeddings = load_word_embeddings(src_emb)
    print(" embeddings ... ", end="", file=sys.stderr)
    src_repr = word_embeddings_for_file(src,
                                        src_embeddings,
                                        src_lng,
                                        mean_pool=False,
                                        skip_tokenization=True)
    print("Done", file=sys.stderr)

    print(f"Loading tgt: {tgt} ... ", file=sys.stderr, end="", flush=True)
    tgt_embeddings = load_word_embeddings(tgt_emb)
    print(" embeddings ... ", end="", file=sys.stderr)
    tgt_repr = word_embeddings_for_file(tgt,
                                        tgt_embeddings,
                                        tgt_lng,
                                        mean_pool=False,
                                        skip_tokenization=True)
    print("Done", file=sys.stderr)

    if center_lng:
        print("Centering data.", file=sys.stderr)
        src_repr, tgt_repr = center(src_repr), center(tgt_repr)

    return src_repr, tgt_repr
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("data_lng1",
                        type=str,
                        help="Sentences with language for training.")
    parser.add_argument("data_lng2",
                        type=str,
                        help="Sentences with language for training.")
    parser.add_argument("lng1_emb",
                        type=str,
                        help="Source language word embeddings.")
    parser.add_argument("lng2_emb",
                        type=str,
                        help="Target language word embeddings.")
    parser.add_argument("lng1", type=str, help="Source language code.")
    parser.add_argument("lng2", type=str, help="Target language code.")
    parser.add_argument("save_model",
                        type=str,
                        help="Path to the saved model.")
    parser.add_argument("--src-proj",
                        default=None,
                        type=str,
                        help="Sklearn projection of the source language.")
    parser.add_argument("--mt-proj",
                        default=None,
                        type=str,
                        help="Sklearn projection of the target language.")
    parser.add_argument("--num-threads", type=int, default=4)
    args = parser.parse_args()

    print(f"Loading {args.lng1} embeddings.", file=sys.stderr)
    lng1_embeddings = load_word_embeddings(args.lng1_emb)
    print(f"Loading {args.lng2} embeddings.", file=sys.stderr)
    lng2_embeddings = load_word_embeddings(args.lng2_emb)

    print(f"Loading representation for {args.data_lng1}", file=sys.stderr)
    lng1_repr = np.stack(
        word_embeddings_for_file(args.data_lng1, lng1_embeddings, args.lng1))
    print(f"Loading representation for {args.data_lng2}", file=sys.stderr)
    lng2_repr = np.stack(
        word_embeddings_for_file(args.data_lng2, lng2_embeddings, args.lng2))
    print("Representations loaded.", file=sys.stderr)

    if args.src_proj is not None:
        src_repr = apply_sklearn_proj(src_repr, args.src_proj)
    if args.mt_proj is not None:
        mt_repr = apply_sklearn_proj(mt_repr, args.mt_proj)

    print("Fitting the projection.", file=sys.stderr)
    model = LinearRegression()
    model.fit(lng1_repr, lng2_repr)
    print("Done, saving model.", file=sys.stderr)

    joblib.dump(model, args.save_model)
Ejemplo n.º 3
0
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_file_name, FLAGS.test_file_name, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_file_name, word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_file_name, word2id, FLAGS.max_aspect_len,
                          FLAGS.max_context_len, FLAGS.test_data,
                          FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    with tf.Session() as sess:
        model = IAN(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
Ejemplo n.º 4
0
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.dataset, FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(word2id, FLAGS.max_aspect_len,
                           FLAGS.max_context_len, FLAGS.dataset + 'train',
                           FLAGS.pre_processed)
    test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.dataset + 'test', FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    with tf.Session(config=config) as sess:
        model = IAN(FLAGS, sess)
        model.build_model(train_data, test_data)
        model.run()

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        dataset, pre_processed)

    print('Loading training data ,validation and testing data ...')
    train_data = read_data(word2id, FLAGS.max_aspect_len,
                           FLAGS.max_context_len, dataset + 'train',
                           pre_processed)
    test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len,
                          dataset + 'val', pre_processed)
    test_new_data = read_data(word2id, FLAGS.max_aspect_len,
                              FLAGS.max_context_len, dataset + 'test',
                              pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    model = IAN(FLAGS)
    run(model, train_data, test_data, test_new_data)

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("vec_file", type=str, help="File with embeddings")
    args = parser.parse_args()

    embeddings = load_word_embeddings(args.vec_file)

    joblib.dump(embeddings, args.vec_file + ".bin")
Ejemplo n.º 7
0
Archivo: main.py Proyecto: haozijie/RAM
def main(_):
    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len, FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id, FLAGS.max_sentence_len,  FLAGS.max_aspect_len, FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess:
        model = RAM(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
Ejemplo n.º 8
0
def main(_):
    print('Loading data info ...')
    #FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed)
    print('Buoc 1: lay thong tin co ban cu du lieu train va test ...')
    word2id, max_sentence_len, max_aspect_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)
    #sys.exit()
    #sys.exit()

    #tf.app.flags.DEFINE_string('word2id', word2id, 'word2id')
    tf.app.flags.DEFINE_integer('max_sentence_len', max_sentence_len,
                                'max sentence len')
    tf.app.flags.DEFINE_integer('max_aspect_len', max_aspect_len,
                                'max aspect len')

    print('Buoc 2: Loading training data and testing data ...')
    print('Buoc 2.1: doac training data ...')
    train_data = read_data(FLAGS.train_fname, word2id, max_sentence_len,
                           max_aspect_len, FLAGS.train_data,
                           FLAGS.pre_processed, FLAGS.sentiment_data)
    #sys.exit()
    print('Buoc 2.2: doac testing data ...')
    test_data = read_data(FLAGS.test_fname, word2id, max_sentence_len,
                          max_aspect_len, FLAGS.test_data, FLAGS.pre_processed,
                          FLAGS.sentiment_data)

    print('Loading pre-trained word vectors ...')
    word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim,
                                    word2id)

    with tf.Session() as sess:
        model = RAM(FLAGS, word2id, word2vec, sess)

        print('Build model ...')
        model.build_model()

        print('Run model ...')
        model.run(train_data, test_data)
Ejemplo n.º 9
0
def main(_):
    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id,
                          FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname,
                                          FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess:
        model = IAN(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
Ejemplo n.º 10
0
    def __init__(self, config):
        super(LSTMClassifier, self).__init__()
        self.dropout = config['dropout']
        self.n_layers = config['n_layers']
        self.hidden_dim = config['hidden_dim']
        self.output_dim = config['output_dim']
        self.vocab_size = config['vocab_size']
        self.embedding_dim = config['embedding_dim']
        self.bidirectional = config['bidirectional']

        self.embedding = nn.Embedding.from_pretrained(load_word_embeddings(),
                                                      freeze=False)

        self.rnn = nn.LSTM(self.embedding_dim,
                           self.hidden_dim,
                           bias=True,
                           num_layers=self.n_layers,
                           dropout=self.dropout,
                           bidirectional=self.bidirectional)
        self.n_directions = 2 if self.bidirectional else 1
        self.out = nn.Linear(self.n_directions * self.hidden_dim,
                             self.output_dim)
        self.softmax = F.softmax
Ejemplo n.º 11
0
def train_model(data_name="laptops",
                task_name="ATEPC",
                params_str="w2v,150,200,20,0.0010,20,0.001"):
    DATA_ROOT = os.getcwd() + '/data'
    SAVE_ROOT = os.getcwd() + '/models'  # trained models
    LOG_ROOT = os.getcwd() + '/logs'

    print("-----{0}-----{1}-----{2}-----".format(task_name, data_name,
                                                 params_str))

    # ----- create save directory -----
    save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name)
    if not os.path.exists(SAVE_ROOT):
        os.makedirs(SAVE_ROOT)
    if not os.path.exists(LOG_ROOT):
        os.makedirs(LOG_ROOT)
    if not os.path.exists(SAVE_ROOT + "/{0}".format(data_name)):
        os.makedirs(SAVE_ROOT + "/{0}".format(data_name))
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # ----- load raw data -----
    train_path = os.path.join(DATA_ROOT,
                              '{0}.{1}.train.tsv'.format(data_name, task_name))
    test_path = os.path.join(DATA_ROOT,
                             '{0}.{1}.test.tsv'.format(data_name, task_name))
    # train set
    if task_name == "ATE":
        sents1, _, _, _, labels1, preds1 = collect_data_infor_from_tsv(
            train_path, keep_conflict=True)
    else:
        sents1, _, _, _, labels1, preds1 = collect_data_infor_from_tsv(
            train_path, keep_conflict=False)
    X1_train_valid = sents1
    Y_train_valid = labels1
    # test set
    sents2, _, _, _, labels2, preds2 = collect_data_infor_from_tsv(
        test_path, keep_conflict=True)
    X1_test = sents2
    Y_test_origin = labels2
    # train + test for counting vocab size
    X1_train_test = np.concatenate((X1_train_valid, X1_test), axis=0)
    Y_train_test = np.concatenate((Y_train_valid, Y_test_origin), axis=0)

    # ----- Model Config
    model_config = ModelConfig()
    model_config.adjust_params_follow_paramstr(params_str)
    p = WordPreprocessor()
    p.fit(X1=X1_train_test, Y=Y_train_test)
    model_config.adjust_params_follow_preprocessor(p)
    print(p.vocab_tag)

    # ----- Embedding loading -----
    w_embedding_path = 'models/{0}.word.{1}.txt'.format(
        model_config.embedding_name, model_config.word_embedding_size)
    W_embedding = load_word_embeddings(p.vocab_word, w_embedding_path,
                                       model_config.word_embedding_size)
    print(W_embedding.shape)

    # for evaluation 2 tasks
    atepc_evaluator = ATEPCNewEvaluator()

    kf = KFold(n_splits=10, shuffle=True)
    i_fold = 0
    model_name = params_str

    results = []
    X_test, Y_test = p.transform(X1=X1_test, Y=Y_test_origin)
    for train_index, valid_index in kf.split(X1_train_valid):
        model_name_ifold = model_name + "." + str(i_fold)
        # create data
        X1_train_ori, X1_valid_ori = X1_train_valid[
            train_index], X1_train_valid[valid_index]
        Y_train_ori, Y_valid_ori = Y_train_valid[train_index], Y_train_valid[
            valid_index]

        X_train, Y_train = p.transform(X1=X1_train_ori, Y=Y_train_ori)
        X_valid, Y_valid = p.transform(X1=X1_valid_ori, Y=Y_valid_ori)
        data = create_data_object(X_train, Y_train, X_valid, Y_valid, X_test,
                                  Y_test)
        # data = create_data_object(copy.deepcopy(X_valid), copy.deepcopy(Y_valid), X_valid , Y_valid, X_test, Y_test)
        f1_valid_best = -1.0
        patient_i = model_config.patience

        sess = tf.Session()
        with sess.as_default():
            # tensorflow model
            model = MATEPC(config=model_config)
            sess.run(tf.global_variables_initializer())
            model.load_word_embedding(sess, initial_weights=W_embedding)

            for epoch_i in range(model_config.max_epoch):
                train_start = int(time.time())
                crf_transition_parameters, loss_train = train_step(
                    sess, model, model_config, data, "train")
                train_end = int(time.time())
                valid_start = int(time.time())
                f1_valid, ys_pred_valid, ys_true_valid, loss_valid = predict_step(
                    sess, model, p, data, "valid", crf_transition_parameters)
                f1_test, ys_pred_test, ys_true_test, loss_test = predict_step(
                    sess, model, p, data, "test", crf_transition_parameters)
                ate_f1_valid, apc_acc_valid = atepc_evaluator.evaluate(
                    ys_true_valid, ys_pred_valid, verbose=False)
                ate_f1_test, apc_acc_test = atepc_evaluator.evaluate(
                    ys_true_test, ys_pred_test, verbose=False)
                valid_end = int(time.time())
                if f1_valid > f1_valid_best:
                    patient_i = model_config.patience
                    f1_valid_best = f1_valid
                    model.saver.save(sess,
                                     save_path=os.path.join(
                                         save_path, model_name_ifold))
                    p.save(file_path=os.path.join(save_path, model_name_ifold))
                    print(
                        "Epoch {0}. Training/valid loss: {1:.4f}/{6:.4f}. Validation f1: {2:.2f}. Time(train/valid): ({4}/{5})s .Patience: {3}. __BEST__, ({7},{8}), ({9}/{10})"
                        .format(epoch_i, loss_train, f1_valid * 100, patient_i,
                                train_end - train_start,
                                valid_end - valid_start, loss_valid,
                                ate_f1_valid, apc_acc_valid, ate_f1_test,
                                apc_acc_test))
                else:
                    print(
                        "Epoch {0}. Training/valid loss: {1:.4f}/{6:.4f}. Validation f1: {2:.2f}. Time(train/valid): ({4}/{5})s .Patience: {3}.         , ({7},{8}), ({9}/{10})"
                        .format(epoch_i, loss_train, f1_valid * 100, patient_i,
                                train_end - train_start,
                                valid_end - valid_start, loss_valid,
                                ate_f1_valid, apc_acc_valid, ate_f1_test,
                                apc_acc_test))
                    patient_i -= 1
                    if patient_i < 0:
                        break

            model.saver.restore(sess,
                                save_path=os.path.join(save_path,
                                                       model_name_ifold))
            crf_transition_parameters = sess.run(
                model.crf_transition_parameters)
            f1_valid, _, _, loss_valid = predict_step(
                sess, model, p, data, "valid", crf_transition_parameters)
            f1_test, ys_pred, ys_true, loss_test = predict_step(
                sess, model, p, data, "test", crf_transition_parameters)
            print("F1 test, ATEPC task: ", f1_test)
            f1, acc = atepc_evaluator.evaluate(ys_true, ys_pred, verbose=True)
            results.append([f1_valid, f1, acc])
            write_result(os.path.join(LOG_ROOT, model_name_ifold + ".txt"),
                         sents2, ys_true, ys_pred)

        tf.reset_default_graph()
        i_fold += 1
        print("-----", i_fold, "-----")
Ejemplo n.º 12
0
                               'the file saving data information')
    tf.app.flags.DEFINE_string('train_data', './data/train_data.txt',
                               'the file saving training data')
    tf.app.flags.DEFINE_string('test_data', './data/test_data.txt',
                               'the file saving testing data')

    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id,
                          FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname,
                                          FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess:
        model = AN_model(FLAGS, sess)
        model.build_model()
        model.train(train_data, test_data)

    print "model=AN, embedding=%s, batch-size=%s, n_epoch=%s, n_hidden=%s, data=%s" % (
        FLAGS.embedding, FLAGS.batch_size, FLAGS.n_epoch, FLAGS.n_hidden,
        FLAGS.train_fname)
Ejemplo n.º 13
0
    X2_train_valid = np.asarray(list(zip(poses1, dep_idxs1, dep_relations1)))
    Y_train_valid = labels1

    # test set
    sents2, poses2, dep_idxs2, dep_relations2, labels2, preds2 = collect_data_infor_from_tsv(
        test_path, keep_conflict=True)
    X1_test = sents2
    X2_test = np.asarray(list(zip(poses2, dep_idxs2, dep_relations2)))
    Y_test = labels2

    # train + test
    X1_train_test = np.concatenate((X1_train_valid, X1_test), axis=0)
    X2_train_test = np.concatenate((X2_train_valid, X2_test), axis=0)
    Y_train_test = np.concatenate((Y_train_valid, Y_test), axis=0)

    p = WordPreprocessor()
    p.fit(X1=X1_train_test, X2=X2_train_test, Y=Y_train_test)
    A, B = p.transform(X1_train_test, Y=Y_train_test)
    # # preprocessor
    # print(p.max_length)
    # print(A[0].shape)
    # print(A[1].shape)
    # print(A[2].shape)
    # print(A[3].shape)
    # print(B.shape)

    POS_embeddings = load_word_embeddings(p.pos_extractor.features_dict,
                                          pos_embedding_path, 50)
    print(POS_embeddings)
    p.save("logs/p")
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("embeddings_prefix",
                        type=str,
                        help="Directory with word embeddings.")
    parser.add_argument("languages",
                        type=str,
                        help="File with a list of languages.")
    parser.add_argument("train_data_txt", type=str, help="Training sentences.")
    parser.add_argument("train_data_lng",
                        type=str,
                        help="Language codes for training sentences.")
    parser.add_argument("val_data_txt", type=str, help="Validation sentences.")
    parser.add_argument("val_data_lng",
                        type=str,
                        help="Language codes for validation sentences.")
    parser.add_argument("test_data_txt", type=str, help="Test sentences.")
    parser.add_argument("test_data_lng",
                        type=str,
                        help="Language codes for test sentences.")
    parser.add_argument("--num-threads", type=int, default=4)
    parser.add_argument("--save-model",
                        type=str,
                        help="Path where to save the best model.")
    parser.add_argument("--save-centroids",
                        type=str,
                        help="Path to save language centroids.")
    parser.add_argument("--test-output",
                        type=str,
                        default=None,
                        help="Output for example classification.")
    parser.add_argument(
        "--center-lng",
        default=False,
        action="store_true",
        help="Center languages to be around coordinate origin.")
    args = parser.parse_args()

    with open(args.languages) as f_lang:
        languages = [line.strip() for line in f_lang]
    lng2idx = {lng: i for i, lng in enumerate(languages)}

    print("Loading embeddings.")
    all_embeddings = {
        lng: load_word_embeddings(f"{args.embeddings_prefix}/{lng}.vec")
        for lng in languages
    }

    print("Loading training data.")
    train_repr, train_tgt = load_dataset(args.train_data_txt,
                                         args.train_data_lng, all_embeddings,
                                         lng2idx)
    print("Loading test data.")
    test_repr, test_tgt = load_dataset(args.test_data_txt, args.test_data_lng,
                                       all_embeddings, lng2idx)

    if args.center_lng:
        centroids = np.stack([
            np.mean(train_repr[train_tgt == i], axis=0)
            for i in range(len(all_embeddings))
        ])
        train_repr = train_repr - centroids[train_tgt]
        test_repr = test_repr - centroids[test_tgt]

    model = LogisticRegression()
    model.fit(train_repr, train_tgt)

    test_prediction = model.predict(test_repr)

    accuracy = np.mean(test_prediction == test_tgt)
    print(accuracy)
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("src", type=str, help="Sentences in source language.")
    parser.add_argument("mt",
                        type=str,
                        help="Sentences in the target language.")
    parser.add_argument("src_emb",
                        type=str,
                        help="Source language word embeddings.")
    parser.add_argument("mt_emb",
                        type=str,
                        help="Target language word embeddings.")
    parser.add_argument("src_lng", type=str, help="Source language code.")
    parser.add_argument("mt_lng", type=str, help="Target language code.")
    parser.add_argument(
        "--mean-pool",
        default=False,
        action="store_true",
        help="If true, use mean-pooling instead of [CLS] vecotr.")
    parser.add_argument("--center-lng",
                        default=False,
                        action="store_true",
                        help="If true, center representations first.")
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--src-proj",
                        default=None,
                        type=str,
                        help="Sklearn projection of the source language.")
    parser.add_argument("--mt-proj",
                        default=None,
                        type=str,
                        help="Sklearn projection of the target language.")
    parser.add_argument("--num-threads", type=int, default=4)
    args = parser.parse_args()

    if args.center_lng and (args.src_proj is not None
                            and args.src_proj is not None):
        print(
            "You can either project or center "
            "the representations, not both.",
            file=sys.stderr)
        exit(1)

    torch.set_num_threads(args.num_threads)

    src_embeddings = load_word_embeddings(args.src_emb)
    mt_embeddings = load_word_embeddings(args.mt_emb)

    src_repr = torch.from_numpy(
        np.stack(
            word_embeddings_for_file(args.src, src_embeddings, args.src_lng)))
    mt_repr = torch.from_numpy(
        np.stack(word_embeddings_for_file(args.mt, mt_embeddings,
                                          args.mt_lng)))

    if args.center_lng:
        src_repr = center(src_repr)
        mt_repr = center(mt_repr)

    if args.src_proj is not None:
        src_repr = apply_sklearn_proj(src_repr, args.src_proj)
    if args.mt_proj is not None:
        mt_repr = apply_sklearn_proj(mt_repr, args.mt_proj)

    src_norm = (src_repr * src_repr).sum(1).sqrt()
    mt_norm = (mt_repr * mt_repr).sum(1).sqrt()

    cosine = (src_repr * mt_repr).sum(1) / src_norm / mt_norm

    for num in cosine.cpu().detach().numpy():
        print(num)
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch Reflection-based Word Attribute Transfer Example')
    parser.add_argument('--model-dir',
                        type=str,
                        required=True,
                        help='model directory')
    parser.add_argument('--attr',
                        type=str,
                        choices=["MF", "SP", "CC", "AN"],
                        required=True,
                        help='target attribute {MF, SP, CC, AN}')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='gpu device id. -1 indicates cpu (default: -1)')

    args = parser.parse_args()
    with open(args.model_dir + '/args.json') as f:
        config = json.load(f)
        config.update(args.__dict__)
        args.__dict__ = config
    print(json.dumps(args.__dict__, indent=2))

    torch.manual_seed(args.seed)
    device, use_cuda = utils.get_device(args.gpu)

    # Load model
    print('loading model...')
    if args.weight_sharing:
        model = Ref_PM_Share(args.dim_x, args.dim_h).to(device)
    else:
        model = Ref_PM(args.dim_x, args.dim_h).to(device)
    model_path = args.model_dir + '/model.pt'
    model.load_state_dict(torch.load(model_path, map_location=device))
    print('loaded.')

    # Load word embeddings
    print('loading word embeddings...')
    word_embedding = utils.load_word_embeddings(args.emb)
    print('loaded.')

    # Calculate accuracy and stability
    attributes = [args.attr]
    include_one_to_many_data = True if args.attr == "AN" else False
    dataset = utils.load_dataset(0, attributes, args.seed, args.emb,
                                 include_one_to_many_data,
                                 args.invariant_word_type)
    print('calculating accuracy...')
    X_train = [d[3] for d in dataset if d[1] == 'train' and d[0] == 'A']
    T_train = [d[4] for d in dataset if d[1] == 'train' and d[0] == 'A']
    Z_train = [
        ATTR2ID[d[2]] for d in dataset if d[1] == 'train' and d[0] == 'A'
    ]
    Y_train = trasfer(model, device, X_train, Z_train, word_embedding)
    accuracy = mean([1 if y == t else 0 for y, t in zip(Y_train, T_train)])
    print('accuracy: %f' % accuracy)

    X_valid = [d[3] for d in dataset if d[1] == 'valid' and d[0] == 'A']
    T_valid = [d[4] for d in dataset if d[1] == 'valid' and d[0] == 'A']
    Z_valid = [
        ATTR2ID[d[2]] for d in dataset if d[1] == 'valid' and d[0] == 'A'
    ]
    Y_valid = trasfer(model, device, X_valid, Z_valid, word_embedding)
    if args.attr == "AN":
        accuracy = mean([1 if y in t else 0 for y, t in zip(Y_valid, T_valid)])
    else:
        accuracy = mean([1 if y == t else 0 for y, t in zip(Y_valid, T_valid)])
    print('accuracy: %f' % accuracy)

    X_test = [d[3] for d in dataset if d[1] == 'test' and d[0] == 'A']
    T_test = [d[4] for d in dataset if d[1] == 'test' and d[0] == 'A']
    Z_test = [ATTR2ID[d[2]] for d in dataset if d[1] == 'test' and d[0] == 'A']
    Y_test = trasfer(model, device, X_test, Z_test, word_embedding)
    if args.attr == "AN":
        accuracy = mean([1 if y in t else 0 for y, t in zip(Y_test, T_test)])
    else:
        accuracy = mean([1 if y == t else 0 for y, t in zip(Y_test, T_test)])
    print('accuracy: %f' % accuracy)

    print('calculating stability...')
    X_test = [d[3] for d in dataset if d[1] == 'test' and d[0] == 'N']
    T_test = [d[4] for d in dataset if d[1] == 'test' and d[0] == 'N']
    Z_test = [ATTR2ID[d[2]] for d in dataset if d[1] == 'test' and d[0] == 'N']
    Y_test = trasfer(model, device, X_test, Z_test, word_embedding)
    stability = mean([1 if y == t else 0 for y, t in zip(Y_test, T_test)])
    print('stability: %f' % stability)
Ejemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch Reflection-based Word Attribute Transfer Example')
    parser.add_argument('--model-dir',
                        type=str,
                        required=True,
                        help='model directory')
    parser.add_argument('--attr',
                        type=str,
                        choices=["MF", "SP", "CC", "AN"],
                        required=True,
                        help='target attribute to transfer {MF, SP, CC, AN}')
    parser.add_argument(
        '--src',
        type=str,
        default='',
        help='path of source file. demo mode if no value is set')
    parser.add_argument('--no-tokenize',
                        action='store_true',
                        default=False,
                        help='disables tokenization (default: False)')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='gpu device id. -1 indicates cpu (default: -1)')

    args = parser.parse_args()
    with open(args.model_dir + '/args.json') as f:
        config = json.load(f)
        config.update(args.__dict__)
        args.__dict__ = config
    print(json.dumps(args.__dict__, indent=2))

    torch.manual_seed(args.seed)
    device, use_cuda = utils.get_device(args.gpu)

    # Load model
    print('loading model...')
    if args.weight_sharing:
        model = Ref_PM_Share(args.dim_x, args.dim_h).to(device)
    else:
        model = Ref_PM(args.dim_x, args.dim_h).to(device)
    model_path = args.model_dir + '/model.pt'
    model.load_state_dict(torch.load(model_path, map_location=device))
    print('loaded.')

    # Load word embeddings
    print('loading word embeddings...')
    word_embedding = utils.load_word_embeddings(args.emb)
    print('loaded.')

    # Transfer
    demo_mode = True if not args.src else False
    if demo_mode:
        print('\n[demo mode]')
        while (1):
            sentence = input('input:  ')
            tokens = [nltk.word_tokenize(sentence)]
            z = ATTR2ID[args.attr]
            result = trasfer_from_tokens(model, device, tokens, z,
                                         word_embedding, demo_mode)
            print('output: ' + ' '.join(result[0]))
    else:
        # Transfer text file
        with open(args.src) as f:
            src = f.read().split('\n')
        if args.emb == 'glove':
            src = list(map(str.lower, src))
        if args.no_tokenize:
            tokens = [sentence.split(' ') for sentence in src]
        else:
            tokens = [nltk.word_tokenize(sentence) for sentence in src]
        z = ATTR2ID[args.attr]
        result = trasfer_from_tokens(model, device, tokens, z, word_embedding,
                                     demo_mode)

        # Save result
        filename = args.src.split('/')[-1].split('.')[0]
        path = args.model_dir + '/result_' + args.attr + '_' + filename + '.txt'
        with open(path, 'w') as f:
            r = '\n'.join([' '.join(tokens) for tokens in result])
            f.write(r)
        print('results are saved at ', path)