Ejemplo n.º 1
0
def dump_word_dict():
  word_dict = build_word_dict(train_data_path)
  start_index = len(word_dict) + 4 + 1
  word_dict.update(build_word_dict(valid_data_path,start_index))
  word_dict["<pad>"] = 0
  word_dict["<bos>"] = 1
  word_dict["<eos>"] = 2
  word_dict["<unk>"] = 3
  print('dumping word_dict...')
  with open(word_dict_path, 'w') as f:
    json.dump(word_dict,f)
  print('word_dict dumped')
Ejemplo n.º 2
0
            train_step(batch_x, batch_y)
            step = tf.train.global_step(sess, global_step)

            if step % 200 == 0:
                test_acc = test_accuracy(test_x, test_y)
                print("test_accuracy = {0}\n".format(test_acc))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pre_trained",
                        type=str,
                        default="none",
                        help="none | auto_encoder | language_model")
    parser.add_argument("--summary_dir",
                        type=str,
                        default="classifier",
                        help="summary dir.")
    args = parser.parse_args()

    if not os.path.exists("dbpedia_csv"):
        print("Downloading dbpedia dataset...")
        download_dbpedia()

    print("\nBuilding dictionary..")
    word_dict = build_word_dict()
    print("Preprocessing dataset..")
    train_x, train_y = build_word_dataset("train", word_dict, MAX_DOCUMENT_LEN)
    test_x, test_y = build_word_dataset("test", word_dict, MAX_DOCUMENT_LEN)
    train(train_x, train_y, test_x, test_y, len(word_dict), args)
Ejemplo n.º 3
0
    args.summary_dir = path
    args.model_dir = model_dir

    write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels, path, 'train.csv', 'test.csv',
                    args.labeled_data_num, args.test_data_num)
    train_path = os.path.join(path, 'train.csv')
    test_path = os.path.join(path, 'test.csv')
    print("\nBuilding dictionary..")
    if args.pre_trained == 'none':
        unlabeled_csv_file = 'unlabeled_150000.csv'
        unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file)
        if not os.path.exists(unlabeled_csv_path):
            write_csv_file([os.path.join(dataset_dir, args.data_type + '.txt')], [-1], model_dir, unlabeled_csv_file,
                           150000)
        print("\nBuilding dictionary..")
        word_dict = build_word_dict(model_dir, 20000, unlabeled_csv_path)
        print("Preprocessing dataset..")
        # word_dict = build_word_dict(model_dir, None, train_path)
    else:
        word_dict = build_word_dict(model_dir, None)
    print("Preprocessing dataset..")
    label_map = dict()
    k = 0
    for label in args.labels:
        label_map[label] = k
        k = k + 1
    train_x, train_y = build_word_dataset(train_path, test_path, "train", word_dict, args.max_document_len, label_map,
                                          up_sample=args.up_sample)
    test_x, test_y = build_word_dataset(train_path, test_path, "test", word_dict, args.max_document_len, label_map)
    logout_config(args, train_y, test_y)
    train(train_x, train_y, test_x, test_y, len(word_dict), args)
Ejemplo n.º 4
0
            if not os.path.exists(train_text_dir):
                with open(train_text_dir, 'w', encoding='utf8') as f_train:
                    f_train.writelines(all_lines[:train_sample])
            if not os.path.exists(test_text_dir):
                with open(test_text_dir, 'w', encoding='utf8') as f_test:
                    f_test.writelines(all_lines[-test_sample:])
        model_dir = get_train_path(args)
        args.model_dir = model_dir

        write_csv_files(train_text_dirs, test_text_dirs, args.labels,
                        args.labels, model_dir, 'train.csv', 'test.csv')
        # args.labeled_data_num, args.test_data_num)
        train_path = os.path.join(model_dir, 'train.csv')
        test_path = os.path.join(model_dir, 'test.csv')
        print("\nBuilding dictionary..")
        word_dict = build_word_dict(dataset_dir)
        embed_dict = build_embedding(word_dict, dataset_dir)
        print("Preprocessing dataset..")
        label_map = dict()
        k = 0
        for label in args.labels:
            label_map[label] = k
            k = k + 1
        train_x, train_y, valid_x, valid_y = build_word_dataset(
            train_path,
            test_path,
            "train",
            word_dict,
            args.max_document_len,
            label_map,
            up_sample=args.up_sample)
Ejemplo n.º 5
0
            str(args.num_hidden),
            str(args.hidden_layers)
        ]))
    if os.path.exists(path) is not True:
        os.makedirs(path)
    args.summary_dir = path
    args.model_dir = model_dir

    write_csv_files(train_text_dirs, test_text_dirs, args.labels, args.labels,
                    path, 'train.csv', 'test.csv', args.labeled_data_num,
                    args.test_data_num)
    train_path = os.path.join(path, 'train.csv')
    test_path = os.path.join(path, 'test.csv')
    print("\nBuilding dictionary..")
    if args.pre_trained == 'none':
        word_dict = build_word_dict(model_dir, None, train_path)
    else:
        word_dict = build_word_dict(model_dir, None)
    print("Preprocessing dataset..")
    label_map = dict()
    k = 0
    for label in args.labels:
        label_map[label] = k
        k = k + 1
    train_x, train_y = build_word_dataset(train_path,
                                          test_path,
                                          "train",
                                          word_dict,
                                          args.max_document_len,
                                          label_map,
                                          up_sample=args.up_sample)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="language_model", help="auto_encoder | language_model")
    # parser.add_argument("--model_name", type=str, default="model", help="the folder name of the model")
    parser.add_argument("--dict_size", type=int, default=20000, help="the max size of word dictionary")
    parser.add_argument("--data_folder", type=str, default="ACL", help="ACL | Markov | huffman_tree | two_tree")
    parser.add_argument("--data_type", type=str, default="news", help="movie | news | tweet")
    parser.add_argument("--unlabeled_data_num", type=int, default=50000, help="how many unlabeled data samples to use")
    parser.add_argument("--batch_size", type=int, default=128, help="batch size")
    parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
    parser.add_argument("--num_epochs", type=int, default=10, help="epoch num")
    parser.add_argument("--max_document_len", type=int, default=30, help="max length of sentence")
    args = parser.parse_args()

    dataset_dir = os.path.join("dataset", args.data_folder, args.data_type)
    unlabeled_text_dirs = [os.path.join(dataset_dir, args.data_type + '.txt')]
    model_dir = os.path.join(args.model, args.data_folder, args.data_type,
                             str(args.unlabeled_data_num))
    unlabeled_csv_file = 'unlabeled_' + str(args.unlabeled_data_num) + '.csv'
    unlabeled_csv_path = os.path.join(model_dir, unlabeled_csv_file)
    if not os.path.exists(unlabeled_csv_path):
        write_csv_file(unlabeled_text_dirs, [-1], model_dir, unlabeled_csv_file, args.unlabeled_data_num)
    print("\nBuilding dictionary..")
    word_dict = build_word_dict(model_dir, args.dict_size, unlabeled_csv_path)
    print("Preprocessing dataset..")
    train_x, train_y = build_word_dataset(unlabeled_csv_path, None, "train",
                                          word_dict,
                                          args.max_document_len)
    logout_config(args, model_dir, len(word_dict))
    train(train_x, train_y, word_dict, args, model_dir)