def process_data(config):
    train_file = os.path.join(config["raw_path"], "2014_train.txt")
    dev_file = os.path.join(config["raw_path"], "2014_dev.txt")
    #ref_file = os.path.join(config["raw_path"], "2014_test.txt")

    if not os.path.exists(config["save_path"]):
        os.makedirs(config["save_path"])
    # build vocabulary
    word_vocab, char_vocab = build_vocab_list([train_file], config["min_word_count"], config["min_char_count"],
                                              config["max_vocab_size"])
    if not config["use_pretrained"]:
        word_dict, char_dict = build_vocabulary(word_vocab, char_vocab)
    else:
        #glove_path = config["glove_path"].format(config["glove_name"], config["emb_dim"])
        glove_path = config["glove_path"]
        glove_vocab = load_glove_vocab(glove_path, config["glove_name"])
        glove_vocab = glove_vocab & {word.lower() for word in glove_vocab}
        word_vocab = [word for word in word_vocab if word in glove_vocab]
        word_dict, char_dict = build_vocabulary(word_vocab, char_vocab)
        tmp_word_dict = word_dict.copy()
        del tmp_word_dict[UNK], tmp_word_dict[NUM], tmp_word_dict[END]
        vectors = filter_glove_emb(tmp_word_dict, glove_path, config["glove_name"], config["emb_dim"])
        np.savez_compressed(config["pretrained_emb"], embeddings=vectors)
    # create indices dataset
    punct_dict = dict([(punct, idx) for idx, punct in enumerate(PUNCTUATION_VOCABULARY)])
    train_set = build_dataset([train_file], word_dict, char_dict, punct_dict, config["max_sequence_len"])
    dev_set = build_dataset([dev_file], word_dict, char_dict, punct_dict, config["max_sequence_len"])
    #ref_set = build_dataset([ref_file], word_dict, char_dict, punct_dict, config["max_sequence_len"])

    vocab = {"word_dict": word_dict, "char_dict": char_dict, "tag_dict": punct_dict}
    # write to file
    #write_json(config["vocab"], vocab)
    write_json(config["train_set"], train_set)
    write_json(config["dev_set"], dev_set)
Ejemplo n.º 2
0
def process_data(config):
    train_data = load_dataset(os.path.join(config["raw_path"], "train.txt"), config["task_name"])
    dev_data = load_dataset(os.path.join(config["raw_path"], "valid.txt"), config["task_name"])
    test_data = load_dataset(os.path.join(config["raw_path"], "test.txt"), config["task_name"])
    if not os.path.exists(config["save_path"]):
        os.makedirs(config["save_path"])
    # build vocabulary
    if not config["use_pretrained"]:
        word_dict = build_word_vocab([train_data, dev_data, test_data])
    else:
        glove_path = config["glove_path"].format(config["glove_name"], config["emb_dim"])
        glove_vocab = load_glove_vocab(glove_path, config["glove_name"])
        word_dict = build_word_vocab_pretrained([train_data, dev_data, test_data], glove_vocab)
        vectors = filter_glove_emb(word_dict, glove_path, config["glove_name"], config["emb_dim"])
        np.savez_compressed(config["pretrained_emb"], embeddings=vectors)
    tag_dict = build_tag_vocab([train_data, dev_data, test_data], config["task_name"])
    # build char dict
    train_data = load_dataset(os.path.join(config["raw_path"], "train.txt"), config["task_name"], keep_number=True,
                              lowercase=config["char_lowercase"])
    dev_data = load_dataset(os.path.join(config["raw_path"], "valid.txt"), config["task_name"], keep_number=True,
                            lowercase=config["char_lowercase"])
    test_data = load_dataset(os.path.join(config["raw_path"], "test.txt"), config["task_name"], keep_number=True,
                             lowercase=config["char_lowercase"])
    char_dict = build_char_vocab([train_data, dev_data, test_data])
    # create indices dataset
    train_set = build_dataset(train_data, word_dict, char_dict, tag_dict)
    dev_set = build_dataset(dev_data, word_dict, char_dict, tag_dict)
    test_set = build_dataset(test_data, word_dict, char_dict, tag_dict)
    vocab = {"word_dict": word_dict, "char_dict": char_dict, "tag_dict": tag_dict}
    # write to file
    write_json(os.path.join(config["save_path"], "vocab.json"), vocab)
    write_json(os.path.join(config["save_path"], "train.json"), train_set)
    write_json(os.path.join(config["save_path"], "dev.json"), dev_set)
    write_json(os.path.join(config["save_path"], "test.json"), test_set)
def process_data(config):
    # load raw data
    train_data = load_dataset(os.path.join(config["raw_path"], "train.crf"),
                              encoding="cp1252")
    dev_data = load_dataset(os.path.join(config["raw_path"], "dev.crf"),
                            encoding="cp1252")
    test_data = load_dataset(os.path.join(config["raw_path"], "test.crf"),
                             encoding="cp1252")
    # build vocabulary
    word_dict, char_dict, _ = build_vocab([train_data, dev_data])
    *_, tag_dict = build_vocab([train_data, dev_data, test_data])
    # create indices dataset
    train_set = build_dataset(train_data, word_dict, char_dict, tag_dict)
    dev_set = build_dataset(dev_data, word_dict, char_dict, tag_dict)
    test_set = build_dataset(test_data, word_dict, char_dict, tag_dict)
    vocab = {
        "word_dict": word_dict,
        "char_dict": char_dict,
        "tag_dict": tag_dict
    }
    # write to file
    if not os.path.exists(config["save_path"]):
        os.makedirs(config["save_path"])
    write_json(os.path.join(config["save_path"], "vocab.json"), vocab)
    write_json(os.path.join(config["save_path"], "train.json"), train_set)
    write_json(os.path.join(config["save_path"], "dev.json"), dev_set)
    write_json(os.path.join(config["save_path"], "test.json"), test_set)