def create_vocab(vocab_file_eng, vocab_file_fra):
    if not os.path.exists(vocab_file_eng) or not os.path.exists(vocab_file_fra):
        vocab_eng = Vocab("eng")
        vocab_fra = Vocab("fra")
        logger.info('Creating vocab.')
        with open("./data/eng-fra.txt", "r", encoding="utf-8") as f:
            print("Create Vocab")
            for line in tqdm(f.readlines()):
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0], to_lower=True, remove_punc=True)
                vocab_fra.add_sentence(line[1], to_lower=True, remove_punc=True)
        logger.info(vocab_eng)
        logger.info(vocab_fra)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        vocab_fra.to_json(vocab_file_fra)
        return vocab_eng, vocab_fra
def create_vocab(vocab_file_eng):
    if not os.path.exists(vocab_file_eng):
        vocab_eng = Vocab("eng")
        logger.info('Creating vocab.')
        with open(
                "./data/cornell movie-dialogs corpus/formatted_movie_lines.txt",
                "r",
                encoding="utf-8") as f:
            print("Create Vocab")
            for line in tqdm(f.readlines()):
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0],
                                       to_lower=True,
                                       remove_punc=False)
                vocab_eng.add_sentence(line[1],
                                       to_lower=True,
                                       remove_punc=False)
        logger.info(vocab_eng)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        return vocab_eng
Example #3
0
datalist = [
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.train.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.valid.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.test.txt"
]

vocab_file = "my_vocab.json"
if not os.path.exists(vocab_file):
    my_vocab = Vocab("my_vocab")
    for data in datalist:
        with open(data, "r", encoding="utf-8") as f:
            for line in f:
                line = line.split(' ', 1)
                tgt, inp = line[0], line[1]
                my_vocab.add_sentence(inp, to_lower=True, remove_punc=False)
    logger.info("vocab size: {}".format(len(my_vocab)))
    my_vocab.keep_most_frequent_k(k=50000)
    my_vocab.to_json("my_vocab.json")
else:
    logger.info('Loading vocab...')
    my_vocab = Vocab.from_json(vocab_file)

pretrain_embedding = my_vocab.extract_pretrain_embedding(
    "./glove.6B.100d.txt", 100)


def prepare_data(data_path, vocab):
    data_ids = []
    with open(data_path, "r", encoding="utf-8") as f:
        print("Prepare Data")