def get_vocab():
    vocab_file_eng = "vocab_eng.json"
    vocab_file_fra = "vocab_fra.json"
    if not os.path.exists(vocab_file_eng) or not os.path.exists(vocab_file_fra):
        return create_vocab(vocab_file_eng, vocab_file_fra)
    else:
        logger.info('Loading vocab.')
        vocab_eng = Vocab.from_json(vocab_file_eng)
        vocab_fra = Vocab.from_json(vocab_file_fra)
        logger.info(vocab_eng)
        logger.info(vocab_fra)
        return vocab_eng, vocab_fra
def create_vocab(vocab_file_eng):
    if not os.path.exists(vocab_file_eng):
        vocab_eng = Vocab("eng")
        logger.info('Creating vocab.')
        with open(
                "./data/cornell movie-dialogs corpus/formatted_movie_lines.txt",
                "r",
                encoding="utf-8") as f:
            print("Create Vocab")
            for line in tqdm(f.readlines()):
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0],
                                       to_lower=True,
                                       remove_punc=False)
                vocab_eng.add_sentence(line[1],
                                       to_lower=True,
                                       remove_punc=False)
        logger.info(vocab_eng)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        return vocab_eng
Example #3
0
def create_vocab(vocab_file_eng, vocab_file_fra):
    if not os.path.exists(vocab_file_eng) or not os.path.exists(
            vocab_file_fra):
        vocab_eng = Vocab("eng")
        vocab_fra = Vocab("fra")
        logger.info('Creating vocab.')
        with open("./data/eng-fra.txt", "r", encoding="utf-8") as f:
            for line in f:
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0],
                                       to_lower=True,
                                       remove_punc=True)
                vocab_fra.add_sentence(line[1],
                                       to_lower=True,
                                       remove_punc=True)
        logger.info(vocab_eng)
        logger.info(vocab_fra)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        vocab_fra.to_json(vocab_file_fra)
        return vocab_eng, vocab_fra
Example #4
0
config_file = args.config
with open(config_file, "r", encoding="utf-8") as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

vocab_file_eng = "vocab_eng.json"
vocab_file_fra = "vocab_fra.json"
if not os.path.exists(vocab_file_eng):
    raise FileNotFoundError(vocab_file_eng)
elif not os.path.exists(vocab_file_fra):
    raise FileNotFoundError(vocab_file_fra)
else:
    logger.info('Loading vocab.')
    vocab_eng = Vocab.from_json(vocab_file_eng)
    vocab_fra = Vocab.from_json(vocab_file_fra)
    logger.info(vocab_eng)
    logger.info(vocab_fra)

logger.info('Preparing data.')
val_data_path = 'val_data.json'
if not os.path.exists(val_data_path):
    raise FileNotFoundError(val_data_path)
else:
    with open(val_data_path, "r", encoding="utf-8") as f:
        val_data = json.load(f)

# Load model if a args.checkpoint is provided
if args.checkpoint is not None:
    logger.info('Loading checkpoint file [{}].'.format(args.checkpoint))
Example #5
0
parser = argparse.ArgumentParser()
parser.add_argument("--config", default="./config.json", type=str)
parser.add_argument("--checkpoint", default=None, type=str)
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
config_file = args.config
with open(config_file) as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

my_vocab = Vocab.from_json("my_vocab.json")


def prepare_data(data_path, vocab):
    data_ids = []
    with open(data_path, "r", encoding="utf-8") as f:
        print("Prepare Data")
        for line in f:
            line = line.split(' ', 1)
            tgt, inp = line[0], line[1]
            tgt = int(tgt)
            inp = inp.lower()
            data_ids.append({
                "input":
                vocab.indexes_from_sentence(inp, add_eos=False),
                "target":
Example #6
0
config_file = args.config
with open(config_file) as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

datalist = [
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.train.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.valid.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.test.txt"
]

vocab_file = "my_vocab.json"
if not os.path.exists(vocab_file):
    my_vocab = Vocab("my_vocab")
    for data in datalist:
        with open(data, "r", encoding="utf-8") as f:
            for line in f:
                line = line.split(' ', 1)
                tgt, inp = line[0], line[1]
                my_vocab.add_sentence(inp, to_lower=True, remove_punc=False)
    logger.info("vocab size: {}".format(len(my_vocab)))
    my_vocab.keep_most_frequent_k(k=50000)
    my_vocab.to_json("my_vocab.json")
else:
    logger.info('Loading vocab...')
    my_vocab = Vocab.from_json(vocab_file)

pretrain_embedding = my_vocab.extract_pretrain_embedding(
    "./glove.6B.100d.txt", 100)
Example #7
0
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
config_file = args.config
with open(config_file, "r", encoding="utf-8") as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

vocab_file_eng = "vocab_eng.json"
if not os.path.exists(vocab_file_eng):
    raise FileNotFoundError(vocab_file_eng)
else:
    logger.info('Loading vocab.')
    vocab_eng = Vocab.from_json(vocab_file_eng)
    logger.info(vocab_eng)

logger.info('Preparing data.')
val_data_path = 'val_data.json'
if not os.path.exists(val_data_path):
    raise FileNotFoundError(val_data_path)
else:
    with open(val_data_path, "r", encoding="utf-8") as f:
        val_data = json.load(f)

# Load model if a args.checkpoint is provided
if args.checkpoint is not None:
    logger.info('Loading checkpoint file [{}].'.format(args.checkpoint))
    # If loading on same machine the model was trained on
    checkpoint = torch.load(args.checkpoint)