Python Vocab Examples

Programming Language: Python

Namespace/Package Name: seq2seq.inputter.vocab

Class/Type: Vocab

Examples at hotexamples.com: 7

Python Vocab - 7 examples found. These are the top rated real world Python examples of seq2seq.inputter.vocab.Vocab extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

from_json(5)

Vocab(3)

add_sentence(3)

to_json(3)

extract_pretrain_embedding(1)

keep_most_frequent_k(1)

Example #1

Show file

File: prepare_vocab_and_data.py Project: LinjianLi/Seq2Seq-PyTorch

def get_vocab():
    vocab_file_eng = "vocab_eng.json"
    vocab_file_fra = "vocab_fra.json"
    if not os.path.exists(vocab_file_eng) or not os.path.exists(vocab_file_fra):
        return create_vocab(vocab_file_eng, vocab_file_fra)
    else:
        logger.info('Loading vocab.')
        vocab_eng = Vocab.from_json(vocab_file_eng)
        vocab_fra = Vocab.from_json(vocab_file_fra)
        logger.info(vocab_eng)
        logger.info(vocab_fra)
        return vocab_eng, vocab_fra

Example #2

Show file

File: prepare_vocab_and_data.py Project: LinjianLi/Seq2Seq-PyTorch

def create_vocab(vocab_file_eng):
    if not os.path.exists(vocab_file_eng):
        vocab_eng = Vocab("eng")
        logger.info('Creating vocab.')
        with open(
                "./data/cornell movie-dialogs corpus/formatted_movie_lines.txt",
                "r",
                encoding="utf-8") as f:
            print("Create Vocab")
            for line in tqdm(f.readlines()):
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0],
                                       to_lower=True,
                                       remove_punc=False)
                vocab_eng.add_sentence(line[1],
                                       to_lower=True,
                                       remove_punc=False)
        logger.info(vocab_eng)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        return vocab_eng

Example #3

Show file

def create_vocab(vocab_file_eng, vocab_file_fra):
    if not os.path.exists(vocab_file_eng) or not os.path.exists(
            vocab_file_fra):
        vocab_eng = Vocab("eng")
        vocab_fra = Vocab("fra")
        logger.info('Creating vocab.')
        with open("./data/eng-fra.txt", "r", encoding="utf-8") as f:
            for line in f:
                line = line.split('\t')
                line = normalizePair(line)
                if not pair_is_simple(line):
                    continue
                vocab_eng.add_sentence(line[0],
                                       to_lower=True,
                                       remove_punc=True)
                vocab_fra.add_sentence(line[1],
                                       to_lower=True,
                                       remove_punc=True)
        logger.info(vocab_eng)
        logger.info(vocab_fra)
        logger.info('Storing vocab.')
        vocab_eng.to_json(vocab_file_eng)
        vocab_fra.to_json(vocab_file_fra)
        return vocab_eng, vocab_fra

Example #4

Show file

File: run_infer.py Project: LinjianLi/Seq2Seq-PyTorch

config_file = args.config
with open(config_file, "r", encoding="utf-8") as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

vocab_file_eng = "vocab_eng.json"
vocab_file_fra = "vocab_fra.json"
if not os.path.exists(vocab_file_eng):
    raise FileNotFoundError(vocab_file_eng)
elif not os.path.exists(vocab_file_fra):
    raise FileNotFoundError(vocab_file_fra)
else:
    logger.info('Loading vocab.')
    vocab_eng = Vocab.from_json(vocab_file_eng)
    vocab_fra = Vocab.from_json(vocab_file_fra)
    logger.info(vocab_eng)
    logger.info(vocab_fra)

logger.info('Preparing data.')
val_data_path = 'val_data.json'
if not os.path.exists(val_data_path):
    raise FileNotFoundError(val_data_path)
else:
    with open(val_data_path, "r", encoding="utf-8") as f:
        val_data = json.load(f)

# Load model if a args.checkpoint is provided
if args.checkpoint is not None:
    logger.info('Loading checkpoint file [{}].'.format(args.checkpoint))

Example #5

Show file

File: run_infer.py Project: LinjianLi/Seq2Seq-PyTorch

parser = argparse.ArgumentParser()
parser.add_argument("--config", default="./config.json", type=str)
parser.add_argument("--checkpoint", default=None, type=str)
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
config_file = args.config
with open(config_file) as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

my_vocab = Vocab.from_json("my_vocab.json")


def prepare_data(data_path, vocab):
    data_ids = []
    with open(data_path, "r", encoding="utf-8") as f:
        print("Prepare Data")
        for line in f:
            line = line.split(' ', 1)
            tgt, inp = line[0], line[1]
            tgt = int(tgt)
            inp = inp.lower()
            data_ids.append({
                "input":
                vocab.indexes_from_sentence(inp, add_eos=False),
                "target":

Example #6

Show file

File: run_train.py Project: LinjianLi/Seq2Seq-PyTorch

config_file = args.config
with open(config_file) as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

datalist = [
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.train.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.valid.txt",
    "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.test.txt"
]

vocab_file = "my_vocab.json"
if not os.path.exists(vocab_file):
    my_vocab = Vocab("my_vocab")
    for data in datalist:
        with open(data, "r", encoding="utf-8") as f:
            for line in f:
                line = line.split(' ', 1)
                tgt, inp = line[0], line[1]
                my_vocab.add_sentence(inp, to_lower=True, remove_punc=False)
    logger.info("vocab size: {}".format(len(my_vocab)))
    my_vocab.keep_most_frequent_k(k=50000)
    my_vocab.to_json("my_vocab.json")
else:
    logger.info('Loading vocab...')
    my_vocab = Vocab.from_json(vocab_file)

pretrain_embedding = my_vocab.extract_pretrain_embedding(
    "./glove.6B.100d.txt", 100)

Example #7

Show file

File: run_infer.py Project: LinjianLi/Seq2Seq-PyTorch

use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
config_file = args.config
with open(config_file, "r", encoding="utf-8") as f:
    config = json.load(f)

logger.info("Use GPU: {}.".format(use_gpu))
logger.info("Configurations:\n{}".format(str(config)))

vocab_file_eng = "vocab_eng.json"
if not os.path.exists(vocab_file_eng):
    raise FileNotFoundError(vocab_file_eng)
else:
    logger.info('Loading vocab.')
    vocab_eng = Vocab.from_json(vocab_file_eng)
    logger.info(vocab_eng)

logger.info('Preparing data.')
val_data_path = 'val_data.json'
if not os.path.exists(val_data_path):
    raise FileNotFoundError(val_data_path)
else:
    with open(val_data_path, "r", encoding="utf-8") as f:
        val_data = json.load(f)

# Load model if a args.checkpoint is provided
if args.checkpoint is not None:
    logger.info('Loading checkpoint file [{}].'.format(args.checkpoint))
    # If loading on same machine the model was trained on
    checkpoint = torch.load(args.checkpoint)