Exemple #1
0
###############################################################################
# Load data
###############################################################################

# Torch
word_freq = torch.load(os.path.join(args.data, 'word_freq.pt')).numpy()
mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
print("load word frequency mapping - complete")

ntokens = len(word_freq)
nsampled = 16384

train_corpus = FastGBWDataset(args.data, 'train_data.pt', 'train_data.sid', mapto, seq_length=args.bptt, batch_size=args.batch_size)
print("load train data - complete")

test_corpus = GBWDataset(args.data, 'test_data.pt', mapto)
print("load test data - complete")

# Streaming
'''
vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt"))

ntokens = len(vocabulary)
nsampled = 16384

train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*"))
test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True)
print("load dataset - complete")
'''

###############################################################################
Exemple #2
0
    ###############################################################################
    # Load data
    ###############################################################################

    # Torch
    word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy()
    mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
    print("load word frequency mapping - complete")

    ntokens = len(word_freq)
    nsampled = 8192

    train_corpus = FastGBWDataset(args.data, 'train_data.th7', 'train_data.sid', mapto)
    print("load train data - complete")

    test_corpus = GBWDataset(args.data, 'test_data.th7', mapto)
    print("load test data - complete")

    cutoffs = args.cutoffs + [ntokens]


# with doing('Constructing model'):
    # if not args.lm1b:
    #     criterion = AdaptiveLoss(cutoffs)
    # else:
    #     criterion = SplitCrossEntropyLoss(args.emsize, args.cutoffs, verbose=False)
    #     criterion.cuda()
logging.info("Constructing model")
criterion = AdaptiveLoss(cutoffs).cuda()
if args.old is None:
    logging.info("building model")
Exemple #3
0
###############################################################################
# Load data
###############################################################################

# Torch
word_freq = np.load(os.path.join(args.data, args.freq_file))
mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long()
print("load word frequency mapping - complete")

ntokens = len(word_freq)
nsampled = 8192

train_corpus = FastGBWDataset(args.data, args.train_file, args.sid_file, mapto)
print("load train data - complete")

test_corpus = GBWDataset(args.data, args.validation_file, mapto)
print("load test data - complete")

# Streaming
'''
vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt"))

ntokens = len(vocabulary)
nsampled = 8192

train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*"))
test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True)
print("load dataset - complete")
'''

###############################################################################