############################################################################### # Load data ############################################################################### # Torch word_freq = torch.load(os.path.join(args.data, 'word_freq.pt')).numpy() mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 16384 train_corpus = FastGBWDataset(args.data, 'train_data.pt', 'train_data.sid', mapto, seq_length=args.bptt, batch_size=args.batch_size) print("load train data - complete") test_corpus = GBWDataset(args.data, 'test_data.pt', mapto) print("load test data - complete") # Streaming ''' vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt")) ntokens = len(vocabulary) nsampled = 16384 train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*")) test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True) print("load dataset - complete") ''' ###############################################################################
############################################################################### # Load data ############################################################################### # Torch word_freq = load_lua(os.path.join(args.data, 'word_freq.th7')).numpy() mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 8192 train_corpus = FastGBWDataset(args.data, 'train_data.th7', 'train_data.sid', mapto) print("load train data - complete") test_corpus = GBWDataset(args.data, 'test_data.th7', mapto) print("load test data - complete") cutoffs = args.cutoffs + [ntokens] # with doing('Constructing model'): # if not args.lm1b: # criterion = AdaptiveLoss(cutoffs) # else: # criterion = SplitCrossEntropyLoss(args.emsize, args.cutoffs, verbose=False) # criterion.cuda() logging.info("Constructing model") criterion = AdaptiveLoss(cutoffs).cuda() if args.old is None: logging.info("building model")
############################################################################### # Load data ############################################################################### # Torch word_freq = np.load(os.path.join(args.data, args.freq_file)) mapto = torch.from_numpy(util.reverse(np.argsort(-word_freq))).long() print("load word frequency mapping - complete") ntokens = len(word_freq) nsampled = 8192 train_corpus = FastGBWDataset(args.data, args.train_file, args.sid_file, mapto) print("load train data - complete") test_corpus = GBWDataset(args.data, args.validation_file, mapto) print("load test data - complete") # Streaming ''' vocabulary = Vocabulary.from_file(os.path.join(args.data, "1b_word_vocab.txt")) ntokens = len(vocabulary) nsampled = 8192 train_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "training-monolingual.tokenized.shuffled/*")) test_corpus = StreamGBWDataset(vocabulary, os.path.join(args.data, "heldout-monolingual.tokenized.shuffled/*"), deterministic=True) print("load dataset - complete") ''' ###############################################################################