def main(args): model = torch.load(args.model) emb_weights = model.E.weight print('Loaded Embeddings: ', emb_weights, '\nShape:', emb_weights.shape) tokenizer = util.Tokenizer(tokenize_type='nltk', lowercase=True) train_toks = tokenizer.tokenize(open('data/gigaword_subset.val').read()) train_ngram_counts = tokenizer.count_ngrams(train_toks) vocab = [tup[0] for tup, _ in train_ngram_counts[0].most_common(v)] print('Tokenized vocabulary') out_v = io.open('vecs.tsv', 'w', encoding='utf-8') out_m = io.open('meta.tsv', 'w', encoding='utf-8') for num, word in enumerate(vocab): vecs = emb_weights[num] out_m.write(word + "\n") out_v.write('\t'.join([str(x.item()) for x in vecs]) + "\n") out_v.close() out_m.close() print('...Created files for embedding visualization')
def main(args): tokenizer = util.Tokenizer(tokenize_type='nltk', lowercase=True) train_toks = tokenizer.tokenize(open(args.train_file).read()) num_train_toks = int(args.train_fraction * len(train_toks)) print('-' * 79) print('Using %d tokens for training (%g%% of %d)' % (num_train_toks, 100 * args.train_fraction, len(train_toks))) train_toks = train_toks[:int(args.train_fraction * len(train_toks))] val_toks = tokenizer.tokenize(open(args.val_file).read()) num_val_toks = int(args.val_fraction * len(val_toks)) print('Using %d tokens for validation (%g%% of %d)' % (num_val_toks, 100 * args.val_fraction, len(val_toks))) val_toks = val_toks[:int(args.val_fraction * len(val_toks))] train_ngram_counts = tokenizer.count_ngrams(train_toks) # Get vocab and threshold. print('Using vocab size %d (excluding UNK) (original %d)' % (min(args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0]))) vocab = [tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab)] train_toks = tokenizer.threshold(train_toks, vocab, args.unk) val_toks = tokenizer.threshold(val_toks, vocab, args.unk) lm = FFLM(args.model, vocab, args.unk, args.init, args.lr, args.check_interval, args.seed, args.nhis, args.wdim, args.hdim, args.nlayers, args.B) print(lm) if not args.test: # Estimate parameters. lm.train_epochs(train_toks, val_toks, args.epochs) lm = torch.load(args.model) val_ppl = lm.test(batchify([lm.token_to_idx[x] for x in val_toks], args.B)) print('Optimized Perplexity: %f' %(val_ppl)) nns = lm.nearest_neighbors(args.K) for x in random.choices(list(nns.keys()), k=args.K): print('%s:' % x, end=' ') for z, _ in nns[x][1:]: print('%s' % z, end=' ') print()
def main(args): tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True) # TODO: you have to pass this test. util.test_ngram_counts(tokenizer) train_toks = tokenizer.tokenize(open(args.train_file).read()) num_train_toks = int(args.train_fraction * len(train_toks)) print('-' * 79) print('Using %d tokens for training (%g%% of %d)' % (num_train_toks, 100 * args.train_fraction, len(train_toks))) train_toks = train_toks[:int(args.train_fraction * len(train_toks))] val_toks = tokenizer.tokenize(open(args.val_file).read()) train_ngram_counts = tokenizer.count_ngrams(train_toks) # Explore n-grams in the training corpus before preprocessing. util.show_ngram_information(train_ngram_counts, args.k, args.figure_file, args.quiet) # Get vocab and threshold. print('Using vocab size %d (excluding UNK) (original %d)' % (min( args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0]))) vocab = [ tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab) ] train_toks = tokenizer.threshold(train_toks, vocab, args.unk) val_toks = tokenizer.threshold(val_toks, vocab, args.unk) # The language model assumes a thresholded vocab. lm = util.BigramLanguageModel(vocab, args.unk, args.smoothing, alpha=args.alpha, beta=args.beta) # Estimate parameters. lm.train(train_toks) train_ppl = lm.test(train_toks) val_ppl = lm.test(val_toks) print('Train perplexity: %f\nVal Perplexity: %f' % (train_ppl, val_ppl))
def main(args): tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True) train_toks = tokenizer.tokenize(open(args.train_file).read()) num_train_toks = int(args.train_fraction * len(train_toks)) print('-' * 79) print('Using %d tokens for training (%g%% of %d)' % (num_train_toks, 100 * args.train_fraction, len(train_toks))) train_toks = train_toks[:int(args.train_fraction * len(train_toks))] val_toks = tokenizer.tokenize(open(args.val_file).read()) num_val_toks = int(args.val_fraction * len(val_toks)) print('Using %d tokens for validation (%g%% of %d)' % (num_val_toks, 100 * args.val_fraction, len(val_toks))) val_toks = val_toks[:int(args.val_fraction * len(val_toks))] train_ngram_counts = tokenizer.count_ngrams(train_toks) # Get vocab and threshold. print('Using vocab size %d (excluding UNK) (original %d)' % (min( args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0]))) vocab = [ tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab) ] train_toks = tokenizer.threshold(train_toks, vocab, args.unk) val_toks = tokenizer.threshold(val_toks, vocab, args.unk) if args.features == 'basic1': feature_extractor = util.basic_features1 elif args.features == 'basic1suffix3': feature_extractor = util.basic_features1_suffix3 # TODO: Implement elif args.features == 'basic2': feature_extractor = util.basic_features2 else: raise ValueError('Unknown feature extractor type.') # for feature_extractor in [util.basic_features1, util.basic_features2, util.basic_features1_suffix3, util.basic_features2_prefix3, util.basic_features2_suffix3]: # We'll cheat and cache features for validation data to make things faster # for this assignment. The correct thing to do here would be # #f2i, fcache, num_feats_cached, x2ys \ # util.extract_features(train_toks, feature_extractor) # f2i, fcache, num_feats_cached, x2ys \ = util.extract_features(train_toks + val_toks, feature_extractor) print('%d feature types extracted' % len(f2i)) print('%d feature values cached for %d window types' % (num_feats_cached, len(fcache))) for seed in [82, 95, 11, 29, 49, 8, 42, 36, 71, 65]: # The language model assumes a trucated vocab and a feature definition. lm = util.LogLinearLanguageModel(args.model, vocab, args.unk, feature_extractor, f2i, fcache, x2ys, init=args.init, lr=args.lr, check_interval=args.check_interval, seed=seed) # lm = util.LogLinearLanguageModel(args.model, vocab, args.unk, # feature_extractor, f2i, fcache, x2ys, # init=args.init, lr=args.lr, # check_interval=args.check_interval, # seed=args.seed) if args.test: # Load trained parameters lm.load() else: # Estimate parameters. lm.train(train_toks, val_toks, args.epochs) val_ppl = lm.test(val_toks) print('Optimized Perplexity: %f' % (val_ppl)) sample = open('results/1.8.txt', 'a') print('%f,%f' % (seed, val_ppl), file=sample) sample.close() print('-' * 79) for (i, f, w) in lm.topK_feats(args.K): print('{:10d}: {:40s} ({:8.4f})'.format(i, f, w))
def main(args): tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True) # TODO: you have to pass this test. util.test_ngram_counts(tokenizer) train_toks = tokenizer.tokenize(open(args.train_file).read()) minValidation = 10000 minA = 10000 minB = 10000 minTF = 10000 ###Problem 6### # for frac in range(1,11): # args.train_fraction = float(frac/10) num_train_toks = int(args.train_fraction * len(train_toks)) print('-' * 79) print('Using %d tokens for training (%g%% of %d)' % (num_train_toks, 100 * args.train_fraction, len(train_toks))) train_toks = train_toks[:int(args.train_fraction * len(train_toks))] val_toks = tokenizer.tokenize(open(args.val_file).read()) train_ngram_counts = tokenizer.count_ngrams(train_toks) # Explore n-grams in the training corpus before preprocessing. util.show_ngram_information(train_ngram_counts, args.k, args.figure_file, args.quiet) # Get vocab and threshold. print('Using vocab size %d (excluding UNK) (original %d)' % (min( args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0]))) vocab = [ tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab) ] train_toks = tokenizer.threshold(train_toks, vocab, args.unk) val_toks = tokenizer.threshold(val_toks, vocab, args.unk) ###Problem 5### # for alpha in range(-5,3): # args.alpha = 10**alpha ###Problem 7### # for beta in range(1,10): # args.beta = 0.1*beta # The language model assumes a thresholded vocab. lm = util.BigramLanguageModel(vocab, args.unk, args.smoothing, alpha=args.alpha, beta=args.beta) # Estimate parameters. lm.train(train_toks) train_ppl = lm.test(train_toks) val_ppl = lm.test(val_toks) # if val_ppl < minValidation: # minValidation = val_ppl # minA = args.alpha # minB = args.beta # minTF = args.train_fraction print('Train perplexity: %f\nVal Perplexity: %f' % (train_ppl, val_ppl)) # print("\tAlpha: " + str(minA)) # print("\tBeta: " + str(minB)) # print("\tTrain Fraction: " + str(minTF)) f = open("TrainFraction2.csv", "a") f.write(str(args.beta) + "," + str(train_ppl) + "," + str(val_ppl) + "\n") f.close()