def main(): print('Invoked as:', ' '.join(sys.argv), file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('model') parser.add_argument('train_corpus') parser.add_argument('test_corpus') parser.add_argument('--layers', type=int, default=1) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--minibatch_size', type=int, default=1) parser.add_argument('--autobatch', action='store_true') parser.add_argument('--tied', action='store_true') parser.add_argument('--residual', action='store_true') parser.add_argument('--sent_level', action='store_true') args = parser.parse_args() vocab = Vocabulary() train_corpus = read_corpus(args.train_corpus, vocab) vocab.frozen = True test_corpus = read_corpus(args.test_corpus, vocab) print('Vocabulary size:', len(vocab), file=sys.stderr) pc = dy.ParameterCollection() model = TopDownDepLM(pc, vocab, args.layers, args.hidden_dim, args.hidden_dim, args.tied, args.residual) pc.populate_from_textfile(args.model) print('Total parameters:', pc.parameter_count(), file=sys.stderr) run_test_set(model, test_corpus, args)
def load_dataset(path): charset = Charset() vocab = Vocabulary() vocab.load(f"{path}/vocab.txt") tag_set = Index() tag_set.load(f"{path}/tag2id.txt") measure_type = get_measure_type(path) tag_set = Index() if measure_type == "relations": tag_set.load(f"{path}/tag2id.txt") elif measure_type == "entities": tag_set.load(f"{path}/entity_labels.txt") helper = Helper(vocab, tag_set, charset, measure_type=measure_type) # relation_labels = Index() # relation_labels.load(f"{path}/relation_labels.txt") train_data = load(f"{path}/train.pk")[:1000] test_data = load(f"{path}/test.pk") word_embeddings = np.load(f"{path}/word2vec.vectors.npy") return helper, word_embeddings, train_data, test_data, tag_set
def main(_): hps = CLSTMDNN.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus word_vocab = Vocabulary.from_file(os.path.join(FLAGS.vocabdir, "1b_word_vocab.txt")) char_vocab = Vocabulary.from_file(os.path.join(FLAGS.vocabdir, "1b_char_vocab.txt")) if FLAGS.mode == "train": hps.batch_size = 256 data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*" eval_dataset = DatasetCharWord(word_vocab, char_vocab, data_dir, max_word_length=hps.word_length, deterministic=True) dataset = DatasetCharWord(word_vocab, char_vocab, FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*", max_word_length=hps.word_length) run_train(dataset, eval_dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): hps.batch_size = 32 if FLAGS.mode.startswith("eval_train"): data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*" else: data_dir = FLAGS.datadir + "/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" dataset = DatasetCharWord(word_vocab, char_vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
def main(): print('Invoked as:', ' '.join(sys.argv), file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('train_corpus') parser.add_argument('dev_corpus') parser.add_argument('--layers', type=int, default=1) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--minibatch_size', type=int, default=1) parser.add_argument('--autobatch', action='store_true') parser.add_argument('--tied', action='store_true') parser.add_argument('--residual', action='store_true') parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--output', type=str, default='') harness.add_optimizer_args(parser) args = parser.parse_args() if args.output == '': args.output = '/tmp/model%d' % random.randint(0, 0xFFFF) print('Output file:', args.output, file=sys.stderr) vocab = Vocabulary() train_corpus = read_corpus(args.train_corpus, vocab) vocab.frozen = True dev_corpus = read_corpus(args.dev_corpus, vocab) print('Vocabulary size:', len(vocab), file=sys.stderr) pc = dy.ParameterCollection() optimizer = harness.make_optimizer(args, pc) model = TopDownDepLM(pc, vocab, args.layers, args.hidden_dim, args.hidden_dim, args.tied, args.residual) print('Total parameters:', pc.parameter_count(), file=sys.stderr) harness.train(model, train_corpus, dev_corpus, optimizer, args)
def __init__(self, scene_images_dir, object_images_dir, dataset, vocab): self.last_index = 0 self.vocab = Vocabulary() self.vocab.load(vocab) self.lemmatizer = WordNetLemmatizer() self.dataset = File(dataset, 'r') self.questions = self.dataset['questions'] self.answers = self.dataset['answers'] self.image_indices = self.dataset['image_indices'] self.images = self.dataset['images'] # self.dataset_size = 100000 self.dataset_size = self.questions.shape[0] self.object_images = dict() self.object_classes = [] self.scene_images = dict() self.scene_classes = [] for object_class in listdir(object_images_dir): self.object_classes.append(self.lemmatizer.lemmatize(object_class)) for scene_class in listdir(scene_images_dir): self.scene_classes.append(self.lemmatizer.lemmatize(scene_class)) for object_class in listdir(object_images_dir): object_class_dir = join(object_images_dir, object_class) self.object_images[object_class] = [ join(object_class_dir, f) for f in listdir(object_class_dir) ] for scene_class in listdir(scene_images_dir): scene_class_dir = join(scene_images_dir, scene_class) self.scene_images[scene_class] = [ join(scene_class_dir, f) for f in listdir(scene_class_dir) ]
def init_word_embedding(embedding_paths): print('Init word embedding from: ', ', '.join(embedding_paths)) lines = [] for embedding_path in embedding_paths: embedding_path = os.path.join(DIR_PATH, embedding_path) with open(embedding_path, encoding='utf-8') as file: lines += file.read().strip().split('\n') tokens_of_lines = [l.strip().split(' ') for l in lines] words = [l[0] for l in tokens_of_lines] weight = [[float(str_emb) for str_emb in l[1:]] for l in tokens_of_lines] voc = Vocabulary(words) print('Vocabulary size:', voc.size()) # also init the embedding for special tokens while len(weight) < voc.size(): embedding_len = len(weight[0]) weight.append([0] * embedding_len) weight = torch.FloatTensor(weight) return voc, weight
def main(): args = parse_arguments() n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed n_batch = args.n_batch temperature = params.temperature train_path = params.train_path assert torch.cuda.is_available() print("loading_data...") # 训练时加载处理好的词典(如果有的话) if os.path.exists("vocab.json"): vocab = Vocabulary() with open('vocab.json', 'r') as fp: vocab.stoi = json.load(fp) for key, value in vocab.stoi.items(): vocab.itos.append(key) else: vocab = build_vocab(train_path, n_vocab) # save vocab with open('vocab.json', 'w') as fp: json.dump(vocab.stoi, fp) train_X, train_y, train_K = load_data(train_path, vocab) train_loader = get_data_loader(train_X, train_y, train_K, n_batch) print("successfully loaded") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() if args.restore: encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) # ToDo:目前的代码所有的embedding都是独立的,可以参考transformer源码使用直接赋值的方法共享参数: #if emb_src_trg_weight_sharing: # self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight model = [encoder, Kencoder, manager, decoder] parameters = list(encoder.parameters()) + list(Kencoder.parameters()) + \ list(manager.parameters()) + list(decoder.parameters()) optimizer = optim.Adam(parameters, lr=args.lr) # pre_train knowledge manager print("start pre-training") pre_train(model, optimizer, train_loader, args) print("start training") train(model, optimizer, train_loader, args) # save final model save_models(model, params.all_restore)
def create_vocab(words): vocab = Vocabulary() for i, w in enumerate(words): vocab.add_word(w) return vocab
def __init__(self): self.vocab = Vocabulary() self.ans = {} for line in open("../data/train_answer.csv"): line = line.strip().split(',') self.ans[line[0]] = int(line[1]) print("*** Finish building vocabulary")
def __init__(self): self.vocab = Vocabulary() self.ans = {} for line in open( "/home/share/liyongqi/ChID/raw_data/train_answer.csv"): line = line.strip().split(',') self.ans[line[0]] = int(line[1]) print("*** Finish building vocabulary")
def load_vocab(vocab_path): """Load Vocabulary object from a pickle file. Args: vocab_path: The location of the vocab pickle file. Returns: A Vocabulary object. """ vocab = Vocabulary() vocab.load(vocab_path) return vocab
def __init__(self): if not os.path.exists('cache'): os.makedirs('cache') if os.path.exists("cache/vocab.pkl"): self.vocab = pickle.load(open("cache/vocab.pkl", "rb")) else: self.vocab = Vocabulary() pickle.dump(self.vocab, open("cache/vocab.pkl", "wb"), protocol=2) print("*** Finish building vocabulary")
def build_vocab(word_lst, size=10000): vocab = Vocabulary() vocab.add_word("<pad>") vocab.add_word("<start>") vocab.add_word("<end>") vocab.add_word("<unk>") for word in word_lst[:size-4]: vocab.add_word(word) return vocab
def __init__(self): self.vocab = Vocabulary() self.ans = {} for line in open("../data/train_data.txt"): #line = line.strip().split(',') #self.ans[line[0]] = int(line[1]) line_json = json.loads(line) self.ans[line_json['content']] = int(line_json['realCount']) print("*** Finish building vocabulary")
def build_vocab(f_corpus: str, f_vocab: str, min_frequency: int, max_len: int): """ Build count-based vocabulary class Args: f_corpus (str): Corpus file used to extract vocabulary f_vocab (str): Text file to store extracted vocabulary min_frequency (int): Word's minimum frequency max_len (int): Maximum sentence length used to zero padding Returns: (Vocabulary) vocabulary class instantiated using 'vocab' file """ vocab = defaultdict(int) corpus = open(f_corpus, 'r', encoding='utf-8') lines = corpus.readlines() for line in lines: line = re.sub('[^A-Za-z0-9 ]+', '', line) for word in line.lower().split(' '): vocab[word] += 1 vocab = sorted(vocab.items(), key=(lambda x: x[1]), reverse=True)[1:] with open(f_vocab, 'w', encoding='utf-8') as f: for word in vocab: if word[1] >= min_frequency: print(f'{word[0]}\t{word[1]}', file=f) return lines, Vocabulary(f_vocab, max_len)
def get_vocab_imdb(data): ''' return: text.vocab.Vocabulary, each word appears at least 5 times. ''' tokenized = tokenize_imdb(data) counter = collections.Counter([tk for st in tokenized for tk in st]) return Vocabulary(counter, min_freq=5)
def main(unused_argv): # extract the vocabulary from training sentendes vocabulary = Vocabulary() vocabulary.load_file(FLAGS.train_file_path) # load training data train_loader = DataLoader(FLAGS.train_file_path, FLAGS.data_location, vocabulary, do_shuffle=True) batches_train = train_loader.batch_iterator(FLAGS.num_epochs, FLAGS.batch_size) # loop over training batches for data_train in batches_train: pass
def update_model(model): d = model.dimensionality if not type(model.vocabulary) == Vocabulary: print "Updating vocabulary and We" model.vocabulary = Vocabulary(model.vocabulary) # Append to We unkn_word_embedding = np.random.uniform(-1, 1, size=(1, d)) model.We = np.append(model.We, unkn_word_embedding, axis=0) if not type(model.dependency_dict) == Vocabulary: print "Updating dependency_dict and Wr" model.dependency_dict = Vocabulary(model.dependency_dict) unkn_relation_embedding = np.random.uniform(-1, 1, size=(1, d, d)) model.Wr = np.append(model.Wr, unkn_relation_embedding, axis=0) return model
def create_vocab(qas, threshold=4): counter = Counter() for qa in qas: question = qa['question'].encode('utf-8') answer = qa['answer'].encode('utf-8') qtokens = nltk.tokenize.word_tokenize(question.lower()) atokens = nltk.tokenize.word_tokenize(answer.lower()) counter.update(qtokens) counter.update(atokens) # If a word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Adds the words to the vocabulary. vocab = Vocabulary() for word in words: vocab.add_word(word) return vocab
def main(): args = parse_arguments() n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed n_batch = args.n_batch temperature = params.temperature test_path = params.test_path vocab_path = params.vocab_path assert torch.cuda.is_available() print("loading the vocab...") vocab = Vocabulary() with open(vocab_path, 'r',encoding='utf-8') as fp: vocab.stoi = json.load(fp) for key, value in vocab.stoi.items(): vocab.itos.append(key) # load data and change to id print("loading_data...") test_X, test_y, test_K = load_data(test_path, vocab) test_loader = get_data_loader(test_X, test_y, test_K, n_batch,False) print("successfully loaded test data") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer).cuda() encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) print("models successfully loaded!\n") model = [encoder, Kencoder, manager, decoder] #evaluate_loss(model, 0, test_loader) evaluate_sample(model, vocab,test_X, test_y, test_K, test_loader)
def _main(): parser = argparse.ArgumentParser(description="Start a ALPR demo server.") parser.add_argument("--dims", help="set the sample dimentions (default: 208)", type=int, default=208) parser.add_argument("--threshold", help="set the positive threshold (default: 0.9)", type=float, default=0.9) parser.add_argument("--plt_w", help="set the max width of output plate images (default: 144)", type=int, default=144) parser.add_argument("--plt_h", help="set the max height of output plate images (default: 48)", type=int, default=48) parser.add_argument("--seq_len", help="set the max length of output sequences (default: 8)", type=int, default=8) parser.add_argument("--beam_size", help="set the size of beam (default: 5)", type=int, default=5) parser.add_argument("--addr", help="set address of ALPR server (default: 0.0.0.0)", type=str, default="0.0.0.0") parser.add_argument("--port", help="set port of ALPR server (default: 80)", type=int, default=80) parser.add_argument("--device_id", help="select device that the model using (default: 0)", type=int, default=0) parser.add_argument("--gpu", help="using gpu acceleration", action="store_true") args = parser.parse_args() if args.gpu: context = mx.gpu(args.device_id) else: context = mx.cpu(args.device_id) print("This is ALPR demo server", flush=True) wpod = WpodNet() wpod.load_parameters("model/wpod_net.params", ctx=context) vocab = Vocabulary() vocab.load("model/vocabulary.json") ocr = OcrNet((args.plt_h, args.plt_w), vocab.size(), args.seq_len) ocr.load_parameters("model/ocr_net.params", ctx=context) yolo = model_zoo.get_model('yolo3_darknet53_voc', pretrained=True, ctx=context) handler = config_handler( context = context, dims = args.dims, threshold = args.threshold, plt_hw = (args.plt_h, args.plt_w), seq_len = args.seq_len, beam_size = args.beam_size, wpod = wpod, vocab = vocab, ocr = ocr, yolo = yolo ) httpd = http.server.HTTPServer((args.addr, args.port), handler) httpd.serve_forever()
def main(): args = parse_arguments() n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed n_batch = args.n_batch temperature = params.temperature test_path = params.test_path assert torch.cuda.is_available() print("loading_data...") if os.path.exists("vocab.json"): vocab = Vocabulary() with open('vocab.json', 'r') as fp: vocab.stoi = json.load(fp) for key, value in vocab.stoi.items(): vocab.itos.append(key) else: train_path = params.train_path vocab = build_vocab(train_path, n_vocab) test_X, test_y, test_K = load_data(test_path, vocab) test_loader = get_data_loader(test_X, test_y, test_K, n_batch) print("successfully loaded") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer).cuda() encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) model = [encoder, Kencoder, manager, decoder] print("start evaluating") evaluate(model, test_loader)
class DataLoaderTest(unittest.TestCase): if __name__ == '__main__': unittest.main() def setUp(self): self.voc = Vocabulary() self.voc.load_file("./test_sentences.txt") self.loader = DataLoader("./test_sentences.txt", self.voc, do_shuffle=False) def test_loadFileIntoMemory(self): # loading data is done in constructor assert self.loader.data_num is not None assert self.loader.data_num.shape == (6, 30) def test_iterate_over_all_epochs_and_batches(self): batches = self.loader.batch_iterator(3, 3) count = 0 for i in batches: count += 1 assert i.shape == (3, 30) assert count == 6 def test_each_sentence_has_bos_eos(self): assert np.sum(np.equal( self.loader.data, Vocabulary.END_SEQ)) == self.loader.data.shape[0] assert np.sum( np.equal(self.loader.data, Vocabulary.INIT_SEQ)) == self.loader.data.shape[0] def test_load_partial_sentence_no_eos(self): self.loader = DataLoader("./test_sentences.txt", self.voc, do_shuffle=False, is_partial=True) assert np.sum(np.equal(self.loader.data, Vocabulary.END_SEQ)) == 0 assert np.sum( np.equal(self.loader.data, Vocabulary.INIT_SEQ)) == self.loader.data.shape[0]
def create_mpqa(): mpqa = read_file('raw_datasets/mpqa.all') # build matrices X, y = [], [] for line in mpqa: words = line.split(' ') label = [0,0] label[int(line[0])] = 1 sent = clean_str(line[1:]) X.append(sent) y.append(label) # build vocab mpqa_vocab = Vocabulary(X) print('vocab', len(mpqa_vocab.vocab)) # encode sents max_len = compute_avg_len(X) for i in range(len(X)): X[i] = encode_sent(X[i].split(' '), mpqa_vocab.encoding, max_len) # build embeddings embeddings = [] for name, (emb_vocab, emb_vectors) in embeddings_map.items(): embedding, found = create_embeddings( mpqa_vocab, emb_vocab, emb_vectors, 300 ) embeddings.append(embedding) print('{} - {}'.format(name, found)) w2v_embeddings, glove_embeddings, nb_embeddings = embeddings # shuffle X, y = np.array(X), np.array(y) indices = np.random.permutation(len(X)) X, y = X[indices], y[indices] split_idx = int(len(X) * 0.9) X_train, X_valid = X[:split_idx], X[split_idx:] y_train, y_valid = y[:split_idx], y[split_idx:] print('train', X_train.shape, y_train.shape) print('valid', X_valid.shape, y_valid.shape) # save objects save_object('datasets/mpqa_train', (X_train, y_train)) save_object('datasets/mpqa_valid', (X_valid, y_valid)) save_object('datasets/mpqa_vocab', mpqa_vocab) save_object('datasets/mpqa_w2v_embs', w2v_embeddings) save_object('datasets/mpqa_glove_embs', glove_embeddings) save_object('datasets/mpqa_nb_embs', nb_embeddings)
def create_mr(): pos = read_file('raw_datasets/rt-polarity.pos') neg = read_file('raw_datasets/rt-polarity.neg') # build matrices X, y = [], [] for sent in pos: X.append(clean_str(sent)) y.append([0,1]) for sent in neg: X.append(clean_str(sent)) y.append([1,0]) # build vocab mr_vocab = Vocabulary(X) print('vocab', len(mr_vocab.vocab)) # encode sents max_seq_len = compute_avg_len(X) for i in range(len(X)): X[i] = encode_sent(X[i].split(' '), mr_vocab.encoding, max_seq_len) # build embeddings embeddings = [] for name, (emb_vocab, emb_vectors) in embeddings_map.items(): embedding, found = create_embeddings( mr_vocab, emb_vocab, emb_vectors, 300 ) embeddings.append(embedding) print('{} - {}'.format(name, found)) w2v_embeddings, glove_embeddings, nb_embeddings = embeddings # shuffle X, y = np.array(X), np.array(y) indices = np.random.permutation(len(X)) X, y = X[indices], y[indices] split_idx = int(len(X) * 0.9) X_train, X_valid = X[:split_idx], X[split_idx:] y_train, y_valid = y[:split_idx], y[split_idx:] print('train', X_train.shape, y_train.shape) print('valid', X_valid.shape, y_valid.shape) # save objects save_object('datasets/mr_train', (X_train, y_train)) save_object('datasets/mr_valid', (X_valid, y_valid)) save_object('datasets/mr_vocab', mr_vocab) save_object('datasets/mr_w2v_embs', w2v_embeddings) save_object('datasets/mr_glove_embs', glove_embeddings) save_object('datasets/mr_nb_embs', nb_embeddings)
def test(images, dims, threshold, plt_hw, seq_len, no_yolo, beam, beam_size, context): print("Loading model...") if not no_yolo: yolo = model_zoo.get_model('yolo3_darknet53_voc', pretrained=True, ctx=context) wpod = WpodNet() wpod.load_parameters("model/wpod_net.params", ctx=context) vocab = Vocabulary() vocab.load("model/vocabulary.json") ocr = OcrNet(plt_hw, vocab.size(), seq_len) ocr.load_parameters("model/ocr_net.params", ctx=context) for path in images: print(path) raw = load_image(path) if no_yolo: detect_plate(wpod, vocab, ocr, raw, dims, threshold, plt_hw, beam, beam_size, context) else: ts = time.time() x, _ = data.transforms.presets.yolo.transform_test(raw, short=512) classes, scores, bboxes = yolo(x.as_in_context(context)) bboxes[0, :, 0::2] = bboxes[0, :, 0::2] / x.shape[3] * raw.shape[1] bboxes[0, :, 1::2] = bboxes[0, :, 1::2] / x.shape[2] * raw.shape[0] vehicles = [ fixed_crop(raw, bboxes[0, i]) for i in range(classes.shape[1]) if (yolo.classes[int(classes[0, i].asscalar())] == 'car' or yolo.classes[int(classes[0, i].asscalar())] == 'bus') and scores[0, i].asscalar() > 0.5 ] print("yolo profiling: %f" % (time.time() - ts)) for i, raw in enumerate(vehicles): print("vehicle[%d]:" % i) detect_plate(wpod, vocab, ocr, raw, dims, threshold, plt_hw, beam, beam_size, context)
def process_corpus(lines: List[str], vocab: Vocabulary, f_output: str): """ Calculate the time spent during one epoch Args: lines (list): List of string which contains line of corpus vocab (Vocabulary): Vocabulary class instantiated using 'vocab' file f_output (float): Text file to store processed corpus Returns: None """ with open(f_output, 'w', encoding='utf-8') as f: for line in lines: line = re.sub('[^A-Za-z0-9 ]+', '', line) print(vocab.encode(line), file=f)
def main(): train_dataframe, valid_dataframe = make_train_valid_dfs() train_loader = make_loaders( dataframe=train_dataframe, vocabulary=Vocabulary(freq_threshold=config.FREQ_THRESHOLD), transforms=get_transforms(mode="train"), mode="train", ) vocab = train_loader.dataset.vocab valid_loader = make_loaders( dataframe=valid_dataframe, vocabulary=vocab, transforms=get_transforms(mode="valid"), mode="valid", ) encoder, decoder, encoder_optimizer, decoder_optimizer = build_model( vocab_size=vocab.vocab_size) criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"]) encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( encoder_optimizer, factor=config.FACTOR, patience=config.PATIENCE, verbose=True) decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( decoder_optimizer, factor=config.FACTOR, patience=config.PATIENCE, verbose=True) for epoch in range(config.EPOCHS): train_loss = train_one_epoch( train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, config.DEVICE, ) # encoder_scheduler.step(valid_loss.avg) # decoder_scheduler.step(valid_loss.avg) predict(valid_loader, encoder, decoder, config.DEVICE)
def main(): print('Invoked as:', ' '.join(sys.argv), file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('corpus') parser.add_argument('dev_corpus') parser.add_argument('--layers', type=int, default=1) parser.add_argument('--emb_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--minibatch_size', type=int, default=1) parser.add_argument('--tied', action='store_true') parser.add_argument('--autobatch', action='store_true') parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--output', type=str, default='') harness.add_optimizer_args(parser) args = parser.parse_args() if args.output == '': args.output = '/tmp/model%d' % random.randint(0, 0xFFFF) print('Output file:', args.output, file=sys.stderr) vocab = Vocabulary() train_corpus = read_corpus(args.corpus, vocab) dev_corpus = read_corpus(args.dev_corpus, vocab) print('Vocab size:', len(vocab), file=sys.stderr) with open(args.output + '.vocab', 'w') as f: for word in vocab.i2w: print(word, file=f) pc = dy.ParameterCollection() optimizer = harness.make_optimizer(args, pc) model = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab), args.tied) print('Total parameters:', pc.parameter_count(), file=sys.stderr) harness.train(model, train_corpus, dev_corpus, optimizer, args)
def main(): train_dataframe, valid_dataframe = make_train_valid_dfs() train_loader = make_loaders( dataframe=train_dataframe, vocabulary=Vocabulary(freq_threshold=config.FREQ_THRESHOLD), transforms=get_transforms(mode="train"), mode="train", ) vocab = train_loader.dataset.vocab valid_loader = make_loaders( dataframe=valid_dataframe, vocabulary=vocab, transforms=get_transforms(mode="valid"), mode="valid", ) # model = CaptioningTransformer(vocab_size=vocab.vocab_size, d_model=config.D_MODEL).to(config.DEVICE) model = TransformerCaptioning(vocab_size=vocab.vocab_size).to( config.DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=config.LR) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=3) criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"]) train_eval( config.EPOCHS, model, train_loader, valid_loader, criterion, optimizer, config.DEVICE, config, lr_scheduler, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('model') parser.add_argument('vocab') parser.add_argument('corpus') parser.add_argument('--layers', type=int, default=1) parser.add_argument('--emb_dim', type=int, default=128) parser.add_argument('--hidden_dim', type=int, default=128) parser.add_argument('--minibatch_size', type=int, default=1) parser.add_argument('--tied', action='store_true') parser.add_argument('--autobatch', action='store_true') parser.add_argument('--sent_level', action='store_true') args = parser.parse_args() vocab = Vocabulary() with open(args.vocab) as f: for line in f: word = line.strip() vocab.convert(word) print('Loaded a vocabulary of size %d' % (len(vocab))) eos = vocab.convert('</s>') pc = dy.ParameterCollection() rnnlm = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab), args.tied) pc.populate_from_textfile(args.model) #rnnlm, = dy.load(args.model, pc) print('Total parameters:', pc.parameter_count()) """for i in range(100): rnnlm.new_graph() sampled_sent = rnnlm.sample(eos, 100) sampled_sent = [vocab.to_word(word_id) for word_id in sampled_sent] print(' '.join(sampled_sent)) sys.stdout.flush() sys.exit(0)""" rnnlm.set_dropout(0.0) vocab.frozen = True corpus = read_corpus(args.corpus, vocab) run_test_set(rnnlm, corpus, args)