def dump_vocab_tsv(self, filepath=None): embeddings = {} self.eval() self.train_feed.reset_offset() for j in tqdm(range(self.train_feed.size), desc='dump emb'): input_ = self.train_feed.next_batch(batch_size=1) idxs, pair, targets = input_ w1_state, w2_state = self.encode_pair(pair) sample = self.dataset.trainset_dict[idxs[0]] #since batch size = 1 idxs[0] works w1, w2 = sample.pair w1_text = ' '.join(w1).replace('@@ ', '') w2_text = ' '.join(w1).replace('@@ ', '') embeddings[w1_text] = w1_state.tolist()[0] embeddings[w2_text] = w2_state.tolist()[0] if not filepath: filepath = self.config.ROOT_DIR + '/vocab.tsv' vector_filepath = filepath.replace('.tsv', '.vector.tsv') token_filepath = filepath.replace('.tsv', '.token.tsv') vector_file = open(vector_filepath, 'w') token_file = open(token_filepath, 'w') for word, vector in tqdm(embeddings.items(), desc='writing to file'): vector_file.write('\t'.join([str(v) for v in vector]) + '\n') token_file.write(word + '\n') vector_file.close() token_file.close()
def train_on_feed(feed): losses = [] feed.reset_offset() for j in tqdm(range(feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = feed.next_batch() idxs, (gender, sequence), targets = input_ sequence = sequence.transpose(0,1) seq_size, batch_size = sequence.size() state = self.initial_hidden(batch_size) loss = 0 output = sequence[0] positions = LongVar(self.config, np.linspace(0, 1, seq_size)) for ti in range(1, sequence.size(0) - 1): output = self.forward(gender, positions[ti], output, state) loss += self.loss_function(ti, output, input_) output, state = output if random.random() > self.teacher_forcing_ratio: output = output.max(1)[1] teacher_force_count[0] += 1 else: output = sequence[ti+1] teacher_force_count[1] += 1 losses.append(loss) loss.backward() self.optimizer.step() return torch.stack(losses).mean()
def do_validate(self): self.eval() if self.test_feed.num_batch > 0: losses, accuracies = [], [] for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, pair, targets = input_ output = self.__(self.forward(pair), 'output') loss = self.loss_function(output, targets) losses.append(loss) epoch_loss = torch.stack(losses).mean() self.test_loss.append(epoch_loss.data.item()) self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss)) if len(self.best_model_criteria) > 1: if self.best_model_criteria[-2] > self.best_model_criteria[-1]: self.log.info('beat best ..') self.best_model = (self.best_model_criteria[-1], self.cpu().state_dict()) self.save_best_model() #self.dump_vocab_tsv() if self.config.CONFIG.cuda: self.cuda() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def build_samples(raw_samples): samples = [] for i, (gender, name) in enumerate(tqdm(raw_samples, desc='processing names')): try: #name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name) < 2: continue log.debug('===') log.debug(pformat(name)) samples.append(Sample('{}.{}'.format(gender, i), gender, name)) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(name)) return samples
def build_samples(raw_samples): samples = [] for i, (gender, name) in enumerate(tqdm(raw_samples, desc='processing names')): try: #name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name) < 2: continue log.debug('===') log.debug(pformat(name)) for a, b in zip(range(len(name)), range(1, len(name) - 1)): template = list(NULL_CHAR * len(name)) template[a] = name[a] template[b] = name[b] samples.append( Sample('{}.{}'.format(gender, i), gender, template, name)) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(name)) return samples
def load_data(set_='train'): skipped = 0 samples = [] for i, line in enumerate(tqdm( open( '{}/{}.tsv'.format(dataset_path, set_) ).readlines())): try: #print(line.split('\t')) pid, sid, line, label = line.strip().split('\t') samples.append( Sample( id = '{}.{}.{}.{}'.format(pid, sid, i, label), sequence = line, label = label, ) ) except KeyboardInterrupt: raise KeyboardInterrupt except: skipped += 1 log.exception(dataset_path) print('skipped {} samples'.format(skipped)) return samples
def do_train2(self): if not hasattr(self, 'batch_cache'): self.build_cache_for_train2() for epoch in range(self.epochs): self.log.critical('memory consumed : {}'.format(memory_consumed())) self.epoch = epoch if epoch and epoch % max(1, (self.checkpoint - 1)) == 0: #self.do_predict() if self.do_validate() == FLAGS.STOP_TRAINING: self.log.info('loss trend suggests to stop training') return self.train() losses = [] for input_ in tqdm(self.batch_cache, desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() idxs, word, targets = input_ output = self.__(self.forward(word), 'output') loss = self.loss_function(output, targets) losses.append(loss) loss.backward() self.optimizer.step() epoch_loss = torch.stack(losses).mean() self.train_loss.append(epoch_loss.data.item()) self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss)) for m in self.metrics: m.write_to_file() return True
def do_train(self): for epoch in range(self.epochs): self.log.critical('memory consumed : {}'.format(memory_consumed())) self.epoch = epoch if epoch % max(1, (self.checkpoint - 1)) == 0: #self.do_predict() if self.do_validate() == FLAGS.STOP_TRAINING: self.log.info('loss trend suggests to stop training') return self.train() losses = [] for j in tqdm(range(self.train_feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = self.train_feed.next_batch() idxs, inputs, targets = input_ output = self.forward(input_) loss = self.loss_function(output, input_) #print(loss.data.cpu().numpy()) losses.append(loss) loss.backward() self.optimizer.step() epoch_loss = torch.stack(losses).mean() self.train_loss.append(epoch_loss.data.item()) self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss)) for m in self.metrics: m.write_to_file() return True
def load_filmreviews_data(config, filename=('../dataset/filmreviews/reviews.subword_nmt.csv', '../dataset/filmreviews/ratings.csv'), max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file, label_file = [open(f).readlines() for f in filename] for i, (s, l) in tqdm(enumerate(zip(text_file, label_file)), desc='processing {}'.format(filename)): s, l = s.strip(), l.strip() label = float(l.strip().lower()) if label >= 2.75: label = 'positive' else: label = 'negative' samples.append( Sample(i, s.strip().split(), label ) ) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(line)) print('skipped {} samples'.format(skipped)) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocab.update(sample.sequence) output_vocab.update([sample.label]) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True) test_samples = sorted(test_samples, key=lambda x: len(x.sequence), reverse=True) return Dataset(filename, (train_samples, test_samples), Vocab(input_vocab, special_tokens=VOCAB), Vocab(output_vocab))
def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10): dataset = json.load(open(data_path, 'r')) samples = [] qn, an = 0, 0 skipped = 0 vocabulary = defaultdict(int) def __(s): import unicodedata s = ''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.category(c) != 'Mn') return s.replace("``", '"').replace("''", '"') try: for aid, article in enumerate(tqdm(dataset['data'])): for pid, paragraph in enumerate(article['paragraphs']): context = TokenString(__(paragraph['context']), word_tokenize).delete_whitespace() questions = paragraph['qas'] for token in context: vocabulary[token] += 1 for qid, qa in enumerate(questions): log.debug('processing: {}.{}.{}'.format(aid, pid, qid)) q = TokenString(__(qa['question']), word_tokenize).delete_whitespace() a = TokenString(__(qa['answers'][0]['text']), word_tokenize).delete_whitespace( ) #simply ignore other answers squad_id = qa['id'] for token in q: vocabulary[token] += 1 indices = context.index(a) if not indices: log.debug(pformat(paragraph['context'])) log.debug(pformat(paragraph['qas'][qid])) log.error('{}.{}.{} - "{}" not found in \n"{}"'.format( aid, pid, qid, a.tokenized_string, context.tokenized_string)) skipped += 1 continue a_start, a_end = indices fields = (aid, pid, qid, squad_id, context, q, a, list(range(a_start, a_end))) _id = tuple(fields[i - 1] for i in ids) samples.append(Sample(_id, *fields)) except: skipped += 1 log.exception('{}'.format(aid)) print('skipped {} samples'.format(skipped)) return samples, vocabulary
def load_task_data(task=1, type_='train', max_sample_size=None): samples = [] qn, an = 0, 0 skipped = 0 input_vocabulary = Counter() output_vocabulary = Counter() try: filename = glob.glob('../dataset/en-10k/qa{}_*_{}.txt'.format( task, type_))[0] task_name = re.search(r'qa\d+_(.*)_.*.txt', filename) if task_name: task_name = task_name.group(1) log.info('processing file: {}'.format(filename)) dataset = open(filename).readlines() prev_linenum = 1000000 for line in tqdm(dataset): questions, answers = [], [] linenum, line = line.split(' ', 1) linenum = int(linenum) if prev_linenum > linenum: story = '' if '?' in line: q, a, _ = line.split('\t') samples.append( Sample('{}.{}'.format(task, linenum), task, linenum, task_name, TokenString(story.lower(), word_tokenize), TokenString(q.lower(), word_tokenize), a.lower())) else: story += ' ' + line prev_linenum = linenum except: skipped += 1 log.exception('{}'.format(task, linenum)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: len(x.story), reverse=True) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocabulary.update(sample.story + sample.q) output_vocabulary.update([sample.a]) return task_name, samples, input_vocabulary, output_vocabulary
def read_words(filename=config.HPCONFIG.lm_dataset_path): samples = [] for line in tqdm( open(filename).readlines()[:config.HPCONFIG.lm_samples_count], 'reading lm file for words'): s = line.split() s = [('neutral', n) for n in s] samples.extend(s) return list(set(samples))
def load_tawiki_data(config, dataset_name='tawiki', max_sample_size=None): samples = [] skipped = 0 vocab = Counter() try: filename = glob.glob('../dataset/tawiki_lines.txt')[0] log.info('processing file: {}'.format(filename)) dataset = open(filename).readlines() for i, line in enumerate(tqdm(dataset, desc='processing {}'.format(filename))): import string #print(line) try: line = line.strip() if len(line) > 20: for j, segment in enumerate(line.split('. ')): if len(segment) < 20: continue samples.append( Sample( id = '{}.{}.{}'.format(dataset_name, i ,j), sequence = [str(i) for i in utf8_to_tace16(segment)] ) ) """ samples.append( Sample( id = '{}.{}'.format(dataset_name, i), sequence = [str(i) for i in utf8_to_tace16(line)] ) ) """ except: log.exception('{}.{}.{} - {}'.format(dataset_name, i, j, word)) except: skipped += 1 log.exception('{}.{} - {}'.format(dataset_name, i, line)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True) if max_sample_size: samples = samples[:max_sample_size] log.info('building vocab...') for sample in samples: vocab.update(sample.sequence) return os.path.basename(filename), samples, vocab
def do_validate(self): self.eval() for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, inputs, targets = input_ sequence = inputs[0].transpose(0, 1) _, batch_size = sequence.size() state = self.initial_hidden(batch_size) loss, accuracy = Var(self.config, [0]), Var(self.config, [0]) output = sequence[0] outputs = [] ti = 0 for ti in range(1, sequence.size(0) - 1): output = self.forward(output, state) loss += self.loss_function(ti, output, input_) accuracy += self.accuracy_function(ti, output, input_) output, state = output output = output.max(1)[1] outputs.append(output) self.test_loss.cache(loss.item()) if ti == 0: ti = 1 self.accuracy.cache(accuracy.item() / ti) #print('====', self.test_loss, self.accuracy) self.log.info('= {} =loss:{}'.format(self.epoch, self.test_loss.epoch_cache)) self.log.info('- {} -accuracy:{}'.format(self.epoch, self.accuracy.epoch_cache)) if self.best_model[0] < self.accuracy.epoch_cache.avg: self.log.info('beat best ..') last_acc = self.best_model[0] self.best_model = (self.accuracy.epoch_cache.avg, self.state_dict()) self.save_best_model() if self.config.CONFIG.cuda: self.cuda() self.test_loss.clear_cache() self.accuracy.clear_cache() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def load_data(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines()[:config.HPCONFIG.max_samples] for i, l in tqdm(enumerate(text_file), desc='processing {}'.format(filename)): sentence = l.strip().split() if len(sentence) > 3: samples.append( Sample(i, sentence[:-1], sentence[-1] ) ) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(line)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocab.update(sample.sequence + [sample.label]) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] vocab = Vocab(input_vocab, special_tokens=VOCAB) return Dataset(filename, (train_samples, test_samples), input_vocab = vocab, output_vocab = vocab)
def do_train(self): for epoch in range(self.epochs): self.log.critical('memory consumed : {}'.format(memory_consumed())) self.epoch = epoch if epoch % max(1, (self.checkpoint - 1)) == 0: if self.do_validate() == FLAGS.STOP_TRAINING: self.log.info('loss trend suggests to stop training') return self.train() teacher_force_count = [0, 0] for j in tqdm(range(self.train_feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = self.train_feed.next_batch() idxs, inputs, targets = input_ sequence = inputs[0].transpose(0, 1) _, batch_size = sequence.size() state = self.initial_hidden(batch_size) loss = 0 output = sequence[0] for ti in range(1, sequence.size(0) - 1): output = self.forward(output, state) loss += self.loss_function(ti, output, input_) output, state = output if random.random() > 0.5: output = output.max(1)[1] teacher_force_count[0] += 1 else: output = sequence[ti + 1] teacher_force_count[1] += 1 loss.backward() self.train_loss.cache(loss.data.item()) self.optimizer.step() self.log.info( 'teacher_force_count: {}'.format(teacher_force_count)) self.log.info('-- {} -- loss: {}\n'.format( epoch, self.train_loss.epoch_cache)) self.train_loss.clear_cache() for m in self.metrics: m.write_to_file() return True
def do_validate(self): self.eval() if self.test_feed.num_batch > 0: for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, (gender, sequence), targets = input_ sequence = sequence.transpose(0,1) seq_size, batch_size = sequence.size() state = self.initial_hidden(batch_size) loss, accuracy = Var(self.config, [0]), Var(self.config, [0]) output = sequence[0] outputs = [] ti = 0 positions = LongVar(self.config, np.linspace(0, 1, seq_size)) for ti in range(1, sequence.size(0) - 1): output = self.forward(gender, positions[ti], output, state) loss += self.loss_function(ti, output, input_) accuracy += self.accuracy_function(ti, output, input_) output, state = output output = output.max(1)[1] outputs.append(output) self.test_loss.append(loss.item()) if ti == 0: ti = 1 self.accuracy.append(accuracy.item()/ti) #print('====', self.test_loss, self.accuracy) self.log.info('= {} =loss:{}'.format(self.epoch, self.test_loss)) self.log.info('- {} -accuracy:{}'.format(self.epoch, self.accuracy)) if len(self.best_model_criteria) > 1 and self.best_model[0] > self.best_model_criteria[-1]: self.log.info('beat best ..') self.best_model = (self.best_model_criteria[-1], self.cpu().state_dict()) self.save_best_model() if self.config.CONFIG.cuda: self.cuda() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def do_validate(self): self.eval() if self.test_feed.num_batch > 0: losses, accuracies = [], [] for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, word, targets = input_ loss = 0 encoded_info = self.__(self.encode(word), 'output') state = self.init_hidden(targets.size(1)) state = encoded_info[-1], state[1] prev_output = self.initial_token for i in range(targets.size(0)): output = self.decode(prev_ouptut, state) loss += self.loss_function(output, targets[i]) prev_output = output.max(1)[1].long() losses.append(loss) epoch_loss = torch.stack(losses).mean() self.test_loss.append(epoch_loss.data.item()) self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss)) if len(self.best_model_criteria) > 1: if self.best_model[0] > self.best_model_criteria[-1]: self.log.info('beat best ..') self.best_model = (self.best_model_criteria[-1], self.cpu().state_dict()) self.save_best_model() """ dump_vocab_tsv(self.config, self.dataset.input_vocab, self.embed.weight.data.cpu().numpy(), self.config.ROOT_DIR + '/vocab.tsv') """ if self.config.CONFIG.cuda: self.cuda() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def do_validate(self): self.eval() if self.test_feed.num_batch > 0: losses, accuracies = [], [] for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, seq, targets = input_ seq_size, batch_size = seq.size() pad_mask = (seq > 0).float() loss = 0 outputs = [] output = self.__(seq[0], 'output') state = self.__(self.init_hidden(batch_size), 'init_hidden') for index in range(seq_size - 1): output, state = self.__(self.forward(output, state), 'output, state') loss += self.loss_function(output, targets[index + 1]) output = self.__(output.max(1)[1], 'output') outputs.append(output) losses.append(loss) epoch_loss = torch.stack(losses).mean() self.test_loss.append(epoch_loss.data.item()) self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss)) if len(self.best_model_criteria) > 1: if self.best_model_criteria[-2] > self.best_model_criteria[-1]: self.log.info('beat best ..') self.best_model = (self.best_model_criteria[-1], self.cpu().state_dict()) self.save_best_model() if self.config.CONFIG.cuda: self.cuda() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def do_train(self): for epoch in range(self.epochs): self.log.critical('memory consumed : {}'.format(memory_consumed())) self.epoch = epoch if epoch and epoch % max(1, (self.checkpoint - 1)) == 0: #self.do_predict() if self.do_validate() == FLAGS.STOP_TRAINING: self.log.info('loss trend suggests to stop training') return self.train() losses = [] for j in tqdm(range(self.train_feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = self.train_feed.next_batch() idxs, seq, targets = input_ seq_size, batch_size = seq.size() pad_mask = (seq > 0).float() loss = 0 outputs = [] output = self.__(seq[0], 'output') state = self.__(self.init_hidden(batch_size), 'init_hidden') for index in range(seq_size - 1): output, state = self.__(self.forward(output, state), 'output, state') loss += self.loss_function(output, targets[index + 1]) output = self.__(output.max(1)[1], 'output') outputs.append(output) losses.append(loss) loss.backward() self.optimizer.step() epoch_loss = torch.stack(losses).mean() self.train_loss.append(epoch_loss.data.item()) self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss)) for m in self.metrics: m.write_to_file() return True
def do_validate(self): self.eval() if self.test_feed.num_batch > 0: losses, accuracies = [], [] for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())): input_ = self.test_feed.next_batch() idxs, inputs, targets = input_ output = self.forward(input_) loss = self.loss_function(output, input_) accuracy = self.accuracy_function(output, input_) losses.append(loss) accuracies.append(accuracy) epoch_loss = torch.stack(losses).mean() epoch_accuracy = torch.stack(accuracies).mean() self.test_loss.append(epoch_loss.data.item()) self.accuracy.append(epoch_accuracy.data.item()) #print('====', self.test_loss, self.accuracy) self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss)) self.log.info('- {} -accuracy:{}'.format(self.epoch, epoch_accuracy)) if len(self.best_model_criteria ) > 1 and self.best_model[0] < self.best_model_criteria[-1]: self.log.info('beat best ..') self.best_model = (self.best_model_criteria[-1], self.cpu().state_dict()) self.save_best_model() if self.config.CONFIG.cuda: self.cuda() for m in self.metrics: m.write_to_file() if self.early_stopping: return self.loss_trend()
def prep_samples(dataset): ret = [] vocabulary = defaultdict(int) labels = defaultdict(int) for i, sample in tqdm(enumerate(dataset)): try: sample = build_sample(sample) if not sample.label in LABELS: continue for token in sample.sentence: vocabulary[token] += 1 labels[sample.label] += 1 ret.append(sample) except KeyboardInterrupt: return except: log.exception('at id: {}'.format(i)) return ret, vocabulary, labels
def load_all_data(): skipped = 0 samples = [] for i, line in enumerate(tqdm(open(dataset_path).readlines())): try: _, line, label, *__ = line.split('|') samples.append( Sample( id = '{}.{}'.format(label, i), sequence = line, label = label, ) ) except KeyboardInterrupt: raise KeyboardInterrupt except: skipped += 1 log.exception(dataset_path) print('skipped {} samples'.format(skipped)) return samples
def train_on_feed(feed): losses = [] feed.reset_offset() for j in tqdm(range(feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = feed.next_batch() idxs, (gender, seq), target = input_ seq_size, batch_size = seq.size() pad_mask = (seq > 0).float() hidden_states, (hidden, cell_state) = self.__( self.encode_sequence(seq), 'encoded_outpus') loss = 0 outputs = [] target_size, batch_size = target.size() #TODO: target[0] should not be used. will throw error when used without GO token from batchip output = self.__(target[0], 'hidden') state = self.__((hidden, cell_state), 'init_hidden') gender_embedding = self.gender_embed(gender) for index in range(target_size - 1): output, state = self.__( self.decode(hidden_states, output, state, gender_embedding), 'output, state') loss += self.loss_function(output, target[index + 1]) output = self.__(output.max(1)[1], 'output') outputs.append(output) losses.append(loss) loss.backward() self.optimizer.step() return torch.stack(losses).mean()
def experiment(VOCAB, raw_samples, datapoints=[[], []], eons=1000, epochs=10, checkpoint=5): try: encoder = Encoder(Config(), 'encoder', len(VOCAB)) decoder = PtrDecoder(Config(), 'decoder', encoder.embed, VOCAB['GO'], len(VOCAB)) try: encoder.load_state_dict(torch.load('{}.{}.{}'.format(SELF_NAME, 'encoder', 'pth'))) decoder.load_state_dict(torch.load('{}.{}.{}'.format(SELF_NAME, 'decoder', 'pth'))) log.info('loaded the old image for the model') except: log.exception('failed to load the model') if Config().cuda: log.info('cuda the model...') encoder.cuda() decoder.cuda() model = (encoder, decoder) print('**** the model', model) name = os.path.basename(__file__).replace('.py', '') _batchop = partial(batchop, WORD2INDEX=VOCAB) train_feed = DataFeed(name, datapoints[0], batchop=_batchop, batch_size=100) test_feed = DataFeed(name, datapoints[1], batchop=_batchop, batch_size=100) predictor_feed = DataFeed(name, datapoints[1], batchop=_batchop, batch_size=100) _loss = partial(loss, loss_function=nn.NLLLoss(), UNK=VOCAB['UNK']) _accuracy = partial(accuracy, UNK=VOCAB['UNK']) trainer = Trainer(name=name, model=(encoder, decoder), loss_function=_loss, accuracy_function=_accuracy, f1score_function=f1score, checkpoint=checkpoint, epochs=epochs, feeder = Feeder(train_feed, test_feed)) _repr_function=partial(repr_function, VOCAB=VOCAB, raw_samples=raw_samples) _process_predictor_output = partial(process_predictor_output, UNK=VOCAB['UNK']) predictor = Predictor(model = (encoder, decoder), feed = predictor_feed, repr_function = _repr_function, process_output = _process_predictor_output) dump = open('results/experiment_attn.csv', 'w') for e in range(eons): log.info('on {}th eon'.format(e)) dump.write('#========================after eon: {}\n'.format(e)) results = ListTable() for ri in tqdm(range(predictor_feed.num_batch//10)): output, _results = predictor.predict(predictor_feed.num_batch - ri, 3) results.extend(_results) dump.write(repr(results)) dump.flush() if not trainer.train(): raise Exception except : log.exception('####################') trainer.save_best_model() return locals()
def load_data(config, dirname='../dataset/', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() gender_vocab = Counter() ######################################################### # Read names ######################################################### def read_data(filename='names.csv'): data = open(filename).readlines() samples = [] for datum in data: name = datum.split(',')[1] name = ''.join(name.split()) samples.append(remove_punct_symbols(name)) return samples def read_dirs(dirs=['boy', 'girl']): samples = [] for d in dirs: for filename in os.listdir('{}/{}'.format(dirname, d)): s = read_data('{}/{}/{}'.format(dirname, d, filename)) s = [(d, n) for n in s] samples.extend(s) return list(set(samples)) raw_samples = read_dirs() log.info('read {} names'.format(len(raw_samples))) ######################################################### # Read tamil words ######################################################### def read_words(filename=config.HPCONFIG.lm_dataset_path): samples = [] for line in tqdm( open(filename).readlines()[:config.HPCONFIG.lm_samples_count], 'reading lm file for words'): s = line.split() s = [('neutral', n) for n in s] samples.extend(s) return list(set(samples)) pretrain_samples = read_words() ######################################################### # build vocab ######################################################### all_samples = raw_samples + pretrain_samples log.info('building input_vocabulary...') for gender, name in tqdm(all_samples, desc='building vocab'): name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name): input_vocab.update(name) gender_vocab.update([gender]) vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50) print(gender_vocab) gender_vocab = Vocab(gender_vocab, special_tokens=[]) if config.CONFIG.write_vocab_to_file: vocab.write_to_file(config.ROOT_DIR + '/input_vocab.csv') gender_vocab.write_to_file(config.ROOT_DIR + '/gender_vocab.csv') def build_samples(raw_samples): samples = [] for i, (gender, name) in enumerate(tqdm(raw_samples, desc='processing names')): try: #name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name) < 2: continue log.debug('===') log.debug(pformat(name)) for a, b in zip(range(len(name)), range(1, len(name) - 1)): template = list(NULL_CHAR * len(name)) template[a] = name[a] template[b] = name[b] samples.append( Sample('{}.{}'.format(gender, i), gender, template, name)) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(name)) return samples pretrain_samples = build_samples(pretrain_samples) samples = build_samples(raw_samples) print('skipped {} samples'.format(skipped)) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] #train_samples, test_samples = samples, [] #train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True) return NameDataset('names', (train_samples, test_samples), pretrain_samples=pretrain_samples, input_vocab=vocab, gender_vocab=gender_vocab)
def do_train(self): for epoch in range(self.epochs): self.log.critical('memory consumed : {}'.format(memory_consumed())) self.epoch = epoch if epoch and epoch % max(1, (self.checkpoint - 1)) == 0: #self.do_predict() if self.do_validate() == FLAGS.STOP_TRAINING: self.log.info('loss trend suggests to stop training') return self.train() losses = [] tracemalloc.start() for j in tqdm(range(self.train_feed.num_batch), desc='Trainer.{}'.format(self.name())): self.optimizer.zero_grad() input_ = self.train_feed.next_batch() idxs, word, targets = input_ loss = 0 encoded_info = self.__(self.encode(word), 'encoded_info') keys = self.__(self.keys.transpose(0, 1), 'keys') keys = self.__( keys.expand([encoded_info.size(0), *keys.size()]), 'keys') inner_product = self.__( torch.bmm( encoded_info.unsqueeze(1), #final state keys), 'inner_product') values = self.__(self.values, 'values') values = self.__( values.expand([inner_product.size(0), *values.size()]), 'values') weighted_sum = self.__(torch.bmm(inner_product, values), 'weighted_sum') weighted_sum = self.__(weighted_sum.squeeze(1), 'weighted_sum') #make the same chane in do_[predict|validate] tseq_len, batch_size = targets.size() state = self.__( (weighted_sum, self.init_hidden(batch_size).squeeze(0)), 'decoder initial state') #state = self.__( (encoded_info, state[1].squeeze(0)), 'decoder initial state') prev_output = self.__( self.sos_token.expand([encoded_info.size(0)]), 'sos_token') for i in range(targets.size(0)): output = self.decode(prev_output, state) loss += self.loss_function(output, targets[i]) prev_output = output.max(1)[1].long() losses.append(loss) loss.backward() self.optimizer.step() del input_ #, keys, values if j and not j % 100000: malloc_snap = tracemalloc.take_snapshot() display_tracemalloc_top(malloc_snap, limit=100) epoch_loss = torch.stack(losses).mean() self.train_loss.append(epoch_loss.data.item()) self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss)) for m in self.metrics: m.write_to_file() return True
def build_cache_for_train2(self): self.batch_cache = [] for j in tqdm(range(self.train_feed.num_batch), desc='building cache'): input_ = self.train_feed.next_batch() self.batch_cache.append(input_)
def multiplexed_train(config, argv, name, ROOT_DIR, model, dataset): _batchop = partial(batchop, VOCAB=dataset.input_vocab, LABELS=dataset.output_vocab) predictor_feed = DataFeed(name, dataset.testset, batchop=_batchop, batch_size=1) predictor = Predictor(name, model=model, directory=ROOT_DIR, feed=predictor_feed, repr_function=partial(repr_function, VOCAB=dataset.input_vocab, LABELS=dataset.output_vocab, dataset=dataset.testset_dict)) loss_ = partial(loss, loss_function=nn.NLLLoss()) test_feed, tester = {}, {} train_feed = {} for subset in dataset.datasets: test_feed[subset.name] = DataFeed(subset.name, subset.testset, batchop=_batchop, batch_size=config.CONFIG.batch_size) train_feed[subset.name] = DataFeed(subset.name, portion( subset.trainset, config.HPCONFIG.trainset_size), batchop=_batchop, batch_size=config.CONFIG.batch_size) tester[subset.name] = Tester(name=subset.name, config=config, model=model, directory=ROOT_DIR, loss_function=loss_, accuracy_function=accuracy, feed=test_feed[subset.name], save_model_weights=False) test_feed[name] = DataFeed(name, dataset.testset, batchop=_batchop, batch_size=config.CONFIG.batch_size) tester[name] = Tester(name=name, config=config, model=model, directory=ROOT_DIR, loss_function=loss_, accuracy_function=accuracy, feed=test_feed[name], predictor=predictor) train_feed_muxed = MultiplexedDataFeed(name, train_feed, _batchop, config.CONFIG.batch_size) trainer = MultiplexedTrainer( name=name, config=config, model=model, directory=ROOT_DIR, optimizer=optim.Adam(model.parameters()), loss_function=loss_, testers=tester, checkpoint=config.CONFIG.CHECKPOINT, epochs=config.CONFIG.EPOCHS, feed=train_feed_muxed, ) for e in range(config.CONFIG.EONS): if not trainer.train(): raise Exception dump = open('{}/results/eon_{}.csv'.format(ROOT_DIR, e), 'w') log.info('on {}th eon'.format(e)) results = ListTable() for ri in tqdm(range(predictor_feed.num_batch), desc='\nrunning prediction on eon: {}'.format(e)): output, _results = predictor.predict(ri) results.extend(_results) dump.write(repr(results)) dump.close()
if args.task == 'train': net.do_train() if args.task == 'drop-words-and-validate': net.drop_words_and_validate(args.epoch) if args.task == 'dump-vocab': from collections import Counter from utilz import Sample counter = Counter() for s in dataset.trainset: counter.update([s.word, s.context]) embedding = [] words = sorted(counter.keys()) for w in tqdm(words): ids, word, context = _batchop([Sample('0', w, '')], for_prediction=True) emb = net.__(net.embed(word), 'emb') embedding.append(emb) embedding = torch.stack(embedding).squeeze() dump_vocab_tsv(config, words, embedding.cpu().detach().numpy(), config.ROOT_DIR + '/vocab.tsv') if args.task == 'dump-cosine-similarity': dump_cosine_similarity_tsv(config, dataset.input_vocab, net.embed.weight.data.cpu(),