def dump_vocab_tsv(self, filepath=None):

        embeddings = {}
        self.eval()
        self.train_feed.reset_offset()
        for j in tqdm(range(self.train_feed.size), desc='dump emb'):
            input_ = self.train_feed.next_batch(batch_size=1)
            idxs, pair, targets = input_
            w1_state, w2_state = self.encode_pair(pair)

            sample = self.dataset.trainset_dict[idxs[0]] #since batch size = 1 idxs[0] works
            w1, w2 = sample.pair
            w1_text = ' '.join(w1).replace('@@ ', '')
            w2_text = ' '.join(w1).replace('@@ ', '')

            embeddings[w1_text] = w1_state.tolist()[0]
            embeddings[w2_text] = w2_state.tolist()[0]


        if not filepath:
            filepath = self.config.ROOT_DIR + '/vocab.tsv'
            
        vector_filepath = filepath.replace('.tsv', '.vector.tsv')
        token_filepath  = filepath.replace('.tsv', '.token.tsv')
        
        vector_file = open(vector_filepath, 'w')
        token_file  = open(token_filepath,  'w')    

        for word, vector in tqdm(embeddings.items(), desc='writing to file'):
            vector_file.write('\t'.join([str(v) for v in vector]) + '\n')
            token_file.write(word + '\n')
            
        vector_file.close()
        token_file.close()
Example #2
0
            def train_on_feed(feed):

                losses = []
                feed.reset_offset()
                for j in tqdm(range(feed.num_batch), desc='Trainer.{}'.format(self.name())):
                    self.optimizer.zero_grad()
                    input_ = feed.next_batch()
                    idxs, (gender, sequence), targets = input_
                    sequence = sequence.transpose(0,1)
                    seq_size, batch_size = sequence.size()

                    state = self.initial_hidden(batch_size)
                    loss = 0
                    output = sequence[0]
                    positions = LongVar(self.config, np.linspace(0, 1, seq_size))
                    for ti in range(1, sequence.size(0) - 1):
                        output = self.forward(gender, positions[ti], output, state)
                        loss += self.loss_function(ti, output, input_)
                        output, state = output

                        if random.random() > self.teacher_forcing_ratio:
                            output = output.max(1)[1]
                            teacher_force_count[0] += 1
                        else:
                            output = sequence[ti+1]
                            teacher_force_count[1] += 1

                    losses.append(loss)
                    loss.backward()
                    self.optimizer.step()
                    
                return torch.stack(losses).mean()
    def do_validate(self):
        self.eval()
        if self.test_feed.num_batch > 0:
            losses, accuracies = [], []
            for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())):
                input_ = self.test_feed.next_batch()
                idxs, pair, targets = input_
                output = self.__(self.forward(pair), 'output')
                loss   = self.loss_function(output, targets)
                
                losses.append(loss)

            epoch_loss = torch.stack(losses).mean()

            self.test_loss.append(epoch_loss.data.item())

            self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss))
            
        if len(self.best_model_criteria) > 1:
            if self.best_model_criteria[-2] > self.best_model_criteria[-1]:
                self.log.info('beat best ..')
                self.best_model = (self.best_model_criteria[-1],
                                   self.cpu().state_dict())                             
                
                self.save_best_model()
                #self.dump_vocab_tsv()
            
                if self.config.CONFIG.cuda:
                    self.cuda()
        
        for m in self.metrics:
            m.write_to_file()
            
        if self.early_stopping:
            return self.loss_trend()
Example #4
0
    def build_samples(raw_samples):
        samples = []
        for i, (gender,
                name) in enumerate(tqdm(raw_samples, desc='processing names')):
            try:

                #name = remove_punct_symbols(name)
                name = tamil.utf8.get_letters(name.strip())

                if len(name) < 2:
                    continue

                log.debug('===')
                log.debug(pformat(name))

                samples.append(Sample('{}.{}'.format(gender, i), gender, name))

                if max_sample_size and len(samples) > max_sample_size:
                    break

            except:
                skipped += 1
                log.exception('{}'.format(name))

        return samples
Example #5
0
    def build_samples(raw_samples):
        samples = []
        for i, (gender,
                name) in enumerate(tqdm(raw_samples, desc='processing names')):
            try:

                #name = remove_punct_symbols(name)
                name = tamil.utf8.get_letters(name.strip())

                if len(name) < 2:
                    continue

                log.debug('===')
                log.debug(pformat(name))

                for a, b in zip(range(len(name)), range(1, len(name) - 1)):
                    template = list(NULL_CHAR * len(name))
                    template[a] = name[a]
                    template[b] = name[b]
                    samples.append(
                        Sample('{}.{}'.format(gender, i), gender, template,
                               name))

                if max_sample_size and len(samples) > max_sample_size:
                    break

            except:
                skipped += 1
                log.exception('{}'.format(name))

        return samples
Example #6
0
    def load_data(set_='train'):
        skipped = 0
        samples = []

        for i, line in enumerate(tqdm(
                open(
                    '{}/{}.tsv'.format(dataset_path, set_)
                ).readlines())):
            
            try:
                #print(line.split('\t'))
                pid, sid, line, label = line.strip().split('\t')
                samples.append(
                    Sample(
                    id = '{}.{}.{}.{}'.format(pid, sid, i, label),
                        sequence = line,
                        label    = label,
                    )
                )

            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except:
                skipped += 1
                log.exception(dataset_path)
                            
        print('skipped {} samples'.format(skipped))
        return samples
Example #7
0
    def do_train2(self):
        if not hasattr(self, 'batch_cache'):
            self.build_cache_for_train2()
        
        for epoch in range(self.epochs):
            self.log.critical('memory consumed : {}'.format(memory_consumed()))            
            self.epoch = epoch
            if epoch and epoch % max(1, (self.checkpoint - 1)) == 0:
                #self.do_predict()
                if self.do_validate() == FLAGS.STOP_TRAINING:
                    self.log.info('loss trend suggests to stop training')
                    return
                           
            self.train()
            losses = []
            for input_ in tqdm(self.batch_cache, desc='Trainer.{}'.format(self.name())):
                self.optimizer.zero_grad()
                idxs, word, targets = input_
                output = self.__(self.forward(word), 'output')
                loss   = self.loss_function(output, targets)
                    
                losses.append(loss)
                loss.backward()
                self.optimizer.step()

            epoch_loss = torch.stack(losses).mean()
            self.train_loss.append(epoch_loss.data.item())

            self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss))
            for m in self.metrics:
                m.write_to_file()

        return True
    def do_train(self):
        for epoch in range(self.epochs):
            self.log.critical('memory consumed : {}'.format(memory_consumed()))
            self.epoch = epoch
            if epoch % max(1, (self.checkpoint - 1)) == 0:
                #self.do_predict()
                if self.do_validate() == FLAGS.STOP_TRAINING:
                    self.log.info('loss trend suggests to stop training')
                    return

            self.train()
            losses = []
            for j in tqdm(range(self.train_feed.num_batch),
                          desc='Trainer.{}'.format(self.name())):
                self.optimizer.zero_grad()
                input_ = self.train_feed.next_batch()
                idxs, inputs, targets = input_

                output = self.forward(input_)
                loss = self.loss_function(output, input_)
                #print(loss.data.cpu().numpy())
                losses.append(loss)
                loss.backward()
                self.optimizer.step()

            epoch_loss = torch.stack(losses).mean()
            self.train_loss.append(epoch_loss.data.item())

            self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss))
            for m in self.metrics:
                m.write_to_file()

        return True
Example #9
0
def load_filmreviews_data(config,
                          filename=('../dataset/filmreviews/reviews.subword_nmt.csv',
                                    '../dataset/filmreviews/ratings.csv'),
                          max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    
    try:
        log.info('processing file: {}'.format(filename))
        text_file, label_file = [open(f).readlines() for f in filename]
        for i, (s, l) in tqdm(enumerate(zip(text_file, label_file)),
                            desc='processing {}'.format(filename)):

            s, l = s.strip(), l.strip()
            label = float(l.strip().lower())
            if label >= 2.75:
                label = 'positive'
            else:
                label = 'negative'
            samples.append(
                Sample(i,
                       s.strip().split(),
                       label
                )
            )
            
            
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(line))

    print('skipped {} samples'.format(skipped))
    

    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocab.update(sample.sequence)            
        output_vocab.update([sample.label])

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True)
    test_samples  = sorted(test_samples, key=lambda x: len(x.sequence), reverse=True)
    return Dataset(filename,
                   (train_samples, test_samples),
                   Vocab(input_vocab, special_tokens=VOCAB),
                   Vocab(output_vocab))
Example #10
0
def load_squad_data(data_path, ids, max_para_len=600, max_ans_len=10):
    dataset = json.load(open(data_path, 'r'))
    samples = []
    qn, an = 0, 0
    skipped = 0

    vocabulary = defaultdict(int)

    def __(s):
        import unicodedata
        s = ''.join(c for c in unicodedata.normalize('NFKD', s)
                    if unicodedata.category(c) != 'Mn')
        return s.replace("``", '"').replace("''", '"')

    try:
        for aid, article in enumerate(tqdm(dataset['data'])):
            for pid, paragraph in enumerate(article['paragraphs']):

                context = TokenString(__(paragraph['context']),
                                      word_tokenize).delete_whitespace()
                questions = paragraph['qas']

                for token in context:
                    vocabulary[token] += 1

                for qid, qa in enumerate(questions):
                    log.debug('processing: {}.{}.{}'.format(aid, pid, qid))
                    q = TokenString(__(qa['question']),
                                    word_tokenize).delete_whitespace()
                    a = TokenString(__(qa['answers'][0]['text']),
                                    word_tokenize).delete_whitespace(
                                    )  #simply ignore other answers
                    squad_id = qa['id']
                    for token in q:
                        vocabulary[token] += 1

                    indices = context.index(a)
                    if not indices:
                        log.debug(pformat(paragraph['context']))
                        log.debug(pformat(paragraph['qas'][qid]))
                        log.error('{}.{}.{} - "{}" not found in \n"{}"'.format(
                            aid, pid, qid, a.tokenized_string,
                            context.tokenized_string))
                        skipped += 1
                        continue

                    a_start, a_end = indices
                    fields = (aid, pid, qid, squad_id, context, q, a,
                              list(range(a_start, a_end)))
                    _id = tuple(fields[i - 1] for i in ids)
                    samples.append(Sample(_id, *fields))
    except:
        skipped += 1
        log.exception('{}'.format(aid))

    print('skipped {} samples'.format(skipped))
    return samples, vocabulary
Example #11
0
def load_task_data(task=1, type_='train', max_sample_size=None):
    samples = []
    qn, an = 0, 0
    skipped = 0

    input_vocabulary = Counter()
    output_vocabulary = Counter()

    try:
        filename = glob.glob('../dataset/en-10k/qa{}_*_{}.txt'.format(
            task, type_))[0]

        task_name = re.search(r'qa\d+_(.*)_.*.txt', filename)
        if task_name:
            task_name = task_name.group(1)

        log.info('processing file: {}'.format(filename))
        dataset = open(filename).readlines()
        prev_linenum = 1000000
        for line in tqdm(dataset):
            questions, answers = [], []
            linenum, line = line.split(' ', 1)

            linenum = int(linenum)
            if prev_linenum > linenum:
                story = ''

            if '?' in line:
                q, a, _ = line.split('\t')

                samples.append(
                    Sample('{}.{}'.format(task, linenum), task,
                           linenum, task_name,
                           TokenString(story.lower(), word_tokenize),
                           TokenString(q.lower(), word_tokenize), a.lower()))

            else:
                story += ' ' + line

            prev_linenum = linenum

    except:
        skipped += 1
        log.exception('{}'.format(task, linenum))

    print('skipped {} samples'.format(skipped))

    samples = sorted(samples, key=lambda x: len(x.story), reverse=True)
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocabulary.update(sample.story + sample.q)
        output_vocabulary.update([sample.a])

    return task_name, samples, input_vocabulary, output_vocabulary
Example #12
0
    def read_words(filename=config.HPCONFIG.lm_dataset_path):
        samples = []
        for line in tqdm(
                open(filename).readlines()[:config.HPCONFIG.lm_samples_count],
                'reading lm file for words'):
            s = line.split()
            s = [('neutral', n) for n in s]
            samples.extend(s)

        return list(set(samples))
Example #13
0
def load_tawiki_data(config, dataset_name='tawiki', max_sample_size=None):
    samples = []
    skipped = 0

    vocab = Counter()
    
    try:
        filename = glob.glob('../dataset/tawiki_lines.txt')[0]
              
        log.info('processing file: {}'.format(filename))
        dataset = open(filename).readlines()
        for i, line in enumerate(tqdm(dataset, desc='processing {}'.format(filename))):
            import string

            #print(line)
            try:
                line = line.strip()
                
                if len(line) > 20:

                    for j, segment in enumerate(line.split('. ')):
                        if len(segment) < 20:
                            continue
                        
                        samples.append(
                            Sample(
                                id = '{}.{}.{}'.format(dataset_name, i ,j),
                                sequence = [str(i) for i in utf8_to_tace16(segment)]
                            )
                        )
                    """
                    samples.append(
                            Sample(
                                id = '{}.{}'.format(dataset_name, i),
                                sequence = [str(i) for i in utf8_to_tace16(line)]
                            )
                        )
                    """
            except:
                log.exception('{}.{}.{} -  {}'.format(dataset_name, i, j, word))
    except:
        skipped += 1
        log.exception('{}.{} -  {}'.format(dataset_name, i, line))

    print('skipped {} samples'.format(skipped))
    
    samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True)
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building vocab...')
    for sample in samples:
        vocab.update(sample.sequence)

    return os.path.basename(filename), samples, vocab
Example #14
0
    def do_validate(self):
        self.eval()
        for j in tqdm(range(self.test_feed.num_batch),
                      desc='Tester.{}'.format(self.name())):
            input_ = self.test_feed.next_batch()
            idxs, inputs, targets = input_
            sequence = inputs[0].transpose(0, 1)
            _, batch_size = sequence.size()

            state = self.initial_hidden(batch_size)
            loss, accuracy = Var(self.config, [0]), Var(self.config, [0])
            output = sequence[0]
            outputs = []
            ti = 0
            for ti in range(1, sequence.size(0) - 1):
                output = self.forward(output, state)
                loss += self.loss_function(ti, output, input_)
                accuracy += self.accuracy_function(ti, output, input_)
                output, state = output
                output = output.max(1)[1]
                outputs.append(output)

            self.test_loss.cache(loss.item())
            if ti == 0: ti = 1
            self.accuracy.cache(accuracy.item() / ti)
            #print('====', self.test_loss, self.accuracy)

        self.log.info('= {} =loss:{}'.format(self.epoch,
                                             self.test_loss.epoch_cache))
        self.log.info('- {} -accuracy:{}'.format(self.epoch,
                                                 self.accuracy.epoch_cache))

        if self.best_model[0] < self.accuracy.epoch_cache.avg:
            self.log.info('beat best ..')
            last_acc = self.best_model[0]
            self.best_model = (self.accuracy.epoch_cache.avg,
                               self.state_dict())

            self.save_best_model()

            if self.config.CONFIG.cuda:
                self.cuda()

        self.test_loss.clear_cache()
        self.accuracy.clear_cache()

        for m in self.metrics:
            m.write_to_file()

        if self.early_stopping:
            return self.loss_trend()
Example #15
0
def load_data(config,
               filename='../dataset/lm_lengthsorted.txt',
               max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    
    try:
        log.info('processing file: {}'.format(filename))
        text_file = open(filename).readlines()[:config.HPCONFIG.max_samples]
        for i, l in tqdm(enumerate(text_file),
                            desc='processing {}'.format(filename)):

            sentence = l.strip().split()

            if len(sentence) > 3:
                samples.append(
                    Sample(i,
                           sentence[:-1],
                           sentence[-1]
                    )
                )
            
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(line))

    print('skipped {} samples'.format(skipped))
    
    samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True)
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocab.update(sample.sequence + [sample.label])            

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    vocab = Vocab(input_vocab, special_tokens=VOCAB)
    return Dataset(filename,
                   (train_samples, test_samples),
                   input_vocab = vocab,
                   output_vocab = vocab)
Example #16
0
    def do_train(self):
        for epoch in range(self.epochs):
            self.log.critical('memory consumed : {}'.format(memory_consumed()))
            self.epoch = epoch
            if epoch % max(1, (self.checkpoint - 1)) == 0:
                if self.do_validate() == FLAGS.STOP_TRAINING:
                    self.log.info('loss trend suggests to stop training')
                    return

            self.train()
            teacher_force_count = [0, 0]
            for j in tqdm(range(self.train_feed.num_batch),
                          desc='Trainer.{}'.format(self.name())):
                self.optimizer.zero_grad()
                input_ = self.train_feed.next_batch()
                idxs, inputs, targets = input_
                sequence = inputs[0].transpose(0, 1)
                _, batch_size = sequence.size()

                state = self.initial_hidden(batch_size)
                loss = 0
                output = sequence[0]
                for ti in range(1, sequence.size(0) - 1):
                    output = self.forward(output, state)
                    loss += self.loss_function(ti, output, input_)
                    output, state = output

                    if random.random() > 0.5:
                        output = output.max(1)[1]
                        teacher_force_count[0] += 1
                    else:
                        output = sequence[ti + 1]
                        teacher_force_count[1] += 1

                loss.backward()
                self.train_loss.cache(loss.data.item())
                self.optimizer.step()

            self.log.info(
                'teacher_force_count: {}'.format(teacher_force_count))

            self.log.info('-- {} -- loss: {}\n'.format(
                epoch, self.train_loss.epoch_cache))
            self.train_loss.clear_cache()

            for m in self.metrics:
                m.write_to_file()

        return True
Example #17
0
    def do_validate(self):
        self.eval()
        if self.test_feed.num_batch > 0:
            for j in tqdm(range(self.test_feed.num_batch), desc='Tester.{}'.format(self.name())):
                input_ = self.test_feed.next_batch()
                idxs, (gender, sequence), targets = input_
                sequence = sequence.transpose(0,1)
                seq_size, batch_size = sequence.size()

                state = self.initial_hidden(batch_size)
                loss, accuracy = Var(self.config, [0]), Var(self.config, [0])
                output = sequence[0]
                outputs = []
                ti = 0
                positions = LongVar(self.config, np.linspace(0, 1, seq_size))
                for ti in range(1, sequence.size(0) - 1):
                    output = self.forward(gender, positions[ti], output, state)
                    loss += self.loss_function(ti, output, input_)
                    accuracy += self.accuracy_function(ti, output, input_)
                    output, state = output
                    output = output.max(1)[1]
                    outputs.append(output)

                self.test_loss.append(loss.item())
                if ti == 0: ti = 1
                self.accuracy.append(accuracy.item()/ti)
                #print('====', self.test_loss, self.accuracy)

            self.log.info('= {} =loss:{}'.format(self.epoch, self.test_loss))
            self.log.info('- {} -accuracy:{}'.format(self.epoch, self.accuracy))

            
        if len(self.best_model_criteria) > 1 and self.best_model[0] > self.best_model_criteria[-1]:
            self.log.info('beat best ..')
            self.best_model = (self.best_model_criteria[-1],
                               self.cpu().state_dict())                             

            self.save_best_model()
            
            if self.config.CONFIG.cuda:
                self.cuda()

        
        for m in self.metrics:
            m.write_to_file()
            
        if self.early_stopping:
            return self.loss_trend()
Example #18
0
    def do_validate(self):
        self.eval()
        if self.test_feed.num_batch > 0:
            losses, accuracies = [], []
            for j in tqdm(range(self.test_feed.num_batch),
                          desc='Tester.{}'.format(self.name())):
                input_ = self.test_feed.next_batch()
                idxs, word, targets = input_
                loss = 0
                encoded_info = self.__(self.encode(word), 'output')
                state = self.init_hidden(targets.size(1))
                state = encoded_info[-1], state[1]
                prev_output = self.initial_token
                for i in range(targets.size(0)):
                    output = self.decode(prev_ouptut, state)
                    loss += self.loss_function(output, targets[i])
                    prev_output = output.max(1)[1].long()

                losses.append(loss)

            epoch_loss = torch.stack(losses).mean()

            self.test_loss.append(epoch_loss.data.item())

            self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss))

        if len(self.best_model_criteria) > 1:
            if self.best_model[0] > self.best_model_criteria[-1]:
                self.log.info('beat best ..')
                self.best_model = (self.best_model_criteria[-1],
                                   self.cpu().state_dict())

                self.save_best_model()
                """
                dump_vocab_tsv(self.config,
                               self.dataset.input_vocab,
                               self.embed.weight.data.cpu().numpy(),
                               self.config.ROOT_DIR + '/vocab.tsv')
                """
                if self.config.CONFIG.cuda:
                    self.cuda()

        for m in self.metrics:
            m.write_to_file()

        if self.early_stopping:
            return self.loss_trend()
Example #19
0
    def do_validate(self):
        self.eval()
        if self.test_feed.num_batch > 0:
            losses, accuracies = [], []
            for j in tqdm(range(self.test_feed.num_batch),
                          desc='Tester.{}'.format(self.name())):
                input_ = self.test_feed.next_batch()
                idxs, seq, targets = input_

                seq_size, batch_size = seq.size()
                pad_mask = (seq > 0).float()

                loss = 0
                outputs = []
                output = self.__(seq[0], 'output')
                state = self.__(self.init_hidden(batch_size), 'init_hidden')
                for index in range(seq_size - 1):
                    output, state = self.__(self.forward(output, state),
                                            'output, state')
                    loss += self.loss_function(output, targets[index + 1])
                    output = self.__(output.max(1)[1], 'output')
                    outputs.append(output)

                losses.append(loss)

            epoch_loss = torch.stack(losses).mean()
            self.test_loss.append(epoch_loss.data.item())

            self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss))

        if len(self.best_model_criteria) > 1:
            if self.best_model_criteria[-2] > self.best_model_criteria[-1]:
                self.log.info('beat best ..')
                self.best_model = (self.best_model_criteria[-1],
                                   self.cpu().state_dict())

                self.save_best_model()

                if self.config.CONFIG.cuda:
                    self.cuda()

        for m in self.metrics:
            m.write_to_file()

        if self.early_stopping:
            return self.loss_trend()
Example #20
0
    def do_train(self):
        for epoch in range(self.epochs):
            self.log.critical('memory consumed : {}'.format(memory_consumed()))
            self.epoch = epoch
            if epoch and epoch % max(1, (self.checkpoint - 1)) == 0:
                #self.do_predict()
                if self.do_validate() == FLAGS.STOP_TRAINING:
                    self.log.info('loss trend suggests to stop training')
                    return

            self.train()
            losses = []
            for j in tqdm(range(self.train_feed.num_batch),
                          desc='Trainer.{}'.format(self.name())):
                self.optimizer.zero_grad()
                input_ = self.train_feed.next_batch()
                idxs, seq, targets = input_

                seq_size, batch_size = seq.size()
                pad_mask = (seq > 0).float()

                loss = 0
                outputs = []
                output = self.__(seq[0], 'output')
                state = self.__(self.init_hidden(batch_size), 'init_hidden')
                for index in range(seq_size - 1):
                    output, state = self.__(self.forward(output, state),
                                            'output, state')
                    loss += self.loss_function(output, targets[index + 1])
                    output = self.__(output.max(1)[1], 'output')
                    outputs.append(output)

                losses.append(loss)
                loss.backward()
                self.optimizer.step()

            epoch_loss = torch.stack(losses).mean()
            self.train_loss.append(epoch_loss.data.item())

            self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss))
            for m in self.metrics:
                m.write_to_file()

        return True
    def do_validate(self):
        self.eval()
        if self.test_feed.num_batch > 0:
            losses, accuracies = [], []
            for j in tqdm(range(self.test_feed.num_batch),
                          desc='Tester.{}'.format(self.name())):
                input_ = self.test_feed.next_batch()
                idxs, inputs, targets = input_

                output = self.forward(input_)
                loss = self.loss_function(output, input_)
                accuracy = self.accuracy_function(output, input_)

                losses.append(loss)
                accuracies.append(accuracy)

            epoch_loss = torch.stack(losses).mean()
            epoch_accuracy = torch.stack(accuracies).mean()

            self.test_loss.append(epoch_loss.data.item())
            self.accuracy.append(epoch_accuracy.data.item())
            #print('====', self.test_loss, self.accuracy)

            self.log.info('= {} =loss:{}'.format(self.epoch, epoch_loss))
            self.log.info('- {} -accuracy:{}'.format(self.epoch,
                                                     epoch_accuracy))

        if len(self.best_model_criteria
               ) > 1 and self.best_model[0] < self.best_model_criteria[-1]:
            self.log.info('beat best ..')
            self.best_model = (self.best_model_criteria[-1],
                               self.cpu().state_dict())

            self.save_best_model()

            if self.config.CONFIG.cuda:
                self.cuda()

        for m in self.metrics:
            m.write_to_file()

        if self.early_stopping:
            return self.loss_trend()
def prep_samples(dataset):
    ret = []
    vocabulary = defaultdict(int)
    labels = defaultdict(int)

    for i, sample in tqdm(enumerate(dataset)):
        try:
            sample = build_sample(sample)
            if not sample.label in LABELS:
                continue
            for token in sample.sentence:
                vocabulary[token] += 1
            labels[sample.label] += 1
            ret.append(sample)
        except KeyboardInterrupt:
            return
        except:
            log.exception('at id: {}'.format(i))

    return ret, vocabulary, labels
Example #23
0
    def load_all_data():
        skipped = 0
        samples = []

        for i, line in enumerate(tqdm(open(dataset_path).readlines())):
            try:
                _, line, label, *__ = line.split('|')
                samples.append(
                    Sample(
                    id = '{}.{}'.format(label, i),
                        sequence = line,
                        label    = label,
                    )
                )

            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except:
                skipped += 1
                log.exception(dataset_path)
                            
        print('skipped {} samples'.format(skipped))
        return samples
Example #24
0
            def train_on_feed(feed):
                losses = []
                feed.reset_offset()

                for j in tqdm(range(feed.num_batch),
                              desc='Trainer.{}'.format(self.name())):
                    self.optimizer.zero_grad()
                    input_ = feed.next_batch()
                    idxs, (gender, seq), target = input_

                    seq_size, batch_size = seq.size()
                    pad_mask = (seq > 0).float()

                    hidden_states, (hidden, cell_state) = self.__(
                        self.encode_sequence(seq), 'encoded_outpus')

                    loss = 0
                    outputs = []
                    target_size, batch_size = target.size()
                    #TODO: target[0] should not be used. will throw error when used without GO token from batchip
                    output = self.__(target[0], 'hidden')
                    state = self.__((hidden, cell_state), 'init_hidden')
                    gender_embedding = self.gender_embed(gender)
                    for index in range(target_size - 1):
                        output, state = self.__(
                            self.decode(hidden_states, output, state,
                                        gender_embedding), 'output, state')
                        loss += self.loss_function(output, target[index + 1])
                        output = self.__(output.max(1)[1], 'output')
                        outputs.append(output)

                    losses.append(loss)
                    loss.backward()
                    self.optimizer.step()

                return torch.stack(losses).mean()
Example #25
0
def experiment(VOCAB, raw_samples, datapoints=[[], []], eons=1000, epochs=10, checkpoint=5):
    try:
        encoder =  Encoder(Config(), 'encoder', len(VOCAB))
        decoder =  PtrDecoder(Config(), 'decoder', encoder.embed, VOCAB['GO'], len(VOCAB))
        try:
            encoder.load_state_dict(torch.load('{}.{}.{}'.format(SELF_NAME, 'encoder', 'pth')))
            decoder.load_state_dict(torch.load('{}.{}.{}'.format(SELF_NAME, 'decoder', 'pth')))
            log.info('loaded the old image for the model')
        except:
            log.exception('failed to load the model')

        if Config().cuda:
            log.info('cuda the model...')
            encoder.cuda()
            decoder.cuda()

        model = (encoder, decoder)
        print('**** the model', model)

        name = os.path.basename(__file__).replace('.py', '')
        
        _batchop = partial(batchop, WORD2INDEX=VOCAB)
        train_feed     = DataFeed(name, datapoints[0], batchop=_batchop, batch_size=100)
        test_feed      = DataFeed(name, datapoints[1], batchop=_batchop, batch_size=100)
        predictor_feed = DataFeed(name, datapoints[1], batchop=_batchop, batch_size=100)

        _loss = partial(loss, loss_function=nn.NLLLoss(), UNK=VOCAB['UNK'])
        _accuracy = partial(accuracy,  UNK=VOCAB['UNK'])
        trainer = Trainer(name=name,
                          model=(encoder, decoder),
                          loss_function=_loss, accuracy_function=_accuracy, f1score_function=f1score,
                          checkpoint=checkpoint, epochs=epochs,
                          feeder = Feeder(train_feed, test_feed))

        _repr_function=partial(repr_function, VOCAB=VOCAB, raw_samples=raw_samples)
        _process_predictor_output = partial(process_predictor_output, UNK=VOCAB['UNK'])
        predictor = Predictor(model = (encoder, decoder),
                              feed  = predictor_feed,
                              repr_function  = _repr_function,
                              process_output = _process_predictor_output)

        dump = open('results/experiment_attn.csv', 'w')        
        for e in range(eons):
            log.info('on {}th eon'.format(e))

            dump.write('#========================after eon: {}\n'.format(e))
            results = ListTable()
            for ri in tqdm(range(predictor_feed.num_batch//10)):
                output, _results = predictor.predict(predictor_feed.num_batch - ri, 3)
                results.extend(_results)
                
            dump.write(repr(results))
            dump.flush()

            if not trainer.train():
                raise Exception
    except :
        log.exception('####################')
        trainer.save_best_model()

        return locals()
Example #26
0
def load_data(config, dirname='../dataset/', max_sample_size=None):

    samples = []
    skipped = 0

    input_vocab = Counter()
    gender_vocab = Counter()

    #########################################################
    # Read names
    #########################################################
    def read_data(filename='names.csv'):
        data = open(filename).readlines()
        samples = []
        for datum in data:
            name = datum.split(',')[1]
            name = ''.join(name.split())
            samples.append(remove_punct_symbols(name))

        return samples

    def read_dirs(dirs=['boy', 'girl']):
        samples = []
        for d in dirs:
            for filename in os.listdir('{}/{}'.format(dirname, d)):
                s = read_data('{}/{}/{}'.format(dirname, d, filename))
                s = [(d, n) for n in s]
                samples.extend(s)

        return list(set(samples))

    raw_samples = read_dirs()
    log.info('read {} names'.format(len(raw_samples)))

    #########################################################
    # Read tamil words
    #########################################################
    def read_words(filename=config.HPCONFIG.lm_dataset_path):
        samples = []
        for line in tqdm(
                open(filename).readlines()[:config.HPCONFIG.lm_samples_count],
                'reading lm file for words'):
            s = line.split()
            s = [('neutral', n) for n in s]
            samples.extend(s)

        return list(set(samples))

    pretrain_samples = read_words()

    #########################################################
    # build vocab
    #########################################################
    all_samples = raw_samples + pretrain_samples
    log.info('building input_vocabulary...')

    for gender, name in tqdm(all_samples, desc='building vocab'):
        name = remove_punct_symbols(name)
        name = tamil.utf8.get_letters(name.strip())
        if len(name):
            input_vocab.update(name)
            gender_vocab.update([gender])

    vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50)

    print(gender_vocab)
    gender_vocab = Vocab(gender_vocab, special_tokens=[])

    if config.CONFIG.write_vocab_to_file:
        vocab.write_to_file(config.ROOT_DIR + '/input_vocab.csv')
        gender_vocab.write_to_file(config.ROOT_DIR + '/gender_vocab.csv')

    def build_samples(raw_samples):
        samples = []
        for i, (gender,
                name) in enumerate(tqdm(raw_samples, desc='processing names')):
            try:

                #name = remove_punct_symbols(name)
                name = tamil.utf8.get_letters(name.strip())

                if len(name) < 2:
                    continue

                log.debug('===')
                log.debug(pformat(name))

                for a, b in zip(range(len(name)), range(1, len(name) - 1)):
                    template = list(NULL_CHAR * len(name))
                    template[a] = name[a]
                    template[b] = name[b]
                    samples.append(
                        Sample('{}.{}'.format(gender, i), gender, template,
                               name))

                if max_sample_size and len(samples) > max_sample_size:
                    break

            except:
                skipped += 1
                log.exception('{}'.format(name))

        return samples

    pretrain_samples = build_samples(pretrain_samples)
    samples = build_samples(raw_samples)
    print('skipped {} samples'.format(skipped))

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    #train_samples, test_samples = samples, []

    #train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True)

    return NameDataset('names', (train_samples, test_samples),
                       pretrain_samples=pretrain_samples,
                       input_vocab=vocab,
                       gender_vocab=gender_vocab)
Example #27
0
    def do_train(self):
        for epoch in range(self.epochs):

            self.log.critical('memory consumed : {}'.format(memory_consumed()))
            self.epoch = epoch
            if epoch and epoch % max(1, (self.checkpoint - 1)) == 0:
                #self.do_predict()
                if self.do_validate() == FLAGS.STOP_TRAINING:
                    self.log.info('loss trend suggests to stop training')
                    return

            self.train()
            losses = []
            tracemalloc.start()
            for j in tqdm(range(self.train_feed.num_batch),
                          desc='Trainer.{}'.format(self.name())):
                self.optimizer.zero_grad()
                input_ = self.train_feed.next_batch()
                idxs, word, targets = input_

                loss = 0
                encoded_info = self.__(self.encode(word), 'encoded_info')

                keys = self.__(self.keys.transpose(0, 1), 'keys')
                keys = self.__(
                    keys.expand([encoded_info.size(0), *keys.size()]), 'keys')
                inner_product = self.__(
                    torch.bmm(
                        encoded_info.unsqueeze(1),  #final state
                        keys),
                    'inner_product')

                values = self.__(self.values, 'values')
                values = self.__(
                    values.expand([inner_product.size(0), *values.size()]),
                    'values')

                weighted_sum = self.__(torch.bmm(inner_product, values),
                                       'weighted_sum')
                weighted_sum = self.__(weighted_sum.squeeze(1), 'weighted_sum')

                #make the same chane in do_[predict|validate]
                tseq_len, batch_size = targets.size()
                state = self.__(
                    (weighted_sum, self.init_hidden(batch_size).squeeze(0)),
                    'decoder initial state')
                #state = self.__( (encoded_info, state[1].squeeze(0)), 'decoder initial state')
                prev_output = self.__(
                    self.sos_token.expand([encoded_info.size(0)]), 'sos_token')

                for i in range(targets.size(0)):
                    output = self.decode(prev_output, state)
                    loss += self.loss_function(output, targets[i])
                    prev_output = output.max(1)[1].long()

                losses.append(loss)
                loss.backward()
                self.optimizer.step()

                del input_  #, keys, values

                if j and not j % 100000:
                    malloc_snap = tracemalloc.take_snapshot()
                    display_tracemalloc_top(malloc_snap, limit=100)

            epoch_loss = torch.stack(losses).mean()
            self.train_loss.append(epoch_loss.data.item())

            self.log.info('-- {} -- loss: {}\n'.format(epoch, epoch_loss))
            for m in self.metrics:
                m.write_to_file()

        return True
Example #28
0
 def build_cache_for_train2(self):
     self.batch_cache = []
     for j in tqdm(range(self.train_feed.num_batch), desc='building cache'):
         input_ = self.train_feed.next_batch()
         self.batch_cache.append(input_)
Example #29
0
def multiplexed_train(config, argv, name, ROOT_DIR, model, dataset):
    _batchop = partial(batchop,
                       VOCAB=dataset.input_vocab,
                       LABELS=dataset.output_vocab)
    predictor_feed = DataFeed(name,
                              dataset.testset,
                              batchop=_batchop,
                              batch_size=1)
    predictor = Predictor(name,
                          model=model,
                          directory=ROOT_DIR,
                          feed=predictor_feed,
                          repr_function=partial(repr_function,
                                                VOCAB=dataset.input_vocab,
                                                LABELS=dataset.output_vocab,
                                                dataset=dataset.testset_dict))

    loss_ = partial(loss, loss_function=nn.NLLLoss())
    test_feed, tester = {}, {}
    train_feed = {}
    for subset in dataset.datasets:
        test_feed[subset.name] = DataFeed(subset.name,
                                          subset.testset,
                                          batchop=_batchop,
                                          batch_size=config.CONFIG.batch_size)
        train_feed[subset.name] = DataFeed(subset.name,
                                           portion(
                                               subset.trainset,
                                               config.HPCONFIG.trainset_size),
                                           batchop=_batchop,
                                           batch_size=config.CONFIG.batch_size)

        tester[subset.name] = Tester(name=subset.name,
                                     config=config,
                                     model=model,
                                     directory=ROOT_DIR,
                                     loss_function=loss_,
                                     accuracy_function=accuracy,
                                     feed=test_feed[subset.name],
                                     save_model_weights=False)

    test_feed[name] = DataFeed(name,
                               dataset.testset,
                               batchop=_batchop,
                               batch_size=config.CONFIG.batch_size)

    tester[name] = Tester(name=name,
                          config=config,
                          model=model,
                          directory=ROOT_DIR,
                          loss_function=loss_,
                          accuracy_function=accuracy,
                          feed=test_feed[name],
                          predictor=predictor)

    train_feed_muxed = MultiplexedDataFeed(name, train_feed, _batchop,
                                           config.CONFIG.batch_size)
    trainer = MultiplexedTrainer(
        name=name,
        config=config,
        model=model,
        directory=ROOT_DIR,
        optimizer=optim.Adam(model.parameters()),
        loss_function=loss_,
        testers=tester,
        checkpoint=config.CONFIG.CHECKPOINT,
        epochs=config.CONFIG.EPOCHS,
        feed=train_feed_muxed,
    )

    for e in range(config.CONFIG.EONS):

        if not trainer.train():
            raise Exception

        dump = open('{}/results/eon_{}.csv'.format(ROOT_DIR, e), 'w')
        log.info('on {}th eon'.format(e))
        results = ListTable()
        for ri in tqdm(range(predictor_feed.num_batch),
                       desc='\nrunning prediction on eon: {}'.format(e)):
            output, _results = predictor.predict(ri)
            results.extend(_results)
        dump.write(repr(results))
        dump.close()
Example #30
0
    if args.task == 'train':
        net.do_train()

    if args.task == 'drop-words-and-validate':
        net.drop_words_and_validate(args.epoch)
        
    if args.task == 'dump-vocab':
        from collections import Counter
        from utilz import Sample
        counter = Counter()
        for s in dataset.trainset:
            counter.update([s.word, s.context])

        embedding = []
        words = sorted(counter.keys())
        for w in tqdm(words):
            ids, word, context = _batchop([Sample('0', w, '')], for_prediction=True)
            emb = net.__(net.embed(word), 'emb')
            embedding.append(emb)

        embedding = torch.stack(embedding).squeeze()
        dump_vocab_tsv(config,
                       words,
                       embedding.cpu().detach().numpy(),
                       config.ROOT_DIR + '/vocab.tsv')

        
    if args.task == 'dump-cosine-similarity':
        dump_cosine_similarity_tsv(config,
                   dataset.input_vocab,
                   net.embed.weight.data.cpu(),