class Trainer:
  def __init__(self, is_many_to_one=True, max_epoch=5000, batch_size=10,
               learning_rate=.01, hidden_size=128, num_hidden_layer=1,
               drop_rate=0., embedding_len=100, use_tensorboard=False,
               early_stopping_history_len=7, early_stopping_allowance=3,
               verbose=1, save_best_model=False, use_cuda=False,
               data_file_count=-1, identity=None, early_stopping=False,
               pre_train=None):
    self.logger = Logger(verbose_level=verbose)
    self.is_many_to_one = is_many_to_one
    self.max_epoch = max_epoch
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.hidden_size = hidden_size
    self.num_hidden_layer = num_hidden_layer
    self.drop_rate = drop_rate
    self.embedding_len = embedding_len
    self.use_cuda = use_cuda
    self.use_tensorboard = use_tensorboard
    self.early_stopping_history_len = early_stopping_history_len
    self.early_stopping_allowance = early_stopping_allowance
    self.verbose = verbose
    self.save_best_model = save_best_model
    self.data_file_count = data_file_count
    self.identity = identity
    self.early_stopping = early_stopping
    self.pre_train = pre_train
  def train(self):
    data_manager = DataManager(self.batch_size, logger=self.logger,
                               is_many_to_one=self.is_many_to_one,
                               data_file_count=self.data_file_count,
                               pretrained_file=self.pre_train)
    if self.is_many_to_one:
      net = RNN_M2O(len(data_manager.word_list), self.embedding_len,
                    self.hidden_size, self.learning_rate, self.num_hidden_layer,
                    self.drop_rate, use_adam=True, use_cuda=self.use_cuda,
                    pretrained_emb=data_manager.pretrained_embeddings())
    else:
      net = RNN_M2M(len(data_manager.word_list), self.embedding_len,
                    self.hidden_size, self.learning_rate, self.num_hidden_layer,
                    self.drop_rate, use_adam=True, use_cuda=self.use_cuda,
                    pretrained_emb=data_manager.pretrained_embeddings())
    self._train(net, data_manager)
  def _train(self, net, data_manager):
    if self.identity is None:
      identity = 'M2O' if self.is_many_to_one else 'M2M'
      identity += '_'+str(self.learning_rate).replace('.', '')
      identity += '_'+str(self.hidden_size)
      identity += '_'+str(self.num_hidden_layer)
    else:
      identity = self.identity
    if self.use_tensorboard:
      from tensorboardX import SummaryWriter
      if os.path.exists(identity+'_logs'):
        if self.verbose > 0:
          should_rm = input(' - Log dir exists. Remove (Y/n)?')
          if should_rm.lower() == 'y' or should_rm == '':
            shutil.rmtree(identity+'_logs')
      self.writer = SummaryWriter(identity+'_logs')

    train_data_loader = data_manager.train_loader()
    valid_data_loader = data_manager.valid_loader()

    optimizer = net.get_optimizer()
    loss_fn = net.get_loss()
    self.logger.i('Start training %s...'%(identity), True)
    try:
      total_batch_per_epoch = len(train_data_loader)
      perplexity_history = deque(maxlen=self.early_stopping_history_len)
      min_perplexity = 999.
      early_stopping_violate_counter = 0
      status, _epoch_index, _perplexity_history, _min_perplexity = self._load(net, identity)
      if status:
        perplexity_history = _perplexity_history
        min_perplexity = _min_perplexity
      else:
        _epoch_index = 0
      epoch_index = 0
      for epoch_index in range(_epoch_index, self.max_epoch):
        losses = 0.
        acc = 0.
        counter = 0
        self.logger.i('[ %d / %d ] epoch:'%(epoch_index + 1, self.max_epoch), True)
        # Training
        net.train()
        for batch_index, (data, label) in enumerate(train_data_loader):
          data = T.autograd.Variable(data)
          label = T.autograd.Variable(label)
          if self.use_cuda:
            data = data.cuda()
            label = label.cuda()
          output, predicted = net(data)
          acc += (label.squeeze() == predicted).float().mean().data * data.size(0)
          loss = loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1))
          optimizer.zero_grad()
          loss.backward()
          T.nn.utils.clip_grad_norm(net.parameters(), .25)
          optimizer.step()
          losses += loss.data.cpu()[0] * data.size(0)
          counter += data.size(0)
          progress = min((batch_index + 1) / total_batch_per_epoch * 20., 20.)
          self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f'%
                        ('>'*int(progress)+'-'*(20-int(progress)), progress * 5.,
                         losses / counter, acc / counter))
        mean_loss = losses / counter
        valid_losses = 0.
        valid_counter = 0
        valid_acc = 0.
        # Validtion
        net.eval()
        for data, label in valid_data_loader:
          data = T.autograd.Variable(T.LongTensor(data))
          label = T.autograd.Variable(T.LongTensor(label))
          if self.use_cuda:
            data = data.cuda()
            label = label.cuda()
          output, predicted = net(data)
          valid_losses += loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1)) \
                                 .data.cpu()[0] * data.size(0)
          valid_acc += (label.squeeze() == predicted).float().mean().data * data.size(0)
          valid_counter += data.size(0)
        mean_val_loss = valid_losses/valid_counter
        mean_val_acc = valid_acc/valid_counter
        perplexity = np.exp(mean_val_loss)
        self.logger.d(' -- val_loss: %.4f, val_acc: %.4f, perplexity: %.4f'%
                      (mean_val_loss, mean_val_acc, perplexity), reset_cursor=False)
        # Log with tensorboard
        if self.use_tensorboard:
          self.writer.add_scalar('train_loss', mean_loss, epoch_index)
          self.writer.add_scalar('train_acc', acc / counter, epoch_index)
          self.writer.add_scalar('val_loss', mean_val_loss, epoch_index)
          self.writer.add_scalar('val_acc', mean_val_acc, epoch_index)
          self.writer.add_scalar('val_perp', perplexity, epoch_index)
        # Early stopping
        if self.early_stopping and perplexity > np.mean(perplexity_history):
          early_stopping_violate_counter += 1
          if early_stopping_violate_counter >= self.early_stopping_allowance:
            self.logger.i('Early stopping...', True)
            break
        else:
          early_stopping_violate_counter = 0
        # Save best model
        if self.save_best_model and perplexity < min_perplexity:
          self._save(epoch_index, net, perplexity_history, perplexity, identity)
          min_perplexity = perplexity
        perplexity_history.append(perplexity)
        self.logger.d('', True, False)
    except KeyboardInterrupt:
      self.logger.i('\n\nInterrupted', True)
    if self.use_tensorboard:
      self.writer.close()
    self.logger.i('Finish', True)
    return np.mean(perplexity_history)
  def test(self, id):
    _, lr, hs, nh = re.search(r'M2(M|O)_([0-9]+)_([0-9]+)_([0-9]+)_?', id).groups()
    lr, hs, nh = float('0.'+lr[1:]), int(hs), int(nh)

    data_manager = DataManager(self.batch_size, logger=self.logger,
                               is_many_to_one=self.is_many_to_one,
                               data_file_count=self.data_file_count,
                               pretrained_file=self.pre_train, is_test=True)
    if self.is_many_to_one:
      model = RNN_M2O
    else:
      model = RNN_M2M
    net = model(len(data_manager.word_list), self.embedding_len,
                hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda,
                pretrained_emb=data_manager.pretrained_embeddings())
    status, _epoch_index, _perplexity_history, _min_perplexity = self._load(net, id)
    if status:
      loss_fn = net.get_loss()

      # Testing
      test_losses = 0.
      test_acc = 0.
      test_counter = 0

      net.eval()
      for data, label in data_manager.test_loader():
        data = T.autograd.Variable(T.LongTensor(data))
        label = T.autograd.Variable(T.LongTensor(label))
        if self.use_cuda:
          data = data.cuda()
          label = label.cuda()
        output, predicted = net(data)
        test_losses += loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1)) \
                                .data.cpu()[0] * data.size(0)
        test_acc += (label.squeeze() == predicted).float().mean().data * data.size(0)
        test_counter += data.size(0)
      mean_test_loss = test_losses/test_counter
      mean_test_acc = test_acc/test_counter
      perplexity = np.exp(mean_test_loss)
      self.logger.i('Loss: %.4f, Acc: %.4f, Perp: %.4f'%(mean_test_loss, mean_test_acc, perplexity))
      return mean_test_loss, mean_test_acc, perplexity
    else:
      raise AssertionError('Model file not found!')
  def text_generate(self, given_words, id, max_len=150):
    if os.path.exists('data/word_list'):
      word_list = pickle.load(open('data/word_list', 'rb'))
    else:
      raise AssertionError('word_list not found')

    _, lr, hs, nh = re.search(r'M2(M|O)_([0-9]+)_([0-9]+)_([0-9]+)_?', id).groups()
    lr, hs, nh = float('0.'+lr[1:]), int(hs), int(nh)

    if self.is_many_to_one:
      net = RNN_M2O(len(word_list), self.embedding_len,
                    hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda)
    else:
      net = RNN_M2M(len(word_list), self.embedding_len,
                    hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda)
    status, _, _, _ = self._load(net, id)
    if status:
      word_index_dict = {w: i for i, w in enumerate(word_list)}
      given_words = given_words.lower().strip().split()
      given_words = [1]+[word_index_dict[word] if word in word_index_dict else 2
                         for word in given_words]
      state = None
      for i in range(max_len):
        if i < len(given_words):
          cur_var = T.autograd.Variable(T.LongTensor([[given_words[i]]]))
          if self.use_cuda:
            cur_var = cur_var.cuda()
          _, predicted, state = net(cur_var, state, return_states=True)
          if i >= len(given_words)-1:
            if predicted[0].cpu().data[0] > 0:
              given_words.append(predicted[0].cpu().data[0])
            else:
              break
      print('Text generated: %s'%(' '.join([word_list[word] for word in given_words[1:]])))
      print('Finished')
    else:
      raise AssertionError('Save not found!')
  def _save(self, global_step, net, perplexity_history, min_perplexity, identity):
    T.save({
        'epoch': global_step+1,
        'state_dict': net.state_dict(),
        'perplexity_history': perplexity_history,
        'min_perplexity': min_perplexity,
        'optimizer': net.optimizer.state_dict()
    }, identity+'_best')
  def _load(self, net, identity):
    if os.path.exists(identity+'_best'):
      checkpoint = T.load(identity+'_best')
    elif os.path.exists(identity):
      checkpoint = T.load(identity)
    else:
      return False, None, None, None
    net.load_state_dict(checkpoint['state_dict'])
    net.get_optimizer().load_state_dict(checkpoint['optimizer'])
    return True, checkpoint['epoch'], checkpoint['perplexity_history'], \
           checkpoint['min_perplexity']
class Trainer:
    def __init__(self,
                 model_generator,
                 train_dataset,
                 valid_dataset,
                 test_dataset,
                 batch_size=50,
                 max_epoch=1000,
                 use_cuda=True,
                 use_tensorboard=False,
                 early_stopping_history_len=50,
                 early_stopping_patience=5,
                 collate_fn=None,
                 verbose=1,
                 save_best_model=False):
        self.logger = Logger(verbose_level=verbose)
        self.model_generator = model_generator
        self.train_dataset = train_dataset
        self.valid_dataset = valid_dataset
        self.test_dataset = test_dataset
        self.batch_size = batch_size
        self.max_epoch = max_epoch
        self.use_cuda = use_cuda
        self.use_tensorboard = use_tensorboard
        self.early_stopping_history_len = early_stopping_history_len
        self.early_stopping_patience = early_stopping_patience
        self.collate_fn = collate_fn
        self.save_best_model = save_best_model
        self.counter = 0

    def train(self):
        emotions = self.train_dataset.EMOTIONS
        best_valid_corrcoef = {}
        best_test_corrcoef = {}
        for emotion in emotions:
            self.train_dataset.set_emotion(emotion)
            self.valid_dataset.set_emotion(emotion)
            self.test_dataset.set_emotion(emotion)
            train_loader = T.utils.data.DataLoader(self.train_dataset,
                                                   batch_size=self.batch_size,
                                                   shuffle=True)
            valid_loader = T.utils.data.DataLoader(self.valid_dataset,
                                                   batch_size=self.batch_size,
                                                   shuffle=True)
            test_loader = T.utils.data.DataLoader(self.test_dataset,
                                                  batch_size=self.batch_size,
                                                  shuffle=True)
            model = self.model_generator(self.train_dataset.wordict_size,
                                         self.train_dataset.weight)
            best_valid_corrcoef[emotion], \
            best_test_corrcoef[emotion] = self._train(model, train_loader, valid_loader, test_loader,
                                                      identity=emotion)
            del model, train_loader, valid_loader
        best_valid_corrcoef['avg'] = np.mean(
            [best_valid_corrcoef[emotion] for emotion in emotions])
        best_test_corrcoef['avg'] = np.mean(
            [best_test_corrcoef[emotion] for emotion in emotions])
        # self.logger.i('\n'+str(best_valid_corrcoef), True, True)
        # self.logger.i('\n'+str(best_test_corrcoef), True, True)
        return best_valid_corrcoef, best_test_corrcoef

    def _train(self,
               model,
               train_loader,
               valid_loader,
               test_loader,
               identity=None):
        if identity is None:
            identity = 'Net' + str(self.counter)
            self.counter += 1
        if self.use_tensorboard:
            from tensorboardX import SummaryWriter
            self.writer = SummaryWriter(identity + '_logs')
        self.logger.i('Start training %s...' % (identity), True)
        try:
            total_batch_per_epoch = len(train_loader)
            loss_history = deque(maxlen=self.early_stopping_history_len)
            best_corrcoef = -1.
            last_test_corrcoef = -1.
            # early_stopping_violate_counter = 0
            epoch_index = 0
            for epoch_index in range(self.max_epoch):
                losses = 0.
                # acc = 0.
                counter = 0
                self.logger.i(
                    '[ %d / %d ] epoch:' % (epoch_index + 1, self.max_epoch),
                    True)
                # Training
                model.train()
                for batch_index, entry in enumerate(train_loader):
                    if self.collate_fn is not None:
                        data, label = self.collate_fn(entry)
                    else:
                        data, label = entry
                    data = T.autograd.Variable(data)
                    label = T.autograd.Variable(label)
                    if self.use_cuda:
                        data = data.cuda()
                        label = label.cuda()
                    output, predicted = model(data)
                    # acc += (label.squeeze() == predicted).float().mean().data * data.size(0)
                    loss = model.loss_fn(output, label.view(-1))
                    model.optimizer.zero_grad()
                    loss.backward()
                    T.nn.utils.clip_grad_norm(model.parameters(), .25)
                    model.optimizer.step()
                    losses += loss.data.cpu()[0] * data.size(0)
                    counter += data.size(0)
                    progress = min(
                        (batch_index + 1) / total_batch_per_epoch * 20., 20.)
                    self.logger.d('[%s] (%3.f%%) loss: %.4f, ' %
                                  ('>' * int(progress) + '-' *
                                   (20 - int(progress)), progress * 5.,
                                   losses / counter))
                mean_loss = losses / counter
                valid_losses = 0.
                valid_counter = 0
                # valid_acc = 0.
                # Validtion
                model.eval()
                valid_prediction = []
                valid_labels = []
                for entry in valid_loader:
                    if self.collate_fn is not None:
                        data, label = self.collate_fn(entry)
                    else:
                        data, label = entry
                    valid_labels += list(label.view(-1))
                    data = T.autograd.Variable(data)
                    label = T.autograd.Variable(label)
                    if self.use_cuda:
                        data = data.cuda()
                        label = label.cuda()
                    output, predicted = model(data)
                    valid_losses += model.loss_fn(
                        output, label.view(-1)).data.cpu()[0] * data.size(0)
                    valid_prediction += list(predicted.view(-1).data.tolist())
                    # valid_acc += (label.squeeze() == predicted).float().mean().data * data.size(0)
                    valid_counter += data.size(0)
                mean_val_loss = valid_losses / valid_counter
                # mean_val_acc = valid_acc/valid_counter
                corrcoef = np.corrcoef(valid_prediction, valid_labels)[0, 1]
                self.logger.d(' -- val_loss: %.4f, corrcoef: %.4f' %
                              (mean_val_loss, corrcoef),
                              reset_cursor=False)
                # Log with tensorboard
                if self.use_tensorboard:
                    self.writer.add_scalar('train_loss', mean_loss,
                                           epoch_index)
                    # self.writer.add_scalar('train_acc', acc / counter, epoch_index)
                    self.writer.add_scalar('val_loss', mean_val_loss,
                                           epoch_index)
                    # self.writer.add_scalar('val_acc', mean_val_acc, epoch_index)
                    self.writer.add_scalar('val_corrcoef', corrcoef,
                                           epoch_index)
                loss_history.append(mean_val_loss)
                # # Early stopping
                # if mean_val_loss > np.mean(loss_history):
                #   early_stopping_violate_counter += 1
                #   if early_stopping_violate_counter >= self.early_stopping_patience:
                #     self.logger.i('Early stopping...', True)
                #     break
                # else:
                #   early_stopping_violate_counter = 0
                # Save best model
                if corrcoef > best_corrcoef:
                    best_corrcoef = corrcoef
                    # last_test_corrcoef = self._test(model, test_loader)
                    # self.logger.d(' -- test_corrcoef: %.4f'%(last_test_corrcoef),
                    #               reset_cursor=False)
                    if self.save_best_model:
                        self._save(model, epoch_index, loss_history,
                                   best_corrcoef, identity)
                self.logger.d('', True, False)
        except KeyboardInterrupt:
            self.logger.i('\n\nInterrupted', True)
        if self.use_tensorboard:
            self.writer.close()
        self.logger.i('Finish', True)
        return best_corrcoef, last_test_corrcoef

    def _test(self, model, test_loader):
        model.eval()
        test_prediction = []
        test_labels = []
        for entry in test_loader:
            if self.collate_fn is not None:
                data, label = self.collate_fn(entry)
            else:
                data, label = entry
            test_labels += list(label.view(-1))
            data = T.autograd.Variable(data)
            label = T.autograd.Variable(label)
            if self.use_cuda:
                data = data.cuda()
                label = label.cuda()
            _, predicted = model(data)
            test_prediction += list(predicted.view(-1).data.tolist())
        return np.corrcoef(test_prediction, test_labels)[0, 1]

    def _save(self, model, global_step, loss_history, best_corrcoef, identity):
        T.save(
            {
                'epoch': global_step + 1,
                'state_dict': model.state_dict(),
                'loss_history': loss_history,
                'best_corrcoef': best_corrcoef,
                'optimizer': model.optimizer.state_dict()
            }, identity + '_best')
    class Data(Dataset):
        '''
    Data loading
    '''
        def __init__(self,
                     filename='twitter_sentiment.csv.gz',
                     window_size=2,
                     for_embedding=False,
                     logger=None,
                     wordlist_file=None):
            self.window_size = window_size
            self.for_embedding = for_embedding
            if logger is None:
                self.logger = Logger(1)
            else:
                self.logger = logger
            self.logger.i('Initializing Loader....')

            x = []  # Input
            self.x = []
            self.y_ = []  # Ground truth
            self.context_vec = []
            self.target_word = []
            self.max_sentence_len = 0
            word_set = set()
            word_counter = Counter()

            # Read labelled data from file
            with gzip.open(filename, 'rt') as dfile:
                lines = dfile.readlines()[1:]
                num_line = len(lines)
                for index, line in enumerate(lines):
                    _, sentiment, sentence = line.split('\t')
                    words = self._clean_str(sentence).split()
                    self.max_sentence_len = np.max(
                        [self.max_sentence_len,
                         len(words)])
                    word_set = word_set.union(words)
                    word_counter += Counter(words)
                    x.append(words)
                    self.y_.append(
                        T.LongTensor([1 if sentiment == 'pos' else 0]))
                    self.logger.d('Loader: Read %6d / %6d line' %
                                  (index + 1, num_line))

            # Build word dictionary
            filter_words = [
                key for key, count in dict(word_counter).items() if count > 3
            ]
            self.word_dict = {
                word: index + 1
                for index, word in enumerate(filter_words)
            }
            # self.word_dict = {word: index+1 for index, word in enumerate(dict(word_counter))}
            self.word_dict['<unk>'] = 0
            self.word_counter = word_counter
            self.word_count = len(self.word_dict)

            if for_embedding:
                for word_seq in x:
                    words, target = self._to_context_vec(word_seq)
                    self.context_vec.extend(words)
                    self.target_word.extend(target)
            else:
                for word_seq in x:
                    self.x.append([self._to_index(word) for word in word_seq])
                del x
            self.len = len(self.x)
            if wordlist_file is not None:
                with open(wordlist_file, 'w+') as wlfile:
                    for key, _ in sorted(self.word_dict.items(),
                                         key=lambda x: x[1]):
                        wlfile.write(key + '\n')
            self.logger.i('Loader initialized', True)
            self.logger.i('Word Count: %d' % (self.word_count), True)
            self.logger.i(
                'Number of unknown word: %d' %
                (len(self.word_counter) - len(self.word_dict) + 1), True)

        def _to_index(self, word):
            if word in self.word_dict.keys():
                return self.word_dict[word]
            else:
                return self.word_dict['<unk>']

        def _to_word(self, index):
            return self.word_dict.keys()[self.word_dict.values().index(index)]

        def __getitem__(self, index):
            if self.for_embedding:
                return self.context_vec[index], self.target_word[index]
            else:
                return self.x[index], self.y_[index]

        def _get_max_sentence_len(self):
            return self.max_sentence_len

        def __len__(self):
            if self.for_embedding:
                return len(self.context_vec)
            else:
                return self.len

        def _clean_str(self, string):
            '''
      Remove noise from input string
      '''
            string = re.sub(r'&[a-zA-Z];', ' ', string)
            string = re.sub(r'[^A-Za-z0-9,!?\(\)\.\'\`]', ' ', string)
            string = re.sub(r'[0-9]+', ' <num> ', string)
            string = re.sub(r'( \' ?)|( ?\' )', ' ', string)
            string = re.sub(r'(\'s|\'ve|n\'t|\'re|\'d|\'ll|\.|,|!|\?|\(|\))',
                            r' \1 ', string)
            string = re.sub(r'\s{2,}', ' ', string)
            return string.strip().lower()

        def _to_context_vec(self, word_seq):
            '''
      Convert sentence to context vectors
      '''
            input_words = []
            target_word = []
            buffer_len = self.window_size * 2 + 1
            window = deque(maxlen=buffer_len)
            for word in word_seq:
                window.append(word)
                if len(window) == buffer_len:
                    tmp_window = [
                        self._to_index(w) for w in list(window.copy())
                    ]
                    target = tmp_window[self.window_size]
                    del tmp_window[self.window_size]
                    input_words.append(T.LongTensor(tmp_window))
                    target_word.append(T.LongTensor([target]))
            return input_words, target_word

        def _get_nce_weight(self):
            '''
      Get weight for generating noise
      '''
            power = .75
            dominator = sum(np.power(list(self.word_counter.values()), power))
            freq_vec = [0.]
            for word, count in self.word_counter.items():
                if word in self.word_dict.keys():
                    freq_vec.append(math.pow(count, power) / dominator)
            freq_vec[0] = np.mean(freq_vec)
            exp_x = np.exp(freq_vec - np.max(freq_vec))
            return exp_x / exp_x.sum()
Example #4
0
class CBOW(T.nn.Module):
  def __init__(self, embedding_len, lr=1., momentum=.9, batch_size=50,
               window_size=2, epoch=1, use_cuda=False, embedding_path=None,
               verbose=1, tensorboard=False, wordlist_path=None, log_folder='runs',
               use_nce=False):
    super(CBOW, self).__init__()
    self.embedding_len = embedding_len
    self.lr = lr
    self.momentum = momentum
    self.epoch = epoch
    self.use_cuda = use_cuda
    self.embedding_path = embedding_path
    self.logger = Logger(verbose)
    self.tensorboard = tensorboard
    self.use_nce = use_nce
    if self.tensorboard:
      from tensorboardX import SummaryWriter
      self.writer = SummaryWriter(log_folder)
    self.loader = Loader(for_embedding=True, window_size=window_size,
                         batch_size=batch_size, logger=self.logger,
                         wordlist_file=wordlist_path)
    self.vocab_size = self.loader.get_vocab_size()
    self._build_model()
  def __del__(self):
    if self.tensorboard:
      self.writer.close()
  def _build_model(self):
    def init_weight(m):
      m.weight.data.normal_().mul_(T.FloatTensor([2/m.weight.data.size()[0]]).sqrt_())
    self.embeddings = T.nn.Embedding(self.vocab_size, self.embedding_len)
    if self.embedding_path is None:
      self.embeddings.apply(init_weight)
    elif Path.exists(self.embedding_path):
      self.embeddings.weight.data.copy_(T.from_numpy(np.loadtxt(self.embedding_path)))
    if self.use_nce:
      self.loss_fn = NCELoss(self.vocab_size, self.embedding_len, self.use_cuda,
                             self.loader.get_nce_weight())
    else:
      self.fc = T.nn.Linear(self.embedding_len, self.vocab_size)
      self.fc.apply(init_weight)
      self.loss_fn = T.nn.CrossEntropyLoss()
    if self.use_cuda:
      self.cuda()
    if self.momentum > 0.:
      self.optimizer = T.optim.SGD(self.parameters(), lr=self.lr,
                                   momentum=self.momentum, nesterov=True)
    else:
      self.optimizer = T.optim.SGD(self.parameters(), lr=self.lr,
                                   momentum=0., nesterov=False)
    # self.optimizer = T.optim.Adam(self.parameters(), lr=.01)
  def forward(self, inputs):
    embeddings = self.embeddings(inputs)
    if self.use_nce:
      return embeddings
    else:
      sum_vector = embeddings.mean(dim=1)
      output = self.fc(sum_vector)
      # output = T.nn.functional.softmax(output, dim=1)
      _, max_indice = T.max(output, dim=1)
      return output, max_indice
  def fit(self):
    self.logger.i('Start training network...', True)
    try:
      total_batch_per_epoch = len(self.loader)
      loss_history = deque(maxlen=50)
      epoch_index = 0
      for epoch_index in range(self.epoch):
        losses = 0.
        acc = 0.
        counter = 0
        self.logger.i('[ %d / %d ] epoch:'%(epoch_index + 1, self.epoch), True)
        for batch_index, (context, target) in enumerate(self.loader):
          context = T.autograd.Variable(context)
          target = T.autograd.Variable(target)
          if self.use_cuda:
            context, target = context.cuda(), target.cuda()
          if self.use_nce:
            output = self(context)
            acc = math.nan
            loss = self.loss_fn(output, target, 5)
          else:
            output, predicted = self(context)
            acc += (target.squeeze() == predicted).float().mean().data
            loss = self.loss_fn(output, target.view(-1))
          self.optimizer.zero_grad()
          loss.backward()
          self.optimizer.step()
          losses += loss.data.cpu()[0]
          counter += 1
          progress = min((batch_index + 1) / total_batch_per_epoch * 20., 20.)
          self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f'%
                        ('>'*int(progress)+'-'*(20-int(progress)), progress * 5.,
                         losses / counter, acc / counter))
        mean_loss = losses / counter
        if self.tensorboard:
          self.writer.add_scalar('train_loss', mean_loss, epoch_index)
          self.writer.add_scalar('train_acc', acc / counter, epoch_index)
        loss_history.append(mean_loss)
        if mean_loss > np.mean(loss_history):
          self.logger.i('Early stopping...', True)
          break
        self.logger.d('', True, False)
    except KeyboardInterrupt:
      self.logger.i('\n\nInterrupted', True)
    self.logger.i('Saving word embeddings...')
    self._save_embeddings(epoch_index+1)
    self.logger.i('Word embeddings saved', True)
    self.logger.i('Finish', True)
  def _save_embeddings(self, global_step=0):
    embeds = self.embeddings.weight.data.cpu().numpy()
    np.savetxt(self.embedding_path, embeds)
    if self.tensorboard:
      self.writer.add_embedding(self.embeddings.weight.data,
                                [key for key, value in \
                                 sorted(self.loader.dataset.word_dict.items(),
                                 key=lambda x: x[1])],
                                global_step=global_step)
  def get_word_embedding(self, word):
    return self.embeddings.weight.data[self.loader.to_index(word)]
  def get_similarity(self, w1, w2):
    w1, w2 = self.get_word_embedding(w1), self.get_word_embedding(w2)
    w1, w2 = T.nn.functional.normalize(w1, dim=0), T.nn.functional.normalize(w2, dim=0)
    return (w1 * w2).sum(dim=0)
Example #5
0
    def __init__(self,
                 raw_data,
                 train_valid_ratio=.2,
                 do_cleaning=True,
                 **args):
        if '_empty' not in args or not args['_empty']:
            self.label = []
            self.data = []
            self.valid_data = []
            self.valid_label = []

            sentiments = set()
            self.word_counter = Counter()
            self.max_len = 0

            Log = Logger()
            Log.i('Start loading dataset...')

            # Load lists if saved previously
            has_lists = Path.exists('sentiment_list') and Path.exists(
                'word_list')
            if has_lists:
                Log.d('Sentiment and word list found!')
                with open('sentiment_list', 'r') as sf:
                    self.sentiments = sf.read().strip().split('\n')
                tmp_dict = {}
                with open('word_list', 'r') as wf:
                    for line in wf.readlines():
                        word, freq = line.strip().split()
                        tmp_dict[word] = int(freq)
                    self.word_counter = Counter(tmp_dict)
                    self.word_list = ['<pad>', '<unk>'] + \
                                     [key for key, value in self.word_counter.items() if value >= 3]
                    del tmp_dict
                if len(self.sentiments) == 0 or len(self.word_list) == 0:
                    raise AssertionError('either sentiment and word list is empty') \
                          .with_traceback(sys.exc_info()[2])
                self.sentiments = {
                    word: index
                    for index, word in enumerate(self.sentiments)
                }
                self.word_list = {
                    word: index
                    for index, word in enumerate(self.word_list)
                }

            if isinstance(raw_data, str):
                raw_data = raw_data.strip().split('\n')[1:]
            raw_data, valid_raw_data = train_test_split(
                raw_data, test_size=train_valid_ratio, random_state=0)
            data_len = len(raw_data)
            valid_data_len = len(valid_raw_data)
            # Add data and label to array
            for index, line in enumerate(raw_data):
                cols = line.split(',', 3)
                if do_cleaning:
                    words = _clean_str(cols[3].strip('"')).split()
                else:
                    words = cols[3].strip('"').split()
                self.max_len = max(self.max_len, len(words))
                # Tweet_id and authour ignore?
                if not has_lists:
                    sentiments.add(cols[1])
                    self.label.append([cols[1]])
                    self.word_counter += Counter(words)
                    self.data.append(words)
                else:
                    self.label.append([self.sentiments[cols[1]]])
                    self.data.append([
                        self.word_list[word]
                        if word in self.word_list else self.word_list['<unk>']
                        for word in words
                    ])
                Log.i('Loading %6d / %6d' % (index, data_len + valid_data_len))

            for index, line in enumerate(valid_raw_data):
                cols = line.split(',', 3)
                if do_cleaning:
                    words = _clean_str(cols[3].strip('"')).split()
                else:
                    words = cols[3].strip('"').split()
                self.max_len = max(self.max_len, len(words))
                # Tweet_id and authour ignore?
                if not has_lists:
                    self.valid_label.append([cols[1]])
                    self.valid_data.append(words)
                else:
                    self.valid_label.append([self.sentiments[cols[1]]])
                    self.valid_data.append([
                        self.word_list[word]
                        if word in self.word_list else self.word_list['<unk>']
                        for word in words
                    ])
                Log.i('Loading %6d / %6d' %
                      (index + data_len, data_len + valid_data_len))

            Log.i('Finish loading', True)

            Log.i('Start preprocessing...')

            if not has_lists:
                # Denoise by setting minimum freq
                self.word_list = ['<pad>', '<unk>'] + \
                                 [key for key, value in self.word_counter.items() if value >= 3]

                # Save sentiment and word list
                self.sentiments = list(sentiments)
                if len(self.sentiments) > 0 and len(self.word_list) > 0:
                    with open('sentiment_list', 'w+') as sf:
                        for sentiment in self.sentiments:
                            sf.write(sentiment + '\n')
                    with open('word_list', 'w+') as wf:
                        for word, freq in dict(self.word_counter).items():
                            wf.write(word + ' ' + str(freq) + '\n')
                else:
                    raise AssertionError('either sentiment and word list is empty') \
                          .with_traceback(sys.exc_info()[2])

                # Convert to dict for fast searching
                self.sentiments = {
                    word: index
                    for index, word in enumerate(self.sentiments)
                }
                self.word_list = {
                    word: index
                    for index, word in enumerate(self.word_list)
                }

                # Convert text to index
                for index, [data_ent,
                            label_ent] in enumerate(zip(self.data,
                                                        self.label)):
                    # <unk> (index 0) if word not found
                    self.data[index] = [
                        self.word_list[word]
                        if word in self.word_list else self.word_list['<unk>']
                        for word in data_ent
                    ]
                    self.label[index] = [
                        self.sentiments[word] for word in label_ent
                    ]

                # Convert text to index
                for index, [data_ent, label_ent] in enumerate(
                        zip(self.valid_data, self.valid_label)):
                    # <unk> (index 0) if word not found
                    self.valid_data[index] = [
                        self.word_list[word]
                        if word in self.word_list else self.word_list['<unk>']
                        for word in data_ent
                    ]
                    self.valid_label[index] = [
                        self.sentiments[word] for word in label_ent
                    ]

            data_len_list = [len(line) for line in self.data]
            self.data_len_mean = np.mean(data_len_list)
            self.data_len_std = np.std(data_len_list)
            self.data = [
                entry + [0] * (self.max_len - len(entry))
                for entry in self.data
            ]
            self.valid_data = [
                entry + [0] * (self.max_len - len(entry))
                for entry in self.valid_data
            ]

            Log.i('Finish preprocessing', True)
class DataManager:
    def __init__(self,
                 batch_size=50,
                 max_seq=7,
                 logger=None,
                 is_many_to_one=False,
                 train_valid_ratio=.2,
                 is_test=False,
                 data_split_mode='window',
                 data_file_count=-1,
                 pretrained_file=None):
        self.batch_size = batch_size
        self.max_seq = max_seq
        if logger is None:
            self.logger = Logger(0)
        else:
            self.logger = logger
        self.is_test = is_test
        self.data_split_mode = data_split_mode
        if self.data_split_mode not in ['window', 'sentence']:
            raise AssertionError('unknown split mode')
        self.data_file_count = data_file_count
        self.pretrained_file = pretrained_file

        # Reserve for <sos>
        self.max_seq += 1

        # mkdir data folder
        if not os.path.exists('data'):
            os.mkdir('data')

        # File path
        self.file_path_subfix = '_M2O' if is_many_to_one else '_M2M'
        self.file_path_prefix = 'Data/Test' if self.is_test else 'Data/Train'

        self.data = ''
        self.dataset = None
        self.word_list = None
        self.train_dataset, self.valid_dataset = None, None
        self.word_counter = None

        self.tensors = None
        if self.pretrained_file is not None:
            self.tensors = T.Tensor(self._load_from_pretrain())
            self.word_index_dict = {w: i for i, w in enumerate(self.word_list)}
            is_wordlist_loaded = True
        else:
            is_wordlist_loaded = self._load_wordlist()

        if self.is_test:
            if not self._load_dataset():
                self._read_files()
                self.dataset = Dataset(self.data, self.word_index_dict,
                                       is_many_to_one, self.max_seq - 1)
                pickle.dump(
                    self.dataset,
                    open('data/test_data' + self.file_path_subfix, 'wb+'))
        else:
            if not self._load_dataset():
                # Load previous split data
                status, train_data, valid_data = self._load_data()
                if not status:  # No split data found
                    self._read_files()
                    train_data, valid_data = train_test_split(
                        self.data, test_size=train_valid_ratio, random_state=0)
                    pickle.dump(train_data, open('data/train_data', 'wb+'))
                    pickle.dump(valid_data, open('data/valid_data', 'wb+'))
            if (not is_wordlist_loaded or not os.path.exists('data/word_counter')) \
                  and self.pretrained_file is None:
                # Generate word list
                self.logger.i(
                    'Start counting words because word list or word counter not found...'
                )
                self.word_counter = Counter()
                flatten_train_data = [
                    x for sublist in train_data for x in sublist
                ]
                self.word_counter += Counter(flatten_train_data)
                # Only keep words in training data
                del self.word_counter['<sos>']
                # Set min freq
                # filtered_word_list = [k for k, v in self.word_counter.items() if v >= 3]
                # self.word_list = ['<pad>', '<sos>', '<unk>']+filtered_word_list
                self.word_list = ['<pad>', '<sos>', '<unk>'] + list(
                    self.word_counter.keys())
                self.word_index_dict = {
                    w: i
                    for i, w in enumerate(self.word_list)
                }
                # Save word list
                pickle.dump(self.word_list, open('data/word_list', 'wb+'))

                # Count words for statistics
                flatten_valid_data = [
                    x for sublist in valid_data for x in sublist
                ]
                self.word_counter += Counter(flatten_valid_data)
                # Update unknown words for statistics
                self.word_counter += Counter({'<unk>': 0})
                self.logger.i('Getting unknown word list...')
                unk_word_list = list(
                    filter(lambda p: p[0] not in self.word_list,
                           self.word_counter.items()))
                self.logger.i(
                    'Start deleting words in validation set but not in training set...'
                )
                unk_word_list_len = len(unk_word_list)
                for index, [k, v] in enumerate(unk_word_list):
                    del self.word_counter[k]
                    self.word_counter['<unk>'] += v
                    self.logger.i('Deleting... %5d / %5d' %
                                  (index + 1, unk_word_list_len))
                del self.word_counter['<sos>']
                # Save word counter
                pickle.dump(self.word_counter, open('data/word_counter',
                                                    'wb+'))

                self.logger.i('Finish building word list and word counter')

            if self.train_dataset is None and self.valid_dataset is None:
                # Save training and validation dataset
                self.train_dataset = Dataset(train_data, self.word_index_dict,
                                             is_many_to_one, self.max_seq - 1)
                self.valid_dataset = Dataset(valid_data, self.word_index_dict,
                                             is_many_to_one, self.max_seq - 1)
                pickle.dump(
                    self.train_dataset,
                    open('data/train_data' + self.file_path_subfix, 'wb+'))
                pickle.dump(
                    self.valid_dataset,
                    open('data/valid_data' + self.file_path_subfix, 'wb+'))

            self.logger.i('Finish Generating training set and validation set')

    def _load_dataset(self):
        if self.is_test and os.path.exists('data/test_data' +
                                           self.file_path_subfix):
            self.dataset = pickle.load(
                open('data/test_data' + self.file_path_subfix, 'rb'))
        elif not self.is_test and os.path.exists('data/train_data'+self.file_path_subfix) \
             and os.path.exists('data/valid_data'+self.file_path_subfix):
            self.train_dataset = pickle.load(
                open('data/train_data' + self.file_path_subfix, 'rb'))
            self.valid_dataset = pickle.load(
                open('data/valid_data' + self.file_path_subfix, 'rb'))
        else:
            return False
        self.logger.i('Dataset found!')
        return True

    def _load_wordlist(self):
        if os.path.exists('data/word_list'):
            self.logger.i('Word list found!')
            self.word_list = pickle.load(open('data/word_list', 'rb'))
            self.word_index_dict = {w: i for i, w in enumerate(self.word_list)}
        elif self.is_test:
            raise AssertionError('word_list not found')
        else:
            return False
        return True

    def _load_data(self):
        if os.path.exists('data/train_data') and os.path.exists(
                'data/valid_data'):
            train_data = pickle.load(open('data/train_data', 'rb'))
            valid_data = pickle.load(open('data/valid_data', 'rb'))
            self.logger.i('Training dataset and validation dataset found!')
            return True, train_data, valid_data
        return False, None, None

    def _read_files(self):
        if os.path.exists('hw4_dataset.zip'):
            with ZipFile('hw4_dataset.zip', 'r') as zf:
                if self.data_file_count < 0:
                    file_count = len(zf.filelist)
                else:
                    file_count = self.data_file_count
                self.logger.i('Start loading dataset...')
                valid_file_counter = 0
                file_list = []
                for f in zf.filelist:
                    if f.file_size > 0:
                        if f.filename.startswith(self.file_path_prefix):
                            text = zf.read(f.filename).decode('utf-8').lower()
                            text = text[text.rindex('*end*') +
                                        len('*end*'):text.rindex('end')]
                            self.data += clean_str(text) + ' \n '
                            valid_file_counter += 1
                            file_list.append(f.filename)
                            self.logger.i('Loading %3d docs' %
                                          (valid_file_counter))
                            if valid_file_counter >= file_count:
                                break
                with open('files_used', 'w+') as fu:
                    for file_name in file_list:
                        fu.write(file_name + '\n')
            if self.data_split_mode == 'window':
                tmp_data = self.data
                self.data = []
                window = deque(maxlen=self.max_seq)
                window.append('<sos>')
                for word in tmp_data.strip().split(' '):
                    if word == '\n':
                        word = '<sos>'
                    window.append(word)
                    if len(window) == self.max_seq:
                        self.data.append(window.copy())
            else:
                self.data = [['<sos>'] + entry.split(' ')
                             for entry in self.data.split('\n')]

                # Limit sentense len
                def spliter(d):
                    for i in range(math.ceil(len(d) / self.max_seq)):
                        yield d[self.max_seq * i:self.max_seq * (i + 1)]

                for index, entry in enumerate(self.data):
                    if len(entry) > self.max_seq:
                        splits = list(spliter(entry))
                        self.data[index] = splits[0]
                        self.data.extend(splits[1:])
            self.data = list(filter(lambda x: len(x) > 2, self.data))
        else:
            raise AssertionError('hw4_dataset.zip not found')

    def _load_from_pretrain(self):
        self.logger.i('Loading pre-trained embeddings...')
        if not (os.path.exists('data/pre_trained_word_list') \
                and os.path.exists('data/pre_trained_embeddings')):
            self.word_list = ['<pad>', '<sos>', '<unk>', '<num>']
            tensors = [[], [], [], []]
            special_word_dict = {
                '<pad>': 0,
                '<sos>': 1,
                '<unk>': 2,
                '<unknown>': 2,
                '<num>': 3,
                '<number>': 3
            }
            is_digit = re.compile(r'^[0-9e\.\-\+]+$')
            is_in_limited_char_set = re.compile(
                r'^[A-Za-z0-9,!?\(\)\.\'\`\"\-]+$')
            with open(self.pretrained_file, 'r') as pt:
                lines = pt.readlines()
                num_line = len(lines)
                for index, line in enumerate(lines):
                    word, *embedding = line.strip().split()
                    embedding = [float(value) for value in embedding]
                    if len(embedding
                           ) < 100:  # May be caused by emojis / rear words
                        continue
                    if word in special_word_dict.keys():
                        tensors[special_word_dict[word]] = embedding
                    elif (is_digit.search(word) is not None or \
                          is_in_limited_char_set.search(word) is not None) \
                            and not word.startswith('<'):
                        self.word_list.append(word)
                        tensors.append(embedding)
                    self.logger.d(
                        'Loading pre-trained embeddings %6d / %6d...' %
                        (index, num_line))
            # Check if any special symbol has empty embedding
            for i in range(4):
                if len(tensors[i]) == 0:
                    tensors[i] = [0.] * len(tensors[4])
            pickle.dump(self.word_list,
                        open('data/pre_trained_word_list', 'wb+'))
            pickle.dump(tensors, open('data/pre_trained_embeddings', 'wb+'))
        else:
            self.logger.i('Pre trained wordlist and embeddings data found!')
            self.word_list = pickle.load(
                open('data/pre_trained_word_list', 'rb'))
            tensors = pickle.load(open('data/pre_trained_embeddings', 'rb'))
        return tensors

    def pretrained_embeddings(self):
        return self.tensors

    def test_loader(self):
        return Data.DataLoader(self.dataset, self.batch_size, False)

    def train_loader(self):
        return Data.DataLoader(self.train_dataset, self.batch_size, True)

    def valid_loader(self):
        return Data.DataLoader(self.valid_dataset, self.batch_size, False)
Example #7
0
class SentimentClassification(T.nn.Module):
    def __init__(self,
                 embedding_path,
                 lr=.01,
                 momentum=.9,
                 batch_size=50,
                 epoch=1000,
                 use_cuda=False,
                 verbose=1,
                 tensorboard=False):
        super(SentimentClassification, self).__init__()
        self.embedding_path = embedding_path
        self.lr = lr
        self.momentum = momentum
        self.batch_size = batch_size
        self.epoch = epoch
        self.use_cuda = use_cuda
        self.logger = Logger(verbose)
        self.tensorboard = tensorboard
        self._build_model()
        self.loader = Loader(for_embedding=False, logger=self.logger)
        if self.tensorboard:
            from tensorboardX import SummaryWriter
            self.writer = SummaryWriter('logs')

    def __del__(self):
        if self.tensorboard:
            self.writer.close()

    def _build_model(self):
        def init_weight(m):
            m.weight.data.normal_().mul_(
                T.FloatTensor([2 / m.weight.data.size()[0]]).sqrt_())

        embeddings = np.loadtxt(self.embedding_path)
        vocab_size, embbeing_len = np.shape(embeddings)
        self.embedding = T.nn.Embedding(vocab_size, embbeing_len)
        self.embedding.weight.data.copy_(T.from_numpy(embeddings))
        self.embedding.weight.requires_grad = False
        self.fc = T.nn.Linear(embbeing_len, 2)
        self.fc.apply(init_weight)
        self.loss_fn = T.nn.CrossEntropyLoss()
        self.optimizer = T.optim.SGD(filter(lambda p: p.requires_grad,
                                            self.parameters()),
                                     lr=self.lr,
                                     momentum=self.momentum)

    def forward(self, inputs):
        embeddings = [self.embedding(doc).mean(dim=0) for doc in inputs]
        embeddings = T.stack(embeddings)
        output = self.fc(embeddings)
        # output = T.nn.functional.softmax(output, dim=1)
        _, max_indice = T.max(output, dim=1)
        return output, max_indice

    def fit(self):
        self.logger.i('Start training network...', True)
        try:
            total_batch_per_epoch = len(self.loader)
            loss_history = deque(maxlen=50)
            epoch_index = 0
            for epoch_index in range(self.epoch):
                losses = 0.
                acc = 0.
                counter = 0
                self.logger.i(
                    '[ %d / %d ] epoch:' % (epoch_index + 1, self.epoch), True)
                for batch_index, (docs, sentiment) in enumerate(self.loader):
                    docs = [T.autograd.Variable(doc) for doc in docs]
                    sentiment = T.autograd.Variable(sentiment)
                    if self.use_cuda:
                        docs = [doc.cuda() for doc in docs]
                        sentiment = sentiment.cuda()
                    output, predicted = self(docs)
                    acc += (
                        sentiment.squeeze() == predicted).float().mean().data
                    loss = self.loss_fn(output, sentiment.view(-1))
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    losses += loss.data.cpu()[0]
                    counter += 1
                    progress = min(
                        (batch_index + 1) / total_batch_per_epoch * 20., 20.)
                    self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f' %
                                  ('>' * int(progress) + '-' *
                                   (20 - int(progress)), progress * 5.,
                                   losses / counter, acc / counter))
                mean_loss = losses / counter
                if self.tensorboard:
                    self.writer.add_scalar('train_loss', mean_loss,
                                           epoch_index)
                    self.writer.add_scalar('train_acc', acc / counter,
                                           epoch_index)
                loss_history.append(mean_loss)
                if mean_loss > np.mean(loss_history):
                    self.logger.i('Early stopping...', True)
                    break
                self.logger.d('', True, False)
        except KeyboardInterrupt:
            self.logger.i('\n\nInterrupted', True)
        self.logger.i('Finish', True)