Ejemplo n.º 1
0
    def __init__(self, args, num_class=2, seed=1111, file_name=''):
        logging.info("Init the dataset...")
        self.seed = seed
        self.args = args
        self.num_class = num_class
        self.max_len = 0  # will be update when load the train and test file

        self.train_x, self.train_y, self.valid_x, self.valid_y, self.test_x, self.test_y = self.load_data(
            file_name)

        train_max, train_mean, train_min = self.statistic_len(self.train_x)
        valid_max, valid_mean, valid_min = self.statistic_len(self.valid_x)
        self.max_len = max(train_max, valid_max)
        logger("Train data max len:%d, mean len:%d, min len:%d " %
               (train_max, train_mean, train_min))
        logger("Test data max len:%d, mean len:%d, min len:%d " %
               (valid_max, valid_mean, valid_min))
        self.valid_nums = len(self.valid_x)
        self.train_nums = len(self.train_x)
        self.test_nums = 0

        # load the data word dict
        self.word2id, self.word_file = self.get_word_index(
            os.path.join(args.tmp_dir, self.__class__.__name__,
                         file_name + args.word_file),
            exclude_n=self.args.skip_top,
            max_size=self.args.num_words)
        self.word2id_size = len(self.word2id)
        self.train_idx = np.random.permutation(self.train_nums //
                                               self.args.batch_size)
Ejemplo n.º 2
0
 def get_word_index(self, path=None):
     if not path:
         path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file
     word2id = dict()
     with open(path, mode='r', encoding='utf-8') as f:
         for l in f:
             word2id.setdefault(l.strip(), len(word2id))
     logger('Word2id size : %d' % len(word2id))
     return word2id
Ejemplo n.º 3
0
 def prepare_dict(self, file_name):
     logger("Prepare the dictionary for the {}...".format(
         self.__class__.__name__))
     word2id = prepare_dictionary(data=self.data_x,
                                  dict_path=file_name,
                                  exclude_n=self.args.skip_top,
                                  max_size=self.args.num_words)
     logger("Word2id size : %d" % len(word2id))
     return word2id
Ejemplo n.º 4
0
 def prepare_dict(self, file_name, exclude_n=10, max_size=10000):
     logger("Prepare the dictionary for the {}...".format(
         self.__class__.__name__))
     word2id = prepare_dictionary(data=self.train_x + self.valid_x,
                                  dict_path=file_name,
                                  exclude_n=exclude_n,
                                  max_size=max_size)
     logger("Word2id size : %d" % len(word2id))
     return word2id
Ejemplo n.º 5
0
 def get_word_index(self, path=None, exclude_n=10, max_size=10000):
     if not path:
         path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file
     if os.path.isfile(path) and os.path.getsize(path) > 0:
         word2id = dict()
         with open(path, mode='r', encoding='utf-8') as f:
             for l in f:
                 word2id.setdefault(l.strip(), len(word2id))
     else:
         word2id = self.prepare_dict(path,
                                     exclude_n=exclude_n,
                                     max_size=max_size)
     logger('Word2id size : %d' % len(word2id))
     return word2id, path
Ejemplo n.º 6
0
 def load_file(self, fpath):
     max_len = 0
     with io.open(fpath, 'r', encoding='utf-8') as f:
         data_x = list()
         data_y = list()
         for line in f.read().splitlines():
             line = line.strip().split(' ')
             if len(line) <= 3:
                 continue
             data_x.append(line[:-1])
             data_y.append(int(line[-1]))
             max_len = len(
                 line[:-1]) if len(line[:-1]) > max_len else max_len
     logger("Load the data over , size: %d. max length :%d" %
            (len(data_x), max_len))
     return data_x, data_y
Ejemplo n.º 7
0
def train(args):
    train_dataloader, test_dataloader, model = init_from_scrach(args)
    best_acc = 0.0
    best_epoch = 0
    iter = 0
    logger('Begin training...')

    # FIXME : could modified for your model
    if args.log_dir:
        logger_path = '../logs/log-av%s-%s-model%s-emb%d-id%s' % (
            args.activation, args.dataset, model.__class__.__name__,
            args.embedding_dim, str(datetime.datetime.now()))
        logger('Save log to %s' % logger_path)
        writer = SummaryWriter(log_dir=logger_path)
    for i in range(args.num_epoches):
        loss_sum = 0
        acc_sum = 0.0
        samples_num = 0
        for j, a_data in enumerate(train_dataloader):
            iter += 1  # recorded for tensorboard

            # forward and loss
            model.optimizer.zero_grad()
            model.zero_grad()
            out, feature = model(
                *a_data
            )  # model should return the output not only predict result.
            loss = model.loss(out, a_data[-1])

            # backward
            loss.backward()

            # grad clip if args.grad_clipping != 0
            if args.grad_clipping != 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.grad_clipping)

            # optimize
            model.optimizer.step()

            # record
            loss_sum += loss.item()
            samples_num += len(a_data[0])
            acc_sum += accuracy(out=out.data.cpu().numpy(), label=a_data[-1])

            if (j + 1) % args.print_every_n == 0:
                logging.info('train: Epoch = %d | iter = %d/%d | ' %
                             (i, j, len(train_dataloader)) +
                             'loss sum = %.2f | accuracy : %.4f' %
                             (loss_sum * 1.0 / j, acc_sum / samples_num))

                # for tensorboard
                if args.log_dir:
                    writer.add_scalar('loss', loss_sum / (j + 1), iter)
                    writer.add_scalar('accuracy', acc_sum / samples_num, iter)

                    for name, param in model.named_parameters():
                        if param.grad is not None:
                            writer.add_histogram(
                                name,
                                param.clone().cpu().data.numpy(), j)
                            writer.add_histogram(
                                name + '/grad',
                                param.grad.clone().cpu().data.numpy(), j)
        # Test
        logging.info("Testing...... | Model : {0} | Task : {1}".format(
            model.__class__.__name__,
            train_dataloader.dataset.__class__.__name__))
        testacc, _ = evaluation(args, model, test_dataloader)
        best_acc, best_epoch = testacc, i if best_acc < testacc else best_acc, best_epoch
        logging.error(
            'Test result acc1: %.4f | best acc: %.4f | best epoch : %d' %
            (testacc, best_acc, best_epoch))
Ejemplo n.º 8
0
def init_from_scrach(args):
    """
    init the model and load the datasets
    :param args:
    :return:
    """
    logger('No trained model provided. init model from scratch...')

    logger('Load the train dataset...')
    if args.dataset.lower() == 'cr':
        train_dataset = CR(args, filename=args.train_file)
        valid_dataset = CR(args, filename=args.valid_file)
    else:
        raise ("No dataset named {}, please check".format(
            args.dataset.lower()))

    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=train_dataset.__class__.batchfy_fn,
        pin_memory=True,
        drop_last=False)
    logger('Train data max length : %d' % train_dataset.max_len)

    logger('Load the test dataset...')
    valid_dataloader = DataLoader(
        dataset=valid_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        collate_fn=valid_dataset.__class__.batchfy_fn,
        pin_memory=True,
        drop_last=False)
    logger('Valid data max length : %d' % valid_dataset.max_len)

    logger('Initiating the model...')
    model = BaseLineRNN(args=args,
                        hidden_size=args.hidden_size,
                        embedding_size=args.embedding_dim,
                        vocabulary_size=len(train_dataset.word2id),
                        rnn_layers=args.num_layers,
                        bidirection=args.bidirectional,
                        num_class=train_dataset.num_class)

    if USE_CUDA:
        model.cuda()
    model.init_optimizer()
    logger('Model {} initiate over...'.format(model.__class__.__name__))
    logger(model)
    return train_dataloader, valid_dataloader, model
Ejemplo n.º 9
0
  def __init__(self, log):
    self.log = log
    self.log.log("employedetails:  init")
    self.__emp_dets = EmpDetails(log)
   

  def add_emp_details(self):
    d = {}
    cnt = 1
    while (cnt < 5):
      y = datetime.date.fromordinal(datetime.date.today().toordinal()-cnt).strftime("%F")
      d[y] = cnt * 10;
      cnt = cnt + 1
    st = {
          1: {"name" : "name_1", "password" : "name_1", "phone" : "4697735274", "mail" : "name_1@name_1.com", "Vendor" : "*****@*****.**", "sec_q1" : "degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"},
          2: {"name" : "name_2", "password":"******", "phone" : "4697735273", "mail" : "name_2@name_2.com", "Vendor": "*****@*****.**", "sec_q1" :"degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"},
          3: {"name" : "name_3", "password":"******", "phone" : "4697735272", "mail" : "name_3@name_3.com", "Vendor": "*****@*****.**", "sec_q1" :"degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"}
       }
    for k, s in st.iteritems():
      s['emp_id'] = self.__emp_dets.add_emp_record(s['name'], s['password'], s['phone'], s['mail'], s['Vendor'], s['sec_q1'], s['sec_q1_a'], s['sec_q2'], s['sec_q2_a'])
      self.__emp_dets.add_work_hours(s['emp_id'], s['password'], d) 

    print("WORK HOURS")
    self.__emp_dets.get_total_work_hours("1", "name_1", [7])


l = logger()
e = Employe_details(l)
e.add_emp_details()   
Ejemplo n.º 10
0
 def shuffle(self):
     logger("Shuffle the dataset.")
     np.random.shuffle(self.train_idx)
Ejemplo n.º 11
0
def train(args):
    train_dataloader, test_dataloader, model = init_from_scrach(args)
    best_acc = 0.0
    best_epoch = 0
    iter = 0
    logger('Begin training...')

    # FIXME : could modified to fit your model and algo
    if args.log_dir:
        logger_path = '../logs/log-av%s-%s-model%s-emb%d-id%s' % (
            args.activation, args.dataset, model.__class__.__name__,
            args.embedding_dim, str(datetime.datetime.now()))
        logger('Save log to %s' % logger_path)
        writer = SummaryWriter(log_dir=logger_path)
    for i in range(args.num_epoches):
        loss_sum = 0
        samples_num = 0
        matrics_value_sum = {}
        for j, data in enumerate(train_dataloader):
            iter += 1  # recorded for tensorboard

            # forward and loss
            model.optimizer.zero_grad()
            model.zero_grad()
            # TODO: you can modified here
            out, feature = model(
                *data
            )  # model should return the output not only predict result.
            loss = model.loss(out, data[-1])

            # backward
            loss.backward()

            # grad clip if args.grad_clipping != 0
            if args.grad_clipping != 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.grad_clipping)

            # optimize
            model.optimizer.step()

            # record
            loss_sum += loss.item()
            samples_num += len(data[0])
            matrics_value = metric(out=out.data.cpu().numpy(), label=data[-1])
            for k, v in matrics_value:
                if k in matrics_value_sum:
                    matrics_value_sum[k] = matrics_value_sum[k] + v
                else:
                    matrics_value_sum.setdefault(v)

            if (j + 1) % args.print_every_n == 0:
                info_add = ''
                for k, v in matrics_value_sum.items():
                    info_add += '| {} : {}'.format(k, v / samples_num)
                info = 'train: Epoch = %d | iter = %d/%d | loss sum = %.2f ' % (
                    i, j, len(train_dataloader), loss_sum * 1.0 / j) + info_add
                logging.info(info)

                # for tensorboard
                if args.log_dir:
                    writer.add_scalar('loss', loss_sum / (j + 1), iter)
                    for k, v in matrics_value_sum.items():
                        writer.add_scalar(k, v / samples_num, iter)

                    for name, param in model.named_parameters():
                        if param.grad is not None:
                            writer.add_histogram(
                                name,
                                param.clone().cpu().data.numpy(), j)
                            writer.add_histogram(
                                name + '/grad',
                                param.grad.clone().cpu().data.numpy(), j)
        # Test
        logging.info("Testing...... | Model : {0} | Task : {1}".format(
            model.__class__.__name__,
            train_dataloader.dataset.__class__.__name__))
        testacc, _ = evaluation(args, model, test_dataloader)
        if best_acc < testacc:
            model.save(datetime=datetime.datetime.now())
        best_acc, best_epoch = testacc, i if best_acc < testacc else best_acc, best_epoch
        logging.error(
            'Test result acc1: %.4f | best acc: %.4f | best epoch : %d' %
            (testacc, best_acc, best_epoch))
Ejemplo n.º 12
0
from utils.util import logging, logger

logging("hello world")

l = logger()
l.log("class logging")
Ejemplo n.º 13
0
    def metric(self, preds, label):
        logger('Starting evaluate custom metric...')
        case_true, case_recall, case_precision = 0, 0, 0
        assert len(label) == len(
            preds
        ), "length of prediction file and gold file should be the same. Receive:%d, should %d" % (
            len(label), len(preds))
        for gold, pred in zip(label, preds):
            lastname = ''
            keys_gold, keys_pred = {}, {}
            for item in gold:
                word, label = item.split('/')[0], item.split('/')[-1]
                flag, name = label[:label.find('-')], label[label.find('-') +
                                                            1:]
                if flag == 'O':
                    continue
                if flag == 'S':
                    if name not in keys_gold:
                        keys_gold[name] = [word]
                    else:
                        keys_gold[name].append(word)
                else:
                    if flag == 'B':
                        if name not in keys_gold:
                            keys_gold[name] = [word]
                        else:
                            keys_gold[name].append(word)
                        lastname = name
                    elif flag == 'I' or flag == 'E':
                        assert name == lastname, "the I-/E- labels are inconsistent with B- labels in gold file."
                        keys_gold[name][-1] += ' ' + word
            for item in pred:
                word, label = item.split('/')[0], item.split('/')[-1]
                flag, name = label[:label.find('-')], label[label.find('-') +
                                                            1:]
                if flag == 'O':
                    continue
                if flag == 'S':
                    if name not in keys_pred:
                        keys_pred[name] = [word]
                    else:
                        keys_pred[name].append(word)
                else:
                    if flag == 'B':
                        if name not in keys_pred:
                            keys_pred[name] = [word]
                        else:
                            keys_pred[name].append(word)
                        lastname = name
                    elif flag == 'I' or flag == 'E':
                        assert name == lastname, "the I-/E- labels are inconsistent with B- labels in pred file."
                        keys_pred[name][-1] += ' ' + word

            for key in keys_gold:
                case_recall += len(keys_gold[key])
            for key in keys_pred:
                case_precision += len(keys_pred[key])

            for key in keys_pred:
                if key in keys_gold:
                    for word in keys_pred[key]:
                        if word in keys_gold[key]:
                            case_true += 1
                            keys_gold[key].remove(
                                word)  # avoid replicate words
        assert case_recall != 0, "no labels in gold files!"
        assert case_precision != 0, "no labels in pred files!"
        recall = 1.0 * case_true / case_recall
        precision = 1.0 * case_true / case_precision
        f1 = 2.0 * recall * precision / (recall + precision)
        result = "recall: %s  precision: %s  F: %s" % (str(recall),
                                                       str(precision), str(f1))
        logger(result)