def __init__(self, args, num_class=2, seed=1111, file_name=''): logging.info("Init the dataset...") self.seed = seed self.args = args self.num_class = num_class self.max_len = 0 # will be update when load the train and test file self.train_x, self.train_y, self.valid_x, self.valid_y, self.test_x, self.test_y = self.load_data( file_name) train_max, train_mean, train_min = self.statistic_len(self.train_x) valid_max, valid_mean, valid_min = self.statistic_len(self.valid_x) self.max_len = max(train_max, valid_max) logger("Train data max len:%d, mean len:%d, min len:%d " % (train_max, train_mean, train_min)) logger("Test data max len:%d, mean len:%d, min len:%d " % (valid_max, valid_mean, valid_min)) self.valid_nums = len(self.valid_x) self.train_nums = len(self.train_x) self.test_nums = 0 # load the data word dict self.word2id, self.word_file = self.get_word_index( os.path.join(args.tmp_dir, self.__class__.__name__, file_name + args.word_file), exclude_n=self.args.skip_top, max_size=self.args.num_words) self.word2id_size = len(self.word2id) self.train_idx = np.random.permutation(self.train_nums // self.args.batch_size)
def get_word_index(self, path=None): if not path: path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file word2id = dict() with open(path, mode='r', encoding='utf-8') as f: for l in f: word2id.setdefault(l.strip(), len(word2id)) logger('Word2id size : %d' % len(word2id)) return word2id
def prepare_dict(self, file_name): logger("Prepare the dictionary for the {}...".format( self.__class__.__name__)) word2id = prepare_dictionary(data=self.data_x, dict_path=file_name, exclude_n=self.args.skip_top, max_size=self.args.num_words) logger("Word2id size : %d" % len(word2id)) return word2id
def prepare_dict(self, file_name, exclude_n=10, max_size=10000): logger("Prepare the dictionary for the {}...".format( self.__class__.__name__)) word2id = prepare_dictionary(data=self.train_x + self.valid_x, dict_path=file_name, exclude_n=exclude_n, max_size=max_size) logger("Word2id size : %d" % len(word2id)) return word2id
def get_word_index(self, path=None, exclude_n=10, max_size=10000): if not path: path = self.args.tmp_dir + self.__class__.__name__ + self.args.word_file if os.path.isfile(path) and os.path.getsize(path) > 0: word2id = dict() with open(path, mode='r', encoding='utf-8') as f: for l in f: word2id.setdefault(l.strip(), len(word2id)) else: word2id = self.prepare_dict(path, exclude_n=exclude_n, max_size=max_size) logger('Word2id size : %d' % len(word2id)) return word2id, path
def load_file(self, fpath): max_len = 0 with io.open(fpath, 'r', encoding='utf-8') as f: data_x = list() data_y = list() for line in f.read().splitlines(): line = line.strip().split(' ') if len(line) <= 3: continue data_x.append(line[:-1]) data_y.append(int(line[-1])) max_len = len( line[:-1]) if len(line[:-1]) > max_len else max_len logger("Load the data over , size: %d. max length :%d" % (len(data_x), max_len)) return data_x, data_y
def train(args): train_dataloader, test_dataloader, model = init_from_scrach(args) best_acc = 0.0 best_epoch = 0 iter = 0 logger('Begin training...') # FIXME : could modified for your model if args.log_dir: logger_path = '../logs/log-av%s-%s-model%s-emb%d-id%s' % ( args.activation, args.dataset, model.__class__.__name__, args.embedding_dim, str(datetime.datetime.now())) logger('Save log to %s' % logger_path) writer = SummaryWriter(log_dir=logger_path) for i in range(args.num_epoches): loss_sum = 0 acc_sum = 0.0 samples_num = 0 for j, a_data in enumerate(train_dataloader): iter += 1 # recorded for tensorboard # forward and loss model.optimizer.zero_grad() model.zero_grad() out, feature = model( *a_data ) # model should return the output not only predict result. loss = model.loss(out, a_data[-1]) # backward loss.backward() # grad clip if args.grad_clipping != 0 if args.grad_clipping != 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping) # optimize model.optimizer.step() # record loss_sum += loss.item() samples_num += len(a_data[0]) acc_sum += accuracy(out=out.data.cpu().numpy(), label=a_data[-1]) if (j + 1) % args.print_every_n == 0: logging.info('train: Epoch = %d | iter = %d/%d | ' % (i, j, len(train_dataloader)) + 'loss sum = %.2f | accuracy : %.4f' % (loss_sum * 1.0 / j, acc_sum / samples_num)) # for tensorboard if args.log_dir: writer.add_scalar('loss', loss_sum / (j + 1), iter) writer.add_scalar('accuracy', acc_sum / samples_num, iter) for name, param in model.named_parameters(): if param.grad is not None: writer.add_histogram( name, param.clone().cpu().data.numpy(), j) writer.add_histogram( name + '/grad', param.grad.clone().cpu().data.numpy(), j) # Test logging.info("Testing...... | Model : {0} | Task : {1}".format( model.__class__.__name__, train_dataloader.dataset.__class__.__name__)) testacc, _ = evaluation(args, model, test_dataloader) best_acc, best_epoch = testacc, i if best_acc < testacc else best_acc, best_epoch logging.error( 'Test result acc1: %.4f | best acc: %.4f | best epoch : %d' % (testacc, best_acc, best_epoch))
def init_from_scrach(args): """ init the model and load the datasets :param args: :return: """ logger('No trained model provided. init model from scratch...') logger('Load the train dataset...') if args.dataset.lower() == 'cr': train_dataset = CR(args, filename=args.train_file) valid_dataset = CR(args, filename=args.valid_file) else: raise ("No dataset named {}, please check".format( args.dataset.lower())) train_dataloader = DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=train_dataset.__class__.batchfy_fn, pin_memory=True, drop_last=False) logger('Train data max length : %d' % train_dataset.max_len) logger('Load the test dataset...') valid_dataloader = DataLoader( dataset=valid_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=valid_dataset.__class__.batchfy_fn, pin_memory=True, drop_last=False) logger('Valid data max length : %d' % valid_dataset.max_len) logger('Initiating the model...') model = BaseLineRNN(args=args, hidden_size=args.hidden_size, embedding_size=args.embedding_dim, vocabulary_size=len(train_dataset.word2id), rnn_layers=args.num_layers, bidirection=args.bidirectional, num_class=train_dataset.num_class) if USE_CUDA: model.cuda() model.init_optimizer() logger('Model {} initiate over...'.format(model.__class__.__name__)) logger(model) return train_dataloader, valid_dataloader, model
def __init__(self, log): self.log = log self.log.log("employedetails: init") self.__emp_dets = EmpDetails(log) def add_emp_details(self): d = {} cnt = 1 while (cnt < 5): y = datetime.date.fromordinal(datetime.date.today().toordinal()-cnt).strftime("%F") d[y] = cnt * 10; cnt = cnt + 1 st = { 1: {"name" : "name_1", "password" : "name_1", "phone" : "4697735274", "mail" : "name_1@name_1.com", "Vendor" : "*****@*****.**", "sec_q1" : "degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"}, 2: {"name" : "name_2", "password":"******", "phone" : "4697735273", "mail" : "name_2@name_2.com", "Vendor": "*****@*****.**", "sec_q1" :"degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"}, 3: {"name" : "name_3", "password":"******", "phone" : "4697735272", "mail" : "name_3@name_3.com", "Vendor": "*****@*****.**", "sec_q1" :"degree passout", "sec_q1_a" : "2003", "sec_q2" : "school name", "sec_q2_a" : "abcd"} } for k, s in st.iteritems(): s['emp_id'] = self.__emp_dets.add_emp_record(s['name'], s['password'], s['phone'], s['mail'], s['Vendor'], s['sec_q1'], s['sec_q1_a'], s['sec_q2'], s['sec_q2_a']) self.__emp_dets.add_work_hours(s['emp_id'], s['password'], d) print("WORK HOURS") self.__emp_dets.get_total_work_hours("1", "name_1", [7]) l = logger() e = Employe_details(l) e.add_emp_details()
def shuffle(self): logger("Shuffle the dataset.") np.random.shuffle(self.train_idx)
def train(args): train_dataloader, test_dataloader, model = init_from_scrach(args) best_acc = 0.0 best_epoch = 0 iter = 0 logger('Begin training...') # FIXME : could modified to fit your model and algo if args.log_dir: logger_path = '../logs/log-av%s-%s-model%s-emb%d-id%s' % ( args.activation, args.dataset, model.__class__.__name__, args.embedding_dim, str(datetime.datetime.now())) logger('Save log to %s' % logger_path) writer = SummaryWriter(log_dir=logger_path) for i in range(args.num_epoches): loss_sum = 0 samples_num = 0 matrics_value_sum = {} for j, data in enumerate(train_dataloader): iter += 1 # recorded for tensorboard # forward and loss model.optimizer.zero_grad() model.zero_grad() # TODO: you can modified here out, feature = model( *data ) # model should return the output not only predict result. loss = model.loss(out, data[-1]) # backward loss.backward() # grad clip if args.grad_clipping != 0 if args.grad_clipping != 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping) # optimize model.optimizer.step() # record loss_sum += loss.item() samples_num += len(data[0]) matrics_value = metric(out=out.data.cpu().numpy(), label=data[-1]) for k, v in matrics_value: if k in matrics_value_sum: matrics_value_sum[k] = matrics_value_sum[k] + v else: matrics_value_sum.setdefault(v) if (j + 1) % args.print_every_n == 0: info_add = '' for k, v in matrics_value_sum.items(): info_add += '| {} : {}'.format(k, v / samples_num) info = 'train: Epoch = %d | iter = %d/%d | loss sum = %.2f ' % ( i, j, len(train_dataloader), loss_sum * 1.0 / j) + info_add logging.info(info) # for tensorboard if args.log_dir: writer.add_scalar('loss', loss_sum / (j + 1), iter) for k, v in matrics_value_sum.items(): writer.add_scalar(k, v / samples_num, iter) for name, param in model.named_parameters(): if param.grad is not None: writer.add_histogram( name, param.clone().cpu().data.numpy(), j) writer.add_histogram( name + '/grad', param.grad.clone().cpu().data.numpy(), j) # Test logging.info("Testing...... | Model : {0} | Task : {1}".format( model.__class__.__name__, train_dataloader.dataset.__class__.__name__)) testacc, _ = evaluation(args, model, test_dataloader) if best_acc < testacc: model.save(datetime=datetime.datetime.now()) best_acc, best_epoch = testacc, i if best_acc < testacc else best_acc, best_epoch logging.error( 'Test result acc1: %.4f | best acc: %.4f | best epoch : %d' % (testacc, best_acc, best_epoch))
from utils.util import logging, logger logging("hello world") l = logger() l.log("class logging")
def metric(self, preds, label): logger('Starting evaluate custom metric...') case_true, case_recall, case_precision = 0, 0, 0 assert len(label) == len( preds ), "length of prediction file and gold file should be the same. Receive:%d, should %d" % ( len(label), len(preds)) for gold, pred in zip(label, preds): lastname = '' keys_gold, keys_pred = {}, {} for item in gold: word, label = item.split('/')[0], item.split('/')[-1] flag, name = label[:label.find('-')], label[label.find('-') + 1:] if flag == 'O': continue if flag == 'S': if name not in keys_gold: keys_gold[name] = [word] else: keys_gold[name].append(word) else: if flag == 'B': if name not in keys_gold: keys_gold[name] = [word] else: keys_gold[name].append(word) lastname = name elif flag == 'I' or flag == 'E': assert name == lastname, "the I-/E- labels are inconsistent with B- labels in gold file." keys_gold[name][-1] += ' ' + word for item in pred: word, label = item.split('/')[0], item.split('/')[-1] flag, name = label[:label.find('-')], label[label.find('-') + 1:] if flag == 'O': continue if flag == 'S': if name not in keys_pred: keys_pred[name] = [word] else: keys_pred[name].append(word) else: if flag == 'B': if name not in keys_pred: keys_pred[name] = [word] else: keys_pred[name].append(word) lastname = name elif flag == 'I' or flag == 'E': assert name == lastname, "the I-/E- labels are inconsistent with B- labels in pred file." keys_pred[name][-1] += ' ' + word for key in keys_gold: case_recall += len(keys_gold[key]) for key in keys_pred: case_precision += len(keys_pred[key]) for key in keys_pred: if key in keys_gold: for word in keys_pred[key]: if word in keys_gold[key]: case_true += 1 keys_gold[key].remove( word) # avoid replicate words assert case_recall != 0, "no labels in gold files!" assert case_precision != 0, "no labels in pred files!" recall = 1.0 * case_true / case_recall precision = 1.0 * case_true / case_precision f1 = 2.0 * recall * precision / (recall + precision) result = "recall: %s precision: %s F: %s" % (str(recall), str(precision), str(f1)) logger(result)