def __init__(self, input_dir, limit=0, from_one_file=True): self.data = [] inputs = [] log = bu.get_logger() if os.path.isfile(input_dir): inputs.append(input_dir) else: for input_file in os.listdir(input_dir): file_path = input_dir + '/' + input_file if os.path.isfile(file_path): inputs.append(file_path) if from_one_file: one_file_limit = limit else: one_file_limit = limit // len(inputs) for input_file in inputs: if one_file_limit > 0 and len(self.data) >= limit: break with open(input_file, 'rb') as wfd: log.info(input_file) if one_file_limit > 0: self.data.extend(pickle.load(wfd)[:one_file_limit]) else: self.data.extend(pickle.load(wfd)) if limit > 0 and len(self.data) > limit: self.data = self.data[:limit]
def __init__(self, args, label2index_map, input_size, paths, config): self.batch_size = args.batch_size self.epoch_num = args.epoch self.optimier = args.optimizer self.hidden_dim1 = args.hidden_dim1 self.hidden_dim2 = args.hidden_dim2 self.hidden_dim3 = args.hidden_dim3 self.dropout_keep_prob = args.dropout self.beta = args.beta self.lr = args.lr self.clip_grad = args.clip self.optimizer = args.optimizer self.test_data_path = args.test_data self.tag2label = label2index_map self.num_tags = len(label2index_map) self.input_size = input_size self.config = config self.model_path = paths['model_path'] self.summary_path = paths['summary_path'] self.logger = base_util.get_logger(paths['log_path']) self.result_path = paths['result_path']
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join(args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) os.environ['CUDA_VISIBLE_DEVICES'] = '1' label2index_map, _ = load_label2index() print(label2index_map) # training model train_path = os.path.join(args.train_data, 'train_modified.csv') test_path = os.path.join(args.test_data, 'test_modified.csv') if args.mode == 'train': ids, train_data = read_corpus(train_path) print("train data: {}".format(len(train_data))) train = train_data[:650000] val = train_data[650000:] input_size = len(train.columns) - 1 print('input_size', input_size)
from util.base_util import timer from util.base_util import get_logger import lightgbm as lgb import numpy as np import pickle from sklearn.model_selection import KFold, StratifiedKFold import pandas as pd from sklearn.metrics import f1_score from hyperopt import hp from hyperopt import tpe from hyperopt import Trials from hyperopt import fmin ITERATION = 0 log = get_logger() def cross_validation(train, params, ID_COLUMN_NAME, LABEL_COLUMN_NAME, N_FOLD=5): ''' :return: loss ''' NUM_BOOST_ROUND = 1000 EARLY_STOPPING_ROUNDS = 50 # Cross validation model folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001)
import pandas as pd import numpy as np from sklearn.metrics import f1_score from sklearn.model_selection import KFold, StratifiedKFold from util.base_util import timer import os from competitions311 import data_process import tensorflow as tf from util import base_util log = base_util.get_logger() ID_COLUMN_NAME = 'user_id' LABEL_COLUMN_NAME = 'current_service' def nn_model(df_train, df_test): pass class FeatureNN(): def __init__(self, x_train, y_train, x_val, y_val, epoch=10, batch_size=1500): self.epoch = epoch
def main(): params = { 'output_dir': str(Path(RESULT_DIR, 'res_torch')), 'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')), 'glove_dim': 300, 'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')), 'glove': str(Path(DATA_DIR, 'embedding/glove.npz')), 'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')), 'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')), } parser = argparse.ArgumentParser() parser.add_argument('--undo_train_valid', help="undo train data as valid", action='store_true', default=False) parser.add_argument('--input', help="input dir or file", type=str, required=True) parser.add_argument('--valid_input', help="valid data input dir or file", type=str, required=True) parser.add_argument('--output', help="output file dir for writing result", type=str, default=params['output_dir']) parser.add_argument('--limit', help="if use data limit", type=int, default=0) parser.add_argument('--gpu_index', help="gpu index must>-1,if use gpu", type=int, default=0) parser.add_argument('--dropout', help="dropout rate in embed and liner layer", type=float, default=0.2) parser.add_argument('--batch_size', help="batch size od data", type=int, default=32) parser.add_argument('--hidden_size', help="set the hidden size", type=int, default=128) parser.add_argument('--epochs', help="epochs of train", type=int, default=100) parser.add_argument('--monitor', help="monitor f1,acc,precision or recall, " "value like ORG:f1 or PER:acc or LOC:recall", type=str, default='ORG:f1') parser.add_argument('--use_glove', help="denote whether use use_glove", type=bool, default=False) parser.add_argument('--model_name', help="file name of model file", type=str, default='ner_model_crf') parser.add_argument('--mode_type', help="choose transformer(t) or biLstm(b) or only crf(c)", choices=['b', 't', 'c', 'bt', 'cnn'], type=str, default='b') parser.add_argument('--bert_dim', help="bert dim", type=int, default=768) parser.add_argument('--te_dropout', help="te dropout", type=float, default=0.1) parser.add_argument('--lr', help="learning rate", type=float, default=3e-4) parser.add_argument('--lr_times', help="learning rate decay times", type=int, default=0) parser.add_argument('--wd', help="weight decay", type=float, default=1e-3) parser.add_argument('--head_num', help="set the head num", type=int, default=8) parser.add_argument('--vip', help="the ip or domain of visdom server", type=str, default='') parser.add_argument('--env', help="the name of env of visdom", type=str, default='ner') parser.add_argument('--pre_model_path', help="the pre model path", type=str, default='') parser.add_argument('--use_cross_entropy', help="use cross entropy loss", action='store_true', default=False) args = parser.parse_args() params['dropout'] = args.dropout params['use_glove'] = args.use_glove params['bert_dim'] = args.bert_dim params['mode_type'] = args.mode_type params['hidden_size'] = args.hidden_size # just for transformer params['te_dropout'] = args.te_dropout params['head_num'] = args.head_num params['use_cross_entropy'] = args.use_cross_entropy model_time_str = args.model_name + '_' + bu.get_time_str() log = bu.get_logger(model_time_str) if args.vip: vis = visdom.Visdom(args.vip, env=args.env) else: vis = None word_to_ix = {'<pad>': 0} if params['use_glove']: with open(params['words']) as wvf: for word in wvf: word = word.strip() if word not in word_to_ix: word_to_ix[word] = len(word_to_ix) tag_to_ix = {'O': 0} with open(params['tags']) as wvf: for tag in wvf: tag = tag.strip() if tag not in tag_to_ix: tag_to_ix[tag] = len(tag_to_ix) idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix} if args.gpu_index > -1: device = torch.device(f'cuda:{args.gpu_index}') else: device = torch.device('cpu') model = Bert_CRF(tag_to_ix, params, device) model.to(device) if args.pre_model_path: with Path(args.pre_model_path).open('rb') as mp: if args.gpu_index < 0: ml = 'cpu' else: ml = None best_state_dict = torch.load(mp, map_location=ml) model.load_state_dict(best_state_dict, False) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # begin to train model step_index = 0 # model, bert_dim, tag_to_ix, word_to_ix, rw, batch collate_fn = functools.partial(data_provider.collect_fn, model, params['bert_dim'], tag_to_ix, None, False) with bu.timer('load train data'): dataset = data_provider.BBNDatasetCombine(args.input, args.limit) data_loader = tud.DataLoader(dataset, args.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True) if not args.undo_train_valid: sampler = tud.RandomSampler(data_source=dataset, replacement=True, num_samples=5000) else: sampler = None log.info('begin to train') Path(params['checkpoint']).mkdir(parents=True, exist_ok=True) monitor_best = 0 wait = 0 loss_train_epoch = [] loss_valid_epoch = [] loss_train_t = [] loss_train_valid = [] criterion_key = ['f1', 'precision', 'recall'] criterion_map = {} lr_times = args.lr_times lr = args.lr for epoch in range(args.epochs): loss_train = [] # index_batch, words_batch, words_ids_batch, len_w_batch, tags_batch # sentence_batch for i, w, wi, l, t, _ in data_loader: # Step 1. Remember that Pytorch accumulates gradients. model.zero_grad() # Step 2. Run our forward pass. # words, words_ids, len_w, tags loss = model.neg_log_likelihood(w, wi, l, t) # Step 3. Compute the loss, gradients, and update the parameters by # calling optimizer.step() ls = loss.mean() ls.backward() optimizer.step() step_index += 1 step_loss = ls.item() log.info( f'global step:{step_index} epoch:{epoch} loss:{step_loss}') loss_train.append(step_loss) loss_train_t.append(step_loss) plot(vis, loss_train_t, args.model_name, ['train_loss']) if sampler: # collate_fn, model, args, tag_to_ix = None, idx_to_tag = None, # fpr = True, get_loss = False, input_dir = None, dataset_in = None, # sampler = None criterion, loss_valid_ = evaluate(collate_fn, model, args, tag_to_ix, idx_to_tag, True, True, dataset_in=dataset, sampler=sampler) for k in criterion: # ['f1', 'precision', 'recall'] for ck in criterion_key: key = f'train_{k}_{ck}' if key not in criterion_map: criterion_map[key] = [] criterion_map[key].append(criterion[k][ck]) loss_train_valid.append(np.mean(loss_valid_)) criterion, loss_valid = evaluate(collate_fn, model, args, tag_to_ix, idx_to_tag, True, True, input_dir=args.valid_input) loss_train_epoch.append(np.mean(loss_train)) loss_valid_epoch.append(np.mean(loss_valid)) for k in criterion: # ['f1', 'precision', 'recall'] for ck in criterion_key: key = f'valid_{k}_{ck}' if key not in criterion_map: criterion_map[key] = [] criterion_map[key].append(criterion[k][ck]) plot_data = [] keys = list(criterion_map.keys()) for k in criterion_map: plot_data.append(criterion_map[k]) if sampler: legend = ['train_loss', 'valid_loss', 'train_loss_t'] + keys x_in = zip(loss_train_epoch, loss_valid_epoch, loss_train_valid, *plot_data) else: legend = ['train_loss', 'valid_loss'] + keys x_in = zip(loss_train_epoch, loss_valid_epoch, *plot_data) plot(vis, x_in, args.model_name, legend) log.info(f'valid:{criterion}') tag_type, monitor_type = args.monitor.split(':') if (criterion[tag_type][monitor_type] > monitor_best or monitor_best == 0): monitor_best = criterion[tag_type][monitor_type] wait = 0 best_state_dict = model.state_dict() if monitor_best: save_mode(best_state_dict, params, tag_to_ix, args.model_name) else: wait += 1 if (epoch + 1) % 5 == 0: temp_name = f't_{args.model_name}_{epoch+1}' save_mode(model.state_dict(), params, tag_to_ix, temp_name) if wait > 8: if lr_times: lr_times -= 1 wait = 3 lr /= 3 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.wd) else: log.warn(f'meat early stopping! best score is {monitor_best}') break log.info('finish train')
def main(): params = { 'output_dir': str(Path(RESULT_DIR, 'res_torch')), 'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')), 'glove_dim': 300, 'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')), 'glove': str(Path(DATA_DIR, 'embedding/glove.npz')), 'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')), 'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')), } parser = argparse.ArgumentParser() parser.add_argument('--input', help="input dir or file", type=str, required=True) parser.add_argument('--output', help="output file dir for writing result", type=str, default=params['output_dir']) parser.add_argument('--limit', help="if use data limit", type=int, default=0) parser.add_argument('--gpu_index', help="gpu index must>-1,if use gpu", type=int, default=0) parser.add_argument('--model_name', help="file name of model file", type=str, default='ner_model_crf') args = parser.parse_args() model_time_str = args.model_name + '_' + bu.get_time_str() log = bu.get_logger(model_time_str) log.info('begin predict') fn_model = params['checkpoint'] + f'/{args.model_name}_torch.pkl' fn_config = params['checkpoint'] + f'/{args.model_name}_config.pkl' with Path(fn_model).open('rb') as mp: if args.gpu_index < 0: ml = 'cpu' else: ml = None best_state_dict = torch.load(mp, map_location=ml) with Path(fn_config).open('rb') as mp: params, tag_to_ix = pickle.load(mp) print(tag_to_ix) idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix} if args.gpu_index > -1: device = torch.device(f'cuda:{args.gpu_index}') else: device = torch.device('cpu') model = Bert_CRF(tag_to_ix, params, device) model.to(device) model.load_state_dict(best_state_dict, strict=False) with bu.timer('load data'): dataset = data_provider.BBNDatasetCombine(args.input, args.limit) # change batch_size to 1 args.batch_size = 1 # model, bert_dim, tag_to_ix, word_to_ix, rw, batch collate_fn = functools.partial(data_provider.collect_fn, model, params['bert_dim'], tag_to_ix, None, True) log.warn(f"{'-'*25}test_valid{'-'*25}") evaluate(collate_fn, model, args, tag_to_ix, idx_to_tag, True, False, f"{args.output}/{args.model_name}.txt", dataset_in=dataset)
import pickle import numpy as np import torch import pathlib from torch.utils.data import TensorDataset, DataLoader, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertModel import util.base_util as bu logger = bu.get_logger(__name__) CUDA_ID_PATTERN = re.compile('(\d+,)*\d+') MODEL_PATH = '../config/bert/bert-base-chinese.tar.gz' VOCAB_PATH = '../config/bert/bert-base-chinese-vocab.txt' DATA_SET = set() LIMITED = 0 class InputExample(object): def __init__(self, unique_id, text, entity_map): self.unique_id = unique_id self.text = text self.entity_map = entity_map