def test_logger(self): """Tests the logger instantiation""" logger = log.get_logger(__file__) assert len(logger.handlers) == 2 logger2 = log.get_logger(__file__) assert logger == logger2 logger.handlers = list() logger = log.get_logger(__file__, "simple") assert len(logger.handlers) == 2 logger.handlers = list() with pytest.raises(ValueError): assert log.get_logger(__file__, "wrong_formatter")
def __init__(self, config=None) -> None: self.__dict__.update(config) self.config = config self.logger = logger.get_logger('proto_{}'.format(self.global_id)) super().__init__(self.epochs, self.eval_epoch, self.patience, self.eval_tasks, self.batch_size, self.first_eval, self.logger) self.device = torch.device(self.device) self.logger.info('current hp', config) self.logger_performance = logger.get_logger( 'proto_{}'.format(self.global_id), 'valid.txt')
def check_config(): """Checks if the default values are changed in the config and if some important requirements are satisfied""" logger = log.get_logger(__file__) correct_config = True if ( "INSERT" in conf.JIRA_URL or "INSERT" in conf.JIRA_USER or "INSERT" in conf.JIRA_PASSWORD ): correct_config = False logger.critical( "Some of your Jira information is not yet configured, " "please change." ) if not os.path.ismount(conf.REPO_PATH): correct_config = False logger.critical( "Your munki repository is not mounted, please mount." ) if not os.path.exists(conf.MAKECATALOGS): correct_config = False logger.critical("Your make catalogs path is wrong, please correct.") config_file_path = os.path.join(conf.LOG_DIR, conf.LOG_FILENAME) if not os.path.exists(config_file_path): correct_config = False logger.critical( f"The config file {config_file_path} does not exists, please " f"create it." ) if conf.DRY_RUN: logger.warning( "The program is executed in dry run mode, no changes will be " "commited." ) if not correct_config and not isinstance(conf, MunkiPromoterTestConfig): # if we are testing we do not want to supply a complete # configuration therefore we only raise the exception when running # in non testing mode. raise ImproperlyConfigured()
def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None: super().__init__() self.logger = logger self.timer = timer() self.timer.initialize(time.time(), 60 * 1000) self.meta_epoch = meta_epoch self.valid_check_epoch = valid_check_epoch self.patience = patience self.valid_tasks = valid_tasks self.batch_size = batch_size self.first_eval = first_eval self.data_augmentor = DataArgumentor( ) if self.use_data_augmentation else None self.turn_on_data_augmentor = False
def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None: super().__init__() self.logger = logger self.timer = timer() self.timer.initialize(time.time(), 60 * 100) self.meta_epoch = meta_epoch self.valid_check_epoch = valid_check_epoch self.patience = patience self.valid_tasks = valid_tasks self.batch_size = batch_size self.first_eval = first_eval self.training_mode = 0 self.training_stage = 0 self.saving = False
type=str, default='info', help='set logging level to store and print statements') parser.add_argument('--seed', type=int, default=1234, help='random seed to set for reproducibility') args = parser.parse_args() # setting up set_seeds(args.seed) set_warnings() log_save_path = f'{args.out_folder}/{args.model_name}/{args.log_file}'.lower() make_dir(log_save_path) logger = get_logger(log_save_path, no_stdout=False, set_level=args.log_level) device = get_device(args.use_cpu, args.cuda_device) # suppress warnings os.environ["TOKENIZERS_PARALLELISM"] = "false" # functions def plot(results, plot_save_path): if 'train_acc' in results.keys() and 'val_acc' in results.keys(): plt.plot(results['train_acc'], label='train accuracy') plt.plot(results['val_acc'], label='validation accuracy') plt.title('Training results') plt.ylabel('Accuracy') plt.xlabel('Epoch')
def main(): '''Parse Arguments''' parser = build_parser() args = parser.parse_args() '''Specify Seeds for reproducibility''' np.random.seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) '''Configs''' device = gpu_init_pytorch(args.gpu) mode = args.mode if mode == 'train': is_train = True else: is_train = False # ckpt= args.ckpt run_name = args.run_name args.log_path = os.path.join(log_folder, run_name) args.model_path = os.path.join(model_folder, run_name) args.board_path = os.path.join(board_path, run_name) args.outputs_path = os.path.join(outputs_folder, run_name) args_file = os.path.join(args.model_path, 'args.p') log_file = os.path.join(args.log_path, 'log.txt') if args.results: args.result_path = os.path.join( result_folder, 'val_results_{}.json'.format(args.dataset)) logging_var = bool(args.logging) if is_train: create_save_directories(args.log_path) create_save_directories(args.model_path) create_save_directories(args.outputs_path) else: create_save_directories(args.log_path) create_save_directories(args.result_path) logger = get_logger(run_name, log_file, logging.DEBUG) logger.debug('Created Relevant Directories') logger.info('Experiment Name: {}'.format(args.run_name)) if args.mt: vocab1_path = os.path.join(args.model_path, 'vocab1.p') vocab2_path = os.path.join(args.model_path, 'vocab2.p') if is_train: #pdb.set_trace() train_dataloader, val_dataloader = load_data(args, logger) logger.debug('Creating Vocab...') voc1 = Voc() voc1.create_vocab_dict(args, 'src', train_dataloader) # To Do : Remove Later voc1.add_to_vocab_dict(args, 'src', val_dataloader) voc2 = Voc() voc2.create_vocab_dict(args, 'trg', train_dataloader) # To Do : Remove Later voc2.add_to_vocab_dict(args, 'trg', val_dataloader) logger.info('Vocab Created with number of words : {}'.format( voc1.nwords)) with open(vocab1_path, 'wb') as f: pickle.dump(voc1, f, protocol=pickle.HIGHEST_PROTOCOL) with open(vocab2_path, 'wb') as f: pickle.dump(voc2, f, protocol=pickle.HIGHEST_PROTOCOL) logger.info('Vocab saved at {}'.format(vocab1_path)) else: test_dataloader = load_data(args, logger) logger.info('Loading Vocab File...') with open(vocab1_path, 'rb') as f: voc1 = pickle.load(f) with open(vocab2_path, 'rb') as f: voc2 = pickle.load(f) logger.info( 'Vocab Files loaded from {}\nNumber of Words: {}'.format( vocab1_path, voc1.nwords)) # print('Done') # TO DO : Load Existing Checkpoints here checkpoint = get_latest_checkpoint(args.model_path, logger) '''Param Specs''' layers = args.layers heads = args.heads d_model = args.d_model d_ff = args.d_ff max_len = args.max_length dropout = args.dropout BATCH_SIZE = args.batch_size epochs = args.epochs if logging_var: meta_fname = os.path.join(args.log_path, 'meta.txt') loss_fname = os.path.join(args.log_path, 'loss.txt') meta_fh = open(meta_fname, 'w') loss_fh = open(loss_fname, 'w') print('Log Files created at: {}'.format(args.log_path)) write_meta(args, meta_fh) """stime= time.time() print('Loading Data...') train, val, test, SRC, TGT = build_data() etime= (time.time()-stime)/60 print('Data Loaded\nTime Taken:{}'.format(etime ))""" pad_idx = voc1.w2id['PAD'] model = make_model(voc1.nwords, voc2.nwords, N=layers, h=heads, d_model=d_model, d_ff=d_ff, dropout=dropout) model.to(device) criterion = LabelSmoothing(size=voc2.nwords, padding_idx=pad_idx, smoothing=0.1) criterion.to(device) # train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device, # repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), # batch_size_fn=batch_size_fn, train=True) # valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device, # repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), # batch_size_fn=batch_size_fn, train=False) if mode == 'train': model_opt = NoamOpt( model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) max_val_score = 0.0 min_error_score = 100.0 epoch_offset = 0 for epoch in range(epochs): # pdb.set_trace() #if epoch%3==0: print('Training Epoch: ', epoch) model.train() run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b) for b in train_dataloader), model, LossCompute(model.generator, criterion, device=device, opt=model_opt)) model.eval() # loss = run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b) for b in val_dataloader), # model, # LossCompute(model.generator, criterion, device=device, opt=None)) # loss_str= "Epoch: {} \t Val Loss: {}\n".format(epoch,loss) # print(loss_str) refs = [] hyps = [] error_score = 0 for i, batch in enumerate(val_dataloader): sent1s = sents_to_idx(voc1, batch['src'], args.max_length) sent2s = sents_to_idx(voc2, batch['trg'], args.max_length) sent1_var, sent2_var, input_len1, input_len2 = process_batch( sent1s, sent2s, voc1, voc2, device, voc1.id2w[pad_idx]) sent1s = idx_to_sents(voc1, sent1_var, no_eos=True) sent2s = idx_to_sents(voc2, sent2_var, no_eos=True) #pdb.set_trace() # for l in range(len(batch['src'])): # if len(batch['src'][l].split())!=9: # print(l) #for eg in range(sent1_var.size(0)): src = sent1_var.transpose(0, 1) src_mask = (src != voc1.w2id['PAD']).unsqueeze(-2) #refs.append([' '.join(sent2s[eg])]) refs += [[' '.join(sent2s[i])] for i in range(sent2_var.size(1))] # pdb.set_trace() out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=voc2.w2id['<s>'], pad=pad_idx) words = [] decoded_words = [[] for i in range(out.size(0))] ends = [] #pdb.set_trace() #print("Translation:", end="\t") for z in range(1, out.size(1)): for b in range(len(decoded_words)): sym = voc2.id2w[out[b, z].item()] if b not in ends: if sym == "</s>": ends.append(b) continue #print(sym, end =" ") decoded_words[b].append(sym) with open(args.outputs_path + '/outputs.txt', 'a') as f_out: f_out.write('Batch: ' + str(i) + '\n') f_out.write( '---------------------------------------\n') for z in range(len(decoded_words)): try: f_out.write('Example: ' + str(z) + '\n') f_out.write('Source: ' + batch['src'][z] + '\n') f_out.write('Target: ' + batch['trg'][z] + '\n') f_out.write('Generated: ' + stack_to_string(decoded_words[z]) + '\n' + '\n') except: logger.warning('Exception: Failed to generate') pdb.set_trace() break f_out.write( '---------------------------------------\n') f_out.close() hyps += [ ' '.join(decoded_words[z]) for z in range(len(decoded_words)) ] #hyps.append(stack_to_string(words)) error_score += cal_score(decoded_words, batch['trg']) #print() #print("Target:", end="\t") for z in range(1, sent2_var.size(0)): sym = voc2.id2w[sent2_var[z, 0].item()] if sym == "</s>": break #print(sym, end =" ") #print() #break val_bleu_epoch = bleu_scorer(refs, hyps) print('Epoch: {} Val bleu: {}'.format(epoch, val_bleu_epoch[0])) print('Epoch: {} Val Error: {}'.format( epoch, error_score / len(val_dataloader))) # if logging_var: # loss_fh.write(loss_str) if epoch % 10 == 0: ckpt_path = os.path.join(args.model_path, 'model.pt') logger.info('Saving Checkpoint at : {}'.format(ckpt_path)) torch.save(model.state_dict(), ckpt_path) print('Model saved at: {}'.format(ckpt_path)) else: model.load_state_dict(torch.load(args.model_path)) model.eval() # pdb.set_trace() # for i, batch in enumerate(val_dataloader): # sent1s = sents_to_idx(voc1, batch['src'], args.max_length) # sent2s = sents_to_idx(voc2, batch['trg'], args.max_length) # sent1_var, sent2_var, input_len1, input_len2 = process_batch(sent1s, sent2s, voc1, voc2, device) # src = sent1_var.transpose(0, 1)[:1] # src_mask = (src != voc1.w2id['PAD']).unsqueeze(-2) # out = greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=voc2.w2id['<s>']) # print("Translation:", end="\t") # for i in range(1, out.size(1)): # sym = voc2.id2w[out[0, i].item()] # if sym == "</s>": break # print(sym, end =" ") # print() # print("Target:", end="\t") # for i in range(1, sent2_var.size(0)): # sym = voc2.id2w[sent2_var[i, 0].item()] # if sym == "</s>": break # print(sym, end =" ") # print() # break else: ''' Code for Synthetic Data ''' vocab_path = os.path.join(args.model_path, 'vocab.p') if is_train: #pdb.set_trace() train_dataloader, val_dataloader = load_data(args, logger) logger.debug('Creating Vocab...') voc = Syn_Voc() voc.create_vocab_dict(args, train_dataloader) # To Do : Remove Later voc.add_to_vocab_dict(args, val_dataloader) logger.info('Vocab Created with number of words : {}'.format( voc.nwords)) with open(vocab_path, 'wb') as f: pickle.dump(voc, f, protocol=pickle.HIGHEST_PROTOCOL) logger.info('Vocab saved at {}'.format(vocab_path)) else: test_dataloader = load_data(args, logger) logger.info('Loading Vocab File...') with open(vocab_path, 'rb') as f: voc = pickle.load(f) logger.info( 'Vocab Files loaded from {}\nNumber of Words: {}'.format( vocab_path, voc.nwords)) # print('Done') # TO DO : Load Existing Checkpoints here # checkpoint = get_latest_checkpoint(args.model_path, logger) '''Param Specs''' layers = args.layers heads = args.heads d_model = args.d_model d_ff = args.d_ff max_len = args.max_length dropout = args.dropout BATCH_SIZE = args.batch_size epochs = args.epochs if logging_var: meta_fname = os.path.join(args.log_path, 'meta.txt') loss_fname = os.path.join(args.log_path, 'loss.txt') meta_fh = open(meta_fname, 'w') loss_fh = open(loss_fname, 'w') print('Log Files created at: {}'.format(args.log_path)) write_meta(args, meta_fh) """stime= time.time() print('Loading Data...') train, val, test, SRC, TGT = build_data() etime= (time.time()-stime)/60 print('Data Loaded\nTime Taken:{}'.format(etime ))""" pad_idx = voc.w2id['PAD'] model = make_model(voc.nwords, voc.nwords, N=layers, h=heads, d_model=d_model, d_ff=d_ff, dropout=dropout) model.to(device) logger.info('Initialized Model') criterion = LabelSmoothing(size=voc.nwords, padding_idx=pad_idx, smoothing=0.1) criterion.to(device) # train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device, # repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), # batch_size_fn=batch_size_fn, train=True) # valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device, # repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), # batch_size_fn=batch_size_fn, train=False) if mode == 'train': model_opt = NoamOpt( model.src_embed[0].d_model, 1, 3000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) max_bleu_score = 0.0 min_error_score = 100.0 epoch_offset = 0 logger.info('Starting Training Procedure') for epoch in range(epochs): # pdb.set_trace() #if epoch%3==0: print('Training Epoch: ', epoch) model.train() start_time = time.time() run_epoch((rebatch(args, device, voc, voc, pad_idx, b) for b in train_dataloader), model, LossCompute(model.generator, criterion, device=device, opt=model_opt)) time_taken = (time.time() - start_time) / 60.0 logger.debug( 'Training for epoch {} completed...\nTime Taken: {}'. format(epoch, time_taken)) logger.debug('Starting Validation') model.eval() # loss = run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b) for b in val_dataloader), # model, # LossCompute(model.generator, criterion, device=device, opt=None)) # loss_str= "Epoch: {} \t Val Loss: {}\n".format(epoch,loss) # print(loss_str) refs = [] hyps = [] error_score = 0 for i, batch in enumerate(val_dataloader): sent1s = sents_to_idx(voc, batch['src'], args.max_length) sent2s = sents_to_idx(voc, batch['trg'], args.max_length) sent1_var, sent2_var, input_len1, input_len2 = process_batch( sent1s, sent2s, voc, voc, device, voc.id2w[pad_idx]) sent1s = idx_to_sents(voc, sent1_var, no_eos=True) sent2s = idx_to_sents(voc, sent2_var, no_eos=True) # pdb.set_trace() # for l in range(len(batch['src'])): # if len(batch['src'][l].split())!=9: # print(l) #for eg in range(sent1_var.size(0)): src = sent1_var.transpose(0, 1) ### FOR NON-DIRECTIONAL ### # src_mask = (src != voc.w2id['PAD']).unsqueeze(-2) ### FOR DIRECTIONAL ### src_mask = make_std_mask(src, pad_idx) src_mask_bi = make_bi_std_mask(src, pad_idx) src_mask_dec = (src != voc.w2id['PAD']).unsqueeze(-2) #refs.append([' '.join(sent2s[eg])]) # refs += [[' '.join(sent2s[i])] for i in range(sent2_var.size(1))] refs += [[x] for x in batch['trg']] out = greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=voc.w2id['<s>'], pad=pad_idx, src_mask_dec=src_mask_dec, src_mask_bi=src_mask_bi) words = [] decoded_words = [[] for i in range(out.size(0))] ends = [] # pdb.set_trace() #print("Translation:", end="\t") for z in range(1, out.size(1)): for b in range(len(decoded_words)): sym = voc.id2w[out[b, z].item()] if b not in ends: if sym == "</s>": ends.append(b) continue #print(sym, end =" ") decoded_words[b].append(sym) with open(args.outputs_path + '/outputs.txt', 'a') as f_out: f_out.write('Batch: ' + str(i) + '\n') f_out.write( '---------------------------------------\n') for z in range(len(decoded_words)): try: f_out.write('Example: ' + str(z) + '\n') f_out.write('Source: ' + batch['src'][z] + '\n') f_out.write('Target: ' + batch['trg'][z] + '\n') f_out.write('Generated: ' + stack_to_string(decoded_words[z]) + '\n' + '\n') except: logger.warning('Exception: Failed to generate') pdb.set_trace() break f_out.write( '---------------------------------------\n') f_out.close() hyps += [ ' '.join(decoded_words[z]) for z in range(len(decoded_words)) ] #hyps.append(stack_to_string(words)) if args.ap: error_score += cal_score_AP(decoded_words, batch['trg']) else: error_score += cal_score(decoded_words, batch['trg']) #print() #print("Target:", end="\t") for z in range(1, sent2_var.size(0)): sym = voc.id2w[sent2_var[z, 0].item()] if sym == "</s>": break #print(sym, end =" ") #print() #break if (error_score / len(val_dataloader)) < min_error_score: min_error_score = error_score / len(val_dataloader) val_bleu_epoch = bleu_scorer(refs, hyps) if max_bleu_score < val_bleu_epoch[0]: max_bleu_score = val_bleu_epoch[0] logger.info('Epoch: {} Val bleu: {}'.format( epoch, val_bleu_epoch[0])) logger.info('Maximum Bleu: {}'.format(max_bleu_score)) logger.info('Epoch: {} Val Error: {}'.format( epoch, error_score / len(val_dataloader))) logger.info('Minimum Error: {}'.format(min_error_score)) # if logging_var: # loss_fh.write(loss_str) if epoch % 5 == 0: ckpt_path = os.path.join(args.model_path, 'model.pt') logger.info('Saving Checkpoint at : {}'.format(ckpt_path)) torch.save(model.state_dict(), ckpt_path) print('Model saved at: {}'.format(ckpt_path)) store_results(args, max_bleu_score, min_error_score) logger.info('Scores saved at {}'.format(args.result_path)) else: model.load_state_dict(torch.load(args.model_path)) model.eval()
# Gmacht mit ❤️ in Basel # # Copyright (c) 2019 University of Basel # Last modified 16/07/2019, 12:55. # # Developed by Tom Cinbis and Tim Königl on 16/07/2019, 13:03 from datetime import datetime from typing import Dict from src.core.base_classes import Package from src.utils import logger as log from src.utils.config import Catalog, PackageState, JiraLane, Present from src.utils.config import conf logger = log.get_logger(__file__) class Promoter: """ The main class for the promotion logic of the program. The munki and jira packages will be compared and the munki packages will be updated according to the state of the jira packages. Additionally automatic catalog transitions are realised if the right criteria are fulfilled. """ def __init__(self, munki_packages: Dict, jira_packages: Dict): """ Initializes a promoter object which contains the munki and the jira packages. :param munki_packages: `Dict` the munki packages
#!/usr/bin/env python import faust from src.config import Config from src.utils.logger import get_logger logger = get_logger('app') config = Config() app = faust.App('compute', broker=config.KAFKA_BROKER_URL, debug=True, web_port=config.WEB_PORT, autodiscover=True, origin='src') config.init_app(app) if __name__ == '__main__': app.main()
def _get_logger(log_file_name, file_dir, config): if log_file_name: return get_logger(log_file_name, file_dir, config) return None
def main(): # ========================================= # === Settings # ========================================= # get logger logger = get_logger(out_file="ensemble.log") logger.info("=== file path ===") # set model oof_1_path = "./data/output/20190930_hmdhmd/20190930_hmdhmd_oof.csv" pred_1_path = "./data/output/20190930_hmdhmd/20190930_hmdhmd_pred.csv" logger.info(f"hmd model - oof: {oof_1_path}") logger.info(f"hmd model - pred: {pred_1_path}") oof_2_path = "./data/output/20191001_ML_Bear/OOF_20190930_ModelAvg_based_on_LB09578_20190930_03_full_model_01_oof09565_pub09886_pri09882.csv" pred_2_path = "./data/output/20191001_ML_Bear/PRED_20190930_ModelAvg_based_on_LB09578_20190930_03_full_model_01_oof09565_pub09886_pri09882.csv" logger.info(f"bear model - oof: {oof_2_path}") logger.info(f"bear model - pred: {pred_2_path}") oof_3_path = "./data/output/model_25/oof_preds.npy" pred_3_path = "./data/output/model_25/submission.csv" logger.info(f"hakubishin model - oof: {oof_3_path}") logger.info(f"hakubishin model - pred: {pred_3_path}") oof_4_path = "./data/output/20190930_holygo/20190930_2032__train_oof_holygo_CV0-9592479__LB0.9594.csv" pred_4_path = "./data/output/20190930_holygo/20190930_2032__test_pred_holygo_CV0-9592479__LB0.9594.csv" logger.info(f"holygo model - oof: {oof_4_path}") logger.info(f"holygo model - pred: {pred_4_path}") # load data oof_1 = pd.read_csv(oof_1_path).sort_values("TransactionID")["isFraud"].values oof_2 = pd.read_csv(oof_2_path).sort_values("TransactionID")["isFraud"].values oof_3 = np.load(oof_3_path) oof_4 = pd.read_csv(oof_4_path).sort_values("TransactionID").iloc[:len(oof_3)]["isFraud"].values pred_1 = pd.read_csv(pred_1_path).sort_values("TransactionID").reset_index(drop=True) pred_2 = pd.read_csv(pred_2_path).sort_values("TransactionID").reset_index(drop=True) pred_3 = pd.read_csv(pred_3_path).sort_values("TransactionID").reset_index(drop=True) pred_4 = pd.read_csv(pred_4_path).sort_values("TransactionID").reset_index(drop=True) # ========================================= # === data loading # ========================================= train = pd.read_csv('./data/input/train.csv') # test = pd.read_csv('./data/input/test.csv') y_train = train["isFraud"].values # ========================================= # === check score # ========================================= logger.info("=== check score ===") def calc_bear_score(df): df_probing = pd.read_csv('data/interim/probing_toolbox/old/probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']] df = pd.merge(df_probing, df, on='TransactionID', how='left') # test public score public_score = roc_auc_score( df[df.data_type=="test_public"]['Probing_isFraud'], df[df.data_type=="test_public"]['isFraud'] ) # test private score private_score = roc_auc_score( df[df.data_type=="test_private"]['Probing_isFraud'], df[df.data_type=="test_private"]['isFraud'] ) return public_score, private_score cv = roc_auc_score(y_train, oof_1) pub, prv = calc_bear_score(pred_1) logger.info(f"hmd model: cv{cv}, pub{pub}, prv{prv}") cv = roc_auc_score(y_train, oof_2) pub, prv = calc_bear_score(pred_2) logger.info(f"bear model: cv{cv}, pub{pub}, prv{prv}") cv = roc_auc_score(y_train, oof_3) pub, prv = calc_bear_score(pred_3) logger.info(f"hakubishin model: cv{cv}, pub{pub}, prv{prv}") cv = roc_auc_score(y_train, oof_4) pub, prv = calc_bear_score(pred_4) logger.info(f"holygo model: cv{cv}, pub{pub}, prv{prv}") # ========================================= # === user info # ========================================= logger.info("=== user info ===") thres = 2 logger.info(f"user count thres: {thres}") predicted_user = pd.read_csv('./data/interim/20190901_user_ids_share.csv').sort_values("TransactionID").reset_index(drop=True) user_count = predicted_user["predicted_user_id"].value_counts() target_user_id = user_count[user_count <= thres].index.tolist() train_predicted_user = predicted_user.iloc[:len(oof_3)] train_target_df = train_predicted_user.query("predicted_user_id in @target_user_id") train_target_index = train_target_df.index cv = roc_auc_score(y_train[train_target_index], oof_1[train_target_index]) logger.info(f"hmd model: cv{cv}") cv = roc_auc_score(y_train[train_target_index], oof_2[train_target_index]) logger.info(f"bear model: cv{cv}") cv = roc_auc_score(y_train[train_target_index], oof_3[train_target_index]) logger.info(f"hakubishin model: cv{cv}") cv = roc_auc_score(y_train[train_target_index], oof_4[train_target_index]) logger.info(f"holygo model: cv{cv}") # ========================================= # === hand made # ========================================= logger.info("=== hand made ===") sub = pred_3.copy() #x_opt = [0.10, 0.25, 0.55, 0.10] x_opt = [0.050, 0.226, 0.6725, 0.0515] logger.info(f"rate: {x_opt}") oof = oof_1 * x_opt[0] + oof_2 * x_opt[1] + oof_3 * x_opt[2] + oof_4 * x_opt[3] cv = roc_auc_score(y_train[train_target_index], oof[train_target_index]) logger.info(f"ensemble model: cv{cv}") sub["isFraud"] = pred_1["isFraud"] * x_opt[0] + pred_2["isFraud"] * x_opt[1] + pred_3["isFraud"] * x_opt[2] + pred_4["isFraud"] * x_opt[3] pub, prv = calc_bear_score(sub) logger.info(f"ensemble model: pub{pub}, prv{prv}") sub.to_csv("sub_avg.csv",header=True,index=False) import pdb; pdb.set_trace() # override probing value and save df_probing = pd.read_csv('data/interim/probing_toolbox/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']] sub = pd.merge(sub, df_probing, on="TransactionID", how="left") # override only probing_isfraud = 1 sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1 sub = sub[["TransactionID", "isFraud"]] pub, prv = calc_bear_score(sub) logger.info(f"ensemble model after override proving value: pub{pub}, prv{prv}") sub.to_csv("sub_avg.csv",header=True,index=False) # ========================================= # === optimize # ========================================= logger.info("=== optimize ===") sub = pred_3.copy() def f(x): x0 = x[:, 0] x1 = x[:, 1] x2 = x[:, 2] x3 = x[:, 3] sub["isFraud"] = pred_1["isFraud"] * x0 + pred_2["isFraud"] * x1 + pred_3["isFraud"] * x2 + pred_4["isFraud"] * x3 public_score, private_score = calc_bear_score(sub) oof = oof_1 * x0 + oof_2 * x1 + oof_3 * x2 + oof_4 * x3 cv = roc_auc_score(y_train[train_target_index], oof[train_target_index]) opt_value = -1 * private_score # opt_value = -1 * (private_score + public_score + cv) return opt_value bounds = [ {'name': 'x0', 'type': 'continuous', 'domain': (0.05, 1)}, {'name': 'x1', 'type': 'continuous', 'domain': (0.05, 1)}, {'name': 'x2', 'type': 'continuous', 'domain': (0.05, 1)}, {'name': 'x3', 'type': 'continuous', 'domain': (0.05, 1)}, ] constraints = [ { 'name': 'constr_1', 'constraint': '(x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 1 - 0.001' }, { 'name': 'constr_2', 'constraint': '1 - (x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 0.001' } ] myBopt = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds, constraints=constraints) myBopt.run_optimization(max_iter=30) logger.info(f"rate: {myBopt.x_opt}") logger.info(f"value: {myBopt.fx_opt}") # check oof oof = oof_1 * myBopt.x_opt[0] + oof_2 * myBopt.x_opt[1] + oof_3 * myBopt.x_opt[2] + oof_4 * myBopt.x_opt[3] cv = roc_auc_score(y_train[train_target_index], oof_1[train_target_index]) logger.info(f"ensemble model: cv{cv}") # make submission file sub = pred_3.copy() sub["isFraud"] = pred_1["isFraud"] * myBopt.x_opt[0] + pred_2["isFraud"] * myBopt.x_opt[1] + pred_3["isFraud"] * myBopt.x_opt[2] + pred_4["isFraud"] * myBopt.x_opt[3] pub, prv = calc_bear_score(sub) logger.info(f"ensemble model: pub{pub}, prv{prv}") import pdb; pdb.set_trace() # override probing value and save df_probing = pd.read_csv('data/interim/probing_toolbox/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']] sub = pd.merge(sub, df_probing, on="TransactionID", how="left") # override only probing_isfraud = 1 sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1 sub = sub[["TransactionID", "isFraud"]] pub, prv = calc_bear_score(sub) logger.info(f"ensemble model after override proving value: pub{pub}, prv{prv}") sub.to_csv("sub_avg.csv",header=True,index=False)
def parse_args(): parser = configargparse.ArgumentParser( description='Training Wikinet 2', formatter_class=configargparse.ArgumentDefaultsHelpFormatter) # General general = parser.add_argument_group('General Settings.') general.add_argument('--my-config', required=True, is_config_file=True, help='config file path') general.add_argument('--exp_name', type=str, default="debug", help="Experiment name") general.add_argument("--debug", type=str2bool, default=True, help="whether to debug") # Data data = parser.add_argument_group('Data Settings.') data.add_argument('--data_path', required=True, type=str, help='location of data dir') data.add_argument( '--data_type', type=str, help='name of train dataset, a directory of this name should contain ' 'generated training data using gen_train_data.py') data.add_argument('--train_size', type=int, help='number of training abstracts') data.add_argument('--data_types', type=str, help='name of datasets separated by comma') # Max Padding padding = parser.add_argument_group('Max Padding for batch.') padding.add_argument('--max_context_size', type=int, help='max number of context') padding.add_argument('--max_ent_size', type=int, help='max number of entities considered in abstract') # Model Type model_selection = parser.add_argument_group('Type of model to train.') model_selection.add_argument( '--pre_train', type=str, help='if specified, model will load state dict, must be ckpt') # Model params model_params = parser.add_argument_group("Parameters for chosen model.") model_params.add_argument('--dp', type=float, help='drop out') model_params.add_argument('--hidden_size', type=int, help='size of hidden layer in yamada model') # Candidate Generation candidate = parser.add_argument_group('Candidate generation.') candidate.add_argument("--num_candidates", type=int, default=32, help="Total number of candidates") candidate.add_argument("--prop_gen_candidates", type=float, default=0.5, help="Proportion of candidates generated") # Training training = parser.add_argument_group("Training parameters.") training.add_argument("--num_epochs", type=int, default=5, help="Number of epochs") training.add_argument("--save_every", type=int, default=5, help="how often to checkpoint") training.add_argument("--patience", type=int, default=5, help="Patience for early stopping") training.add_argument("--batch_size", type=int, default=32, help="Batch size") training.add_argument("--num_workers", type=int, default=4, help="number of workers for data loader") training.add_argument('--lr', type=float, help='learning rate') training.add_argument('--wd', type=float, help='weight decay') training.add_argument('--embs_optim', type=str, choices=['adagrad', 'adam', 'rmsprop', 'sparseadam'], help='optimizer for embeddings') training.add_argument( '--other_optim', type=str, choices=['adagrad', 'adam', 'rmsprop'], help='optimizer for paramaters that are not embeddings') training.add_argument('--sparse', type=str2bool, help='sparse gradients') # cuda parser.add_argument("--device", type=str, help="cuda device") parser.add_argument("--use_cuda", type=str2bool, help="use gpu or not") parser.add_argument("--profile", type=str2bool, help="if set will run profiler on dataloader and exit") args = parser.parse_args() logger = get_logger(args) if args.wd > 0: assert not args.sparse if args.use_cuda: devices = args.device.split(",") if len(devices) > 1: devices = tuple([int(device) for device in devices]) else: devices = int(devices[0]) args.__dict__['device'] = devices logger.info("Experiment Parameters:") print() for arg in sorted(vars(args)): logger.info('{:<15}\t{}'.format(arg, getattr(args, arg))) model_date_dir = join(args.data_path, 'models', '{}'.format(datetime.now().strftime("%Y_%m_%d"))) if not os.path.exists(model_date_dir): os.makedirs(model_date_dir) model_dir = join(model_date_dir, args.exp_name) args.__dict__['model_dir'] = model_dir if not os.path.exists(model_dir): os.makedirs(model_dir) return args, logger, model_dir
import json import os import aiohttp import numpy as np from src.app import app, config from src.models.faust_dao import State, CalculationItem from src.utils.logger import get_logger logger = get_logger('zz-compute-agents') @app.agent(config.topics['model-tasks-do']) async def compute_agent(events): async for event in events: task = event.task print(f'MYAGENT RECEIVED -- {task!r}') task.state = State.IN_PROGRESS.value await config.topics['model-tasks-done'].send(value=task) try: async with aiohttp.ClientSession() as session: compute_closure = compute(session, task) if not event.runner_code: outputs = await compute_closure(task.data) else: _locals = {'klass': None} exec(event.runner_code, { '__builtins__': __builtins__, 'np': np
import signal SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) \ for n in dir(signal) if n.startswith('SIG') and '_' not in n ) def receive_signal(signum, stack): if signum in [1, 2, 3, 15]: print('Caught signal %s (%s), exiting.' % (SIGNALS_TO_NAMES_DICT[signum], str(signum))) else: print('Caught signal %s (%s), ignoring.' % (SIGNALS_TO_NAMES_DICT[signum], str(signum))) LOGGER = logger.get_logger('main-thread') GLOBAL_CONFIG = {} def predict(learner, episode_queue, total_task=200, res='cpu', kargs={}): time1 = time.time() if 'time_fired' in kargs: LOGGER.debug(kargs['taskid'], 'fired time', time1 - kargs['time_fired']) # LOGGER.info('task id', kargs['taskid'], 'device', res) device = torch.device(res) learner.to(device) result = [] for i in range(total_task): if isinstance(episode_queue, list):
def main(): '''read arguments''' parser = build_parser() args = parser.parse_args() config =args mode = config.mode if mode == 'train': is_train = True else: is_train = False ''' Set seed for reproducibility''' np.random.seed(config.seed) torch.manual_seed(config.seed) random.seed(config.seed) '''GPU initialization''' device = gpu_init_pytorch(config.gpu) #device = 'cpu' '''Run Config files/paths''' run_name = config.run_name config.log_path = os.path.join(log_folder, run_name) config.model_path = os.path.join(model_folder, run_name) config.board_path = os.path.join(board_path, run_name) vocab_path = os.path.join(config.model_path, 'vocab.p') config_file = os.path.join(config.model_path, 'config.p') log_file = os.path.join(config.log_path, 'log.txt') if config.results: config.result_path = os.path.join(result_folder, 'val_results_{}.json'.format(config.dataset)) if is_train: create_save_directories(config.log_path, config.model_path) else: create_save_directories(config.log_path, config.result_path) logger = get_logger(run_name, log_file, logging.DEBUG) writer = SummaryWriter(config.board_path) logger.debug('Created Relevant Directories') logger.info('Experiment Name: {}'.format(config.run_name)) '''Read Files and create/load Vocab''' if is_train: logger.debug('Creating Vocab and loading Data ...') train_loader, val_loader_bins, voc = load_data(config, logger) logger.info( 'Vocab Created with number of words : {}'.format(voc.nwords)) with open(vocab_path, 'wb') as f: pickle.dump(voc, f, protocol=pickle.HIGHEST_PROTOCOL) logger.info('Vocab saved at {}'.format(vocab_path)) else: logger.info('Loading Vocab File...') with open(vocab_path, 'rb') as f: voc = pickle.load(f) logger.info('Vocab Files loaded from {}'.format(vocab_path)) logger.info("Loading Test Dataloaders...") config.batch_size = 1 test_loader_bins = load_data(config, logger, voc) logger.info("Done loading test dataloaders") # print('Done') # TO DO : Load Existing Checkpoints here if is_train: max_val_acc = 0.0 epoch_offset= 0 if config.load_model: checkpoint = get_latest_checkpoint(config.model_path, logger) if checkpoint: ckpt = torch.load(checkpoint, map_location=lambda storage, loc: storage) #config.lr = checkpoint['lr'] model = build_model(config=config, voc=voc, device=device, logger=logger) model.load_state_dict(ckpt['model_state_dict']) model.optimizer.load_state_dict(ckpt['optimizer_state_dict']) else: model = build_model(config=config, voc=voc, device=device, logger=logger) # pdb.set_trace() logger.info('Initialized Model') with open(config_file, 'wb') as f: pickle.dump(vars(config), f, protocol=pickle.HIGHEST_PROTOCOL) logger.debug('Config File Saved') logger.info('Starting Training Procedure') train_model(model, train_loader, val_loader_bins, voc, device, config, logger, epoch_offset, max_val_acc, writer) else: gpu = config.gpu with open(config_file, 'rb') as f: bias = config.bias extraffn = config.extraffn config = AttrDict(pickle.load(f)) config.gpu = gpu config.bins = len(test_loader_bins) config.batch_size = 1 config.bias = bias config.extraffn = extraffn # To do: remove it later #config.num_labels =2 model = build_model(config=config, voc=voc, device=device, logger=logger) checkpoint = get_latest_checkpoint(config.model_path, logger) ep_offset, train_loss, score, voc = load_checkpoint( model, config.mode, checkpoint, logger, device, bins = config.bins) logger.info('Prediction from') od = OrderedDict() od['epoch'] = ep_offset od['train_loss'] = train_loss if config.bins != -1: for i in range(config.bins): od['max_val_acc_bin{}'.format(i)] = score[i] else: od['max_val_acc'] = score print_log(logger, od) pdb.set_trace() #test_acc_epoch, test_loss_epoch = run_validation(config, model, test_loader, voc, device, logger) #test_analysis_dfs = [] for i in range(config.bins): test_acc_epoch, test_analysis_df = run_test(config, model, test_loader_bins[i], voc, device, logger) logger.info('Bin {} Accuracy: {}'.format(i, test_acc_epoch)) #test_analysis_dfs.append(test_analysis_df) test_analysis_df.to_csv(os.path.join(result_folder, '{}_{}_test_analysis_bin{}.csv'.format(config.dataset, config.model_type, i))) logger.info("Analysis results written to {}...".format(result_folder))
from sklearn import clone from sklearn.model_selection import GridSearchCV, GroupKFold import pandas as pd import numpy as np from os.path import join from src.evaluation.classification import evaluate_fold from src.models.base import ModelBase from src.utils.logger import get_logger from src.utils.misc import randomised_order logger = get_logger(__name__) __all__ = [ 'sklearn_model', ] def select_fold(key, folds, fold_name): assert fold_name in folds.columns fold_def = folds[fold_name] fold_vals = set(np.unique(fold_def.values)) assert fold_vals.issubset({'train', 'val', 'test'}) return fold_def def learn_sklearn_model(key, index, features, targets, fold_def, model, n_splits): assert index.shape[0] == features.shape[0] assert index.shape[0] == targets.shape[0]
import os from src.models.faust_dao import ModelTask, ModelTaskDoEvent from src.utils.logger import get_logger logger = get_logger('config') class Config: KAFKA_BROKER_URL = None WEB_PORT = None def __init__(self): """ Initiates microservice configuration :return: configuration object """ self.KAFKA_BROKER_URL = self._set_kafka_url() self.WEB_PORT = self._set_web_port() self.topics = { 'model-tasks-do': None, 'model-tasksdone': None, 'model-metadata-updates': None } self.debug_models = { 'mod-dummy': 'https://mod-dummy-501-zz-test.22ad.bi-x.openshiftapps.com' '/v1/models/mod-dummy:predict', 'mod-text-class': 'https://mod-text-class-501-zz-test.22ad.bi-x.' 'openshiftapps.com/v1/models/mod-text-class:predict'