def main(args): torch.manual_seed(0) train_fname = args.dataset_path + 'train.csv' test_fname = args.dataset_path + 'test.csv' dataloaders = make_dataloader(train_fname, test_fname) model = VDCNN(depth=args.depth, num_class=args.num_class) run_model(model, dataloaders, args.num_epochs)
def run_reranking(new_index,sentence_in,qid,specific_ws,ref_doc,out_index,texts,new_trectext_name,ranked_lists,new_feature_file,feature_dir,trec_file,score_file,options): new_text = update_text_doc(texts[ref_doc],sentence_in,out_index) create_new_trectext(ref_doc,texts,new_text,new_trectext_name) create_specifi_ws(qid,ranked_lists,specific_ws) logger.info("creating features") create_index(new_trectext_name, os.path.dirname(new_index), os.path.basename(new_index), options.home_path, options.indri_path) features_file = create_features_file_diff(feature_dir, options.index_path, new_index, new_feature_file, specific_ws, options.scripts_path,options.java_path,options.swig_path,options.stopwords_file,options.queries_text_file,options.home_path) logger.info("creating docname index") docname_index = create_index_to_doc_name_dict(features_file) logger.info("docname index creation is completed") logger.info("features creation completed") logger.info("running ranking model on features file") score_file = run_model(features_file, options.home_path, options.java_path, options.jar_path, score_file, options.model) logger.info("ranking completed") logger.info("retrieving scores") scores = retrieve_scores(docname_index, score_file) logger.info("scores retrieval completed") logger.info("creating trec_eval file") tmp_trec = create_trec_eval_file(scores, trec_file) logger.info("trec file creation is completed") logger.info("ordering trec file") final = order_trec_file(tmp_trec) logger.info("ranking procedure completed") return final
def ith_nn_strat(state): model = self.past_models[i] if model is None: return TTT_random(state) else: return utils.run_model(model, self.process_state(state), stochastic=True)
def main(): config = utils.load_yaml(args.config) task = config['task'] EPOCHS = config['epoch'] N_FOLDS = 5 BATCH_SIZE = config['batchsize'] IMAGE_SIZE = config['image_size'] model_name = config['model'] optimizer_name = config['optimizer'] loss = config['loss'] lr = float(config['lr']) n_class = config['n_class'] lr_scheduler = config.get('lr_scheduler') azure_run = None tb_writer = None num_workers = 64 experiment_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S') print(f'found {torch.cuda.device_count()} gpus !!') try: if args.debug: print('running in debug mode') EPOCHS = 1 N_FOLDS = 2 if args.debug: result_dir = Path(utils.RESULT_DIR) / ('debug-' + experiment_name) else: result_dir = Path(utils.RESULT_DIR) / experiment_name ws = Workspace.from_config('.aml_config/config.json') exp = Experiment(workspace=ws, name='kaggle-aptos2019') azure_run = exp.start_logging() azure_run.log('experiment name', experiment_name) azure_run.log('epoch', EPOCHS) azure_run.log('batch size', BATCH_SIZE) azure_run.log('image size', IMAGE_SIZE) azure_run.log('model', model_name) azure_run.log('optimizer', optimizer_name) azure_run.log('loss_name', loss['name']) azure_run.log('lr', lr) azure_run.log('lr_scheduler', lr_scheduler) azure_run.log('task', task) if args.cv: azure_run.log('cv', N_FOLDS) else: azure_run.log('cv', 0) if args.multi: print('use multi gpu !!') os.mkdir(result_dir) print(f'created: {result_dir}') utils.save_yaml(result_dir / Path(args.config).name, config) # if not args.debug: # tb_writer = SummaryWriter(log_dir=result_dir) device = torch.device("cuda:0") config = { 'epochs': EPOCHS, 'multi': args.multi, 'batch_size': BATCH_SIZE, 'image_size': IMAGE_SIZE, 'model_name': model_name, 'n_class': n_class, 'optimizer_name': optimizer_name, 'loss': loss, 'lr': lr, 'lr_scheduler': lr_scheduler, 'task': task, 'device': device, 'num_workers': num_workers, } print(config) if not args.debug: slack.notify_start(experiment_name, config) train_df = pd.read_csv(utils.TRAIN_CSV_PATH) if args.debug: train_df = train_df[:1000] config['df'] = train_df skf = StratifiedKFold(n_splits=N_FOLDS, random_state=41, shuffle=True) indices = list(skf.split(train_df, train_df['diagnosis'])) if not args.cv: print('do not use cross validation') indices = [indices[0]] # cross validation oof_preds = np.zeros((len(train_df), n_class)) for i_fold, (train_index, valid_index) in tqdm(enumerate(indices)): model_path = result_dir / f'model_fold{i_fold}' config['train_index'] = train_index config['valid_index'] = valid_index config['model_path'] = str(model_path) if azure_run: if i_fold == 0: config['azure_run'] = azure_run y_pred, y_true = utils.run_model(**config) else: with azure_run.child_run() as child: config['azure_run'] = child y_pred, y_true = utils.run_model(**config) else: y_pred, y_true = utils.run_model(**config) if args.cv: oof_preds[valid_index] = y_pred if args.cv: valid_preds = oof_preds valid_true = train_df['diagnosis'] else: valid_preds = y_pred valid_true = y_true if task == 'class': round_valid_preds = np.argmax(valid_preds, axis=1) elif task == 'reg': print('optimizing threshold ...') optR = utils.OptimizedRounder() optR.fit(valid_preds, valid_true) coef = optR.coefficients() print(f'best coef: {coef}') if azure_run: azure_run.log('coef', coef) round_valid_preds = optR.predict(valid_preds, coef) val_kappa = cohen_kappa_score(round_valid_preds, valid_true, weights='quadratic') print(f'best val kappa: {val_kappa}') if azure_run: azure_run.log('best val kappa', val_kappa) test_csv = pd.read_csv(utils.TEST_CSV_PATH) #test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='test') test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='val') test_dataset = RetinopathyDataset(df=test_csv, mode='test', transform=test_tfms, auto_crop=True, add_blur=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=num_workers) test_preds = np.zeros((len(test_csv), n_class)) for i in range(len(indices)): model = utils.load_pytorch_model(model_name, result_dir / f'model_fold{i}', n_class) test_preds += utils.predict(model, test_loader, n_class=n_class, device=device, tta=1) test_preds /= len(indices) if task == 'class': round_test_preds = np.argmax(test_preds, axis=1) elif task == 'reg': round_test_preds = optR.predict(test_preds, coef) submission_csv = pd.read_csv(utils.SAMPLE_SUBMISSION_PATH) submission_csv['diagnosis'] = round_test_preds submission_csv.to_csv(result_dir / 'submission.csv', index=False) print('finish!!!') if not args.debug: slack.notify_finish(experiment_name, config, val_kappa) except KeyboardInterrupt as e: if not args.debug: slack.notify_fail(experiment_name, config, e.__class__.__name__, str(e)) except Exception as e: if azure_run: azure_run.fail(e) if not args.debug: slack.notify_fail(experiment_name, config, e.__class__.__name__, str(e)) raise finally: if azure_run: azure_run.complete() print('close azure_run') if tb_writer: tb_writer.export_scalars_to_json( os.path.join(result_dir, 'all_scalars.json')) tb_writer.close() print('close tb_writer')
- row: pandas dataframe row Return: A modified pandas dataframe row with calculated columns """ row['approve'] = '0' row['limit'] = '%s' % np.nan return row if __name__ == '__main__': # max client aproving max_approved = 1500 # checking for fraud fraud_pred = run_model(FraudEnsemble, model_suffix='*_fraud_ensemble.bin') fraud_pred.set_index(['ids'], inplace=True) fraud_pred['fraud'] = fraud_pred['fraud'].apply(pd.to_numeric) # checking for default default_pred = run_model(DefaultEnsemble, model_suffix='*_default_ensemble.bin') default_pred.set_index(['ids'], inplace=True) default_pred['default'].astype(float, inplace=True) default_pred['default'] = default_pred['default'].apply(pd.to_numeric) # checking for spend spend_pred = run_model(SpendDTR, model_suffix='*_spend_dtr.bin') spend_pred.set_index(['ids'], inplace=True) spend_pred['spend_score'].astype(int, inplace=True) spend_pred['spend_score'] = spend_pred['spend_score'].apply(pd.to_numeric)
def nn_strategy(state): return utils.run_model(model, state, stochastic=False)
K = [] print("Computing kernels...") for name in [0, 1, 2]: X = np.array(pd.read_csv(f'{args.data_folder}/Xtr{name}.csv')['seq']) X_ev = np.array(pd.read_csv(f'{args.data_folder}/Xte{name}.csv')['seq']) t0 = time.time() K_tr = Mismatch_kernel(X, X, k=default_params['k'], m=default_params['m']) K_te = Mismatch_kernel(X, X_ev, k=default_params['k'], m=default_params['m']) print(f"Finished computing mismatch kernel for dataset {name}.") K.append({"train": K_tr, "eval": K_te}) preds, _ = run_model('ksvm', kernel='', K=K, sequence=True, prop_test=0.2, default_params=default_params) write_csv(np.arange(preds.shape[0]), preds, args.savefile) # If MKL is used else: default_params = {"lamb": 25, "step": .05} kernel_params = [(7, 3), (8, 3)] K = [{"train": [], "eval": []} for _ in range(3)] print("Computing kernels...") for name in [0, 1, 2]: X = np.array(pd.read_csv(f'{args.data_folder}/Xtr{name}.csv')['seq']) X_ev = np.array(pd.read_csv(f'{args.data_folder}/Xte{name}.csv')['seq'])
# Default parameters # weights = args.weights hidden_units = int(args.hidden_units) if args.hidden_units else 60 batch_size = int(args.batch_size) if args.batch_size else 32 bidirectional = bool(args.bidirectional) if args.bidirectional else True highway = bool(args.highway) if args.highway else False self_attention = bool(args.self_attention) if args.self_attention else False max_pooling = bool(args.max_pooling) if args.max_pooling else False alignment = bool(args.alignment) if args.alignment else False shortcut = bool(shortcut) if args.shortcut else False ###################### # Load character list # with open("characters.pkl", 'rb') as char: characters = pickle.load(char) ######################### model = load_model(hidden_units, bidirectional, highway, self_attention, max_pooling, alignment, shortcut) two_fold = args.dataset_train == args.dataset_test print("Loading train dataset...") dataset_train = load_dataset(args.dataset_train, characters, two_fold, False) print("Loading test dataset...") dataset_test = load_dataset(args.dataset_test, characters, False, two_fold) run_model(model, weights, batch_size, dataset_train, dataset_test, two_fold)