import os import pprint sys.path.append( os.path.join( os.environ['CCETC_ROOT'] ) ) import ccetc_py.info from lib.project import project #Init projects projects = {} with open("res/project_list.json") as f: projects_json = json.loads(f.read()) for project_json in projects_json: p = project( project_json ) projects[project_json["name"]] = p #Init nodes and groups nodes = ccetc_py.info.nodes() groups = ccetc_py.info.groups() def getProject(project): '''If it exists, returns project. Else, returns None''' if project in projects: return projects[project] else: return None def getNode(node): '''If it exists, returns node object. Else, returns None''' if node in nodes: return nodes[node] else: return None
logging.info('Loading test dataset') test_df = load_test_df(conf['svdff.dataset']) logging.info('Computing test features') X = compute_feature_matrix(test_df, vectorizer, combine='stack') logging.info('Computing test SVD features') U = X.dot(VT.transpose().dot(Sinv)) logging.info('Symmetrizing input features') Uq1, Uq2 = np.vsplit(U, 2) U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0]) logging.info('Applying models to test dataset') test_df['svdff'] = np.zeros(U.shape[0]) for q in quality['folds']: f = load_model(q['dump']) p = f.predict_proba(U).flatten() test_df['svdff'] = test_df['svdff'] + logit(p) test_df['svdff'] = test_df['svdff'] / len(quality['folds']) logging.info('Writing test dataset') test_df[[ FieldsTest.test_id, 'svdff', ]].to_csv(join_path(dump_dir, 'test.csv'), index=False) if __name__ == '__main__': main(project().conf)
with open('hyperopt_trials.json', 'w') as f: json.dump(trials.results, f, indent=4) logging.info('Best parameters: %s', opt) best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss']) logging.info('Best model %d: AUC=%s, model=%s' % ( best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file'])) best_model = CatBoostClassifier() best_model.load_model(best_trial_result['model']['file']) return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model if __name__ == '__main__': conf = project().conf dump_dir = abspath(conf['catboost']['dump']['dir']) makedirs(dump_dir) write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon') write_config(conf, join_path(dump_dir, 'application.json'), 'json') logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log'))) logging.info('Kaggle Talking Data') logging.info('Train Catboost') logging.info('Dump: %s', dump_dir) target = conf['catboost']['target'] features = conf['catboost']['features'] categorical_features = conf['catboost']['categorical_features']