def main(): # Objective: create a csv file with the ubigeo code of all centro poblados in Peru. logger = config.config_logger(__name__, 10) t0 = time.time() logger.info('Beginning execuation') raw = pd.read_csv('./data/ubigeo/Ubigeo_mod.csv', header=0) districts = raw.loc[raw['district'].apply(np.isnan)] print(districts.shape) departments = raw.loc[~raw['region'].apply(np.isnan)] n_centro_pob = sum(departments['centro_pob']) logger.info('There are {0:,} centros poblados in Peru'.format(n_centro_pob)) output = [] for index, row in districts.iterrows(): temp_centros = row['centro_pob'] temp_code = row['code'] for centro in range(1, temp_centros+1): centro_code = add_zero_left(centro, 4) centro_code = str(temp_code) + centro_code centro_code = add_zero_left(centro_code, 10) output.append(centro_code) print(len(output)) # This shoul be equal to the number of centros poblados. print(raw.head().to_string()) print(districts.head().to_string()) print(departments.head().to_string()) output = pd.DataFrame({'ubigeo': output}) output.to_csv('./data/ubigeo/ubigeo_final.csv') config.time_taken_display(t0)
def main(): LOGGER_LEVEL = 20 cfg_name = './config/_credentials.cfg' OUTPUT_PATH = './data/sportmonks/' GET_STATS = False league_id_list = [ 2, 5, 72, 74, 78, 82, 85, 208, 462, 564, 570, 384, 390, 8, 9, 24, 12, 600, 301, 304, 453, 1114, 292, 654, 651, 444, 573, 579 ] t0 = time.time() pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x)) logger = config.config_logger(__name__, level=LOGGER_LEVEL) cfg_parser = configparser.ConfigParser() cfg_parser.read(cfg_name) my_key = str(cfg_parser.get('Sportmonks', 'key')) logger.info('Beginning execution') sportmonks.init(my_key) logger.info('Available leagues:') leagues_dict = {} for l in sportmonks.leagues(): print(l['id'], l['name'], l['country_id']) leagues_dict[l['id']] = l['name'] for league_id in league_id_list: if GET_STATS: league_name = leagues_dict[league_id] logger.info('Sending query - Stats') logger.info('League selected: {0} - {1}'.format( league_id, league_name)) league_json = sportmonks.league( league_id, include= 'seasons.fixtures.stats,seasons.fixtures.localTeam,seasons.fixtures.visitorTeam' ) logger.info('Processing package') league_df = sportmonks.league_into_dataframe(league_json) #print(league_df.head().to_string()) logger.info('Dimensions of the dataframe: {0}'.format( league_df.shape)) save_name = '{0}{1}'.format(league_id, league_name.replace(' ', '_')) logger.info('Saving CSV: {0}'.format(OUTPUT_PATH + save_name)) league_df.to_csv(OUTPUT_PATH + save_name + '.csv') config.time_taken_display(t0) print(' ')
def main(): t0 = time.time() LIMA_MAP = './data/lima_map/limaPolyUTM.geojson' cfg_name = './config/_credentials.cfg' LOGGER_LEVEL = 10 N = 2000 cfg_parser = configparser.ConfigParser() cfg_parser.read(cfg_name) my_key = str(cfg_parser.get('key', 'key')) logger = config.config_logger(__name__, LOGGER_LEVEL) logger.info('Beginning execution: GOOGLE URBAN') logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL)) logger.info('Logging in Google Maps API') gmaps = googlemaps.Client(key=my_key) logger.info('Opening Lima map: {0}'.format(LIMA_MAP)) lima_gpd = maps.load_map(LIMA_MAP) logger.info('Getting Lima limits') lima_limits = maps.boundaries(lima_gpd) logger.info('Plotting Lima map') save_path_lima_map = './data/lima_map/lima.png' maps.plot_map(lima_gpd, save_path_lima_map, lima_limits) logger.info('Getting {0} random starts'.format(N)) start_points = maps.get_list_of_random_starts(lima_gpd, lima_limits, n=N) logger.info('Plotting points in Lima map') save_path_lima_point = './data/lima_map/lima_start_points.png' maps.plot_map_and_points(lima_gpd, save_path_lima_point, start_points) logger.info('Converting UTM points into lat-lon points') start_points = maps.UTM_to_latlon(start_points) #TODO generate queries for trafic each hour #TODO automatize the code to send queries each hour. config.time_taken_display(t0)
def main(): t0 = time.time() logger = config.config_logger(__name__, 10) stats_path = './data/sportmonks/with_data/' save_path = './data/sportmonks/' create_data = False logger.info('Create dataset : {0}'.format(create_data)) if create_data: league_names = work_data.get_data_list(stats_path) logger.info('Leagues found: {0}'.format(len(league_names))) match_data_tot = pd.DataFrame({}) for league in league_names: logger.info('Opening {0}'.format(league)) match_data_raw = work_data.load_data(stats_path, selection=league, date_filter='2016-07-13') logger.info('Dimensions of raw data: {0}'.format( match_data_raw.shape)) match_data_raw = work_data.get_selection(match_data_raw) match_data_raw = work_data.fill_selected_vars(match_data_raw) match_data_raw = work_data.drop_rows_NA(match_data_raw) logger.info('Dimensions after cleaning: {0}'.format( match_data_raw.shape)) match_data_tot = match_data_tot.append(match_data_raw) window = 4 match_data_tot.reset_index(drop=True, inplace=True) logger.info('Dimension of all leagues DB: {0}'.format( match_data_tot.shape)) logger.info('Duplicating vars') match_data = work_data.duplicate_stats(match_data_tot) logger.info('Finding average with window {0}'.format(window)) match_data = work_data.get_averages(match_data, window=window) logger.info('Dimensions after preprocessing: {0}'.format( match_data.shape)) logger.info('Stat variables: {0}'.format( len(work_data.stats_variables()))) match_data.to_csv(save_path + 'sportmonks_final.csv') logger.info('Final DB saved') logger.info('Opening sportmonks DB') match_data = pd.read_csv(save_path + 'sportmonks_final.csv', index_col=1) logger.info('Beginning analysis section') #predict.descriptive_stats(match_data) keep_draws = True logger.info( 'Generating attributes and standardizing - Keep draws: {0}'.format( keep_draws)) y_data, x_data = predict.preprocess_data(match_data, draws=keep_draws) x_data = predict.standardize(x_data) logger.info('Number of attributes included: {0}'.format(x_data.shape[1])) logger.info('Number of obs: {0}'.format(x_data.shape[0])) #print(x_data.describe().transpose().to_string()) train_nn = False train_logit = False train_logit_lasso = False train_svm = False train_tree = False train_adaBoost = True train_gbm = True train_treeBoost = True train_staged = False n_cv = 10 lam_list = [0.001, 0.01, 0.05, 0.1, 0.2, 1, 10, 100] layers = [(10, 5), (20, 10), (30, 10), (50, 10), (100, 10), (100, 20), (100, 30), (100, 50, 10)] if train_nn: logger.info('Training models: Neural Network') cv_nn = list() for layer in layers: temp_nn = predict.neural_network_cv(x_data, y_data, n_cv=n_cv, layers=layer) cv_nn.append(temp_nn) print('.', end='') print(' ') for i in range(len(layers)): predict.report_model_output(cv_nn[i], 'NN_{0}'.format(layers[i])) if train_logit: logger.info('Training models: Logit') cv_logit = list() for lam in lam_list: temp_logit = predict.logistic_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_logit.append(temp_logit) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_logit[i], 'Logit_{0}'.format(lam_list[i])) if train_logit_lasso: logger.info('Training models: Logit_Lasso') cv_logit_lasso = list() for lam in lam_list: temp_logit_lasso = predict.logistic_lasso_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_logit_lasso.append(temp_logit_lasso) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_logit_lasso[i], 'Logit_Lasso_{0}'.format(lam_list[i])) if train_svm: logger.info('Training models: SVM') cv_svm = list() for lam in lam_list: temp_svm = predict.svm_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_svm.append(temp_svm) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_svm[i], 'SVM_{0}'.format(lam_list[i])) if train_tree: logger.info('Training models: Decision Tree') cv_tree = list() temp_tree = predict.tree_cv(x_data, y_data, n_cv=n_cv) cv_tree.append(temp_tree) predict.report_model_output(cv_tree, 'Tree') print(' ') if train_adaBoost: logger.info('Training models: AdaBoost') cv_adaBoost = list() temp_adaBoost = predict.adaBoost_cv(x_data, y_data, n_cv=n_cv) cv_adaBoost.append(temp_adaBoost) predict.report_model_output(cv_adaBoost, 'AdaBoost') print(' ') if train_gbm: logger.info('Training models: gbm') cv_gbm = list() temp_gbm = predict.gbm_cv(x_data, y_data, n_cv=n_cv) cv_gbm.append(temp_gbm) predict.report_model_output(cv_gbm, 'GBM') print(' ') logger.info('Grid: gbm') grid_gbm = predict.gbm_grid(x_data, y_data) print(grid_gbm.best_params_) print(grid_gbm.best_score_) if train_treeBoost: logger.info('Training models: TreeBoost') cv_treeBoost = list() temp_treeBoost = predict.treeBoost_cv(x_data, y_data, n_cv=n_cv) cv_treeBoost.append(temp_treeBoost) predict.report_model_output(cv_treeBoost, 'TreeBoost') print(' ') if train_staged: x_train, x_test, y_train, y_test = predict.split(x_data, y_data, size=0.25) adaBoost_model = predict.adaBoost(x_train, y_train, n_iter=300) adaBoost_model.fit(x_train, y_train) xgBoost_model = predict.xgBoost(x_train, y_train, n_iter=300) xgBoost_model.fit(x_train, y_train) adaBoost_test_errors = [] xgBoost_test_errors = [] for adaBoost_test_predict, xgBoost_test_predict in zip( adaBoost_model.staged_predict(x_test), xgBoost_model.staged_predict(x_test)): adaBoost_test_errors.append( 1. - accuracy_score(adaBoost_test_predict, y_test)) xgBoost_test_errors.append( 1. - accuracy_score(xgBoost_test_predict, y_test)) n_trees_ada = len(adaBoost_test_errors) n_trees_xg = len(xgBoost_test_errors) print(n_trees_ada, n_trees_xg) logger.info('Best accuracy - AdaBoost: {0:.3f}'.format( 1 - min(adaBoost_test_errors))) logger.info('Best accuracy - xgBoost: {0:.3f}'.format( 1 - min(xgBoost_test_errors))) #plt.figure(figsize=(15, 5)) #plt.subplot(131) plt.figure() plt.plot(range(1, n_trees_ada + 1), adaBoost_test_errors, c='black', label='adaBoost') plt.plot(range(1, n_trees_xg + 1), xgBoost_test_errors, c='red', label='xgBoost') plt.legend() plt.ylim(0.2, 0.8) plt.ylabel('Test Error') plt.xlabel('Number of Trees') plt.show() #TODO exclude first and last 4 matches #TODO redo CV using GridSearchCV #TODO use PCA to preprocess the data #TODO implement SoftVoting config.time_taken_display(t0)
def main(): logger = config.config_logger(__name__, 10) t0 = time.time() pdf_path = './data/pdf/' txt_path = './data/txt/' dict_path = './data/dict/' output_path = './output/' convert_files = False logger.info('Begin execution') if convert_files: logger.info('Coonvert files: {0}'.format(convert_files)) logging.getLogger().setLevel(30) all_docs = work_data.convert_pdf_to_txt(pdf_path, txt_path) logging.getLogger().setLevel(10) else: logger.info('Import testimonials') all_docs = work_data.open_testimonies(txt_path) logger.info('Create wordcloud') wordcloud_words = work_data.generate_wordcloud(all_docs, output_path) wordcloud_words.to_csv('./output/word_count.csv') logger.info('Remove protocol paragraphs') filter_docs = [doc.filter_protocol() for doc in all_docs] filter_docs = [doc for doc in filter_docs if len(doc) > 5] logger.info('Load dictionaries') dict1 = pd.read_csv(dict_path + 'dict_ale.csv', index_col=0, header=0) dict2 = pd.read_csv(dict_path + 'dict_erika.csv', index_col=0, header=0) dict3 = pd.read_csv(dict_path + 'dict_macla.csv', index_col=0, header=0) dict_agents = pd.read_csv(dict_path + 'dict_agentes.csv', index_col=0, header=0) filter_docs = [work_data.input_sentiment(doc, dict1) for doc in filter_docs] filter_docs = [work_data.input_sentiment(doc, dict2) for doc in filter_docs] filter_docs = [work_data.input_sentiment(doc, dict3) for doc in filter_docs] #clean_docs = [work_data.input_agent(doc, dict_agents) for doc in clean_docs] tagged = not_tagged = 0 for i in filter_docs: for par in i: if par.sentiment != 'none': tagged += 1 else: not_tagged += 1 logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged)) logger.info('Number of testimonials: {0}'.format(len(all_docs))) logger.info('Clean testimonials') clean_docs = [[parag.clean_data() for parag in doc] for doc in filter_docs] clean_docs, quechua = work_data.extract_quechua(clean_docs) logger.info('Testimonials in spanish {0} - quechua {1}'.format(len(clean_docs), len(quechua))) print([doc[0].name for doc in quechua]) tagged = not_tagged = 0 for i in clean_docs: for par in i: if par.sentiment != 'none': tagged += 1 else: not_tagged += 1 logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged)) logger.info('Train model') parag_trained1 = work_data.input_sentiment_posneg(filter_docs) parag_trained2 = work_data.train_sentiment(clean_docs) logger.info('Save preditions') parag_trained1.to_csv(output_path + 'reg_database1.csv') parag_trained2.to_csv(output_path + 'reg_database2.csv') # LDA implementation mat_docs, dictionary = work_data.list_to_matrix(clean_docs) print(mat_docs[0]) pprint.pprint(dictionary.dfs) lda_model = work_data.lda_model(dictionary, mat_docs, 10) print(lda_model) pprint.pprint(lda_model.print_topics(num_topics=10, num_words=10)) config.time_taken_display(t0)
def main(): np.random.seed(42) logger = config.config_logger(__name__, 10) t0 = time.time() train_client_path = './data/raw/csv/train_clientes.csv' train_reque_path = './data/raw/csv/train_requerimientos.csv' test_client_path = './data/raw/csv/test_clientes.csv' test_reque_path = './data/raw/csv/test_requerimientos.csv' output_path = './output/' do_merge = False write_impute_test = False write_output = False add_variables = False version = 6 logger.info('Beginning execution') logger.info('Load dataframes') test_client = pd.read_csv(test_client_path, header=0) test_reque = pd.read_csv(test_reque_path, header=0) main_client = pd.read_csv(train_client_path, header=0) main_reque = pd.read_csv(train_reque_path, header=0) work_data.basic_descriptive(main_client) work_data.basic_descriptive(main_reque) id_variables = work_data.id_variables() index_client = test_client['ID_CORRELATIVO'] if write_impute_test: logger.info('Creating new test database') logger.info('Cleaning test reque database') test_reque = work_data.preprocess_reque(test_reque) print(test_reque.head().to_string()) logger.info('Cleaning test client database - Imputing missing values') test_client = work_data.count_missings_column(test_client) test_client = work_data.preprocess_client(test_client) print(test_client.head().to_string()) logger.info('Merging test databases') temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index]) temp.fillna(0, inplace=True) test_df = temp print(test_df.head().to_string()) print(test_df.describe().transpose().to_string()) logger.info('Saving test database') test_df.to_csv('./data/mod/test_imputed.csv', index=False) else: logger.info('Opening test database') test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0) print(test_df.head().to_string()) if do_merge: logger.info('Creating new merge') logger.info('Cleaning reque database') main_reque = work_data.preprocess_reque(main_reque) print(main_reque.head().to_string()) #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum) #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format) #main_reque.fillna(0, inplace=True) logger.info('Cleaning client database - Imputing missing values') main_client = work_data.count_missings_column(main_client) target = main_client.pop('ATTRITION') target.index = main_client['ID_CORRELATIVO'] main_client = work_data.preprocess_client(main_client) main_client['ATTRITION'] = target print(main_client.head().to_string()) logger.info('Merging databases') temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index]) temp.fillna(0, inplace=True) main_df = temp print(main_df.shape) print(main_df.head().to_string()) print(main_df.describe().transpose().to_string()) work_data.basic_descriptive(main_df) logger.info('Saving marges database') main_df.to_csv('./data/mod/merge1.csv', index=False) else: logger.info('Opening merged database') main_df = pd.read_csv('./data/mod/merge1.csv', header=0) print(main_df.head().to_string()) print(main_df.shape) y = main_df.pop('ATTRITION') main_df = main_df.append(test_df).reset_index(drop=True) if False: logger.info('Creating T-SNE database') temp_tsne = pd.DataFrame(models.tnse(main_df)) temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False) else: logger.info('Loading T-SNE database') temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv') if add_variables: logger.info('Beginning feature engineering') logger.info('Interactions') main_df_feat = models.create_interactions(main_df, models.inter_vars()) logger.info('Row sums 1-3') main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1) temp = models.standard_scale_df(main_df) main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1) main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1) logger.info('K-means 4-7') main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str) main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str) main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str) main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str) logger.info('KNN 8-11') main_df_feat['ext8'] = models.knn_distance(main_df, 2) main_df_feat['ext9'] = models.knn_distance(main_df, 3) main_df_feat['ext10'] = models.knn_distance(main_df, 5) main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2) main_df_feat = pd.get_dummies(main_df_feat, drop_first=True) print(main_df_feat.head().to_string()) print(main_df_feat.shape) config.time_taken_display(t0) logger.info('Saving features database') main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False) else: logger.info('Opening feature engineered database') main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0) print(main_df_feat.head().to_string()) print(main_df_feat.shape) logger.info('Split data into train and test') x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :] print(main_df_feat.shape) print(x.shape) print(test_df.shape) x_train, x_test, y_train, y_test = models.split_data(x, y) work_data.basic_descriptive(x_train) logger.info('Level 1 - Create metafeatures') if False: logger.info('1. Ridge logit') ridge_model = models.logit_grid(x, y, 'l2', StandardScaler()) models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard') print(ridge_model.score(x_test, y_test)) logger.info('2. Lasso logit') lasso_model = models.logit_grid(x, y, 'l1',StandardScaler()) models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard') print(lasso_model.score(x_test, y_test)) logger.info('3. Random Forrest') RF_model = models.random_forrest_grid(x, y, StandardScaler()) models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard') print(RF_model.score(x_test, y_test)) logger.info('4. Extra Trees') ET_model = models.extra_trees_grid(x, y, StandardScaler()) models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard') print(ET_model.score(x_test, y_test)) logger.info('5. 2-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 2) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard') print(KNN_model.score(x_test, y_test)) logger.info('6. 4-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 4) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard') print(KNN_model.score(x_test, y_test)) logger.info('7. 8-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 8) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard') print(KNN_model.score(x_test, y_test)) logger.info('8. 16-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 16) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard') print(KNN_model.score(x_test, y_test)) logger.info('9. 32-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 32) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard') print(KNN_model.score(x_test, y_test)) logger.info('10. 64-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 64) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard') print(KNN_model.score(x_test, y_test)) logger.info('11. 128-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 128) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard') print(KNN_model.score(x_test, y_test)) logger.info('12. 256-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 256) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard') print(KNN_model.score(x_test, y_test)) logger.info('13. 512-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 512) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard') print(KNN_model.score(x_test, y_test)) logger.info('14. 1024-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 1024) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard') print(KNN_model.score(x_test, y_test)) logger.info('15. Naive Bayes') NB_model = models.naive_bayes_grid(x, y, StandardScaler()) models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard') print(NB_model.score(x_test, y_test)) logger.info('16. MPL') MLP_model = models.MLP_grid(x, y, StandardScaler()) models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard') print(MLP_model.score(x_test, y_test)) logger.info('17. AdaBoost') adaboost_model = models.adaboost_grid(x, y, StandardScaler()) models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard') print(adaboost_model.score(x_test, y_test)) logger.info('18. GBM') gbm_model = models.gbm_grid(x, y, StandardScaler()) models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard') print(gbm_model.score(x_test, y_test)) logger.info('18. LightGBM') lgbm_model = models.lgbm_grid(x, y, None) models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none') print(lgbm_model.score(x_test, y_test)) logger.info('19. XgBoost') test_final = main_df_feat.iloc[70000:, :] id_test = test_client['ID_CORRELATIVO'] xgboost_model = models.xgboost_grid(x, y, StandardScaler()) models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard') print(xgboost_model.score(x_test, y_test)) models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION') hi # Stage 2: logger.info('Level 2') logger.info('Creating meta-features database') meta_features_list = os.listdir('./data/mod/meta_features') temp = {} for feature in meta_features_list: temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0) temp[feature] = temp_df.iloc[:, 1] meta_features = pd.DataFrame(temp) meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True) x = meta_features.iloc[:70000, :] test_final = meta_features.iloc[70000:, :] x_train, x_test, y_train, y_test = models.split_data(x, y) print(x_train.shape) print(test_final.shape) print(x.shape) logger.info('Estimating second level model with XgBoost') xgboost_final = models.xgboost_full_mod(x_train, y_train) print(xgboost_final.score(x_test, y_test)) print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1])) models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8') models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train') config.time_taken_display(t0) hi logger.info('XgBoost') xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test) print('Test grid: {0}'.format(xgboost_result)) #Test: -0.322 xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test) print(xgboost_full) xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')
def main(): LOGGER_LEVEL = 10 RAW_DATA_PATH = './data/raw/' RAW_CSV_NAME = 'raw_data.csv' t0 = time.time() logger = config.config_logger(__name__, LOGGER_LEVEL) pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x)) logger.info('Beginning execution: zika dataset') logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL)) logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME)) raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME) logger.info('Raw dataset description:') process.basic_descriptives(raw_data) raw_data = process.preprocess(raw_data) #print(raw_data.describe().transpose().to_string()) #print(raw_data.head().to_string()) #print(raw_data.info().to_string()) y_dengue = raw_data['dengue_pcr'] y_zika = raw_data['zika_pcr'] y_chik = raw_data['chik_pcr'] diseases = [y_dengue, y_zika, y_chik] # Check process code for further explanation of select_disease function. # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any # only_one: if True, input np.nan to patients with another disease. y = process.select_disease(diseases, code=1, only_one=False) logger.info('Target var frequency: \n{0}'.format(y.value_counts())) logger.info('Total obs: {0}'.format(y.value_counts().sum())) remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist', 'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4', 'dengue_pcr', 'zika_pcr', 'chik_pcr'] X = process.remove_vars(raw_data, remove_list) X = process.keep_non_nan(X, y) y = y.dropna() logger.info('Features dataset') process.basic_descriptives(X) logger.info('Split train test') X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4) logger.info('Estimating models') logger.info('GBM') grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5) logger.info(grid_gbm.best_params_) logger.info('Train score: {0}'.format(grid_gbm.best_score_)) logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test))) logger.info('Logit') grid_logit = models.logit_grid(X_train, y_train, n_cv=5) logger.info(grid_logit.best_params_) logger.info('Train score: {0}'.format(grid_logit.best_score_)) logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test))) logger.info('AdaBoost') grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5) logger.info(grid_adaboost.best_params_) logger.info('Train score: {0}'.format(grid_adaboost.best_score_)) logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test))) logger.info('Soft Voting') eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit), ('ada', grid_adaboost)], voting='soft') eclf.fit(X_train, y_train) y_pred = eclf.predict_proba(X_test) print(y_pred[:5,:]) logger.info('Train score: {0}'.format(eclf.score(X_train, y_train))) logger.info('Test score: {0}'.format(eclf.score(X_test, y_test))) config.time_taken_display(t0)