def __init__(self, experiment_name): self.experiment_name = experiment_name self.params_search = \ {"dv": [2], "dd": [20], "include_edges": [False, True], "include_weighted_mean": [False, True], "alpha": [2] } # the keys of this dict represents a feature params, that without being ON, their dict values has no meaning self.features_params = {"include_edges": ["edges_max_pixels"], "include_weighted_mean": ["center_weight", "corners_weight"] } self._build_checkpoints_dir() self.logger = config_logger(self.experiment_name) metadata = self.load_experiment() if metadata: self.execution_order = metadata[EXECUTION_ORDER] self.execution_counter = metadata[EXECUTION_COUNTER] self.best_experiment_index = metadata[BEST_EXPERIMENT_INDEX] self.min_loss = metadata[MIN_LOSS] self.logger.info("Starting experiment %s from %s out of %s", self.experiment_name, self.execution_counter, len(self.execution_order)) else: self.execution_order = self._build_execution_order() self.execution_counter = 0 self.best_experiment_index = None self.min_loss = None self.logger.info("Starting experiment %s from scratch", self.experiment_name)
def main(): # Objective: create a csv file with the ubigeo code of all centro poblados in Peru. logger = config.config_logger(__name__, 10) t0 = time.time() logger.info('Beginning execuation') raw = pd.read_csv('./data/ubigeo/Ubigeo_mod.csv', header=0) districts = raw.loc[raw['district'].apply(np.isnan)] print(districts.shape) departments = raw.loc[~raw['region'].apply(np.isnan)] n_centro_pob = sum(departments['centro_pob']) logger.info('There are {0:,} centros poblados in Peru'.format(n_centro_pob)) output = [] for index, row in districts.iterrows(): temp_centros = row['centro_pob'] temp_code = row['code'] for centro in range(1, temp_centros+1): centro_code = add_zero_left(centro, 4) centro_code = str(temp_code) + centro_code centro_code = add_zero_left(centro_code, 10) output.append(centro_code) print(len(output)) # This shoul be equal to the number of centros poblados. print(raw.head().to_string()) print(districts.head().to_string()) print(departments.head().to_string()) output = pd.DataFrame({'ubigeo': output}) output.to_csv('./data/ubigeo/ubigeo_final.csv') config.time_taken_display(t0)
def run(current_experiment, currentEpoch, data_path, labels_path, ids_path): dataset = CostumeDataset(ids_path, data_path, labels_path, img_h=224, img_w=224) dataloader = DataLoader(dataset) # Set up an experiment logger = config_logger(current_experiment) fe = getFeatureExtractionModel(current_experiment, logger, currentEpoch=currentEpoch)[0] fe.eval() for i, batch in enumerate(dataloader): inputs = Variable(batch['image'].type(float_type)) labels = batch['label'].cpu().numpy() results = fe(inputs, None, None) features = results[0] inputs = inputs.cpu().numpy().squeeze() features = features.cpu().numpy().squeeze() labels = labels.squeeze() visualize(inputs, labels, features, current_experiment, i) return
def main(): LOGGER_LEVEL = 20 cfg_name = './config/_credentials.cfg' OUTPUT_PATH = './data/sportmonks/' GET_STATS = False league_id_list = [ 2, 5, 72, 74, 78, 82, 85, 208, 462, 564, 570, 384, 390, 8, 9, 24, 12, 600, 301, 304, 453, 1114, 292, 654, 651, 444, 573, 579 ] t0 = time.time() pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x)) logger = config.config_logger(__name__, level=LOGGER_LEVEL) cfg_parser = configparser.ConfigParser() cfg_parser.read(cfg_name) my_key = str(cfg_parser.get('Sportmonks', 'key')) logger.info('Beginning execution') sportmonks.init(my_key) logger.info('Available leagues:') leagues_dict = {} for l in sportmonks.leagues(): print(l['id'], l['name'], l['country_id']) leagues_dict[l['id']] = l['name'] for league_id in league_id_list: if GET_STATS: league_name = leagues_dict[league_id] logger.info('Sending query - Stats') logger.info('League selected: {0} - {1}'.format( league_id, league_name)) league_json = sportmonks.league( league_id, include= 'seasons.fixtures.stats,seasons.fixtures.localTeam,seasons.fixtures.visitorTeam' ) logger.info('Processing package') league_df = sportmonks.league_into_dataframe(league_json) #print(league_df.head().to_string()) logger.info('Dimensions of the dataframe: {0}'.format( league_df.shape)) save_name = '{0}{1}'.format(league_id, league_name.replace(' ', '_')) logger.info('Saving CSV: {0}'.format(OUTPUT_PATH + save_name)) league_df.to_csv(OUTPUT_PATH + save_name + '.csv') config.time_taken_display(t0) print(' ')
def run(current_experiment, currentEpoch, data_path, labels_path, ids_path): try: os.makedirs(os.path.join('cluster_visualizations', current_experiment)) except: pass dataset = CostumeDataset(ids_path, data_path, labels_path, img_h=224, img_w=224) dataloader = DataLoader(dataset) # Set up an experiment exp_logger = config_logger(current_experiment) fe = getFeatureExtractionModel(current_experiment,exp_logger,currentEpoch=currentEpoch)[0] fe.eval() for i, batch in enumerate(dataloader): try: inputs = Variable(batch['image'].type(float_type)) features, _ = fe(inputs, None, None) features = features.cpu().numpy().squeeze() features = np.transpose(features, [1, 2, 0]) # transpose to (h,w,c) labels = batch['label'].cpu().numpy() labels = labels.squeeze() flat_labels = labels.flatten() h = features.shape[0] w = features.shape[1] c = features.shape[2] flat_features = np.reshape(features, [h * w, c]) # find tsne coords for 2 dimensions tsne = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) features_2d = tsne.fit_transform(flat_features) instances = np.unique(flat_labels) plt.figure(figsize=(6, 5)) colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple' for idx, instance in enumerate(instances): plt.scatter(features_2d[flat_labels == instance, 0], features_2d[flat_labels == instance, 1], c=colors[idx], label=instance) plt.legend() plt.savefig(os.path.join('cluster_visualizations', current_experiment, "%s.png" % i)) plt.close() print("Done %s" % i) except: continue return
def main(): t0 = time.time() LIMA_MAP = './data/lima_map/limaPolyUTM.geojson' cfg_name = './config/_credentials.cfg' LOGGER_LEVEL = 10 N = 2000 cfg_parser = configparser.ConfigParser() cfg_parser.read(cfg_name) my_key = str(cfg_parser.get('key', 'key')) logger = config.config_logger(__name__, LOGGER_LEVEL) logger.info('Beginning execution: GOOGLE URBAN') logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL)) logger.info('Logging in Google Maps API') gmaps = googlemaps.Client(key=my_key) logger.info('Opening Lima map: {0}'.format(LIMA_MAP)) lima_gpd = maps.load_map(LIMA_MAP) logger.info('Getting Lima limits') lima_limits = maps.boundaries(lima_gpd) logger.info('Plotting Lima map') save_path_lima_map = './data/lima_map/lima.png' maps.plot_map(lima_gpd, save_path_lima_map, lima_limits) logger.info('Getting {0} random starts'.format(N)) start_points = maps.get_list_of_random_starts(lima_gpd, lima_limits, n=N) logger.info('Plotting points in Lima map') save_path_lima_point = './data/lima_map/lima_start_points.png' maps.plot_map_and_points(lima_gpd, save_path_lima_point, start_points) logger.info('Converting UTM points into lat-lon points') start_points = maps.UTM_to_latlon(start_points) #TODO generate queries for trafic each hour #TODO automatize the code to send queries each hour. config.time_taken_display(t0)
import numpy as np import config from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB, GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.metrics import classification_report, make_scorer, accuracy_score from sklearn.model_selection import train_test_split logger = config.config_logger(__name__, 10) np.random.seed(42) def split_data(x, y, proportion=0.3): return train_test_split(x, y, test_size=proportion) def report_model_output(model_output, label): logger.info('{2} -- Mean: {0:.3g} -- std: {1:.3g}'.format( np.mean(model_output), np.std(model_output), label)) return def classification_report_with_accuracy_score(y_true, y_pred): print(classification_report(y_true, y_pred)) # print classification report return accuracy_score(y_true, y_pred) # return accuracy score def logistic_cv(x, y, n_cv=10, lam=1):
def main(): t0 = time.time() logger = config.config_logger(__name__, 10) stats_path = './data/sportmonks/with_data/' save_path = './data/sportmonks/' create_data = False logger.info('Create dataset : {0}'.format(create_data)) if create_data: league_names = work_data.get_data_list(stats_path) logger.info('Leagues found: {0}'.format(len(league_names))) match_data_tot = pd.DataFrame({}) for league in league_names: logger.info('Opening {0}'.format(league)) match_data_raw = work_data.load_data(stats_path, selection=league, date_filter='2016-07-13') logger.info('Dimensions of raw data: {0}'.format( match_data_raw.shape)) match_data_raw = work_data.get_selection(match_data_raw) match_data_raw = work_data.fill_selected_vars(match_data_raw) match_data_raw = work_data.drop_rows_NA(match_data_raw) logger.info('Dimensions after cleaning: {0}'.format( match_data_raw.shape)) match_data_tot = match_data_tot.append(match_data_raw) window = 4 match_data_tot.reset_index(drop=True, inplace=True) logger.info('Dimension of all leagues DB: {0}'.format( match_data_tot.shape)) logger.info('Duplicating vars') match_data = work_data.duplicate_stats(match_data_tot) logger.info('Finding average with window {0}'.format(window)) match_data = work_data.get_averages(match_data, window=window) logger.info('Dimensions after preprocessing: {0}'.format( match_data.shape)) logger.info('Stat variables: {0}'.format( len(work_data.stats_variables()))) match_data.to_csv(save_path + 'sportmonks_final.csv') logger.info('Final DB saved') logger.info('Opening sportmonks DB') match_data = pd.read_csv(save_path + 'sportmonks_final.csv', index_col=1) logger.info('Beginning analysis section') #predict.descriptive_stats(match_data) keep_draws = True logger.info( 'Generating attributes and standardizing - Keep draws: {0}'.format( keep_draws)) y_data, x_data = predict.preprocess_data(match_data, draws=keep_draws) x_data = predict.standardize(x_data) logger.info('Number of attributes included: {0}'.format(x_data.shape[1])) logger.info('Number of obs: {0}'.format(x_data.shape[0])) #print(x_data.describe().transpose().to_string()) train_nn = False train_logit = False train_logit_lasso = False train_svm = False train_tree = False train_adaBoost = True train_gbm = True train_treeBoost = True train_staged = False n_cv = 10 lam_list = [0.001, 0.01, 0.05, 0.1, 0.2, 1, 10, 100] layers = [(10, 5), (20, 10), (30, 10), (50, 10), (100, 10), (100, 20), (100, 30), (100, 50, 10)] if train_nn: logger.info('Training models: Neural Network') cv_nn = list() for layer in layers: temp_nn = predict.neural_network_cv(x_data, y_data, n_cv=n_cv, layers=layer) cv_nn.append(temp_nn) print('.', end='') print(' ') for i in range(len(layers)): predict.report_model_output(cv_nn[i], 'NN_{0}'.format(layers[i])) if train_logit: logger.info('Training models: Logit') cv_logit = list() for lam in lam_list: temp_logit = predict.logistic_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_logit.append(temp_logit) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_logit[i], 'Logit_{0}'.format(lam_list[i])) if train_logit_lasso: logger.info('Training models: Logit_Lasso') cv_logit_lasso = list() for lam in lam_list: temp_logit_lasso = predict.logistic_lasso_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_logit_lasso.append(temp_logit_lasso) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_logit_lasso[i], 'Logit_Lasso_{0}'.format(lam_list[i])) if train_svm: logger.info('Training models: SVM') cv_svm = list() for lam in lam_list: temp_svm = predict.svm_cv(x_data, y_data, n_cv=n_cv, lam=lam) cv_svm.append(temp_svm) print('.', end='') print(' ') for i in range(len(lam_list)): predict.report_model_output(cv_svm[i], 'SVM_{0}'.format(lam_list[i])) if train_tree: logger.info('Training models: Decision Tree') cv_tree = list() temp_tree = predict.tree_cv(x_data, y_data, n_cv=n_cv) cv_tree.append(temp_tree) predict.report_model_output(cv_tree, 'Tree') print(' ') if train_adaBoost: logger.info('Training models: AdaBoost') cv_adaBoost = list() temp_adaBoost = predict.adaBoost_cv(x_data, y_data, n_cv=n_cv) cv_adaBoost.append(temp_adaBoost) predict.report_model_output(cv_adaBoost, 'AdaBoost') print(' ') if train_gbm: logger.info('Training models: gbm') cv_gbm = list() temp_gbm = predict.gbm_cv(x_data, y_data, n_cv=n_cv) cv_gbm.append(temp_gbm) predict.report_model_output(cv_gbm, 'GBM') print(' ') logger.info('Grid: gbm') grid_gbm = predict.gbm_grid(x_data, y_data) print(grid_gbm.best_params_) print(grid_gbm.best_score_) if train_treeBoost: logger.info('Training models: TreeBoost') cv_treeBoost = list() temp_treeBoost = predict.treeBoost_cv(x_data, y_data, n_cv=n_cv) cv_treeBoost.append(temp_treeBoost) predict.report_model_output(cv_treeBoost, 'TreeBoost') print(' ') if train_staged: x_train, x_test, y_train, y_test = predict.split(x_data, y_data, size=0.25) adaBoost_model = predict.adaBoost(x_train, y_train, n_iter=300) adaBoost_model.fit(x_train, y_train) xgBoost_model = predict.xgBoost(x_train, y_train, n_iter=300) xgBoost_model.fit(x_train, y_train) adaBoost_test_errors = [] xgBoost_test_errors = [] for adaBoost_test_predict, xgBoost_test_predict in zip( adaBoost_model.staged_predict(x_test), xgBoost_model.staged_predict(x_test)): adaBoost_test_errors.append( 1. - accuracy_score(adaBoost_test_predict, y_test)) xgBoost_test_errors.append( 1. - accuracy_score(xgBoost_test_predict, y_test)) n_trees_ada = len(adaBoost_test_errors) n_trees_xg = len(xgBoost_test_errors) print(n_trees_ada, n_trees_xg) logger.info('Best accuracy - AdaBoost: {0:.3f}'.format( 1 - min(adaBoost_test_errors))) logger.info('Best accuracy - xgBoost: {0:.3f}'.format( 1 - min(xgBoost_test_errors))) #plt.figure(figsize=(15, 5)) #plt.subplot(131) plt.figure() plt.plot(range(1, n_trees_ada + 1), adaBoost_test_errors, c='black', label='adaBoost') plt.plot(range(1, n_trees_xg + 1), xgBoost_test_errors, c='red', label='xgBoost') plt.legend() plt.ylim(0.2, 0.8) plt.ylabel('Test Error') plt.xlabel('Number of Trees') plt.show() #TODO exclude first and last 4 matches #TODO redo CV using GridSearchCV #TODO use PCA to preprocess the data #TODO implement SoftVoting config.time_taken_display(t0)
def main(): logger = config.config_logger(__name__, 10) t0 = time.time() pdf_path = './data/pdf/' txt_path = './data/txt/' dict_path = './data/dict/' output_path = './output/' convert_files = False logger.info('Begin execution') if convert_files: logger.info('Coonvert files: {0}'.format(convert_files)) logging.getLogger().setLevel(30) all_docs = work_data.convert_pdf_to_txt(pdf_path, txt_path) logging.getLogger().setLevel(10) else: logger.info('Import testimonials') all_docs = work_data.open_testimonies(txt_path) logger.info('Create wordcloud') wordcloud_words = work_data.generate_wordcloud(all_docs, output_path) wordcloud_words.to_csv('./output/word_count.csv') logger.info('Remove protocol paragraphs') filter_docs = [doc.filter_protocol() for doc in all_docs] filter_docs = [doc for doc in filter_docs if len(doc) > 5] logger.info('Load dictionaries') dict1 = pd.read_csv(dict_path + 'dict_ale.csv', index_col=0, header=0) dict2 = pd.read_csv(dict_path + 'dict_erika.csv', index_col=0, header=0) dict3 = pd.read_csv(dict_path + 'dict_macla.csv', index_col=0, header=0) dict_agents = pd.read_csv(dict_path + 'dict_agentes.csv', index_col=0, header=0) filter_docs = [work_data.input_sentiment(doc, dict1) for doc in filter_docs] filter_docs = [work_data.input_sentiment(doc, dict2) for doc in filter_docs] filter_docs = [work_data.input_sentiment(doc, dict3) for doc in filter_docs] #clean_docs = [work_data.input_agent(doc, dict_agents) for doc in clean_docs] tagged = not_tagged = 0 for i in filter_docs: for par in i: if par.sentiment != 'none': tagged += 1 else: not_tagged += 1 logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged)) logger.info('Number of testimonials: {0}'.format(len(all_docs))) logger.info('Clean testimonials') clean_docs = [[parag.clean_data() for parag in doc] for doc in filter_docs] clean_docs, quechua = work_data.extract_quechua(clean_docs) logger.info('Testimonials in spanish {0} - quechua {1}'.format(len(clean_docs), len(quechua))) print([doc[0].name for doc in quechua]) tagged = not_tagged = 0 for i in clean_docs: for par in i: if par.sentiment != 'none': tagged += 1 else: not_tagged += 1 logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged)) logger.info('Train model') parag_trained1 = work_data.input_sentiment_posneg(filter_docs) parag_trained2 = work_data.train_sentiment(clean_docs) logger.info('Save preditions') parag_trained1.to_csv(output_path + 'reg_database1.csv') parag_trained2.to_csv(output_path + 'reg_database2.csv') # LDA implementation mat_docs, dictionary = work_data.list_to_matrix(clean_docs) print(mat_docs[0]) pprint.pprint(dictionary.dfs) lda_model = work_data.lda_model(dictionary, mat_docs, 10) print(lda_model) pprint.pprint(lda_model.print_topics(num_topics=10, num_words=10)) config.time_taken_display(t0)
def runSingleClusterSize(feExpName, feSubName, clExpName, clSubName, feEpoch, clEpoch, dataPath, labelsPath, idsFilePath, outputPath, clusterMinSize, mergeSmallerFirst): makedirs(outputPath, exist_ok=True) dataset = CostumeDataset(idsFilePath, dataPath, labelsPath) dataLoader = DataLoader(dataset, batch_size=1, shuffle=False) if useClusteringNet: loggerExpName = 'evaluation_' + feExpName + '_' + feSubName + '_' + clExpName + '_' + clSubName else: loggerExpName = 'evaluation_' + feExpName + '_' + feSubName logger = config_logger(loggerExpName) featureExtractorModel = \ getFeatureExtractionModel(feExpName, logger, sub_experiment_name=feSubName, currentEpoch=feEpoch)[0] featureExtractorModel.eval() if useClusteringNet: clusteringModel = getClusterModel(clExpName, logger, sub_experiment_name=clSubName, currentEpoch=clEpoch)[0] clusteringModel.eval() hdbEval = Evaluator() hdbEvalResults = [] hdbMrfEval = Evaluator() hdbMrfEvalResults = [] hdbClusterNetEval = Evaluator() hdbClusterNetEvalResults = [] gtClusterNetEval = Evaluator() gtClusterNetEvalResults = [] hdbClusterNetMrfEval = Evaluator() hdbClusterNetMrfEvalResults = [] hdbMrfClusterNetEval = Evaluator() hdbMrfClusterNetEvalResults = [] hdbMrfClusterNetMrfEval = Evaluator() hdbMrfClusterNetMrfEvalResults = [] for i, batch in enumerate(dataLoader): inputs = batch['image'].type(float_type) labels = batch['label'].cpu().numpy() labels = labels[0] saveImage(outputPath, 'image', i, batch['originalImage'].cpu().numpy()) saveLabel(outputPath, 'ground_truth', i, labels) features = featureExtractorModel(inputs, None, None)[0] clustered = getClusters(features.cpu().numpy(), clusterMinSize) saveLabel(outputPath, 'hdbscan', i, clustered) hdbEvalResults.append(hdbEval.evaluate(clustered, labels)) if useMrfAfterHdbScan: clusteredAndMRF = upsample( denoise_colored_image(downsample(clustered, RESCALE_FACTOR)), RESCALE_FACTOR) saveLabel(outputPath, 'hdbscan_mrf', i, clusteredAndMRF) hdbMrfEvalResults.append( hdbMrfEval.evaluate(clusteredAndMRF, labels)) if useClusteringNet: clusteredInput = convertToClusterNetInput(features, clustered, mergeSmallerFirst) if clusteredInput.shape[0] > 0: noMrfOnInput = clusteringModel(clusteredInput, None)[0] else: noMrfOnInput = np.zeros( (1, 1, clusteredInput.shape[2], clusteredInput.shape[3])) hdbClusterNetOut = convertIndividualSegmentsToSingleImage( noMrfOnInput, mergeSmallerFirst) saveLabel(outputPath, 'hdbscan_clusternet', i, hdbClusterNetOut) hdbClusterNetEvalResults.append( hdbClusterNetEval.evaluate(hdbClusterNetOut, labels)) gtInput = convertToClusterNetInput(features, labels, mergeSmallerFirst) if gtInput.shape[0] > 0: clusteredGt = clusteringModel(gtInput, None)[0] else: clusteredGt = np.zeros( (1, 1, gtInput.shape[2], gtInput.shape[3])) gtClusterNetOut = convertIndividualSegmentsToSingleImage( clusteredGt, mergeSmallerFirst) saveLabel(outputPath, 'calc_mean_by_GT_clusternet', i, gtClusterNetOut) gtClusterNetEvalResults.append( gtClusterNetEval.evaluate(gtClusterNetOut, labels)) if useMrfAfterHdbScan and useClusteringNet: clusteredMrfInput = convertToClusterNetInput( features, clusteredAndMRF, mergeSmallerFirst) if clusteredMrfInput.shape[0] > 0: mrfOnInput = clusteringModel(clusteredMrfInput, None)[0] else: mrfOnInput = np.zeros((1, 1, clusteredMrfInput.shape[2], clusteredMrfInput.shape[3])) hdbMrfClusterNetOut = convertIndividualSegmentsToSingleImage( mrfOnInput, mergeSmallerFirst) saveLabel(outputPath, 'hdbscan_mrf_clusternet', i, hdbMrfClusterNetOut) hdbMrfClusterNetEvalResults.append( hdbMrfClusterNetEval.evaluate(hdbMrfClusterNetOut, labels)) if useClusteringNet and useMrfAfterClusteringNet: hdbClusterNetMrfOut = upsample( denoise_colored_image( downsample(hdbClusterNetOut, RESCALE_FACTOR)), RESCALE_FACTOR) saveLabel(outputPath, 'hdbscan_clusternet_mrf', i, hdbClusterNetMrfOut) hdbClusterNetMrfEvalResults.append( hdbClusterNetMrfEval.evaluate(hdbClusterNetMrfOut, labels)) if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet: hdbMrfClusterNetMrfOut = upsample( denoise_colored_image( downsample(hdbMrfClusterNetOut, RESCALE_FACTOR)), RESCALE_FACTOR) saveLabel(outputPath, 'hdbscan_mrf_clusternet_mrf', i, hdbMrfClusterNetMrfOut) hdbMrfClusterNetMrfEvalResults.append( hdbMrfClusterNetMrfEval.evaluate(hdbMrfClusterNetMrfOut, labels)) hdbEvalResults.append(hdbEval.get_average_results()) if useMrfAfterHdbScan: hdbMrfEvalResults.append(hdbMrfEval.get_average_results()) if useClusteringNet: hdbClusterNetEvalResults.append( hdbClusterNetEval.get_average_results()) gtClusterNetEvalResults.append(gtClusterNetEval.get_average_results()) if useMrfAfterHdbScan and useClusteringNet: hdbMrfClusterNetEvalResults.append( hdbMrfClusterNetEval.get_average_results()) if useClusteringNet and useMrfAfterClusteringNet: hdbClusterNetMrfEvalResults.append( hdbClusterNetMrfEval.get_average_results()) if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet: hdbMrfClusterNetMrfEvalResults.append( hdbMrfClusterNetMrfEval.get_average_results()) with open(join(outputPath, 'statistics.txt'), mode='w') as file: for i in range(len(hdbEvalResults) - 1): file.write('hdbscan only image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbEvalResults[i])) file.write('\n') if useMrfAfterHdbScan: file.write('hdbscan and MRF image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbMrfEvalResults[i])) file.write('\n') if useClusteringNet: file.write('hdbscan and ClusterNet image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbClusterNetEvalResults[i])) file.write('\n') file.write('ClusterNet with actual embedding mean image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(gtClusterNetEvalResults[i])) file.write('\n') if useMrfAfterHdbScan and useClusteringNet: file.write('hdbscan and MRF and ClusterNet image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbMrfClusterNetEvalResults[i])) file.write('\n') if useClusteringNet and useMrfAfterClusteringNet: file.write('hdbscan and ClusterNet and MRF image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbClusterNetMrfEvalResults[i])) file.write('\n') if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet: file.write('hdbscan and MRF and ClusterNet and MRF image ' + str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' + str(hdbMrfClusterNetMrfEvalResults[i])) file.write('\n') file.write('\n') lastLoc = len(hdbEvalResults) - 1 file.write('hdbscan only, average score: ' + str(hdbEvalResults[lastLoc])) file.write('\n') if useMrfAfterHdbScan: file.write('hdbscan and MRF, average score: ' + str(hdbMrfEvalResults[lastLoc])) file.write('\n') if useClusteringNet: file.write('hdbscan and ClusterNet, average score: ' + str(hdbClusterNetEvalResults[lastLoc])) file.write('\n') file.write( 'ClusterNet using actual embedding mean, average score: ' + str(gtClusterNetEvalResults[lastLoc])) file.write('\n') if useMrfAfterHdbScan and useClusteringNet: file.write('hdbscan and MRF and ClusterNet, average score: ' + str(hdbMrfClusterNetEvalResults[lastLoc])) file.write('\n') if useClusteringNet and useMrfAfterClusteringNet: file.write('hdbscan and ClusterNet and MRF, average score: ' + str(hdbClusterNetMrfEvalResults[lastLoc])) file.write('\n') if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet: file.write( 'hdbscan and MRF and ClusterNet and MRF, average score: ' + str(hdbMrfClusterNetMrfEvalResults[lastLoc])) file.write('\n')
def main(): np.random.seed(42) logger = config.config_logger(__name__, 10) t0 = time.time() train_client_path = './data/raw/csv/train_clientes.csv' train_reque_path = './data/raw/csv/train_requerimientos.csv' test_client_path = './data/raw/csv/test_clientes.csv' test_reque_path = './data/raw/csv/test_requerimientos.csv' output_path = './output/' do_merge = False write_impute_test = False write_output = False add_variables = False version = 6 logger.info('Beginning execution') logger.info('Load dataframes') test_client = pd.read_csv(test_client_path, header=0) test_reque = pd.read_csv(test_reque_path, header=0) main_client = pd.read_csv(train_client_path, header=0) main_reque = pd.read_csv(train_reque_path, header=0) work_data.basic_descriptive(main_client) work_data.basic_descriptive(main_reque) id_variables = work_data.id_variables() index_client = test_client['ID_CORRELATIVO'] if write_impute_test: logger.info('Creating new test database') logger.info('Cleaning test reque database') test_reque = work_data.preprocess_reque(test_reque) print(test_reque.head().to_string()) logger.info('Cleaning test client database - Imputing missing values') test_client = work_data.count_missings_column(test_client) test_client = work_data.preprocess_client(test_client) print(test_client.head().to_string()) logger.info('Merging test databases') temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index]) temp.fillna(0, inplace=True) test_df = temp print(test_df.head().to_string()) print(test_df.describe().transpose().to_string()) logger.info('Saving test database') test_df.to_csv('./data/mod/test_imputed.csv', index=False) else: logger.info('Opening test database') test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0) print(test_df.head().to_string()) if do_merge: logger.info('Creating new merge') logger.info('Cleaning reque database') main_reque = work_data.preprocess_reque(main_reque) print(main_reque.head().to_string()) #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum) #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format) #main_reque.fillna(0, inplace=True) logger.info('Cleaning client database - Imputing missing values') main_client = work_data.count_missings_column(main_client) target = main_client.pop('ATTRITION') target.index = main_client['ID_CORRELATIVO'] main_client = work_data.preprocess_client(main_client) main_client['ATTRITION'] = target print(main_client.head().to_string()) logger.info('Merging databases') temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index]) temp.fillna(0, inplace=True) main_df = temp print(main_df.shape) print(main_df.head().to_string()) print(main_df.describe().transpose().to_string()) work_data.basic_descriptive(main_df) logger.info('Saving marges database') main_df.to_csv('./data/mod/merge1.csv', index=False) else: logger.info('Opening merged database') main_df = pd.read_csv('./data/mod/merge1.csv', header=0) print(main_df.head().to_string()) print(main_df.shape) y = main_df.pop('ATTRITION') main_df = main_df.append(test_df).reset_index(drop=True) if False: logger.info('Creating T-SNE database') temp_tsne = pd.DataFrame(models.tnse(main_df)) temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False) else: logger.info('Loading T-SNE database') temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv') if add_variables: logger.info('Beginning feature engineering') logger.info('Interactions') main_df_feat = models.create_interactions(main_df, models.inter_vars()) logger.info('Row sums 1-3') main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1) temp = models.standard_scale_df(main_df) main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1) main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1) logger.info('K-means 4-7') main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str) main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str) main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str) main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str) logger.info('KNN 8-11') main_df_feat['ext8'] = models.knn_distance(main_df, 2) main_df_feat['ext9'] = models.knn_distance(main_df, 3) main_df_feat['ext10'] = models.knn_distance(main_df, 5) main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2) main_df_feat = pd.get_dummies(main_df_feat, drop_first=True) print(main_df_feat.head().to_string()) print(main_df_feat.shape) config.time_taken_display(t0) logger.info('Saving features database') main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False) else: logger.info('Opening feature engineered database') main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0) print(main_df_feat.head().to_string()) print(main_df_feat.shape) logger.info('Split data into train and test') x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :] print(main_df_feat.shape) print(x.shape) print(test_df.shape) x_train, x_test, y_train, y_test = models.split_data(x, y) work_data.basic_descriptive(x_train) logger.info('Level 1 - Create metafeatures') if False: logger.info('1. Ridge logit') ridge_model = models.logit_grid(x, y, 'l2', StandardScaler()) models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard') print(ridge_model.score(x_test, y_test)) logger.info('2. Lasso logit') lasso_model = models.logit_grid(x, y, 'l1',StandardScaler()) models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard') print(lasso_model.score(x_test, y_test)) logger.info('3. Random Forrest') RF_model = models.random_forrest_grid(x, y, StandardScaler()) models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard') print(RF_model.score(x_test, y_test)) logger.info('4. Extra Trees') ET_model = models.extra_trees_grid(x, y, StandardScaler()) models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard') print(ET_model.score(x_test, y_test)) logger.info('5. 2-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 2) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard') print(KNN_model.score(x_test, y_test)) logger.info('6. 4-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 4) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard') print(KNN_model.score(x_test, y_test)) logger.info('7. 8-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 8) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard') print(KNN_model.score(x_test, y_test)) logger.info('8. 16-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 16) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard') print(KNN_model.score(x_test, y_test)) logger.info('9. 32-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 32) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard') print(KNN_model.score(x_test, y_test)) logger.info('10. 64-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 64) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard') print(KNN_model.score(x_test, y_test)) logger.info('11. 128-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 128) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard') print(KNN_model.score(x_test, y_test)) logger.info('12. 256-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 256) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard') print(KNN_model.score(x_test, y_test)) logger.info('13. 512-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 512) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard') print(KNN_model.score(x_test, y_test)) logger.info('14. 1024-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 1024) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard') print(KNN_model.score(x_test, y_test)) logger.info('15. Naive Bayes') NB_model = models.naive_bayes_grid(x, y, StandardScaler()) models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard') print(NB_model.score(x_test, y_test)) logger.info('16. MPL') MLP_model = models.MLP_grid(x, y, StandardScaler()) models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard') print(MLP_model.score(x_test, y_test)) logger.info('17. AdaBoost') adaboost_model = models.adaboost_grid(x, y, StandardScaler()) models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard') print(adaboost_model.score(x_test, y_test)) logger.info('18. GBM') gbm_model = models.gbm_grid(x, y, StandardScaler()) models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard') print(gbm_model.score(x_test, y_test)) logger.info('18. LightGBM') lgbm_model = models.lgbm_grid(x, y, None) models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none') print(lgbm_model.score(x_test, y_test)) logger.info('19. XgBoost') test_final = main_df_feat.iloc[70000:, :] id_test = test_client['ID_CORRELATIVO'] xgboost_model = models.xgboost_grid(x, y, StandardScaler()) models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard') print(xgboost_model.score(x_test, y_test)) models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION') hi # Stage 2: logger.info('Level 2') logger.info('Creating meta-features database') meta_features_list = os.listdir('./data/mod/meta_features') temp = {} for feature in meta_features_list: temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0) temp[feature] = temp_df.iloc[:, 1] meta_features = pd.DataFrame(temp) meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True) x = meta_features.iloc[:70000, :] test_final = meta_features.iloc[70000:, :] x_train, x_test, y_train, y_test = models.split_data(x, y) print(x_train.shape) print(test_final.shape) print(x.shape) logger.info('Estimating second level model with XgBoost') xgboost_final = models.xgboost_full_mod(x_train, y_train) print(xgboost_final.score(x_test, y_test)) print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1])) models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8') models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train') config.time_taken_display(t0) hi logger.info('XgBoost') xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test) print('Test grid: {0}'.format(xgboost_result)) #Test: -0.322 xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test) print(xgboost_full) xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')
def main(): LOGGER_LEVEL = 10 RAW_DATA_PATH = './data/raw/' RAW_CSV_NAME = 'raw_data.csv' t0 = time.time() logger = config.config_logger(__name__, LOGGER_LEVEL) pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x)) logger.info('Beginning execution: zika dataset') logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL)) logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME)) raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME) logger.info('Raw dataset description:') process.basic_descriptives(raw_data) raw_data = process.preprocess(raw_data) #print(raw_data.describe().transpose().to_string()) #print(raw_data.head().to_string()) #print(raw_data.info().to_string()) y_dengue = raw_data['dengue_pcr'] y_zika = raw_data['zika_pcr'] y_chik = raw_data['chik_pcr'] diseases = [y_dengue, y_zika, y_chik] # Check process code for further explanation of select_disease function. # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any # only_one: if True, input np.nan to patients with another disease. y = process.select_disease(diseases, code=1, only_one=False) logger.info('Target var frequency: \n{0}'.format(y.value_counts())) logger.info('Total obs: {0}'.format(y.value_counts().sum())) remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist', 'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4', 'dengue_pcr', 'zika_pcr', 'chik_pcr'] X = process.remove_vars(raw_data, remove_list) X = process.keep_non_nan(X, y) y = y.dropna() logger.info('Features dataset') process.basic_descriptives(X) logger.info('Split train test') X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4) logger.info('Estimating models') logger.info('GBM') grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5) logger.info(grid_gbm.best_params_) logger.info('Train score: {0}'.format(grid_gbm.best_score_)) logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test))) logger.info('Logit') grid_logit = models.logit_grid(X_train, y_train, n_cv=5) logger.info(grid_logit.best_params_) logger.info('Train score: {0}'.format(grid_logit.best_score_)) logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test))) logger.info('AdaBoost') grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5) logger.info(grid_adaboost.best_params_) logger.info('Train score: {0}'.format(grid_adaboost.best_score_)) logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test))) logger.info('Soft Voting') eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit), ('ada', grid_adaboost)], voting='soft') eclf.fit(X_train, y_train) y_pred = eclf.predict_proba(X_test) print(y_pred[:5,:]) logger.info('Train score: {0}'.format(eclf.score(X_train, y_train))) logger.info('Test score: {0}'.format(eclf.score(X_test, y_test))) config.time_taken_display(t0)
import argparse from config import config_logger from controller.executor import Executor if __name__ == '__main__': config_logger() parser = argparse.ArgumentParser() parser.add_argument('-c', '--cash', type=int, help='initial capital of cash', default=10000) parser.add_argument('-v', '--volume', type=int, help='initial stock volume', default=100) parser.add_argument('-sc', '--stock_code', type=str, help='stock code', default='000001') parser.add_argument( '-s', '--strategy', type=str, help='strategy type -"rsi_hf" and "rsi_lf" are supported', default='rsi_hf') parser.add_argument('-ts', '--time_span',