Beispiel #1
0
    def __init__(self, experiment_name):
        self.experiment_name = experiment_name
        self.params_search = \
            {"dv": [2],
             "dd": [20],
             "include_edges": [False, True],
             "include_weighted_mean": [False, True],
             "alpha": [2]
             }
        # the keys of this dict represents a feature params, that without being ON, their dict values has no meaning
        self.features_params = {"include_edges": ["edges_max_pixels"],
                                "include_weighted_mean": ["center_weight", "corners_weight"]
                                }
        self._build_checkpoints_dir()
        self.logger = config_logger(self.experiment_name)
        metadata = self.load_experiment()

        if metadata:
            self.execution_order = metadata[EXECUTION_ORDER]
            self.execution_counter = metadata[EXECUTION_COUNTER]
            self.best_experiment_index = metadata[BEST_EXPERIMENT_INDEX]
            self.min_loss = metadata[MIN_LOSS]
            self.logger.info("Starting experiment %s from %s out of %s", self.experiment_name,
                             self.execution_counter, len(self.execution_order))
        else:
            self.execution_order = self._build_execution_order()
            self.execution_counter = 0
            self.best_experiment_index = None
            self.min_loss = None
            self.logger.info("Starting experiment %s from scratch", self.experiment_name)
Beispiel #2
0
def main():
    # Objective: create a csv file with the ubigeo code of all centro poblados in Peru.
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    logger.info('Beginning execuation')
    raw = pd.read_csv('./data/ubigeo/Ubigeo_mod.csv', header=0)
    districts = raw.loc[raw['district'].apply(np.isnan)]
    print(districts.shape)
    departments = raw.loc[~raw['region'].apply(np.isnan)]
    n_centro_pob = sum(departments['centro_pob'])
    logger.info('There are {0:,} centros poblados in Peru'.format(n_centro_pob))

    output = []
    for index, row in districts.iterrows():
        temp_centros = row['centro_pob']
        temp_code = row['code']
        for centro in range(1, temp_centros+1):
            centro_code = add_zero_left(centro, 4)
            centro_code = str(temp_code) + centro_code
            centro_code = add_zero_left(centro_code, 10)
            output.append(centro_code)

    print(len(output))  # This shoul be equal to the number of centros poblados.

    print(raw.head().to_string())
    print(districts.head().to_string())
    print(departments.head().to_string())

    output = pd.DataFrame({'ubigeo': output})
    output.to_csv('./data/ubigeo/ubigeo_final.csv')

    config.time_taken_display(t0)
Beispiel #3
0
def run(current_experiment, currentEpoch, data_path, labels_path, ids_path):

    dataset = CostumeDataset(ids_path,
                             data_path,
                             labels_path,
                             img_h=224,
                             img_w=224)
    dataloader = DataLoader(dataset)

    # Set up an experiment
    logger = config_logger(current_experiment)
    fe = getFeatureExtractionModel(current_experiment,
                                   logger,
                                   currentEpoch=currentEpoch)[0]

    fe.eval()
    for i, batch in enumerate(dataloader):
        inputs = Variable(batch['image'].type(float_type))
        labels = batch['label'].cpu().numpy()
        results = fe(inputs, None, None)
        features = results[0]
        inputs = inputs.cpu().numpy().squeeze()
        features = features.cpu().numpy().squeeze()
        labels = labels.squeeze()
        visualize(inputs, labels, features, current_experiment, i)

    return
Beispiel #4
0
def main():

    LOGGER_LEVEL = 20
    cfg_name = './config/_credentials.cfg'
    OUTPUT_PATH = './data/sportmonks/'
    GET_STATS = False
    league_id_list = [
        2, 5, 72, 74, 78, 82, 85, 208, 462, 564, 570, 384, 390, 8, 9, 24, 12,
        600, 301, 304, 453, 1114, 292, 654, 651, 444, 573, 579
    ]

    t0 = time.time()
    pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x))
    logger = config.config_logger(__name__, level=LOGGER_LEVEL)

    cfg_parser = configparser.ConfigParser()
    cfg_parser.read(cfg_name)
    my_key = str(cfg_parser.get('Sportmonks', 'key'))

    logger.info('Beginning execution')
    sportmonks.init(my_key)

    logger.info('Available leagues:')
    leagues_dict = {}
    for l in sportmonks.leagues():
        print(l['id'], l['name'], l['country_id'])
        leagues_dict[l['id']] = l['name']

    for league_id in league_id_list:
        if GET_STATS:
            league_name = leagues_dict[league_id]
            logger.info('Sending query - Stats')
            logger.info('League selected: {0} - {1}'.format(
                league_id, league_name))
            league_json = sportmonks.league(
                league_id,
                include=
                'seasons.fixtures.stats,seasons.fixtures.localTeam,seasons.fixtures.visitorTeam'
            )

            logger.info('Processing package')
            league_df = sportmonks.league_into_dataframe(league_json)
            #print(league_df.head().to_string())
            logger.info('Dimensions of the dataframe: {0}'.format(
                league_df.shape))

            save_name = '{0}{1}'.format(league_id,
                                        league_name.replace(' ', '_'))
            logger.info('Saving CSV: {0}'.format(OUTPUT_PATH + save_name))
            league_df.to_csv(OUTPUT_PATH + save_name + '.csv')

    config.time_taken_display(t0)
    print(' ')
def run(current_experiment, currentEpoch, data_path, labels_path, ids_path):
    try:
        os.makedirs(os.path.join('cluster_visualizations', current_experiment))
    except:
        pass

    dataset = CostumeDataset(ids_path, data_path, labels_path, img_h=224, img_w=224)
    dataloader = DataLoader(dataset)

    # Set up an experiment
    exp_logger = config_logger(current_experiment)
    fe = getFeatureExtractionModel(current_experiment,exp_logger,currentEpoch=currentEpoch)[0]

    fe.eval()
    for i, batch in enumerate(dataloader):
        try:
            inputs = Variable(batch['image'].type(float_type))
            features, _ = fe(inputs, None, None)
            features = features.cpu().numpy().squeeze()
            features = np.transpose(features, [1, 2, 0])  # transpose to (h,w,c)

            labels = batch['label'].cpu().numpy()
            labels = labels.squeeze()
            flat_labels = labels.flatten()

            h = features.shape[0]
            w = features.shape[1]
            c = features.shape[2]

            flat_features = np.reshape(features, [h * w, c])

            # find tsne coords for 2 dimensions
            tsne = TSNE(n_components=2, random_state=0)
            np.set_printoptions(suppress=True)
            features_2d = tsne.fit_transform(flat_features)

            instances = np.unique(flat_labels)

            plt.figure(figsize=(6, 5))
            colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple'
            for idx, instance in enumerate(instances):
                plt.scatter(features_2d[flat_labels == instance, 0], features_2d[flat_labels == instance, 1],
                            c=colors[idx], label=instance)
            plt.legend()
            plt.savefig(os.path.join('cluster_visualizations', current_experiment, "%s.png" % i))
            plt.close()
            print("Done %s" % i)
        except:
            continue
    return
Beispiel #6
0
def main():

    t0 = time.time()
    LIMA_MAP = './data/lima_map/limaPolyUTM.geojson'
    cfg_name = './config/_credentials.cfg'
    LOGGER_LEVEL = 10
    N = 2000

    cfg_parser = configparser.ConfigParser()
    cfg_parser.read(cfg_name)
    my_key = str(cfg_parser.get('key', 'key'))

    logger = config.config_logger(__name__, LOGGER_LEVEL)
    logger.info('Beginning execution: GOOGLE URBAN')
    logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL))

    logger.info('Logging in Google Maps API')
    gmaps = googlemaps.Client(key=my_key)

    logger.info('Opening Lima map: {0}'.format(LIMA_MAP))
    lima_gpd = maps.load_map(LIMA_MAP)

    logger.info('Getting Lima limits')
    lima_limits = maps.boundaries(lima_gpd)

    logger.info('Plotting Lima map')
    save_path_lima_map = './data/lima_map/lima.png'
    maps.plot_map(lima_gpd, save_path_lima_map, lima_limits)

    logger.info('Getting {0} random starts'.format(N))
    start_points = maps.get_list_of_random_starts(lima_gpd, lima_limits, n=N)

    logger.info('Plotting points in Lima map')
    save_path_lima_point = './data/lima_map/lima_start_points.png'
    maps.plot_map_and_points(lima_gpd, save_path_lima_point, start_points)

    logger.info('Converting UTM points into lat-lon points')
    start_points = maps.UTM_to_latlon(start_points)

    #TODO generate queries for trafic each hour
    #TODO automatize the code to send queries each hour.

    config.time_taken_display(t0)
Beispiel #7
0
import numpy as np
import config
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split

logger = config.config_logger(__name__, 10)
np.random.seed(42)


def split_data(x, y, proportion=0.3):
    return train_test_split(x, y, test_size=proportion)


def report_model_output(model_output, label):
    logger.info('{2} -- Mean: {0:.3g} -- std: {1:.3g}'.format(
        np.mean(model_output), np.std(model_output), label))
    return


def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred))  # print classification report
    return accuracy_score(y_true, y_pred)  # return accuracy score


def logistic_cv(x, y, n_cv=10, lam=1):
Beispiel #8
0
def main():
    t0 = time.time()
    logger = config.config_logger(__name__, 10)

    stats_path = './data/sportmonks/with_data/'
    save_path = './data/sportmonks/'
    create_data = False

    logger.info('Create dataset : {0}'.format(create_data))
    if create_data:
        league_names = work_data.get_data_list(stats_path)
        logger.info('Leagues found: {0}'.format(len(league_names)))

        match_data_tot = pd.DataFrame({})
        for league in league_names:
            logger.info('Opening {0}'.format(league))
            match_data_raw = work_data.load_data(stats_path,
                                                 selection=league,
                                                 date_filter='2016-07-13')
            logger.info('Dimensions of raw data: {0}'.format(
                match_data_raw.shape))
            match_data_raw = work_data.get_selection(match_data_raw)
            match_data_raw = work_data.fill_selected_vars(match_data_raw)
            match_data_raw = work_data.drop_rows_NA(match_data_raw)
            logger.info('Dimensions after cleaning: {0}'.format(
                match_data_raw.shape))
            match_data_tot = match_data_tot.append(match_data_raw)

        window = 4
        match_data_tot.reset_index(drop=True, inplace=True)
        logger.info('Dimension of all leagues DB: {0}'.format(
            match_data_tot.shape))
        logger.info('Duplicating vars')
        match_data = work_data.duplicate_stats(match_data_tot)
        logger.info('Finding average with window {0}'.format(window))
        match_data = work_data.get_averages(match_data, window=window)
        logger.info('Dimensions after preprocessing: {0}'.format(
            match_data.shape))
        logger.info('Stat variables: {0}'.format(
            len(work_data.stats_variables())))
        match_data.to_csv(save_path + 'sportmonks_final.csv')
        logger.info('Final DB saved')

    logger.info('Opening sportmonks DB')
    match_data = pd.read_csv(save_path + 'sportmonks_final.csv', index_col=1)

    logger.info('Beginning analysis section')
    #predict.descriptive_stats(match_data)

    keep_draws = True
    logger.info(
        'Generating attributes and standardizing - Keep draws: {0}'.format(
            keep_draws))
    y_data, x_data = predict.preprocess_data(match_data, draws=keep_draws)
    x_data = predict.standardize(x_data)
    logger.info('Number of attributes included: {0}'.format(x_data.shape[1]))
    logger.info('Number of obs: {0}'.format(x_data.shape[0]))
    #print(x_data.describe().transpose().to_string())

    train_nn = False
    train_logit = False
    train_logit_lasso = False
    train_svm = False
    train_tree = False
    train_adaBoost = True
    train_gbm = True
    train_treeBoost = True
    train_staged = False
    n_cv = 10
    lam_list = [0.001, 0.01, 0.05, 0.1, 0.2, 1, 10, 100]
    layers = [(10, 5), (20, 10), (30, 10), (50, 10), (100, 10), (100, 20),
              (100, 30), (100, 50, 10)]

    if train_nn:
        logger.info('Training models: Neural Network')
        cv_nn = list()
        for layer in layers:
            temp_nn = predict.neural_network_cv(x_data,
                                                y_data,
                                                n_cv=n_cv,
                                                layers=layer)
            cv_nn.append(temp_nn)
            print('.', end='')
        print(' ')
        for i in range(len(layers)):
            predict.report_model_output(cv_nn[i], 'NN_{0}'.format(layers[i]))

    if train_logit:
        logger.info('Training models: Logit')
        cv_logit = list()
        for lam in lam_list:
            temp_logit = predict.logistic_cv(x_data,
                                             y_data,
                                             n_cv=n_cv,
                                             lam=lam)
            cv_logit.append(temp_logit)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_logit[i],
                                        'Logit_{0}'.format(lam_list[i]))

    if train_logit_lasso:
        logger.info('Training models: Logit_Lasso')
        cv_logit_lasso = list()
        for lam in lam_list:
            temp_logit_lasso = predict.logistic_lasso_cv(x_data,
                                                         y_data,
                                                         n_cv=n_cv,
                                                         lam=lam)
            cv_logit_lasso.append(temp_logit_lasso)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_logit_lasso[i],
                                        'Logit_Lasso_{0}'.format(lam_list[i]))

    if train_svm:
        logger.info('Training models: SVM')
        cv_svm = list()
        for lam in lam_list:
            temp_svm = predict.svm_cv(x_data, y_data, n_cv=n_cv, lam=lam)
            cv_svm.append(temp_svm)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_svm[i],
                                        'SVM_{0}'.format(lam_list[i]))

    if train_tree:
        logger.info('Training models: Decision Tree')
        cv_tree = list()
        temp_tree = predict.tree_cv(x_data, y_data, n_cv=n_cv)
        cv_tree.append(temp_tree)
        predict.report_model_output(cv_tree, 'Tree')
        print(' ')

    if train_adaBoost:
        logger.info('Training models: AdaBoost')
        cv_adaBoost = list()
        temp_adaBoost = predict.adaBoost_cv(x_data, y_data, n_cv=n_cv)
        cv_adaBoost.append(temp_adaBoost)
        predict.report_model_output(cv_adaBoost, 'AdaBoost')
        print(' ')

    if train_gbm:
        logger.info('Training models: gbm')
        cv_gbm = list()
        temp_gbm = predict.gbm_cv(x_data, y_data, n_cv=n_cv)
        cv_gbm.append(temp_gbm)
        predict.report_model_output(cv_gbm, 'GBM')
        print(' ')

        logger.info('Grid: gbm')
        grid_gbm = predict.gbm_grid(x_data, y_data)
        print(grid_gbm.best_params_)
        print(grid_gbm.best_score_)

    if train_treeBoost:
        logger.info('Training models: TreeBoost')
        cv_treeBoost = list()
        temp_treeBoost = predict.treeBoost_cv(x_data, y_data, n_cv=n_cv)
        cv_treeBoost.append(temp_treeBoost)
        predict.report_model_output(cv_treeBoost, 'TreeBoost')
        print(' ')

    if train_staged:
        x_train, x_test, y_train, y_test = predict.split(x_data,
                                                         y_data,
                                                         size=0.25)
        adaBoost_model = predict.adaBoost(x_train, y_train, n_iter=300)
        adaBoost_model.fit(x_train, y_train)

        xgBoost_model = predict.xgBoost(x_train, y_train, n_iter=300)
        xgBoost_model.fit(x_train, y_train)

        adaBoost_test_errors = []
        xgBoost_test_errors = []

        for adaBoost_test_predict, xgBoost_test_predict in zip(
                adaBoost_model.staged_predict(x_test),
                xgBoost_model.staged_predict(x_test)):
            adaBoost_test_errors.append(
                1. - accuracy_score(adaBoost_test_predict, y_test))
            xgBoost_test_errors.append(
                1. - accuracy_score(xgBoost_test_predict, y_test))

        n_trees_ada = len(adaBoost_test_errors)
        n_trees_xg = len(xgBoost_test_errors)
        print(n_trees_ada, n_trees_xg)
        logger.info('Best accuracy - AdaBoost: {0:.3f}'.format(
            1 - min(adaBoost_test_errors)))
        logger.info('Best accuracy - xgBoost: {0:.3f}'.format(
            1 - min(xgBoost_test_errors)))

        #plt.figure(figsize=(15, 5))
        #plt.subplot(131)
        plt.figure()
        plt.plot(range(1, n_trees_ada + 1),
                 adaBoost_test_errors,
                 c='black',
                 label='adaBoost')
        plt.plot(range(1, n_trees_xg + 1),
                 xgBoost_test_errors,
                 c='red',
                 label='xgBoost')
        plt.legend()
        plt.ylim(0.2, 0.8)
        plt.ylabel('Test Error')
        plt.xlabel('Number of Trees')
        plt.show()

    #TODO exclude first and last 4 matches
    #TODO redo CV using GridSearchCV
    #TODO use PCA to preprocess the data
    #TODO implement SoftVoting

    config.time_taken_display(t0)
Beispiel #9
0
def main():
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    pdf_path = './data/pdf/'
    txt_path = './data/txt/'
    dict_path = './data/dict/'
    output_path = './output/'

    convert_files = False

    logger.info('Begin execution')
    if convert_files:
        logger.info('Coonvert files: {0}'.format(convert_files))
        logging.getLogger().setLevel(30)
        all_docs = work_data.convert_pdf_to_txt(pdf_path, txt_path)
        logging.getLogger().setLevel(10)
    else:
        logger.info('Import testimonials')
        all_docs = work_data.open_testimonies(txt_path)

    logger.info('Create wordcloud')
    wordcloud_words = work_data.generate_wordcloud(all_docs, output_path)
    wordcloud_words.to_csv('./output/word_count.csv')

    logger.info('Remove protocol paragraphs')
    filter_docs = [doc.filter_protocol() for doc in all_docs]
    filter_docs = [doc for doc in filter_docs if len(doc) > 5]

    logger.info('Load dictionaries')
    dict1 = pd.read_csv(dict_path + 'dict_ale.csv', index_col=0, header=0)
    dict2 = pd.read_csv(dict_path + 'dict_erika.csv', index_col=0, header=0)
    dict3 = pd.read_csv(dict_path + 'dict_macla.csv', index_col=0, header=0)
    dict_agents = pd.read_csv(dict_path + 'dict_agentes.csv', index_col=0, header=0)

    filter_docs = [work_data.input_sentiment(doc, dict1) for doc in filter_docs]
    filter_docs = [work_data.input_sentiment(doc, dict2) for doc in filter_docs]
    filter_docs = [work_data.input_sentiment(doc, dict3) for doc in filter_docs]
    #clean_docs = [work_data.input_agent(doc, dict_agents) for doc in clean_docs]

    tagged = not_tagged = 0
    for i in filter_docs:
        for par in i:
            if par.sentiment != 'none':
                tagged += 1
            else:
                not_tagged += 1
    logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged))

    logger.info('Number of testimonials: {0}'.format(len(all_docs)))
    logger.info('Clean testimonials')
    clean_docs = [[parag.clean_data() for parag in doc] for doc in filter_docs]

    clean_docs, quechua = work_data.extract_quechua(clean_docs)
    logger.info('Testimonials in spanish {0} - quechua {1}'.format(len(clean_docs), len(quechua)))
    print([doc[0].name for doc in quechua])

    tagged = not_tagged = 0
    for i in clean_docs:
        for par in i:
            if par.sentiment != 'none':
                tagged += 1
            else:
                not_tagged += 1
    logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged))

    logger.info('Train model')
    parag_trained1 = work_data.input_sentiment_posneg(filter_docs)
    parag_trained2 = work_data.train_sentiment(clean_docs)

    logger.info('Save preditions')
    parag_trained1.to_csv(output_path + 'reg_database1.csv')
    parag_trained2.to_csv(output_path + 'reg_database2.csv')

    # LDA implementation
    mat_docs, dictionary = work_data.list_to_matrix(clean_docs)
    print(mat_docs[0])
    pprint.pprint(dictionary.dfs)

    lda_model = work_data.lda_model(dictionary, mat_docs, 10)
    print(lda_model)
    pprint.pprint(lda_model.print_topics(num_topics=10, num_words=10))

    config.time_taken_display(t0)
Beispiel #10
0
def runSingleClusterSize(feExpName, feSubName, clExpName, clSubName, feEpoch,
                         clEpoch, dataPath, labelsPath, idsFilePath,
                         outputPath, clusterMinSize, mergeSmallerFirst):
    makedirs(outputPath, exist_ok=True)
    dataset = CostumeDataset(idsFilePath, dataPath, labelsPath)
    dataLoader = DataLoader(dataset, batch_size=1, shuffle=False)
    if useClusteringNet:
        loggerExpName = 'evaluation_' + feExpName + '_' + feSubName + '_' + clExpName + '_' + clSubName
    else:
        loggerExpName = 'evaluation_' + feExpName + '_' + feSubName
    logger = config_logger(loggerExpName)
    featureExtractorModel = \
        getFeatureExtractionModel(feExpName, logger, sub_experiment_name=feSubName, currentEpoch=feEpoch)[0]
    featureExtractorModel.eval()
    if useClusteringNet:
        clusteringModel = getClusterModel(clExpName,
                                          logger,
                                          sub_experiment_name=clSubName,
                                          currentEpoch=clEpoch)[0]
        clusteringModel.eval()
    hdbEval = Evaluator()
    hdbEvalResults = []
    hdbMrfEval = Evaluator()
    hdbMrfEvalResults = []
    hdbClusterNetEval = Evaluator()
    hdbClusterNetEvalResults = []
    gtClusterNetEval = Evaluator()
    gtClusterNetEvalResults = []
    hdbClusterNetMrfEval = Evaluator()
    hdbClusterNetMrfEvalResults = []
    hdbMrfClusterNetEval = Evaluator()
    hdbMrfClusterNetEvalResults = []
    hdbMrfClusterNetMrfEval = Evaluator()
    hdbMrfClusterNetMrfEvalResults = []
    for i, batch in enumerate(dataLoader):
        inputs = batch['image'].type(float_type)
        labels = batch['label'].cpu().numpy()
        labels = labels[0]

        saveImage(outputPath, 'image', i, batch['originalImage'].cpu().numpy())
        saveLabel(outputPath, 'ground_truth', i, labels)

        features = featureExtractorModel(inputs, None, None)[0]

        clustered = getClusters(features.cpu().numpy(), clusterMinSize)
        saveLabel(outputPath, 'hdbscan', i, clustered)
        hdbEvalResults.append(hdbEval.evaluate(clustered, labels))

        if useMrfAfterHdbScan:
            clusteredAndMRF = upsample(
                denoise_colored_image(downsample(clustered, RESCALE_FACTOR)),
                RESCALE_FACTOR)
            saveLabel(outputPath, 'hdbscan_mrf', i, clusteredAndMRF)
            hdbMrfEvalResults.append(
                hdbMrfEval.evaluate(clusteredAndMRF, labels))

        if useClusteringNet:
            clusteredInput = convertToClusterNetInput(features, clustered,
                                                      mergeSmallerFirst)
            if clusteredInput.shape[0] > 0:
                noMrfOnInput = clusteringModel(clusteredInput, None)[0]
            else:
                noMrfOnInput = np.zeros(
                    (1, 1, clusteredInput.shape[2], clusteredInput.shape[3]))
            hdbClusterNetOut = convertIndividualSegmentsToSingleImage(
                noMrfOnInput, mergeSmallerFirst)
            saveLabel(outputPath, 'hdbscan_clusternet', i, hdbClusterNetOut)
            hdbClusterNetEvalResults.append(
                hdbClusterNetEval.evaluate(hdbClusterNetOut, labels))

            gtInput = convertToClusterNetInput(features, labels,
                                               mergeSmallerFirst)
            if gtInput.shape[0] > 0:
                clusteredGt = clusteringModel(gtInput, None)[0]
            else:
                clusteredGt = np.zeros(
                    (1, 1, gtInput.shape[2], gtInput.shape[3]))
            gtClusterNetOut = convertIndividualSegmentsToSingleImage(
                clusteredGt, mergeSmallerFirst)
            saveLabel(outputPath, 'calc_mean_by_GT_clusternet', i,
                      gtClusterNetOut)
            gtClusterNetEvalResults.append(
                gtClusterNetEval.evaluate(gtClusterNetOut, labels))

        if useMrfAfterHdbScan and useClusteringNet:
            clusteredMrfInput = convertToClusterNetInput(
                features, clusteredAndMRF, mergeSmallerFirst)
            if clusteredMrfInput.shape[0] > 0:
                mrfOnInput = clusteringModel(clusteredMrfInput, None)[0]
            else:
                mrfOnInput = np.zeros((1, 1, clusteredMrfInput.shape[2],
                                       clusteredMrfInput.shape[3]))
            hdbMrfClusterNetOut = convertIndividualSegmentsToSingleImage(
                mrfOnInput, mergeSmallerFirst)
            saveLabel(outputPath, 'hdbscan_mrf_clusternet', i,
                      hdbMrfClusterNetOut)
            hdbMrfClusterNetEvalResults.append(
                hdbMrfClusterNetEval.evaluate(hdbMrfClusterNetOut, labels))

        if useClusteringNet and useMrfAfterClusteringNet:
            hdbClusterNetMrfOut = upsample(
                denoise_colored_image(
                    downsample(hdbClusterNetOut, RESCALE_FACTOR)),
                RESCALE_FACTOR)
            saveLabel(outputPath, 'hdbscan_clusternet_mrf', i,
                      hdbClusterNetMrfOut)
            hdbClusterNetMrfEvalResults.append(
                hdbClusterNetMrfEval.evaluate(hdbClusterNetMrfOut, labels))

        if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet:
            hdbMrfClusterNetMrfOut = upsample(
                denoise_colored_image(
                    downsample(hdbMrfClusterNetOut, RESCALE_FACTOR)),
                RESCALE_FACTOR)
            saveLabel(outputPath, 'hdbscan_mrf_clusternet_mrf', i,
                      hdbMrfClusterNetMrfOut)
            hdbMrfClusterNetMrfEvalResults.append(
                hdbMrfClusterNetMrfEval.evaluate(hdbMrfClusterNetMrfOut,
                                                 labels))

    hdbEvalResults.append(hdbEval.get_average_results())

    if useMrfAfterHdbScan:
        hdbMrfEvalResults.append(hdbMrfEval.get_average_results())

    if useClusteringNet:
        hdbClusterNetEvalResults.append(
            hdbClusterNetEval.get_average_results())
        gtClusterNetEvalResults.append(gtClusterNetEval.get_average_results())

    if useMrfAfterHdbScan and useClusteringNet:
        hdbMrfClusterNetEvalResults.append(
            hdbMrfClusterNetEval.get_average_results())

    if useClusteringNet and useMrfAfterClusteringNet:
        hdbClusterNetMrfEvalResults.append(
            hdbClusterNetMrfEval.get_average_results())

    if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet:
        hdbMrfClusterNetMrfEvalResults.append(
            hdbMrfClusterNetMrfEval.get_average_results())

    with open(join(outputPath, 'statistics.txt'), mode='w') as file:
        for i in range(len(hdbEvalResults) - 1):
            file.write('hdbscan only image ' +
                       str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                       str(hdbEvalResults[i]))
            file.write('\n')
            if useMrfAfterHdbScan:
                file.write('hdbscan and MRF image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(hdbMrfEvalResults[i]))
                file.write('\n')
            if useClusteringNet:
                file.write('hdbscan and ClusterNet image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(hdbClusterNetEvalResults[i]))
                file.write('\n')
                file.write('ClusterNet with actual embedding mean image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(gtClusterNetEvalResults[i]))
                file.write('\n')
            if useMrfAfterHdbScan and useClusteringNet:
                file.write('hdbscan and MRF and ClusterNet image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(hdbMrfClusterNetEvalResults[i]))
                file.write('\n')
            if useClusteringNet and useMrfAfterClusteringNet:
                file.write('hdbscan and ClusterNet and MRF image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(hdbClusterNetMrfEvalResults[i]))
                file.write('\n')
            if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet:
                file.write('hdbscan and MRF and ClusterNet and MRF image ' +
                           str(i).zfill(FILE_NAME_ID_LENGTH) + ': ' +
                           str(hdbMrfClusterNetMrfEvalResults[i]))
                file.write('\n')
            file.write('\n')
        lastLoc = len(hdbEvalResults) - 1

        file.write('hdbscan only, average score: ' +
                   str(hdbEvalResults[lastLoc]))
        file.write('\n')
        if useMrfAfterHdbScan:
            file.write('hdbscan and MRF, average score: ' +
                       str(hdbMrfEvalResults[lastLoc]))
            file.write('\n')
        if useClusteringNet:
            file.write('hdbscan and ClusterNet, average score: ' +
                       str(hdbClusterNetEvalResults[lastLoc]))
            file.write('\n')
            file.write(
                'ClusterNet using actual embedding mean, average score: ' +
                str(gtClusterNetEvalResults[lastLoc]))
            file.write('\n')
        if useMrfAfterHdbScan and useClusteringNet:
            file.write('hdbscan and MRF and ClusterNet, average score: ' +
                       str(hdbMrfClusterNetEvalResults[lastLoc]))
            file.write('\n')
        if useClusteringNet and useMrfAfterClusteringNet:
            file.write('hdbscan and ClusterNet and MRF, average score: ' +
                       str(hdbClusterNetMrfEvalResults[lastLoc]))
            file.write('\n')
        if useMrfAfterHdbScan and useClusteringNet and useMrfAfterClusteringNet:
            file.write(
                'hdbscan and MRF and ClusterNet and MRF, average score: ' +
                str(hdbMrfClusterNetMrfEvalResults[lastLoc]))
            file.write('\n')
Beispiel #11
0
def main():
    np.random.seed(42)
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    train_client_path = './data/raw/csv/train_clientes.csv'
    train_reque_path = './data/raw/csv/train_requerimientos.csv'
    test_client_path = './data/raw/csv/test_clientes.csv'
    test_reque_path = './data/raw/csv/test_requerimientos.csv'
    output_path = './output/'
    do_merge = False
    write_impute_test = False
    write_output = False
    add_variables = False
    version = 6

    logger.info('Beginning execution')
    logger.info('Load dataframes')
    test_client = pd.read_csv(test_client_path, header=0)
    test_reque = pd.read_csv(test_reque_path, header=0)
    main_client = pd.read_csv(train_client_path, header=0)
    main_reque = pd.read_csv(train_reque_path, header=0)

    work_data.basic_descriptive(main_client)
    work_data.basic_descriptive(main_reque)

    id_variables = work_data.id_variables()
    index_client = test_client['ID_CORRELATIVO']

    if write_impute_test:
        logger.info('Creating new test database')
        logger.info('Cleaning test reque database')
        test_reque = work_data.preprocess_reque(test_reque)
        print(test_reque.head().to_string())

        logger.info('Cleaning test client database - Imputing missing values')
        test_client = work_data.count_missings_column(test_client)
        test_client = work_data.preprocess_client(test_client)
        print(test_client.head().to_string())

        logger.info('Merging test databases')
        temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index])
        temp.fillna(0, inplace=True)
        test_df = temp
        print(test_df.head().to_string())
        print(test_df.describe().transpose().to_string())

        logger.info('Saving test database')
        test_df.to_csv('./data/mod/test_imputed.csv', index=False)
    else:
        logger.info('Opening test database')
        test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0)
        print(test_df.head().to_string())

    if do_merge:
        logger.info('Creating new merge')
        logger.info('Cleaning reque database')
        main_reque = work_data.preprocess_reque(main_reque)
        print(main_reque.head().to_string())

        #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum)
        #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format)
        #main_reque.fillna(0, inplace=True)

        logger.info('Cleaning client database - Imputing missing values')
        main_client = work_data.count_missings_column(main_client)
        target = main_client.pop('ATTRITION')
        target.index = main_client['ID_CORRELATIVO']
        main_client = work_data.preprocess_client(main_client)
        main_client['ATTRITION'] = target
        print(main_client.head().to_string())

        logger.info('Merging databases')
        temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index])
        temp.fillna(0, inplace=True)
        main_df = temp

        print(main_df.shape)
        print(main_df.head().to_string())
        print(main_df.describe().transpose().to_string())
        work_data.basic_descriptive(main_df)

        logger.info('Saving marges database')
        main_df.to_csv('./data/mod/merge1.csv', index=False)
    else:
        logger.info('Opening merged database')
        main_df = pd.read_csv('./data/mod/merge1.csv', header=0)
        print(main_df.head().to_string())
        print(main_df.shape)

    y = main_df.pop('ATTRITION')
    main_df = main_df.append(test_df).reset_index(drop=True)

    if False:
        logger.info('Creating T-SNE database')
        temp_tsne = pd.DataFrame(models.tnse(main_df))
        temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False)
    else:
        logger.info('Loading T-SNE database')
        temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv')

    if add_variables:
        logger.info('Beginning feature engineering')
        logger.info('Interactions')
        main_df_feat = models.create_interactions(main_df, models.inter_vars())

        logger.info('Row sums 1-3')
        main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1)
        temp = models.standard_scale_df(main_df)
        main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1)
        main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1)

        logger.info('K-means 4-7')
        main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str)
        main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str)
        main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str)
        main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str)

        logger.info('KNN 8-11')
        main_df_feat['ext8'] = models.knn_distance(main_df, 2)
        main_df_feat['ext9'] = models.knn_distance(main_df, 3)
        main_df_feat['ext10'] = models.knn_distance(main_df, 5)
        main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2)

        main_df_feat = pd.get_dummies(main_df_feat, drop_first=True)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)
        config.time_taken_display(t0)
        logger.info('Saving features database')
        main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False)
    else:
        logger.info('Opening feature engineered database')
        main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)

    logger.info('Split data into train and test')
    x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :]
    print(main_df_feat.shape)
    print(x.shape)
    print(test_df.shape)
    x_train, x_test, y_train, y_test = models.split_data(x, y)
    work_data.basic_descriptive(x_train)

    logger.info('Level 1 - Create metafeatures')

    if False:
        logger.info('1. Ridge logit')
        ridge_model = models.logit_grid(x, y, 'l2', StandardScaler())
        models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard')
        print(ridge_model.score(x_test, y_test))

        logger.info('2. Lasso logit')
        lasso_model = models.logit_grid(x, y, 'l1',StandardScaler())
        models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard')
        print(lasso_model.score(x_test, y_test))

        logger.info('3. Random Forrest')
        RF_model = models.random_forrest_grid(x, y, StandardScaler())
        models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard')
        print(RF_model.score(x_test, y_test))

        logger.info('4. Extra Trees')
        ET_model = models.extra_trees_grid(x, y, StandardScaler())
        models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard')
        print(ET_model.score(x_test, y_test))

        logger.info('5. 2-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 2)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('6. 4-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 4)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('7. 8-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 8)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('8. 16-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 16)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('9. 32-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 32)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('10. 64-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 64)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('11. 128-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 128)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('12. 256-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 256)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('13. 512-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 512)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('14. 1024-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 1024)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('15. Naive Bayes')
        NB_model = models.naive_bayes_grid(x, y, StandardScaler())
        models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard')
        print(NB_model.score(x_test, y_test))

        logger.info('16. MPL')
        MLP_model = models.MLP_grid(x, y, StandardScaler())
        models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard')
        print(MLP_model.score(x_test, y_test))

        logger.info('17. AdaBoost')
        adaboost_model = models.adaboost_grid(x, y, StandardScaler())
        models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard')
        print(adaboost_model.score(x_test, y_test))

        logger.info('18. GBM')
        gbm_model = models.gbm_grid(x, y, StandardScaler())
        models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard')
        print(gbm_model.score(x_test, y_test))

        logger.info('18. LightGBM')
        lgbm_model = models.lgbm_grid(x, y, None)
        models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none')
        print(lgbm_model.score(x_test, y_test))

    logger.info('19. XgBoost')
    test_final = main_df_feat.iloc[70000:, :]
    id_test = test_client['ID_CORRELATIVO']
    xgboost_model = models.xgboost_grid(x, y, StandardScaler())
    models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard')
    print(xgboost_model.score(x_test, y_test))
    models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION')
    hi

    # Stage 2:
    logger.info('Level 2')
    logger.info('Creating meta-features database')
    meta_features_list = os.listdir('./data/mod/meta_features')
    temp = {}
    for feature in meta_features_list:
        temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0)
        temp[feature] = temp_df.iloc[:, 1]
    meta_features = pd.DataFrame(temp)
    meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True)
    x = meta_features.iloc[:70000, :]
    test_final = meta_features.iloc[70000:, :]
    x_train, x_test, y_train, y_test = models.split_data(x, y)

    print(x_train.shape)
    print(test_final.shape)
    print(x.shape)

    logger.info('Estimating second level model with XgBoost')
    xgboost_final = models.xgboost_full_mod(x_train, y_train)
    print(xgboost_final.score(x_test, y_test))
    print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1]))
    models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8')
    models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train')


    config.time_taken_display(t0)
    hi

    logger.info('XgBoost')
    xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test)
    print('Test grid: {0}'.format(xgboost_result))
    #Test: -0.322

    xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test)
    print(xgboost_full)
    xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')
Beispiel #12
0
def main():
    LOGGER_LEVEL = 10
    RAW_DATA_PATH = './data/raw/'
    RAW_CSV_NAME = 'raw_data.csv'

    t0 = time.time()
    logger = config.config_logger(__name__, LOGGER_LEVEL)
    pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x))
    logger.info('Beginning execution: zika dataset')
    logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL))

    logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME))
    raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME)
   
    logger.info('Raw dataset description:') 
    process.basic_descriptives(raw_data)
    raw_data = process.preprocess(raw_data) 
    #print(raw_data.describe().transpose().to_string())
    #print(raw_data.head().to_string())
    #print(raw_data.info().to_string())

    y_dengue = raw_data['dengue_pcr']
    y_zika = raw_data['zika_pcr']
    y_chik = raw_data['chik_pcr']
    diseases = [y_dengue, y_zika, y_chik]
    # Check process code for further explanation of select_disease function.
    # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any
    # only_one: if True, input np.nan to patients with another disease.
    y = process.select_disease(diseases, code=1, only_one=False)
    logger.info('Target var frequency: \n{0}'.format(y.value_counts()))
    logger.info('Total obs: {0}'.format(y.value_counts().sum()))

    remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist',
                   'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4',
                   'dengue_pcr', 'zika_pcr', 'chik_pcr']

    X = process.remove_vars(raw_data, remove_list)
    X = process.keep_non_nan(X, y)
    y = y.dropna()

    logger.info('Features dataset')
    process.basic_descriptives(X)

    logger.info('Split train test')
    X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4)

    logger.info('Estimating models')
    logger.info('GBM')
    grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5)
    logger.info(grid_gbm.best_params_)
    logger.info('Train score: {0}'.format(grid_gbm.best_score_))
    logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test)))

    logger.info('Logit')
    grid_logit = models.logit_grid(X_train, y_train, n_cv=5)
    logger.info(grid_logit.best_params_)
    logger.info('Train score: {0}'.format(grid_logit.best_score_))
    logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test)))

    logger.info('AdaBoost')
    grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5)
    logger.info(grid_adaboost.best_params_)
    logger.info('Train score: {0}'.format(grid_adaboost.best_score_))
    logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test)))

    logger.info('Soft Voting')
    eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit),
                                        ('ada', grid_adaboost)], voting='soft')
    eclf.fit(X_train, y_train)
    y_pred = eclf.predict_proba(X_test)
    print(y_pred[:5,:])
    logger.info('Train score: {0}'.format(eclf.score(X_train, y_train)))
    logger.info('Test score: {0}'.format(eclf.score(X_test, y_test)))

    config.time_taken_display(t0)
Beispiel #13
0
import argparse

from config import config_logger
from controller.executor import Executor

if __name__ == '__main__':
    config_logger()
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--cash',
                        type=int,
                        help='initial capital of cash',
                        default=10000)
    parser.add_argument('-v',
                        '--volume',
                        type=int,
                        help='initial stock volume',
                        default=100)
    parser.add_argument('-sc',
                        '--stock_code',
                        type=str,
                        help='stock code',
                        default='000001')
    parser.add_argument(
        '-s',
        '--strategy',
        type=str,
        help='strategy type -"rsi_hf" and "rsi_lf" are supported',
        default='rsi_hf')
    parser.add_argument('-ts',
                        '--time_span',