Esempi in Python per time_taken_display

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: config

Metodo/funzione: time_taken_display

Esempi su hotexamples.com: 7

time_taken_display in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per config.time_taken_display, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

def main():
    # Objective: create a csv file with the ubigeo code of all centro poblados in Peru.
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    logger.info('Beginning execuation')
    raw = pd.read_csv('./data/ubigeo/Ubigeo_mod.csv', header=0)
    districts = raw.loc[raw['district'].apply(np.isnan)]
    print(districts.shape)
    departments = raw.loc[~raw['region'].apply(np.isnan)]
    n_centro_pob = sum(departments['centro_pob'])
    logger.info('There are {0:,} centros poblados in Peru'.format(n_centro_pob))

    output = []
    for index, row in districts.iterrows():
        temp_centros = row['centro_pob']
        temp_code = row['code']
        for centro in range(1, temp_centros+1):
            centro_code = add_zero_left(centro, 4)
            centro_code = str(temp_code) + centro_code
            centro_code = add_zero_left(centro_code, 10)
            output.append(centro_code)

    print(len(output))  # This shoul be equal to the number of centros poblados.

    print(raw.head().to_string())
    print(districts.head().to_string())
    print(departments.head().to_string())

    output = pd.DataFrame({'ubigeo': output})
    output.to_csv('./data/ubigeo/ubigeo_final.csv')

    config.time_taken_display(t0)

Esempio n. 2

Mostra file

def main():

    LOGGER_LEVEL = 20
    cfg_name = './config/_credentials.cfg'
    OUTPUT_PATH = './data/sportmonks/'
    GET_STATS = False
    league_id_list = [
        2, 5, 72, 74, 78, 82, 85, 208, 462, 564, 570, 384, 390, 8, 9, 24, 12,
        600, 301, 304, 453, 1114, 292, 654, 651, 444, 573, 579
    ]

    t0 = time.time()
    pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x))
    logger = config.config_logger(__name__, level=LOGGER_LEVEL)

    cfg_parser = configparser.ConfigParser()
    cfg_parser.read(cfg_name)
    my_key = str(cfg_parser.get('Sportmonks', 'key'))

    logger.info('Beginning execution')
    sportmonks.init(my_key)

    logger.info('Available leagues:')
    leagues_dict = {}
    for l in sportmonks.leagues():
        print(l['id'], l['name'], l['country_id'])
        leagues_dict[l['id']] = l['name']

    for league_id in league_id_list:
        if GET_STATS:
            league_name = leagues_dict[league_id]
            logger.info('Sending query - Stats')
            logger.info('League selected: {0} - {1}'.format(
                league_id, league_name))
            league_json = sportmonks.league(
                league_id,
                include=
                'seasons.fixtures.stats,seasons.fixtures.localTeam,seasons.fixtures.visitorTeam'
            )

            logger.info('Processing package')
            league_df = sportmonks.league_into_dataframe(league_json)
            #print(league_df.head().to_string())
            logger.info('Dimensions of the dataframe: {0}'.format(
                league_df.shape))

            save_name = '{0}{1}'.format(league_id,
                                        league_name.replace(' ', '_'))
            logger.info('Saving CSV: {0}'.format(OUTPUT_PATH + save_name))
            league_df.to_csv(OUTPUT_PATH + save_name + '.csv')

    config.time_taken_display(t0)
    print(' ')

Esempio n. 3

Mostra file

def main():

    t0 = time.time()
    LIMA_MAP = './data/lima_map/limaPolyUTM.geojson'
    cfg_name = './config/_credentials.cfg'
    LOGGER_LEVEL = 10
    N = 2000

    cfg_parser = configparser.ConfigParser()
    cfg_parser.read(cfg_name)
    my_key = str(cfg_parser.get('key', 'key'))

    logger = config.config_logger(__name__, LOGGER_LEVEL)
    logger.info('Beginning execution: GOOGLE URBAN')
    logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL))

    logger.info('Logging in Google Maps API')
    gmaps = googlemaps.Client(key=my_key)

    logger.info('Opening Lima map: {0}'.format(LIMA_MAP))
    lima_gpd = maps.load_map(LIMA_MAP)

    logger.info('Getting Lima limits')
    lima_limits = maps.boundaries(lima_gpd)

    logger.info('Plotting Lima map')
    save_path_lima_map = './data/lima_map/lima.png'
    maps.plot_map(lima_gpd, save_path_lima_map, lima_limits)

    logger.info('Getting {0} random starts'.format(N))
    start_points = maps.get_list_of_random_starts(lima_gpd, lima_limits, n=N)

    logger.info('Plotting points in Lima map')
    save_path_lima_point = './data/lima_map/lima_start_points.png'
    maps.plot_map_and_points(lima_gpd, save_path_lima_point, start_points)

    logger.info('Converting UTM points into lat-lon points')
    start_points = maps.UTM_to_latlon(start_points)

    #TODO generate queries for trafic each hour
    #TODO automatize the code to send queries each hour.

    config.time_taken_display(t0)

Esempio n. 4

Mostra file

def main():
    t0 = time.time()
    logger = config.config_logger(__name__, 10)

    stats_path = './data/sportmonks/with_data/'
    save_path = './data/sportmonks/'
    create_data = False

    logger.info('Create dataset : {0}'.format(create_data))
    if create_data:
        league_names = work_data.get_data_list(stats_path)
        logger.info('Leagues found: {0}'.format(len(league_names)))

        match_data_tot = pd.DataFrame({})
        for league in league_names:
            logger.info('Opening {0}'.format(league))
            match_data_raw = work_data.load_data(stats_path,
                                                 selection=league,
                                                 date_filter='2016-07-13')
            logger.info('Dimensions of raw data: {0}'.format(
                match_data_raw.shape))
            match_data_raw = work_data.get_selection(match_data_raw)
            match_data_raw = work_data.fill_selected_vars(match_data_raw)
            match_data_raw = work_data.drop_rows_NA(match_data_raw)
            logger.info('Dimensions after cleaning: {0}'.format(
                match_data_raw.shape))
            match_data_tot = match_data_tot.append(match_data_raw)

        window = 4
        match_data_tot.reset_index(drop=True, inplace=True)
        logger.info('Dimension of all leagues DB: {0}'.format(
            match_data_tot.shape))
        logger.info('Duplicating vars')
        match_data = work_data.duplicate_stats(match_data_tot)
        logger.info('Finding average with window {0}'.format(window))
        match_data = work_data.get_averages(match_data, window=window)
        logger.info('Dimensions after preprocessing: {0}'.format(
            match_data.shape))
        logger.info('Stat variables: {0}'.format(
            len(work_data.stats_variables())))
        match_data.to_csv(save_path + 'sportmonks_final.csv')
        logger.info('Final DB saved')

    logger.info('Opening sportmonks DB')
    match_data = pd.read_csv(save_path + 'sportmonks_final.csv', index_col=1)

    logger.info('Beginning analysis section')
    #predict.descriptive_stats(match_data)

    keep_draws = True
    logger.info(
        'Generating attributes and standardizing - Keep draws: {0}'.format(
            keep_draws))
    y_data, x_data = predict.preprocess_data(match_data, draws=keep_draws)
    x_data = predict.standardize(x_data)
    logger.info('Number of attributes included: {0}'.format(x_data.shape[1]))
    logger.info('Number of obs: {0}'.format(x_data.shape[0]))
    #print(x_data.describe().transpose().to_string())

    train_nn = False
    train_logit = False
    train_logit_lasso = False
    train_svm = False
    train_tree = False
    train_adaBoost = True
    train_gbm = True
    train_treeBoost = True
    train_staged = False
    n_cv = 10
    lam_list = [0.001, 0.01, 0.05, 0.1, 0.2, 1, 10, 100]
    layers = [(10, 5), (20, 10), (30, 10), (50, 10), (100, 10), (100, 20),
              (100, 30), (100, 50, 10)]

    if train_nn:
        logger.info('Training models: Neural Network')
        cv_nn = list()
        for layer in layers:
            temp_nn = predict.neural_network_cv(x_data,
                                                y_data,
                                                n_cv=n_cv,
                                                layers=layer)
            cv_nn.append(temp_nn)
            print('.', end='')
        print(' ')
        for i in range(len(layers)):
            predict.report_model_output(cv_nn[i], 'NN_{0}'.format(layers[i]))

    if train_logit:
        logger.info('Training models: Logit')
        cv_logit = list()
        for lam in lam_list:
            temp_logit = predict.logistic_cv(x_data,
                                             y_data,
                                             n_cv=n_cv,
                                             lam=lam)
            cv_logit.append(temp_logit)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_logit[i],
                                        'Logit_{0}'.format(lam_list[i]))

    if train_logit_lasso:
        logger.info('Training models: Logit_Lasso')
        cv_logit_lasso = list()
        for lam in lam_list:
            temp_logit_lasso = predict.logistic_lasso_cv(x_data,
                                                         y_data,
                                                         n_cv=n_cv,
                                                         lam=lam)
            cv_logit_lasso.append(temp_logit_lasso)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_logit_lasso[i],
                                        'Logit_Lasso_{0}'.format(lam_list[i]))

    if train_svm:
        logger.info('Training models: SVM')
        cv_svm = list()
        for lam in lam_list:
            temp_svm = predict.svm_cv(x_data, y_data, n_cv=n_cv, lam=lam)
            cv_svm.append(temp_svm)
            print('.', end='')
        print(' ')
        for i in range(len(lam_list)):
            predict.report_model_output(cv_svm[i],
                                        'SVM_{0}'.format(lam_list[i]))

    if train_tree:
        logger.info('Training models: Decision Tree')
        cv_tree = list()
        temp_tree = predict.tree_cv(x_data, y_data, n_cv=n_cv)
        cv_tree.append(temp_tree)
        predict.report_model_output(cv_tree, 'Tree')
        print(' ')

    if train_adaBoost:
        logger.info('Training models: AdaBoost')
        cv_adaBoost = list()
        temp_adaBoost = predict.adaBoost_cv(x_data, y_data, n_cv=n_cv)
        cv_adaBoost.append(temp_adaBoost)
        predict.report_model_output(cv_adaBoost, 'AdaBoost')
        print(' ')

    if train_gbm:
        logger.info('Training models: gbm')
        cv_gbm = list()
        temp_gbm = predict.gbm_cv(x_data, y_data, n_cv=n_cv)
        cv_gbm.append(temp_gbm)
        predict.report_model_output(cv_gbm, 'GBM')
        print(' ')

        logger.info('Grid: gbm')
        grid_gbm = predict.gbm_grid(x_data, y_data)
        print(grid_gbm.best_params_)
        print(grid_gbm.best_score_)

    if train_treeBoost:
        logger.info('Training models: TreeBoost')
        cv_treeBoost = list()
        temp_treeBoost = predict.treeBoost_cv(x_data, y_data, n_cv=n_cv)
        cv_treeBoost.append(temp_treeBoost)
        predict.report_model_output(cv_treeBoost, 'TreeBoost')
        print(' ')

    if train_staged:
        x_train, x_test, y_train, y_test = predict.split(x_data,
                                                         y_data,
                                                         size=0.25)
        adaBoost_model = predict.adaBoost(x_train, y_train, n_iter=300)
        adaBoost_model.fit(x_train, y_train)

        xgBoost_model = predict.xgBoost(x_train, y_train, n_iter=300)
        xgBoost_model.fit(x_train, y_train)

        adaBoost_test_errors = []
        xgBoost_test_errors = []

        for adaBoost_test_predict, xgBoost_test_predict in zip(
                adaBoost_model.staged_predict(x_test),
                xgBoost_model.staged_predict(x_test)):
            adaBoost_test_errors.append(
                1. - accuracy_score(adaBoost_test_predict, y_test))
            xgBoost_test_errors.append(
                1. - accuracy_score(xgBoost_test_predict, y_test))

        n_trees_ada = len(adaBoost_test_errors)
        n_trees_xg = len(xgBoost_test_errors)
        print(n_trees_ada, n_trees_xg)
        logger.info('Best accuracy - AdaBoost: {0:.3f}'.format(
            1 - min(adaBoost_test_errors)))
        logger.info('Best accuracy - xgBoost: {0:.3f}'.format(
            1 - min(xgBoost_test_errors)))

        #plt.figure(figsize=(15, 5))
        #plt.subplot(131)
        plt.figure()
        plt.plot(range(1, n_trees_ada + 1),
                 adaBoost_test_errors,
                 c='black',
                 label='adaBoost')
        plt.plot(range(1, n_trees_xg + 1),
                 xgBoost_test_errors,
                 c='red',
                 label='xgBoost')
        plt.legend()
        plt.ylim(0.2, 0.8)
        plt.ylabel('Test Error')
        plt.xlabel('Number of Trees')
        plt.show()

    #TODO exclude first and last 4 matches
    #TODO redo CV using GridSearchCV
    #TODO use PCA to preprocess the data
    #TODO implement SoftVoting

    config.time_taken_display(t0)

Esempio n. 5

Mostra file

def main():
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    pdf_path = './data/pdf/'
    txt_path = './data/txt/'
    dict_path = './data/dict/'
    output_path = './output/'

    convert_files = False

    logger.info('Begin execution')
    if convert_files:
        logger.info('Coonvert files: {0}'.format(convert_files))
        logging.getLogger().setLevel(30)
        all_docs = work_data.convert_pdf_to_txt(pdf_path, txt_path)
        logging.getLogger().setLevel(10)
    else:
        logger.info('Import testimonials')
        all_docs = work_data.open_testimonies(txt_path)

    logger.info('Create wordcloud')
    wordcloud_words = work_data.generate_wordcloud(all_docs, output_path)
    wordcloud_words.to_csv('./output/word_count.csv')

    logger.info('Remove protocol paragraphs')
    filter_docs = [doc.filter_protocol() for doc in all_docs]
    filter_docs = [doc for doc in filter_docs if len(doc) > 5]

    logger.info('Load dictionaries')
    dict1 = pd.read_csv(dict_path + 'dict_ale.csv', index_col=0, header=0)
    dict2 = pd.read_csv(dict_path + 'dict_erika.csv', index_col=0, header=0)
    dict3 = pd.read_csv(dict_path + 'dict_macla.csv', index_col=0, header=0)
    dict_agents = pd.read_csv(dict_path + 'dict_agentes.csv', index_col=0, header=0)

    filter_docs = [work_data.input_sentiment(doc, dict1) for doc in filter_docs]
    filter_docs = [work_data.input_sentiment(doc, dict2) for doc in filter_docs]
    filter_docs = [work_data.input_sentiment(doc, dict3) for doc in filter_docs]
    #clean_docs = [work_data.input_agent(doc, dict_agents) for doc in clean_docs]

    tagged = not_tagged = 0
    for i in filter_docs:
        for par in i:
            if par.sentiment != 'none':
                tagged += 1
            else:
                not_tagged += 1
    logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged))

    logger.info('Number of testimonials: {0}'.format(len(all_docs)))
    logger.info('Clean testimonials')
    clean_docs = [[parag.clean_data() for parag in doc] for doc in filter_docs]

    clean_docs, quechua = work_data.extract_quechua(clean_docs)
    logger.info('Testimonials in spanish {0} - quechua {1}'.format(len(clean_docs), len(quechua)))
    print([doc[0].name for doc in quechua])

    tagged = not_tagged = 0
    for i in clean_docs:
        for par in i:
            if par.sentiment != 'none':
                tagged += 1
            else:
                not_tagged += 1
    logger.info('Paragraphs tagged {0} - not tagged {1}'.format(tagged, not_tagged))

    logger.info('Train model')
    parag_trained1 = work_data.input_sentiment_posneg(filter_docs)
    parag_trained2 = work_data.train_sentiment(clean_docs)

    logger.info('Save preditions')
    parag_trained1.to_csv(output_path + 'reg_database1.csv')
    parag_trained2.to_csv(output_path + 'reg_database2.csv')

    # LDA implementation
    mat_docs, dictionary = work_data.list_to_matrix(clean_docs)
    print(mat_docs[0])
    pprint.pprint(dictionary.dfs)

    lda_model = work_data.lda_model(dictionary, mat_docs, 10)
    print(lda_model)
    pprint.pprint(lda_model.print_topics(num_topics=10, num_words=10))

    config.time_taken_display(t0)

Esempio n. 6

Mostra file

def main():
    np.random.seed(42)
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    train_client_path = './data/raw/csv/train_clientes.csv'
    train_reque_path = './data/raw/csv/train_requerimientos.csv'
    test_client_path = './data/raw/csv/test_clientes.csv'
    test_reque_path = './data/raw/csv/test_requerimientos.csv'
    output_path = './output/'
    do_merge = False
    write_impute_test = False
    write_output = False
    add_variables = False
    version = 6

    logger.info('Beginning execution')
    logger.info('Load dataframes')
    test_client = pd.read_csv(test_client_path, header=0)
    test_reque = pd.read_csv(test_reque_path, header=0)
    main_client = pd.read_csv(train_client_path, header=0)
    main_reque = pd.read_csv(train_reque_path, header=0)

    work_data.basic_descriptive(main_client)
    work_data.basic_descriptive(main_reque)

    id_variables = work_data.id_variables()
    index_client = test_client['ID_CORRELATIVO']

    if write_impute_test:
        logger.info('Creating new test database')
        logger.info('Cleaning test reque database')
        test_reque = work_data.preprocess_reque(test_reque)
        print(test_reque.head().to_string())

        logger.info('Cleaning test client database - Imputing missing values')
        test_client = work_data.count_missings_column(test_client)
        test_client = work_data.preprocess_client(test_client)
        print(test_client.head().to_string())

        logger.info('Merging test databases')
        temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index])
        temp.fillna(0, inplace=True)
        test_df = temp
        print(test_df.head().to_string())
        print(test_df.describe().transpose().to_string())

        logger.info('Saving test database')
        test_df.to_csv('./data/mod/test_imputed.csv', index=False)
    else:
        logger.info('Opening test database')
        test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0)
        print(test_df.head().to_string())

    if do_merge:
        logger.info('Creating new merge')
        logger.info('Cleaning reque database')
        main_reque = work_data.preprocess_reque(main_reque)
        print(main_reque.head().to_string())

        #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum)
        #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format)
        #main_reque.fillna(0, inplace=True)

        logger.info('Cleaning client database - Imputing missing values')
        main_client = work_data.count_missings_column(main_client)
        target = main_client.pop('ATTRITION')
        target.index = main_client['ID_CORRELATIVO']
        main_client = work_data.preprocess_client(main_client)
        main_client['ATTRITION'] = target
        print(main_client.head().to_string())

        logger.info('Merging databases')
        temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index])
        temp.fillna(0, inplace=True)
        main_df = temp

        print(main_df.shape)
        print(main_df.head().to_string())
        print(main_df.describe().transpose().to_string())
        work_data.basic_descriptive(main_df)

        logger.info('Saving marges database')
        main_df.to_csv('./data/mod/merge1.csv', index=False)
    else:
        logger.info('Opening merged database')
        main_df = pd.read_csv('./data/mod/merge1.csv', header=0)
        print(main_df.head().to_string())
        print(main_df.shape)

    y = main_df.pop('ATTRITION')
    main_df = main_df.append(test_df).reset_index(drop=True)

    if False:
        logger.info('Creating T-SNE database')
        temp_tsne = pd.DataFrame(models.tnse(main_df))
        temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False)
    else:
        logger.info('Loading T-SNE database')
        temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv')

    if add_variables:
        logger.info('Beginning feature engineering')
        logger.info('Interactions')
        main_df_feat = models.create_interactions(main_df, models.inter_vars())

        logger.info('Row sums 1-3')
        main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1)
        temp = models.standard_scale_df(main_df)
        main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1)
        main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1)

        logger.info('K-means 4-7')
        main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str)
        main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str)
        main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str)
        main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str)

        logger.info('KNN 8-11')
        main_df_feat['ext8'] = models.knn_distance(main_df, 2)
        main_df_feat['ext9'] = models.knn_distance(main_df, 3)
        main_df_feat['ext10'] = models.knn_distance(main_df, 5)
        main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2)

        main_df_feat = pd.get_dummies(main_df_feat, drop_first=True)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)
        config.time_taken_display(t0)
        logger.info('Saving features database')
        main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False)
    else:
        logger.info('Opening feature engineered database')
        main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)

    logger.info('Split data into train and test')
    x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :]
    print(main_df_feat.shape)
    print(x.shape)
    print(test_df.shape)
    x_train, x_test, y_train, y_test = models.split_data(x, y)
    work_data.basic_descriptive(x_train)

    logger.info('Level 1 - Create metafeatures')

    if False:
        logger.info('1. Ridge logit')
        ridge_model = models.logit_grid(x, y, 'l2', StandardScaler())
        models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard')
        print(ridge_model.score(x_test, y_test))

        logger.info('2. Lasso logit')
        lasso_model = models.logit_grid(x, y, 'l1',StandardScaler())
        models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard')
        print(lasso_model.score(x_test, y_test))

        logger.info('3. Random Forrest')
        RF_model = models.random_forrest_grid(x, y, StandardScaler())
        models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard')
        print(RF_model.score(x_test, y_test))

        logger.info('4. Extra Trees')
        ET_model = models.extra_trees_grid(x, y, StandardScaler())
        models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard')
        print(ET_model.score(x_test, y_test))

        logger.info('5. 2-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 2)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('6. 4-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 4)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('7. 8-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 8)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('8. 16-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 16)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('9. 32-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 32)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('10. 64-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 64)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('11. 128-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 128)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('12. 256-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 256)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('13. 512-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 512)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('14. 1024-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 1024)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('15. Naive Bayes')
        NB_model = models.naive_bayes_grid(x, y, StandardScaler())
        models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard')
        print(NB_model.score(x_test, y_test))

        logger.info('16. MPL')
        MLP_model = models.MLP_grid(x, y, StandardScaler())
        models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard')
        print(MLP_model.score(x_test, y_test))

        logger.info('17. AdaBoost')
        adaboost_model = models.adaboost_grid(x, y, StandardScaler())
        models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard')
        print(adaboost_model.score(x_test, y_test))

        logger.info('18. GBM')
        gbm_model = models.gbm_grid(x, y, StandardScaler())
        models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard')
        print(gbm_model.score(x_test, y_test))

        logger.info('18. LightGBM')
        lgbm_model = models.lgbm_grid(x, y, None)
        models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none')
        print(lgbm_model.score(x_test, y_test))

    logger.info('19. XgBoost')
    test_final = main_df_feat.iloc[70000:, :]
    id_test = test_client['ID_CORRELATIVO']
    xgboost_model = models.xgboost_grid(x, y, StandardScaler())
    models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard')
    print(xgboost_model.score(x_test, y_test))
    models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION')
    hi

    # Stage 2:
    logger.info('Level 2')
    logger.info('Creating meta-features database')
    meta_features_list = os.listdir('./data/mod/meta_features')
    temp = {}
    for feature in meta_features_list:
        temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0)
        temp[feature] = temp_df.iloc[:, 1]
    meta_features = pd.DataFrame(temp)
    meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True)
    x = meta_features.iloc[:70000, :]
    test_final = meta_features.iloc[70000:, :]
    x_train, x_test, y_train, y_test = models.split_data(x, y)

    print(x_train.shape)
    print(test_final.shape)
    print(x.shape)

    logger.info('Estimating second level model with XgBoost')
    xgboost_final = models.xgboost_full_mod(x_train, y_train)
    print(xgboost_final.score(x_test, y_test))
    print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1]))
    models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8')
    models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train')


    config.time_taken_display(t0)
    hi

    logger.info('XgBoost')
    xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test)
    print('Test grid: {0}'.format(xgboost_result))
    #Test: -0.322

    xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test)
    print(xgboost_full)
    xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')

Esempio n. 7

Mostra file

def main():
    LOGGER_LEVEL = 10
    RAW_DATA_PATH = './data/raw/'
    RAW_CSV_NAME = 'raw_data.csv'

    t0 = time.time()
    logger = config.config_logger(__name__, LOGGER_LEVEL)
    pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x))
    logger.info('Beginning execution: zika dataset')
    logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL))

    logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME))
    raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME)
   
    logger.info('Raw dataset description:') 
    process.basic_descriptives(raw_data)
    raw_data = process.preprocess(raw_data) 
    #print(raw_data.describe().transpose().to_string())
    #print(raw_data.head().to_string())
    #print(raw_data.info().to_string())

    y_dengue = raw_data['dengue_pcr']
    y_zika = raw_data['zika_pcr']
    y_chik = raw_data['chik_pcr']
    diseases = [y_dengue, y_zika, y_chik]
    # Check process code for further explanation of select_disease function.
    # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any
    # only_one: if True, input np.nan to patients with another disease.
    y = process.select_disease(diseases, code=1, only_one=False)
    logger.info('Target var frequency: \n{0}'.format(y.value_counts()))
    logger.info('Total obs: {0}'.format(y.value_counts().sum()))

    remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist',
                   'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4',
                   'dengue_pcr', 'zika_pcr', 'chik_pcr']

    X = process.remove_vars(raw_data, remove_list)
    X = process.keep_non_nan(X, y)
    y = y.dropna()

    logger.info('Features dataset')
    process.basic_descriptives(X)

    logger.info('Split train test')
    X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4)

    logger.info('Estimating models')
    logger.info('GBM')
    grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5)
    logger.info(grid_gbm.best_params_)
    logger.info('Train score: {0}'.format(grid_gbm.best_score_))
    logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test)))

    logger.info('Logit')
    grid_logit = models.logit_grid(X_train, y_train, n_cv=5)
    logger.info(grid_logit.best_params_)
    logger.info('Train score: {0}'.format(grid_logit.best_score_))
    logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test)))

    logger.info('AdaBoost')
    grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5)
    logger.info(grid_adaboost.best_params_)
    logger.info('Train score: {0}'.format(grid_adaboost.best_score_))
    logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test)))

    logger.info('Soft Voting')
    eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit),
                                        ('ada', grid_adaboost)], voting='soft')
    eclf.fit(X_train, y_train)
    y_pred = eclf.predict_proba(X_test)
    print(y_pred[:5,:])
    logger.info('Train score: {0}'.format(eclf.score(X_train, y_train)))
    logger.info('Test score: {0}'.format(eclf.score(X_test, y_test)))

    config.time_taken_display(t0)