def pnone_processing(path="../data/",
                     train_data_name="train_data",
                     test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    print df_train.shape
    print df_test.shape

    # user prod add_to_cart_order mean, std
    priors, train, orders, products, aisles, departments = preprocessing_data()

    df_user_pnone=pd.DataFrame(priors.groupby('user_id').\
        apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\
        / float(user_orders.order_id.unique().shape[0])),columns=['pnone'])

    df_train = df_train.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)
    df_test = df_test.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
def extra_feature_processing(path="../data/",
                             train_data_name="train_data",
                             test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    print df_train.shape
    print df_test.shape

    # user prod add_to_cart_order mean, std
    priors, train, orders, products, aisles, departments = preprocessing_data()

    # get mean of add to cart order
    user_prod_avg_add_to_cart_order=\
        pd.DataFrame(priors.groupby(['user_id','product_id']).add_to_cart_order.agg(np.mean))
    user_prod_avg_add_to_cart_order.rename( columns={'add_to_cart_order':'user_prod_avg_add_to_cart_order'},\
                                           inplace=True)
    df_train = df_train.merge(user_prod_avg_add_to_cart_order,\
                          how='left', left_on=['user_id','product_id'], right_index=True)
    df_test = df_test.merge(user_prod_avg_add_to_cart_order,\
                          how='left', left_on=['user_id','product_id'], right_index=True)

    # order_dow processing
    # priors.groupby(['user_id','product_id']).apply(get_p_dow)

    # load timeline related data
    # timeline_data = get_timeline_data()

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
def load_user_prod_cat_data(num_topics=20, rename=True):
    ### load cat data
    user_cat_name = "prior_user_cat_" + str(num_topics)
    df_user_cat = common.load_df("../data/", user_cat_name)
    df_user_cat = df_user_cat.set_index('user_id', drop=False)

    prod_cat_name = "prior_prod_cat_" + str(num_topics)
    df_prod_cat = common.load_df("../data/", prod_cat_name)
    df_prod_cat = df_prod_cat.set_index('prod_id', drop=False)

    user_prod_match_name = "prior_user_prod_match_" + str(num_topics)
    df_user_prod_match = common.load_df("../data/", user_prod_match_name)

    # reset dtype to category
    df_user_cat['user_cat'] = df_user_cat['user_cat'].astype('category')
    df_prod_cat['prod_cat'] = df_prod_cat['prod_cat'].astype('category')

    if rename:
        df_user_cat.rename(columns={"user_cat": "user_cat_" + str(num_topics)},
                           inplace=True)
        df_prod_cat.rename(columns={"prod_cat": "prod_cat_" + str(num_topics)},
                           inplace=True)
        df_prod_cat.rename(columns={"prod_id": "product_id"}, inplace=True)
        df_user_prod_match.rename(
            columns={"user_prod_match": "user_prod_match_" + str(num_topics)},
            inplace=True)
        df_user_prod_match.rename(columns={'prod_id': 'product_id'},
                                  inplace=True)

    return df_user_cat, df_prod_cat, df_user_prod_match
def get_processed_data(path="../data/",
                       train_data_name="train_data",
                       test_data_name="test_data"):
    if os.path.isfile(path + train_data_name +
                      ".csv.gz") and os.path.isfile(path + test_data_name +
                                                    ".csv.gz"):
        # load the data
        df_train = common.load_df(path, train_data_name)
        df_test = common.load_df(path, test_data_name)
    else:
        print "no data, start processing"
        df_train, df_test = processing_data()
        # save the df_train, df_test
        common.save_df(df_train, "../data/", train_data_name, index=False)
        common.save_df(df_test, "../data/", test_data_name, index=False)
    return df_train, df_test
def extra_reorder_rate_processing(path="../data/",
                                  train_data_name="train_data",
                                  test_data_name="test_data"):
    # load the data
    df_train = common.load_df(path, train_data_name)
    df_test = common.load_df(path, test_data_name)

    df_all = pd.concat([df_train, df_test])
    assert (df_all.shape[0] == df_train.shape[0] + df_test.shape[0])

    print df_all.shape
    print df_all.keys()

    # here the cat 50, 100 is totally wrong.
    # re-processing here
    # drop old cat columns
    drop_cols = [
        'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20',
        'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50',
        'user_cat_100', 'user_prod_match_20', 'user_prod_match_50',
        'user_prod_match_100', 'prod_cat_20_reorder_rate',
        'user_cat_20_prod_reorder_rate',
        'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate',
        'user_cat_50_prod_reorder_rate',
        'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate',
        'user_cat_100_prod_reorder_rate',
        'user_cat_100_prod_cat_100_reorder_rate'
    ]
    df_all.drop(drop_cols, axis=1, inplace=True, errors='ignore')
    print df_all.shape

    # load user and product category data
    user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data(
    )

    # merge category data
    df_all = df_all.merge(user_cat_data, how='left', on='user_id')
    df_all = df_all.merge(prod_cat_data, how='left', on='product_id')
    df_all = df_all.merge(user_prod_cat_match_data,
                          how='left',
                          on=['user_id', 'product_id'])

    # reorder_rate processing
    # aisle reorder rate
    df_all=df_all.merge(pd.DataFrame(df_all.groupby('aisle_id').apply(lambda orders: \
                                    sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                    if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['aisle_reorder_rate']),\
                                    how='left', left_on='aisle_id', right_index=True)
    assert (
        df_all.groupby('aisle_id').aisle_reorder_rate.apply(lambda x: x.unique(
        ).shape[0] == 1).sum() == df_all.aisle_id.unique().shape[0])

    # department reorder rate
    df_all=df_all.merge(pd.DataFrame(df_all.groupby('department_id').apply(lambda orders: \
                                    sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                    if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['department_reorder_rate']),\
                                    how='left', left_on='department_id', right_index=True)
    assert(df_all.groupby('department_id').department_reorder_rate.apply(lambda x: x.unique().shape[0]==1).sum() ==\
                                                                         df_all.department_id.unique().shape[0])

    nt_list = [20, 50, 100]
    for num_topics in nt_list:
        # prod_cat_reorder_rate
        prod_cat_str = 'prod_cat_' + str(num_topics)
        df_all = df_all.merge(pd.DataFrame(df_all.groupby(prod_cat_str).apply(lambda orders: \
                                           sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                           if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=[prod_cat_str+'_reorder_rate']),\
                                           how='left', left_on=prod_cat_str, right_index=True)
        assert(df_all.groupby(prod_cat_str)[prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == df_all[prod_cat_str].unique().shape[0])

        # user_cat_prod_reorder_rate
        # for a given product, the reorder rate of all users who belongs to a particle category
        user_cat_str = 'user_cat_' + str(num_topics)
        df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, 'product_id']).apply(lambda orders: \
                                         sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                                         if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_prod_reorder_rate']),\
                                         how='left', left_on=[user_cat_str,'product_id'],right_index=True)
        assert(df_all.groupby([user_cat_str,'product_id'])[user_cat_str+'_prod_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == len(df_all.groupby([user_cat_str,'product_id']).groups.keys()))

        # user_cat_prod_cat_reorder_rate
        df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, prod_cat_str]).apply(lambda orders: \
                        sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \
                        if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_'+prod_cat_str+'_reorder_rate']), \
                        how='left', left_on=[user_cat_str, prod_cat_str], right_index=True)
        assert(df_all.groupby([user_cat_str,prod_cat_str])[user_cat_str+'_'+prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \
               == len(df_all.groupby([user_cat_str,prod_cat_str]).groups.keys()))

    # set dtypes
    category_cols = [
        'eval_set', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100',
        'user_cat_20', 'user_cat_50', 'user_cat_100'
    ]
    for col in category_cols:
        df_all[col] = df_all[col].astype('category')

    rate_cols = [
        'aisle_reorder_rate', 'department_reorder_rate',
        'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate',
        'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate',
        'user_cat_50_prod_reorder_rate',
        'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate',
        'user_cat_100_prod_reorder_rate',
        'user_cat_100_prod_cat_100_reorder_rate'
    ]
    for col in rate_cols:
        df_all[col] = df_all[col].astype('float32')

    print df_all.shape
    # use float32 to save
    # split the train and test for df_all
    # df_train=df_all[df_all.eval_set=='train'].drop(['add_to_cart_order'],axis=1)
    #  df_test=(df_all[df_all.eval_set=='test']).drop(['add_to_cart_order','reordered'],axis=1)
    df_train = df_all[df_all.eval_set == 'train']
    df_train['reordered'] = df_train['reordered'].astype(np.uint8)
    df_test = (df_all[df_all.eval_set == 'test']).drop(['reordered'], axis=1)

    print df_train.shape
    print df_test.shape

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename=__file__[:-3] + ts() + '.log',
                    filemode='w+')

log = logging.getLogger(__name__)

if __name__ == "__main__":

    config = tf.ConfigProto()  #device_count = {'GPU': 0})
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

    X_train, X_test, y_train, y_test, y_train_argmax, y_test_argmax = df_to_ML_data(
        load_df())

    node_subset_change_interval = 3
    population_size = 50
    num_parents_mating = 8
    num_generations = 5000
    mutation_chance = 0.01
    mutation_rate = 3
    stuck_multiplier = 1
    stuck_evasion_rate = 1.25
    stuck_multiplier_max = 5
    stuck_check_length = 30
    save_interval = 5
    plot_interval = 150000
    federated_population_fitness = federated_population_fitness_model_based
    individual_fitness = individual_fitness_nmse
    # user prod add_to_cart_order mean, std
    priors, train, orders, products, aisles, departments = preprocessing_data()

    df_user_pnone=pd.DataFrame(priors.groupby('user_id').\
        apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\
        / float(user_orders.order_id.unique().shape[0])),columns=['pnone'])

    df_train = df_train.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)
    df_test = df_test.merge(df_user_pnone,\
                          how='left', left_on=['user_id'], right_index=True)

    # save the df_train, df_test
    print "save the processed data"
    common.save_df(df_train, "../data/", train_data_name, index=False)
    common.save_df(df_test, "../data/", test_data_name, index=False)


if __name__ == '__main__':
    df_train, df_test = get_processed_data()
    extra_reorder_rate_processing()
    pnone_processing()

    # debug for extra cat data
    # load the data
    df_train = common.load_df('../data/', 'train_data')
    df_test = common.load_df('../data/', 'test_data')

    df_train = get_extra_cat_data(df_train)
    df_test = get_extra_cat_data(df_test)
Exemple #8
0
        prod_cat_name_prefix = "prior_prod_cat_"
    else:
        user_corpus_name = "train_user_corpus"
        model_fn_prefix = "../models/train_up_"
        user_cat_name_prefix = "train_user_cat_"
        prod_cat_name_prefix = "train_prod_cat_"

    print user_corpus_name
    print model_fn_prefix
    print user_cat_name_prefix
    print prod_cat_name_prefix

    # load and build user products
    # products correspond to corpus
    df_user_prods = common.load_df("../data/",
                                   user_corpus_name,
                                   converters={"user_corpus": literal_eval})
    df_user_prods = df_user_prods.set_index('user_id', drop=False)
    user_prods = list(df_user_prods.user_corpus)

    # build lda model
    if __debug__:
        print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n"
        print "Processing 200 users, 10 topics"
        up_lda = LdaMulticore(corpus=user_prods[0:1000],
                              id2word=id2prod,
                              workers=3,
                              num_topics=20)
        up_lda.save('/tmp/up.lda')
        loaded_up_lda = LdaModel.load('/tmp/up.lda')
        loaded_up_lda.show_topics()
Exemple #9
0
     # the following is self test code for 10 users
     # load the data
     ten_user_orders = pd.read_csv("/tmp/ten_user_orders.csv.gz", compression='gzip')
     
     # test for get_user_corpus
     ten_user_corpus = get_user_corpus(ten_user_orders)  
     
     # check get_user_corpus result
     print "compare user_corpus"
     print ten_user_corpus[ten_user_corpus.user_id==202277].user_corpus.apply(lambda row: list(zip(*row)[0]))
     print ten_user_orders[ten_user_orders.user_id==202277].product_id.sort_values().tolist()
 
     # save corpus
     common.save_df(ten_user_corpus,"/tmp/", "ten_user_corpus", index=False)
     # load corpus back    
     load_ten_user_corpus = common.load_df("/tmp/", "ten_user_corpus")
     print load_ten_user_corpus
 else:
     ### formal code
     IDIR = "../input/"
     priors, train, orders, products, aisles, departments = common.load_raw_data(IDIR)
 
 	# only build corpus for priors which is used for cross-validation
     print('add order info to priors')
     orders = orders.set_index('order_id', inplace=True, drop=False)
     priors = priors.join(orders, on='order_id', rsuffix='_')
     priors.drop('order_id_', inplace=True, axis=1)
     
     if "priors" in objects:
 	    priors_user_corpus = get_user_corpus(priors)
         print "compare user_corpus"
Exemple #10
0
        np.std(df_x[0]),
        sp.stats.skew(df_x[0]),
        sp.stats.kurtosis(df_x[0])
    ],
                     index=[
                         'user_cat_mean', 'user_cat_std', 'user_cat_skew',
                         'user_cat_kur'
                     ])


if __name__ == '__main__':
    # get data
    # df_train,df_test = get_processed_data()

    #  try to use extra data
    df_train = common.load_df('../data/', 'df_imba_train')
    df_train['aisle_id'] = df_train['aisle_id'].astype('category')
    df_train['department_id'] = df_train['department_id'].astype('category')

    # load extra cat data 150, 300
    df_train = get_extra_cat_data(df_train)
    print(df_train.dtypes['user_cat_150'])
    print(df_train.dtypes['prod_cat_150'])

    # find the features which could be used for prediction pnone
    pne = pNoneEstimator()

    find_features = False
    if find_features:
        df_features = df_train[df_train.user_id < 10000]
        order_numbers = df_features.order_id.unique().shape[0]
        #'bagging_freq': 5,
        'early_stopping_rounds':
        100,  # early_stopping_rounds is important. only when early_stopping happens, the best_iteration will be returned.
        'num_boost_round':
        3000,  # num_boost_round, Number of boosted trees to fit 
        'decay': 0.995,
        'min_learning_rate': 0.02,
        'verbose_eval': True
    }

    print(trainParams)
    common.logging_dict(logger, trainParams, 'test logging')
    logger.debug('lgb_version=%f' % lgb.__version__)

    # load the data
    df_train = common.load_df('../data/', 'df_imba_train')
    df_train['aisle_id'] = df_train['aisle_id'].astype('category')
    df_train['department_id'] = df_train['department_id'].astype('category')

    # load extra cat data 150, 300
    df_train = get_extra_cat_data(df_train)
    print(df_train.dtypes['user_cat_150'])
    print(df_train.dtypes['prod_cat_150'])

    # the bst_model_id decided load model or do training
    bst_model_id = -1

    if bst_model_id < 0:
        print("execute the training")
        # run the training
        model = run_training(df_train, copy.deepcopy(trainParams))
Exemple #12
0
from common import load_df, df_to_ML_data
from myneat import LoggingReporter
from sklearn.metrics import accuracy_score
import os
import pickle
import neat
import neat_visualization as visualize
import logging
import sys
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',level=logging.INFO, 
                    filename='genetic_neat.log', filemode='w+')


log = logging.getLogger(__name__)

X_train, X_test, y_train, y_test = df_to_ML_data(load_df())


def eval_genomes(genomes, config):
    for _, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        y_pred = [net.activate(Xi) for Xi in X_train]
        genome.fitness = accuracy_score(y_train, y_pred)


def run(config_file):
    # Load configuration.
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         config_file)