def pnone_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) print df_train.shape print df_test.shape # user prod add_to_cart_order mean, std priors, train, orders, products, aisles, departments = preprocessing_data() df_user_pnone=pd.DataFrame(priors.groupby('user_id').\ apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\ / float(user_orders.order_id.unique().shape[0])),columns=['pnone']) df_train = df_train.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) df_test = df_test.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
def extra_feature_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) print df_train.shape print df_test.shape # user prod add_to_cart_order mean, std priors, train, orders, products, aisles, departments = preprocessing_data() # get mean of add to cart order user_prod_avg_add_to_cart_order=\ pd.DataFrame(priors.groupby(['user_id','product_id']).add_to_cart_order.agg(np.mean)) user_prod_avg_add_to_cart_order.rename( columns={'add_to_cart_order':'user_prod_avg_add_to_cart_order'},\ inplace=True) df_train = df_train.merge(user_prod_avg_add_to_cart_order,\ how='left', left_on=['user_id','product_id'], right_index=True) df_test = df_test.merge(user_prod_avg_add_to_cart_order,\ how='left', left_on=['user_id','product_id'], right_index=True) # order_dow processing # priors.groupby(['user_id','product_id']).apply(get_p_dow) # load timeline related data # timeline_data = get_timeline_data() # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
def load_user_prod_cat_data(num_topics=20, rename=True): ### load cat data user_cat_name = "prior_user_cat_" + str(num_topics) df_user_cat = common.load_df("../data/", user_cat_name) df_user_cat = df_user_cat.set_index('user_id', drop=False) prod_cat_name = "prior_prod_cat_" + str(num_topics) df_prod_cat = common.load_df("../data/", prod_cat_name) df_prod_cat = df_prod_cat.set_index('prod_id', drop=False) user_prod_match_name = "prior_user_prod_match_" + str(num_topics) df_user_prod_match = common.load_df("../data/", user_prod_match_name) # reset dtype to category df_user_cat['user_cat'] = df_user_cat['user_cat'].astype('category') df_prod_cat['prod_cat'] = df_prod_cat['prod_cat'].astype('category') if rename: df_user_cat.rename(columns={"user_cat": "user_cat_" + str(num_topics)}, inplace=True) df_prod_cat.rename(columns={"prod_cat": "prod_cat_" + str(num_topics)}, inplace=True) df_prod_cat.rename(columns={"prod_id": "product_id"}, inplace=True) df_user_prod_match.rename( columns={"user_prod_match": "user_prod_match_" + str(num_topics)}, inplace=True) df_user_prod_match.rename(columns={'prod_id': 'product_id'}, inplace=True) return df_user_cat, df_prod_cat, df_user_prod_match
def get_processed_data(path="../data/", train_data_name="train_data", test_data_name="test_data"): if os.path.isfile(path + train_data_name + ".csv.gz") and os.path.isfile(path + test_data_name + ".csv.gz"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) else: print "no data, start processing" df_train, df_test = processing_data() # save the df_train, df_test common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False) return df_train, df_test
def extra_reorder_rate_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) df_all = pd.concat([df_train, df_test]) assert (df_all.shape[0] == df_train.shape[0] + df_test.shape[0]) print df_all.shape print df_all.keys() # here the cat 50, 100 is totally wrong. # re-processing here # drop old cat columns drop_cols = [ 'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50', 'user_cat_100', 'user_prod_match_20', 'user_prod_match_50', 'user_prod_match_100', 'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate', 'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate', 'user_cat_50_prod_reorder_rate', 'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate', 'user_cat_100_prod_reorder_rate', 'user_cat_100_prod_cat_100_reorder_rate' ] df_all.drop(drop_cols, axis=1, inplace=True, errors='ignore') print df_all.shape # load user and product category data user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data( ) # merge category data df_all = df_all.merge(user_cat_data, how='left', on='user_id') df_all = df_all.merge(prod_cat_data, how='left', on='product_id') df_all = df_all.merge(user_prod_cat_match_data, how='left', on=['user_id', 'product_id']) # reorder_rate processing # aisle reorder rate df_all=df_all.merge(pd.DataFrame(df_all.groupby('aisle_id').apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['aisle_reorder_rate']),\ how='left', left_on='aisle_id', right_index=True) assert ( df_all.groupby('aisle_id').aisle_reorder_rate.apply(lambda x: x.unique( ).shape[0] == 1).sum() == df_all.aisle_id.unique().shape[0]) # department reorder rate df_all=df_all.merge(pd.DataFrame(df_all.groupby('department_id').apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['department_reorder_rate']),\ how='left', left_on='department_id', right_index=True) assert(df_all.groupby('department_id').department_reorder_rate.apply(lambda x: x.unique().shape[0]==1).sum() ==\ df_all.department_id.unique().shape[0]) nt_list = [20, 50, 100] for num_topics in nt_list: # prod_cat_reorder_rate prod_cat_str = 'prod_cat_' + str(num_topics) df_all = df_all.merge(pd.DataFrame(df_all.groupby(prod_cat_str).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=[prod_cat_str+'_reorder_rate']),\ how='left', left_on=prod_cat_str, right_index=True) assert(df_all.groupby(prod_cat_str)[prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == df_all[prod_cat_str].unique().shape[0]) # user_cat_prod_reorder_rate # for a given product, the reorder rate of all users who belongs to a particle category user_cat_str = 'user_cat_' + str(num_topics) df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, 'product_id']).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_prod_reorder_rate']),\ how='left', left_on=[user_cat_str,'product_id'],right_index=True) assert(df_all.groupby([user_cat_str,'product_id'])[user_cat_str+'_prod_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == len(df_all.groupby([user_cat_str,'product_id']).groups.keys())) # user_cat_prod_cat_reorder_rate df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, prod_cat_str]).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_'+prod_cat_str+'_reorder_rate']), \ how='left', left_on=[user_cat_str, prod_cat_str], right_index=True) assert(df_all.groupby([user_cat_str,prod_cat_str])[user_cat_str+'_'+prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == len(df_all.groupby([user_cat_str,prod_cat_str]).groups.keys())) # set dtypes category_cols = [ 'eval_set', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50', 'user_cat_100' ] for col in category_cols: df_all[col] = df_all[col].astype('category') rate_cols = [ 'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate', 'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate', 'user_cat_50_prod_reorder_rate', 'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate', 'user_cat_100_prod_reorder_rate', 'user_cat_100_prod_cat_100_reorder_rate' ] for col in rate_cols: df_all[col] = df_all[col].astype('float32') print df_all.shape # use float32 to save # split the train and test for df_all # df_train=df_all[df_all.eval_set=='train'].drop(['add_to_cart_order'],axis=1) # df_test=(df_all[df_all.eval_set=='test']).drop(['add_to_cart_order','reordered'],axis=1) df_train = df_all[df_all.eval_set == 'train'] df_train['reordered'] = df_train['reordered'].astype(np.uint8) df_test = (df_all[df_all.eval_set == 'test']).drop(['reordered'], axis=1) print df_train.shape print df_test.shape # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO, filename=__file__[:-3] + ts() + '.log', filemode='w+') log = logging.getLogger(__name__) if __name__ == "__main__": config = tf.ConfigProto() #device_count = {'GPU': 0}) config.gpu_options.per_process_gpu_memory_fraction = 0.5 config.gpu_options.allow_growth = True keras.backend.tensorflow_backend.set_session(tf.Session(config=config)) X_train, X_test, y_train, y_test, y_train_argmax, y_test_argmax = df_to_ML_data( load_df()) node_subset_change_interval = 3 population_size = 50 num_parents_mating = 8 num_generations = 5000 mutation_chance = 0.01 mutation_rate = 3 stuck_multiplier = 1 stuck_evasion_rate = 1.25 stuck_multiplier_max = 5 stuck_check_length = 30 save_interval = 5 plot_interval = 150000 federated_population_fitness = federated_population_fitness_model_based individual_fitness = individual_fitness_nmse
# user prod add_to_cart_order mean, std priors, train, orders, products, aisles, departments = preprocessing_data() df_user_pnone=pd.DataFrame(priors.groupby('user_id').\ apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\ / float(user_orders.order_id.unique().shape[0])),columns=['pnone']) df_train = df_train.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) df_test = df_test.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False) if __name__ == '__main__': df_train, df_test = get_processed_data() extra_reorder_rate_processing() pnone_processing() # debug for extra cat data # load the data df_train = common.load_df('../data/', 'train_data') df_test = common.load_df('../data/', 'test_data') df_train = get_extra_cat_data(df_train) df_test = get_extra_cat_data(df_test)
prod_cat_name_prefix = "prior_prod_cat_" else: user_corpus_name = "train_user_corpus" model_fn_prefix = "../models/train_up_" user_cat_name_prefix = "train_user_cat_" prod_cat_name_prefix = "train_prod_cat_" print user_corpus_name print model_fn_prefix print user_cat_name_prefix print prod_cat_name_prefix # load and build user products # products correspond to corpus df_user_prods = common.load_df("../data/", user_corpus_name, converters={"user_corpus": literal_eval}) df_user_prods = df_user_prods.set_index('user_id', drop=False) user_prods = list(df_user_prods.user_corpus) # build lda model if __debug__: print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n" print "Processing 200 users, 10 topics" up_lda = LdaMulticore(corpus=user_prods[0:1000], id2word=id2prod, workers=3, num_topics=20) up_lda.save('/tmp/up.lda') loaded_up_lda = LdaModel.load('/tmp/up.lda') loaded_up_lda.show_topics()
# the following is self test code for 10 users # load the data ten_user_orders = pd.read_csv("/tmp/ten_user_orders.csv.gz", compression='gzip') # test for get_user_corpus ten_user_corpus = get_user_corpus(ten_user_orders) # check get_user_corpus result print "compare user_corpus" print ten_user_corpus[ten_user_corpus.user_id==202277].user_corpus.apply(lambda row: list(zip(*row)[0])) print ten_user_orders[ten_user_orders.user_id==202277].product_id.sort_values().tolist() # save corpus common.save_df(ten_user_corpus,"/tmp/", "ten_user_corpus", index=False) # load corpus back load_ten_user_corpus = common.load_df("/tmp/", "ten_user_corpus") print load_ten_user_corpus else: ### formal code IDIR = "../input/" priors, train, orders, products, aisles, departments = common.load_raw_data(IDIR) # only build corpus for priors which is used for cross-validation print('add order info to priors') orders = orders.set_index('order_id', inplace=True, drop=False) priors = priors.join(orders, on='order_id', rsuffix='_') priors.drop('order_id_', inplace=True, axis=1) if "priors" in objects: priors_user_corpus = get_user_corpus(priors) print "compare user_corpus"
np.std(df_x[0]), sp.stats.skew(df_x[0]), sp.stats.kurtosis(df_x[0]) ], index=[ 'user_cat_mean', 'user_cat_std', 'user_cat_skew', 'user_cat_kur' ]) if __name__ == '__main__': # get data # df_train,df_test = get_processed_data() # try to use extra data df_train = common.load_df('../data/', 'df_imba_train') df_train['aisle_id'] = df_train['aisle_id'].astype('category') df_train['department_id'] = df_train['department_id'].astype('category') # load extra cat data 150, 300 df_train = get_extra_cat_data(df_train) print(df_train.dtypes['user_cat_150']) print(df_train.dtypes['prod_cat_150']) # find the features which could be used for prediction pnone pne = pNoneEstimator() find_features = False if find_features: df_features = df_train[df_train.user_id < 10000] order_numbers = df_features.order_id.unique().shape[0]
#'bagging_freq': 5, 'early_stopping_rounds': 100, # early_stopping_rounds is important. only when early_stopping happens, the best_iteration will be returned. 'num_boost_round': 3000, # num_boost_round, Number of boosted trees to fit 'decay': 0.995, 'min_learning_rate': 0.02, 'verbose_eval': True } print(trainParams) common.logging_dict(logger, trainParams, 'test logging') logger.debug('lgb_version=%f' % lgb.__version__) # load the data df_train = common.load_df('../data/', 'df_imba_train') df_train['aisle_id'] = df_train['aisle_id'].astype('category') df_train['department_id'] = df_train['department_id'].astype('category') # load extra cat data 150, 300 df_train = get_extra_cat_data(df_train) print(df_train.dtypes['user_cat_150']) print(df_train.dtypes['prod_cat_150']) # the bst_model_id decided load model or do training bst_model_id = -1 if bst_model_id < 0: print("execute the training") # run the training model = run_training(df_train, copy.deepcopy(trainParams))
from common import load_df, df_to_ML_data from myneat import LoggingReporter from sklearn.metrics import accuracy_score import os import pickle import neat import neat_visualization as visualize import logging import sys logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',level=logging.INFO, filename='genetic_neat.log', filemode='w+') log = logging.getLogger(__name__) X_train, X_test, y_train, y_test = df_to_ML_data(load_df()) def eval_genomes(genomes, config): for _, genome in genomes: net = neat.nn.FeedForwardNetwork.create(genome, config) y_pred = [net.activate(Xi) for Xi in X_train] genome.fitness = accuracy_score(y_train, y_pred) def run(config_file): # Load configuration. config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, config_file)