def main(): log = load_hdfs(TEST_FOLDER + 'data_log.hd5') examples = load_hdfs(TEST_FOLDER + 'data_examples.hd5') meta_features(TEST_FOLDER, META_FOLDER, log, examples, latent=None, redo=False)
def main(): log = load_hdfs(TEST_FOLDER + 'data_log.hd5') examples = load_hdfs(TEST_FOLDER + 'data_examples.hd5') examples = pop_features(TEST_FOLDER, log, examples) examples = crawl_features(TEST_FOLDER, CRAWL_FOLDER, log, examples) examples = geo_features(TEST_FOLDER, CRAWL_FOLDER, log, examples) examples = latent_sim_features(TEST_FOLDER, log, examples, LATENT_FOLDER) rank_features(TEST_FOLDER, log, examples, redo=True)
def main(): log = load_hdfs(TEST_FOLDER + 'data_log.hd5') examples = load_hdfs(TEST_FOLDER + 'data_examples.hd5') latent_sim_features(TEST_FOLDER, log, examples, latent_path=LATENT_FOLDER, keys=KEYS, sizes=SIZES, redo=True)
def main(): log = load_hdfs(TEST_FOLDER + 'data_log.hd5') examples = load_hdfs(TEST_FOLDER + 'data_examples.hd5') log = clean_test(log) price_features(TEST_FOLDER, log, examples, min_occurences=MIN_OCCURENCES, hidden=HIDDEN, fillna_mean=FILLNA_MEAN, redo=True)
def main(): log = load_hdfs( DATA_FOLDER + 'data_log.hd5' ) #examples = load_hdfs( DATA_FOLDER + 'data_examples.hd5' ) create_latent_factors( log, size=SIZEA, actions=ACTIONS_ITEM, values=VALUE_ITEM, key=KEY_ITEM, method=METHOD ) gc.collect() create_latent_factors( log, size=SIZEA, actions=ACTIONS_ITEM, values=VALUE_ITEM, key=KEY_ITEM, method=METHOD ) gc.collect() create_latent_factors( log, size=SIZEB, actions=ACTIONS_CLICK, values=VALUE_CLICK, key=KEY_CLICK, method=METHOD ) gc.collect() create_latent_factors( log, size=SIZEB, actions=ACTIONS_CLICK, values=VALUE_CLICK, key=KEY_CLICK, method=METHOD ) gc.collect()
def main(): log = load_hdfs(TEST_FOLDER + 'data_log.hd5') examples = load_hdfs(TEST_FOLDER + 'data_examples.hd5') properties_features(TEST_FOLDER, META_FOLDER, log, examples, redo=True)
def main(): log = load_hdfs( TEST_FOLDER + 'data_log.hd5' ) examples = load_hdfs( TEST_FOLDER + 'data_examples.hd5' ) combine_features( TEST_FOLDER, log, examples, redo=True )
def main(): log = load_hdfs( TEST_FOLDER + 'data_log.hd5' ) examples = load_hdfs( TEST_FOLDER + 'data_examples.hd5' ) log = clean_test(log) position_features( TEST_FOLDER, log, examples, redo=True )
def create_set(base_path=SET, key='dataset', conf={}, redo=False): name = key path = Path(base_path + 'sets/' + name + '.fthr') if path.is_file() and not redo: print('loaded') examples = load_feather(path) gc.collect() else: print('create') log = load_hdfs(base_path + 'data_log.hd5') examples = load_hdfs(base_path + 'data_examples.hd5') if 'current_filters' in set(examples.columns): print('current_filters') del examples['current_filters'] if 'session_id_pre' in set(examples.columns): print('session_id_pre') del examples['session_id_pre'] examples = pop_features(conf['path_pop'], log, examples, hidden=conf['pop_hidden'], min_pop=conf['min_pop'], train_only=conf['train_only'], redo=redo) examples = price_features(conf['path_price'], log, examples, min_occurences=conf['min_occurences'], hidden=conf['price_hidden'], train_only=conf['train_only'], fillna_mean=conf['fillna_mean'], redo=redo) examples = session_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], redo=redo) examples = crawl_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = geo_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = meta_features(base_path, conf['path_meta'], log, examples, latent=conf['meta_latent'], redo=redo) examples = user_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], poi_path=conf['path_poi'], redo=redo) examples = position_features(base_path, log, examples, redo=redo) examples = properties_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = latent_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = latent_sim_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = combine_features(base_path, log, examples, redo=redo) examples = rank_features(base_path, log, examples, redo=redo) examples = time_features(base_path, log, examples, redo=redo) examples = list_context_features(base_path, log, examples, redo=redo) examples = stars_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = prediction_features(base_path, log, examples, redo=redo) #examples.to_csv( base_path + 'sets/' + name + '.csv' ) write_feather(examples, path) del log gc.collect() #print_col_list( examples.columns ) #examples = reduce_mem_usage(examples) return examples
def main(): log = load_hdfs( DATA_FOLDER + 'data_log.hd5' ) #examples = load_hdfs( DATA_FOLDER + 'data_examples.hd5' ) create_latent_factors( log, size=SIZE, actions=ACTIONS_ITEM, key=KEY_ITEM ) create_latent_factors( log, size=SIZE, actions=ACTIONS_CLICK, key=KEY_CLICK )
def main(): meta = load_hdfs(BASE_PATH + PREP + 'meta_extended.hd5') keep = meta[['item_id', 'properties_code']] create_latent_factors(keep)