def latent_sim_features(base_path, log, examples, latent_path=None, keys=KEYS, sizes=SIZES, redo=False): name = 'latent_sim_features' if latent_path is None: latent_path = base_path path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, latent_path=latent_path, keys=keys, sizes=sizes) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def time_features(base_path, log, examples, preprocessed_path=PREPROCESSED_FOLDER, redo=False): name = 'time_features' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, preprocessed_path=preprocessed_path) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def pop_features(base_path, log, examples, hidden=False, min_pop=None, train_only=False, redo=False): name = 'pop_features' if hidden: name += '_hidden' if min_pop is not None: name += '_mp' + str(min_pop) if train_only: name += '_trainonly' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, hidden=hidden, min_pop=min_pop, train_only=train_only) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def meta_features(base_path, meta_path, log, examples, latent='d2v', latent_size=16, redo=False): name = 'meta_features' if latent == None: name += '_all' else: name += '_' + str(latent_size) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(meta_path, log, examples, latent_prefix=latent, latent_size=latent_size) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def crawl_features(base_path, crawl_path, log, examples, redo=False): name = 'crawl_features' path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( crawl_path, log, examples ) examples = reduce_mem_usage(examples) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False): name = 'session_features' if price_path is None: price_path = base_path path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path ) examples = reduce_mem_usage(examples, cols=cols) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False): name = 'list_context_features_' + str(shifts) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, shifts=shifts) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def add_last_poi( poi_path, log ): def _add_last_poi(row, save=None): session = row[0] action = row[1] ref = row[2] city = row[3] if 'session' in save and save['session'] != session or not 'session' in save: #new session save['session'] = session save['last_poi'] = -1 if 'city' in save and save['city'] != city or not 'city' in save: #new session save['city'] = city save['last_poi'] = -1 if action == POI and not np.isnan( ref ): save['last_poi'] = ref return save['last_poi'] file = poi_path + 'last_poi.fthr' if not Path( file ).is_file(): log_full = load_hdfs( poi_path + 'data_log.hd5' ) log_full['last_poi'] = apply(log_full, ['session_id','action_type','reference','city'], _add_last_poi, verbose=100000) write_feather( log_full[['session_id','last_poi']], file ) last_poi = load_feather( file ) print( len(last_poi) ) last_poi = last_poi[last_poi.session_id.isin( log.session_id.unique() )] print( len(last_poi) ) print( len(log) ) log['last_poi'] = last_poi['last_poi'].values del last_poi return log
def create_set(base_path=SET, key='dataset', conf={}, redo=False): name = key path = Path(base_path + 'sets/' + name + '.fthr') if path.is_file() and not redo: print('loaded') examples = load_feather(path) gc.collect() else: print('create') log = load_hdfs(base_path + 'data_log.hd5') examples = load_hdfs(base_path + 'data_examples.hd5') if 'current_filters' in set(examples.columns): print('current_filters') del examples['current_filters'] if 'session_id_pre' in set(examples.columns): print('session_id_pre') del examples['session_id_pre'] examples = pop_features(conf['path_pop'], log, examples, hidden=conf['pop_hidden'], min_pop=conf['min_pop'], train_only=conf['train_only'], redo=redo) examples = price_features(conf['path_price'], log, examples, min_occurences=conf['min_occurences'], hidden=conf['price_hidden'], train_only=conf['train_only'], fillna_mean=conf['fillna_mean'], redo=redo) examples = session_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], redo=redo) examples = crawl_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = geo_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = meta_features(base_path, conf['path_meta'], log, examples, latent=conf['meta_latent'], redo=redo) examples = user_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], poi_path=conf['path_poi'], redo=redo) examples = position_features(base_path, log, examples, redo=redo) examples = properties_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = latent_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = latent_sim_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = combine_features(base_path, log, examples, redo=redo) examples = rank_features(base_path, log, examples, redo=redo) examples = time_features(base_path, log, examples, redo=redo) examples = list_context_features(base_path, log, examples, redo=redo) examples = stars_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = prediction_features(base_path, log, examples, redo=redo) #examples.to_csv( base_path + 'sets/' + name + '.csv' ) write_feather(examples, path) del log gc.collect() #print_col_list( examples.columns ) #examples = reduce_mem_usage(examples) return examples