Esempio n. 1
0
def latent_sim_features(base_path,
                        log,
                        examples,
                        latent_path=None,
                        keys=KEYS,
                        sizes=SIZES,
                        redo=False):

    name = 'latent_sim_features'
    if latent_path is None:
        latent_path = base_path

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         latent_path=latent_path,
                                         keys=keys,
                                         sizes=sizes)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Esempio n. 2
0
File: time.py Progetto: rn5l/rsc19
def time_features(base_path,
                  log,
                  examples,
                  preprocessed_path=PREPROCESSED_FOLDER,
                  redo=False):

    name = 'time_features'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         preprocessed_path=preprocessed_path)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Esempio n. 3
0
def pop_features(base_path,
                 log,
                 examples,
                 hidden=False,
                 min_pop=None,
                 train_only=False,
                 redo=False):

    name = 'pop_features'
    if hidden:
        name += '_hidden'
    if min_pop is not None:
        name += '_mp' + str(min_pop)
    if train_only:
        name += '_trainonly'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         hidden=hidden,
                                         min_pop=min_pop,
                                         train_only=train_only)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Esempio n. 4
0
def meta_features(base_path,
                  meta_path,
                  log,
                  examples,
                  latent='d2v',
                  latent_size=16,
                  redo=False):

    name = 'meta_features'
    if latent == None:
        name += '_all'
    else:
        name += '_' + str(latent_size)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(meta_path,
                                         log,
                                         examples,
                                         latent_prefix=latent,
                                         latent_size=latent_size)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Esempio n. 5
0
def crawl_features(base_path, crawl_path, log, examples, redo=False):
    
    name = 'crawl_features'
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( crawl_path, log, examples )
        examples = reduce_mem_usage(examples)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
    return examples
Esempio n. 6
0
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False):
    
    name = 'session_features'
    if price_path is None:
        price_path = base_path
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path )
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
        
    return examples
Esempio n. 7
0
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False):

    name = 'list_context_features_' + str(shifts)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log, examples, shifts=shifts)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Esempio n. 8
0
def add_last_poi( poi_path, log ):
    
    def _add_last_poi(row, save=None):
        
        session = row[0]
        action = row[1]
        ref = row[2]
        city = row[3]
        
        if 'session' in save and save['session'] != session or not 'session' in save:
            #new session
            save['session'] = session
            save['last_poi'] = -1
        
        if 'city' in save and save['city'] != city or not 'city' in save:
            #new session
            save['city'] = city
            save['last_poi'] = -1
        
        if action == POI and not np.isnan( ref ):
            save['last_poi'] = ref
        
        return save['last_poi']
    
    file = poi_path + 'last_poi.fthr'
    
    if not Path( file ).is_file():
        log_full = load_hdfs( poi_path + 'data_log.hd5' )
        log_full['last_poi'] = apply(log_full, ['session_id','action_type','reference','city'], _add_last_poi, verbose=100000)
        write_feather( log_full[['session_id','last_poi']], file )
    
    last_poi = load_feather( file )
    print( len(last_poi) )
    last_poi = last_poi[last_poi.session_id.isin( log.session_id.unique() )]
    
    print( len(last_poi) )
    print( len(log) )
    log['last_poi'] = last_poi['last_poi'].values
    del last_poi
    
    return log
Esempio n. 9
0
def create_set(base_path=SET, key='dataset', conf={}, redo=False):

    name = key

    path = Path(base_path + 'sets/' + name + '.fthr')
    if path.is_file() and not redo:
        print('loaded')
        examples = load_feather(path)
        gc.collect()
    else:
        print('create')
        log = load_hdfs(base_path + 'data_log.hd5')
        examples = load_hdfs(base_path + 'data_examples.hd5')
        if 'current_filters' in set(examples.columns):
            print('current_filters')
            del examples['current_filters']
        if 'session_id_pre' in set(examples.columns):
            print('session_id_pre')
            del examples['session_id_pre']

        examples = pop_features(conf['path_pop'],
                                log,
                                examples,
                                hidden=conf['pop_hidden'],
                                min_pop=conf['min_pop'],
                                train_only=conf['train_only'],
                                redo=redo)
        examples = price_features(conf['path_price'],
                                  log,
                                  examples,
                                  min_occurences=conf['min_occurences'],
                                  hidden=conf['price_hidden'],
                                  train_only=conf['train_only'],
                                  fillna_mean=conf['fillna_mean'],
                                  redo=redo)
        examples = session_features(conf['path_session'],
                                    log,
                                    examples,
                                    crawl_path=conf['path_crawl'],
                                    redo=redo)
        examples = crawl_features(base_path,
                                  conf['path_crawl'],
                                  log,
                                  examples,
                                  redo=redo)
        examples = geo_features(base_path,
                                conf['path_crawl'],
                                log,
                                examples,
                                redo=redo)
        examples = meta_features(base_path,
                                 conf['path_meta'],
                                 log,
                                 examples,
                                 latent=conf['meta_latent'],
                                 redo=redo)
        examples = user_features(conf['path_session'],
                                 log,
                                 examples,
                                 crawl_path=conf['path_crawl'],
                                 poi_path=conf['path_poi'],
                                 redo=redo)
        examples = position_features(base_path, log, examples, redo=redo)
        examples = properties_features(base_path,
                                       conf['path_meta'],
                                       log,
                                       examples,
                                       redo=redo)
        #examples = latent_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo)
        examples = latent_sim_features(base_path,
                                       log,
                                       examples,
                                       latent_path=conf['path_latent'],
                                       redo=redo)
        examples = combine_features(base_path, log, examples, redo=redo)
        examples = rank_features(base_path, log, examples, redo=redo)
        examples = time_features(base_path, log, examples, redo=redo)
        examples = list_context_features(base_path, log, examples, redo=redo)
        examples = stars_features(base_path,
                                  conf['path_meta'],
                                  log,
                                  examples,
                                  redo=redo)
        #examples = prediction_features(base_path, log, examples, redo=redo)

        #examples.to_csv( base_path + 'sets/' + name + '.csv' )
        write_feather(examples, path)

        del log
        gc.collect()

    #print_col_list( examples.columns )
    #examples = reduce_mem_usage(examples)
    return examples