def latent_sim_features(base_path, log, examples, latent_path=None, keys=KEYS, sizes=SIZES, redo=False): name = 'latent_sim_features' if latent_path is None: latent_path = base_path path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, latent_path=latent_path, keys=keys, sizes=sizes) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def time_features(base_path, log, examples, preprocessed_path=PREPROCESSED_FOLDER, redo=False): name = 'time_features' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, preprocessed_path=preprocessed_path) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def pop_features(base_path, log, examples, hidden=False, min_pop=None, train_only=False, redo=False): name = 'pop_features' if hidden: name += '_hidden' if min_pop is not None: name += '_mp' + str(min_pop) if train_only: name += '_trainonly' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, hidden=hidden, min_pop=min_pop, train_only=train_only) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def meta_features(base_path, meta_path, log, examples, latent='d2v', latent_size=16, redo=False): name = 'meta_features' if latent == None: name += '_all' else: name += '_' + str(latent_size) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(meta_path, log, examples, latent_prefix=latent, latent_size=latent_size) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def crawl_features(base_path, crawl_path, log, examples, redo=False): name = 'crawl_features' path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( crawl_path, log, examples ) examples = reduce_mem_usage(examples) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False): name = 'session_features' if price_path is None: price_path = base_path path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path ) examples = reduce_mem_usage(examples, cols=cols) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False): name = 'list_context_features_' + str(shifts) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, shifts=shifts) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def split_competition(data): if DAYS_TRAIN_COMPETITION is not None: maxtr = datetime.fromtimestamp(data[data.train == 1].timestamp.max()) mintr = datetime.fromtimestamp(data[data.train == 1].timestamp.min()) start = maxtr - timedelta(days=DAYS_TRAIN_COMPETITION) data['mintimestamp'] = data.groupby('user_id').timestamp.transform(min) if RANDOM_TRAIN: keep = ((data['mintimestamp'] >= start.timestamp()) & (data.train == 1)) num_sess = data[keep].session_id.nunique() sess = list(data[data.train == 1].session_id.unique()) shuffle(sess) keep = sess[:num_sess] keep = (data.train == 0) | data.session_id.isin(keep) data = data[keep] else: keep = ((data['mintimestamp'] >= start.timestamp()) & (data.train == 1)) keep = keep | (data.train == 0) data = data[keep] mintr = data[data.train == 1].timestamp.min() minva = data[data.train == 0].timestamp.min() maxva = data[data.train == 0].timestamp.max() print(datetime.fromtimestamp(mintr)) print(datetime.fromtimestamp(minva)) print(datetime.fromtimestamp(maxva)) del data['mintimestamp'] data['hidden'] = 0 hide_test = data.reference.isnull() & (data.action_type == CLICK) data.ix[hide_test, 'reference'] = np.nan data.ix[hide_test, 'hidden'] = 1 hide_train = data[(data.train == 1) & (data.action_type == CLICK)].copy() # filter clickout hide_train = hide_train.drop_duplicates('user_id', keep='last') data.ix[hide_train.index, 'reference'] = np.nan data.ix[hide_train.index, 'hidden'] = 1 tmp = pd.DataFrame() tmp['maxstamp'] = data[data.hidden == 1].groupby( 'session_id').timestamp.max() data = data.merge(tmp, right_index=True, left_on='session_id', how='left') data['maxstamp'] = data['maxstamp'].fillna(data.timestamp.max()) data['exclude'] = 0 data.ix[data.timestamp > data.maxstamp, 'exclude'] = 1 del data['maxstamp'], tmp data = reduce_mem_usage(data) write_hdfs(data, TARGET + 'data_log.hd5') #data.to_csv( TARGET + 'data_log.csv', index=False ) data[data.train == 0].to_csv(TARGET + 'data_log_test.csv') examples = expand_and_label(data) write_hdfs(examples, TARGET + 'data_examples.hd5')
def split_sample(data): data = data[data.train == 1].copy() data = reduce_mem_usage(data) maxtr = datetime.fromtimestamp(data.timestamp.max()) mintr = datetime.fromtimestamp(data.timestamp.min()) minva = maxtr - timedelta(days=DAYS_TEST) if DAYS_TRAIN is not None: mintr = maxtr - timedelta(days=DAYS_TEST + DAYS_TRAIN) print(mintr) print(maxtr) print(minva) data['mintimestamp'] = data.groupby('user_id').timestamp.transform(min) data['train'] = (data['mintimestamp'] >= mintr.timestamp()).astype(int) if RANDOM_TRAIN: data.ix[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0 num_sess = data[data.train == 1].session_id.nunique() data['train'] = 1 data.ix[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0 sess = list(data[data.train == 1].session_id.unique()) shuffle(sess) keep = sess[:num_sess] keep = (data.train == 0) | data.session_id.isin(keep) data = data[keep] else: data = data[data.train == 1] data.loc[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0 print(data[['session_id', 'timestamp', 'mintimestamp', 'train']]) mintr = data[data.train == 1].timestamp.min() minva = data[data.train == 0].timestamp.min() maxva = data[data.train == 0].timestamp.max() print(datetime.fromtimestamp(mintr)) print(datetime.fromtimestamp(minva)) print(datetime.fromtimestamp(maxva)) print(len(data[data.train == 1])) print(len(data[data.train == 0])) data = data.reset_index(drop=True) del data['mintimestamp'] #print( len( set(test.session_id.unique()) & set(train.session_id.unique()) ) ) data['hidden'] = 0 data['exclude'] = 0 examples_log = data[data.action_type == CLICK].copy() # filter clickout examples_log = examples_log.drop_duplicates('user_id', keep='last') truth = examples_log[examples_log.train == 0] #hide all data.loc[examples_log.index.values, 'reference'] = np.nan data.loc[examples_log.index.values, 'hidden'] = 1 print('hidden test sum ', data[(data.hidden == 1) & (data.train == 0)].hidden.sum()) tmp = pd.DataFrame() tmp['maxstamp'] = data[data.hidden == 1].groupby( 'session_id').timestamp.max() data = data.merge(tmp, right_index=True, left_on='session_id', how='left') data['maxstamp'] = data['maxstamp'].fillna(data.timestamp.max()) data.loc[data.timestamp > data.maxstamp, 'exclude'] = 1 del data['maxstamp'], tmp print('hidden test sum ', data[(data.hidden == 1) & (data.train == 0)].hidden.sum()) examples = expand_and_label(data) #hide test completely data.loc[examples_log[examples_log.train == 0].index.values, 'item_id'] = np.nan data.loc[examples_log[examples_log.train == 0].index.values, 'price_session'] = np.nan data = reduce_mem_usage(data) write_hdfs(data, TARGET + 'data_log.hd5') #data.to_csv( TARGET + 'data_log.csv' ) data[data.train == 0].to_csv(TARGET + 'data_log_test.csv') write_hdfs(examples, TARGET + 'data_examples.hd5') #examples.to_csv( TARGET + 'data_examples.csv', index=False ) truth.to_csv(TARGET + 'truth.csv', index=False) with open(TARGET + 'size.txt', 'w') as out: out.write('train_size: {}, test_size: {}'.format( DAYS_TRAIN, DAYS_TEST))