def user_agg(si=None): ''' Loads search.gl and aggregates it by UserID to get some features. NB: this did not help. ''' start = datetime.now() if si is None: si = load('search.gl') D = 2**20 si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '') si['SQhash'] = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D) si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None) f = {'pctSQE' : agg.AVG('SQexists'), 'pctSPE' : agg.AVG('SPexists'), 'numSearches' : agg.COUNT(), 'allCat' : agg.CONCAT('CategoryID'), 'allSQ' : agg.CONCAT('SQhash')} si = si[['UserID', 'CategoryID', 'SearchParams', 'SQexists', 'SPexists', 'SQhash']] usr = si.groupby('UserID', f) usr['allSQ'] = usr['allSQ'].apply(lambda l : list(set(l))) usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l))) usr_dict = sframe_to_dict('UserID', usr) avito2_io.put_artifact(usr_dict, 'user_si.pkl') print('elapsed time: %s' % (datetime.now() - start))
def user_agg(si=None): ''' Loads search.gl and aggregates it by UserID to get some features. NB: this did not help. ''' start = datetime.now() if si is None: si = load('search.gl') D = 2**20 si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '') si['SQhash'] = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D) si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None) f = {'pctSQE' : agg.AVG('SQexists'), 'pctSPE' : agg.AVG('SPexists'), 'numSearches' : agg.COUNT(), 'allCat' : agg.CONCAT('CategoryID'), 'allSQ' : agg.CONCAT('SQhash')} si = si[['UserID', 'CategoryID', 'SearchParams', 'SQexists', 'SPexists', 'SQhash']] usr = si.groupby('UserID', f) usr['allSQ'] = usr['allSQ'].apply(lambda l : list(set(l))) usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l))) usr_dict = sframe_to_dict('UserID', usr) avito2_io.put_artifact(usr_dict, 'user_si.pkl') print 'elapsed time: %s' % (datetime.now() - start)
def make_user_dict(): ''' Loads user.gl and creates a dict-of-dicts {int: dict} like: {UserID: {other_fields:other_values}} Saves result at artifacts/user_dict.pkl. It can be loaded with avito2_io.get_artifact. ''' start = datetime.now() user = load('user.gl') user_dict = sframe_to_dict('UserID', user) avito2_io.put_artifact(user_dict, 'user_dict.pkl') print('elapsed time: %s' % (datetime.now() - start))
def make_user_dict(): ''' Loads user.gl and creates a dict-of-dicts {int: dict} like: {UserID: {other_fields:other_values}} Saves result at artifacts/user_dict.pkl. It can be loaded with avito2_io.get_artifact. ''' start = datetime.now() user = load('user.gl') user_dict = sframe_to_dict('UserID', user) avito2_io.put_artifact(user_dict, 'user_dict.pkl') print 'elapsed time: %s' % (datetime.now() - start)
for (k, line) in enumerate(reader): user_id = int(line['UserID']) if user_id in out: vis_ct = out[user_id].setdefault('vis_ct', 0) out[user_id]['vis_ct'] = vis_ct + 1 if (k + 1) % 1000000 == 0: print 'read %d lines from VisitsStream.tsv.gz' % (k + 1) for k in out: out[k].setdefault('vis_ct', 0) return out if __name__ == '__main__': start = datetime.now() print 'running at: ' + str(start) parser = argparse.ArgumentParser(description = 'Collects counts of UserID from several data files.') parser.add_argument('--min_ct', type=int, default=None) parser.add_argument('--max_lines', type=int, default=None) args = parser.parse_args() user_counts = user_counts(args.min_ct) avito2_io.put_artifact(user_counts, 'user_counts.pkl') print 'elapsed time: %s' % (datetime.now() - start)
''' out = {} with open(avito2_io.ADS_INFO) as f: reader = csv.DictReader(f, delimiter='\t') for (k, line) in enumerate(reader): if k == maxlines: break if (int(line['IsContext']) == 1): if line['Price'] == '': line['Price'] = -1 if line['CategoryID'] == '': line['CategoryID'] = -1 values = [int(line['CategoryID']), int(float(line['Price'])), line['Title'], line['Params']] out[int(line['AdID'])] = values if (k + 1) % 1000000 == 0: print 'read %d lines' % (k + 1) return out if __name__=='__main__': start = datetime.now() print 'parsing AdsInfo.tsv' out = parse_ads() print 'saving context ads to ARTIFACTS/' avito2_io.put_artifact(out, 'context_ads.pkl') print 'Finished, elapsed time: %s' % (datetime.now() - start)
(k + 1)) for k in out: out[k].setdefault('ph_ct', 0) with gzip.open(avito2_io.VISIT) as f_vis: reader = csv.DictReader(f_vis, delimiter='\t') for (k, line) in enumerate(reader): user_id = int(line['UserID']) if user_id in out: vis_ct = out[user_id].setdefault('vis_ct', 0) out[user_id]['vis_ct'] = vis_ct + 1 if (k + 1) % 1000000 == 0: print('read %d lines from VisitsStream.tsv.gz' % (k + 1)) for k in out: out[k].setdefault('vis_ct', 0) return out if __name__ == '__main__': start = datetime.now() print('running at: ' + str(start)) parser = argparse.ArgumentParser( description='Collects counts of UserID from several data files.') parser.add_argument('--min_ct', type=int, default=None) parser.add_argument('--max_lines', type=int, default=None) args = parser.parse_args() user_counts = user_counts(args.min_ct) avito2_io.put_artifact(user_counts, 'user_counts.pkl') print('elapsed time: %s' % (datetime.now() - start))