def join(tss, si, delimiter=','): ''' NB: SearchID in tss and si are strings of int. The IDs in ctx, loc and cat are ints. ''' ctx = avito2_io.get_artifact('context_ads_map.pkl') loc = avito2_io.get_artifact('location_map.pkl') cat = avito2_io.get_artifact('cat_map.pkl') with gzip.open(si) as f_si: with gzip.open(tss) as f_t: read_t = csv.DictReader(f_t, delimiter=delimiter) read_si = csv.DictReader(f_si, delimiter=delimiter) si_line = next(read_si) for (k, t_line) in enumerate(read_t): search_id = t_line['SearchID'] while search_id != si_line['SearchID']: si_line = next(read_si) # Now the SearchID's match # NB: ad before si overwrites ad.CategoryID ad_id = int(t_line['AdID']) t_line.update(ctx[ad_id]) t_line.update(si_line) loc_id = int(si_line['LocationID']) t_line.update(loc[loc_id]) cat_id = int(si_line['CategoryID']) t_line.update(cat[cat_id]) yield t_line
def join(tss, si, delimiter=','): ''' NB: SearchID in tss and si are strings of int. The IDs in ctx, loc and cat are ints. ''' ctx = avito2_io.get_artifact('context_ads_map.pkl') loc = avito2_io.get_artifact('location_map.pkl') cat = avito2_io.get_artifact('cat_map.pkl') with gzip.open(si) as f_si: with gzip.open(tss) as f_t: read_t = csv.DictReader(f_t, delimiter=delimiter) read_si = csv.DictReader(f_si, delimiter=delimiter) si_line = read_si.next() for (k, t_line) in enumerate(read_t): search_id = t_line['SearchID'] while search_id != si_line['SearchID']: si_line = read_si.next() # Now the SearchID's match # NB: ad before si overwrites ad.CategoryID ad_id = int(t_line['AdID']) t_line.update(ctx[ad_id]) t_line.update(si_line) loc_id = int(si_line['LocationID']) t_line.update(loc[loc_id]) cat_id = int(si_line['CategoryID']) t_line.update(cat[cat_id]) yield t_line
def search_val(): ''' This function filters the rows of search.gl (the SFrame containing SearchInfo.tsv) to just the rows used in the validation set. ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') si = load('search.gl') idx = si['SearchID'].apply(lambda x : x in val_ids) si_val = si[idx] path = os.path.join(GL_DATA, 'search_val.gl') si_val.save(path) print('elapsed time: %s' % (datetime.now() - start))
def val_context(): ''' This function filters the rows of train_context.gl to just those rows that are in the validation set(train_context() has to be run first). ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') tr = load('train_context.gl') idx = tr['SearchID'].apply(lambda id : id in val_ids) val = tr[idx] path = os.path.join(GL_DATA, 'val_context.gl') val.save(path) print('elapsed time: %s' % (datetime.now() - start))
def search_val(): ''' This function filters the rows of search.gl (the SFrame containing SearchInfo.tsv) to just the rows used in the validation set. ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') si = load('search.gl') idx = si['SearchID'].apply(lambda x : x in val_ids) si_val = si[idx] path = os.path.join(GL_DATA, 'search_val.gl') si_val.save(path) print 'elapsed time: %s' % (datetime.now() - start)
def val_context(): ''' This function filters the rows of train_context.gl to just those rows that are in the validation set(train_context() has to be run first). ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') tr = load('train_context.gl') idx = tr['SearchID'].apply(lambda id : id in val_ids) val = tr[idx] path = os.path.join(GL_DATA, 'val_context.gl') val.save(path) print 'elapsed time: %s' % (datetime.now() - start)
def build_combo(): ''' Builds a combo SFrame, with test and train, joined to search, sorted by date, with some of the features added. ''' start = datetime.now() print('concatenating train_context.gl and test_context.gl') tr = load('train_context.gl') test = load('test_context.gl') tr['isTest'] = 0 test['isTest'] = 1 tr['ID'] = -1 test['IsClick'] = -1 both = tr.append(test) both['HistCTR'] = both['HistCTR'].apply(lambda x : round(log(x), 1)) print('modifying search.gl') si = load('search.gl') ds = load('train_ds.gl') ds_ids = set(ds['SearchID']) val_ids = avito2_io.get_artifact('full_val_set.pkl') si['isDS'] = si['SearchID'].apply(lambda id : id in ds_ids) si['isVal'] = si['SearchID'].apply(lambda id : id in val_ids) print('converting datetimes') si['dt'] = si['SearchDate'].str_to_datetime() # produces a 0-based running day (0-25) from 4/25 to 5/20 si['runDay'] = si['dt'].apply(lambda dt : (dt.month - 4) * 30 + dt.day - 25) del si['SearchDate'] si['sqe'] = si['SearchQuery'].apply(lambda sq : len(sq) > 0) si['spe'] = si['SearchParams'].apply(lambda sp : sp is not None) si['spsq'] = si['sqe'] * si['spe'] si['spe_cat'] = si['CategoryID'] + 0.1 * si['spe'] si['sqe_cat'] = si['CategoryID'] + 0.1 * si['sqe'] si['sq_len'] = si['SearchQuery'].apply(lambda x : len(x)/3) print('joining') combo = si.join(both) combo['cat_pos'] = combo['CategoryID'] + 0.1 * combo['Position'] combo['sqe_pos'] = combo['sqe'] + 0.1 * combo['Position'] combo['spe_pos'] = combo['spe'] + 0.1 * combo['Position'] print('sorting') combo = combo.sort('dt') print('saving') path = os.path.join(GL_DATA, 'combo.gl') combo.save(path) print('elapsed time: %s' % (datetime.now() - start))
def build_combo(): ''' Builds a combo SFrame, with test and train, joined to search, sorted by date, with some of the features added. ''' start = datetime.now() print 'concatenating train_context.gl and test_context.gl' tr = load('train_context.gl') test = load('test_context.gl') tr['isTest'] = 0 test['isTest'] = 1 tr['ID'] = -1 test['IsClick'] = -1 both = tr.append(test) both['HistCTR'] = both['HistCTR'].apply(lambda x : round(log(x), 1)) print 'modifying search.gl' si = load('search.gl') ds = load('train_ds.gl') ds_ids = set(ds['SearchID']) val_ids = avito2_io.get_artifact('full_val_set.pkl') si['isDS'] = si['SearchID'].apply(lambda id : id in ds_ids) si['isVal'] = si['SearchID'].apply(lambda id : id in val_ids) print 'converting datetimes' si['dt'] = si['SearchDate'].str_to_datetime() # produces a 0-based running day (0-25) from 4/25 to 5/20 si['runDay'] = si['dt'].apply(lambda dt : (dt.month - 4) * 30 + dt.day - 25) del si['SearchDate'] si['sqe'] = si['SearchQuery'].apply(lambda sq : len(sq) > 0) si['spe'] = si['SearchParams'].apply(lambda sp : sp is not None) si['spsq'] = si['sqe'] * si['spe'] si['spe_cat'] = si['CategoryID'] + 0.1 * si['spe'] si['sqe_cat'] = si['CategoryID'] + 0.1 * si['sqe'] si['sq_len'] = si['SearchQuery'].apply(lambda x : len(x)/3) print 'joining' combo = si.join(both) combo['cat_pos'] = combo['CategoryID'] + 0.1 * combo['Position'] combo['sqe_pos'] = combo['sqe'] + 0.1 * combo['Position'] combo['spe_pos'] = combo['spe'] + 0.1 * combo['Position'] print 'sorting' combo = combo.sort('dt') print 'saving' path = os.path.join(GL_DATA, 'combo.gl') combo.save(path) print 'elapsed time: %s' % (datetime.now() - start)
def train_ds(p=0.05): ''' Filters train_context.gl such that all of the positive rows are kept, but the negatives are selected with probability p. Also removes any rows that are in the validation set. ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') tr1 = load('train_context.gl') idx1 = tr1['IsClick'].apply(lambda x : 1 if random() < p else x) tr2 = tr1[idx1] idx2 = tr2['SearchID'].apply(lambda x : x not in val_ids) tr_ds = tr2[idx2] path = os.path.join(GL_DATA, 'train_ds.gl') tr_ds.save(path) print('elapsed time: %s' % (datetime.now() - start))
def train_ds(p=0.05): ''' Filters train_context.gl such that all of the positive rows are kept, but the negatives are selected with probability p. Also removes any rows that are in the validation set. ''' start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') tr1 = load('train_context.gl') idx1 = tr1['IsClick'].apply(lambda x : 1 if random() < p else x) tr2 = tr1[idx1] idx2 = tr2['SearchID'].apply(lambda x : x not in val_ids) tr_ds = tr2[idx2] path = os.path.join(GL_DATA, 'train_ds.gl') tr_ds.save(path) print 'elapsed time: %s' % (datetime.now() - start)
def run_val(alpha, l2, l1, maxlines, interact): val_ids = avito2_io.get_artifact('full_val_set.pkl') model = ftrl_proximal(alpha, beta, l1, l2, D, interact) train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE) with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) not in val_ids: y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines' % (k + 1) print 'finished training' count = 0 loss = 0.0 with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) in val_ids: count += 1 y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines of raw train on validation pass' % (k + 1) print 'validation loss: %.5f on %d rows' % (loss/count, count)
''' This script gets log loss on the validation set from full_val_set.pkl, (generated by the full_validation_set.py script) for some simple, no-learning models like the HistCTR, all 0's, or mean-value benchmark. author: David Thaler date: July 2015 ''' import avito2_io from datetime import datetime from eval import logloss maxlines_val = None start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') print 'validation set ids read' train_etl = { 'ad': lambda l: l['AdID'], 'pos': lambda l: l['Position'], 'ctr': lambda l: l['HistCTR'] } search_etl = {'cat': lambda l: l['CategoryID']} # validation run input = avito2_io.rolling_join(True, train_etl, search_etl, do_validation=True, val_ids=val_ids) loss = 0.0 for (k, (x, y)) in enumerate(input):
def build_user_dict(): users = avito2_io.get_artifact('user_counts.pkl') users.update(avito2_io.get_artifact('user_dict.pkl')) return users
parser.add_argument('-n', '--maxlines_val',type=int, default=None, help='A max # lines for validation, if none, all data is used.') parser.add_argument('-s', '--sub', type=str, help='Do test and write results at submissions/submission<sub>.csv') parser.add_argument('-u', '--users', type=str, default=None, help="None, 'counts' or 'full' - what user data to use") parser.add_argument('-a','--all', action='store_const', default=False, const=True, help='Full training run; use all training data.') parser.add_argument('-p', '--passes',type=int, default=1, help='# of passes over training data.') args = parser.parse_args() if args.users=='full': users = build_user_dict() print 'loading full user data' elif args.users=='counts': users = avito2_io.get_artifact('user_counts.pkl') print 'loading user counts only from user_counts.pkl' elif args.users == 'si': users = avito2_io.get_artifact('user_si.pkl') print 'loading user dict from user_si.pkl' else: users = None D = 2**args.bits if args.all: tr = sframes.load('train_context.gl') si = sframes.load('search.gl') if not args.sub: raise Warning('--all without --sub is not sensible.') else: tr = sframes.load('train_ds.gl') si = sframes.load('search_ds.gl')
search_etl = {'user' : (lambda l : l['UserID']), 'category': (lambda l : l['CategoryID']), 'location': (lambda l : l['LocationID']), 'logon' : (lambda l : l['IsUserLoggedOn']), 'SPexists': (lambda l : int(len(l['SearchParams']) > 0)), 'SQexists': (lambda l : int(len(l['SearchQuery']) > 0))} ads_etl ={'price' : lambda l : ceil(float(l[1])/100.), 'ad_cat' : lambda l : l[0]} # cut: # 'params' : lambda l : len(l[3]), # 'title' : lambda l : len(l[2]), # use_train = True val_ids = avito2_io.get_artifact('full_val_set.pkl') ads = avito2_io.get_artifact('context_ads.pkl') input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=False, val_ids=val_ids) model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for (k, (x, y)) in enumerate(input): f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines_train:
'logon': (lambda l: l['IsUserLoggedOn']), 'SPexists': (lambda l: int(len(l['SearchParams']) > 0)), 'SQexists': (lambda l: int(len(l['SearchQuery']) > 0)) } ads_etl = { 'price': lambda l: ceil(float(l[1]) / 100.), 'ad_cat': lambda l: l[0] } # cut: # 'params' : lambda l : len(l[3]), # 'title' : lambda l : len(l[2]), # use_train = True val_ids = avito2_io.get_artifact('full_val_set.pkl') ads = avito2_io.get_artifact('context_ads.pkl') print 'small objects loaded' input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=False, val_ids=val_ids) model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) # total count is just k + 1 total_y = 0.0 sample_ct = 0.0
''' This script gets log loss on the validation set from full_val_set.pkl, (generated by the full_validation_set.py script) for some simple, no-learning models like the HistCTR, all 0's, or mean-value benchmark. author: David Thaler date: July 2015 ''' import avito2_io from datetime import datetime from eval import logloss maxlines_val = None start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') print 'validation set ids read' train_etl = {'ad' : lambda l : l['AdID'], 'pos' : lambda l : l['Position'], 'ctr' : lambda l : l['HistCTR']} search_etl = {'cat' : lambda l : l['CategoryID']} # validation run input = avito2_io.rolling_join(True, train_etl, search_etl, do_validation=True, val_ids=val_ids) loss = 0.0 for (k, (x, y)) in enumerate(input): #loss += logloss(float(x['ctr']), y) loss += logloss(0.006, y)
'--all', action='store_const', default=False, const=True, help='Full training run; use all training data.') parser.add_argument('-p', '--passes', type=int, default=1, help='# of passes over training data.') args = parser.parse_args() if args.users == 'full': users = build_user_dict() print('loading full user data') elif args.users == 'counts': users = avito2_io.get_artifact('user_counts.pkl') print('loading user counts only from user_counts.pkl') elif args.users == 'si': users = avito2_io.get_artifact('user_si.pkl') print('loading user dict from user_si.pkl') else: users = None D = 2**args.bits if args.all: tr = sframes.load('train_context.gl') si = sframes.load('search.gl') if not args.sub: raise Warning('--all without --sub is not sensible.') else: tr = sframes.load('train_ds.gl') si = sframes.load('search_ds.gl')