def train(tr, si, alpha, beta, L1, L2, D, users=None, interaction=False, maxlines=None, iterations=1): model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for j in range(iterations): it = gl_iter.basic_join(tr, si, users) for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print('processed %d lines on training pass %d' % (k + 1, j + 1)) return model
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26): ''' Runs one training pass. ''' model = ftrl_proximal(alpha, beta, L1, L2, D, False) for df in data: for t in df.itertuples(): x, y = process_line(t, False) f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) return model
def train(tr, si, alpha, beta, L1, L2, D, users=None, interaction=False, maxlines=None, iterations=1): model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for j in range(iterations): it = gl_iter.basic_join(tr, si, users) for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print 'processed %d lines on training pass %d' % (k + 1, j + 1) return model
def run_val(alpha, l2, l1, maxlines, interact): val_ids = avito2_io.get_artifact('full_val_set.pkl') model = ftrl_proximal(alpha, beta, l1, l2, D, interact) train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE) with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) not in val_ids: y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines' % (k + 1) print 'finished training' count = 0 loss = 0.0 with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) in val_ids: count += 1 y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines of raw train on validation pass' % (k + 1) print 'validation loss: %.5f on %d rows' % (loss/count, count)
import csv import pdb SUBMIT_NUM = 2 submission = os.path.join(SUBMIT, 'submission%d.csv' % SUBMIT_NUM) alpha = 0.1 # learning rate beta = 1.0 # smoothing parameter, probably doesn't matter on big data L1 = 0.0000 # l1-regularization L2 = 0.1000 # l2-regularization D = 2**26 # feature space size interaction = False maxlines = None start = datetime.now() model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) train_path = os.path.join(PROCESSED, 'gl_train1.csv') with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): y = float(x['IsClick']) del x['IsClick'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print('processed %d lines' % (k + 1)) print('finished training')
# cut: # 'params' : lambda l : len(l[3]), # 'title' : lambda l : len(l[2]), # use_train = True val_ids = avito2_io.get_artifact('full_val_set.pkl') ads = avito2_io.get_artifact('context_ads.pkl') input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=False, val_ids=val_ids) model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for (k, (x, y)) in enumerate(input): f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines_train: break if (k + 1) % 1000000 == 0: print 'processed %d lines on training pass' % (k + 1) print 'finished training' # validation run input = avito2_io.join_with_ads(True, ads, train_etl,