def train(tr, si, alpha, beta, L1, L2, D, users=None, interaction=False, maxlines=None, iterations=1): model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for j in range(iterations): it = gl_iter.basic_join(tr, si, users) for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print('processed %d lines on training pass %d' % (k + 1, j + 1)) return model
def predict(data, model, offset=0.0): out = [] for k, df in enumerate(data): for t in df.itertuples(): x, id = process_line(t, True) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) out.append((id, p)) return pd.DataFrame(out, columns=['ID', 'IsClick'])
def predict(data, model, offset=0.0): out = [] for k, df in enumerate(data): for t in df.itertuples(): x, id = process_line(t, True) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) out.append((id, p)) return pd.DataFrame(out, columns=['ID','IsClick'])
def run_test(submission_file, test, si, users=None, offset=0): it = gl_iter.basic_join(test, si, users) for (k, line) in enumerate(it): id = line.pop('ID') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) submission_file.write('%d,%s\n' % (id, str(p))) if (k + 1) % 250000 == 0: print 'processed %d lines' % (k + 1)
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26): ''' Runs one training pass. ''' model = ftrl_proximal(alpha, beta, L1, L2, D, False) for df in data: for t in df.itertuples(): x, y = process_line(t, False) f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) return model
def run_test(submission_file, test, si, users=None, offset=0): it = gl_iter.basic_join(test, si, users) for (k, line) in enumerate(it): id = line.pop('ID') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) submission_file.write('%d,%s\n' % (id, str(p))) if (k + 1) % 250000 == 0: print('processed %d lines' % (k + 1))
def validate(data, model, offset=0.0): loss = 0.0 count = 0 for k, df in enumerate(data): for t in df.itertuples(): count += 1 x, y = process_line(t, False) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) loss += logloss(p, y) return loss / count
def run_val(alpha, l2, l1, maxlines, interact): val_ids = avito2_io.get_artifact('full_val_set.pkl') model = ftrl_proximal(alpha, beta, l1, l2, D, interact) train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE) with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) not in val_ids: y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines' % (k + 1) print 'finished training' count = 0 loss = 0.0 with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) in val_ids: count += 1 y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines of raw train on validation pass' % (k + 1) print 'validation loss: %.5f on %d rows' % (loss/count, count)
def validate(data, model, offset=0.0): loss = 0.0 count = 0 for k, df in enumerate(data): for t in df.itertuples(): count += 1 x, y = process_line(t, False) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) loss += logloss(p, y) return loss/count
def validate(val, si, users=None, offset=0, maxlines=None): it = gl_iter.basic_join(val, si, users) loss = 0.0 for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print 'processed %d lines from validation set' % (k + 1) return loss/k, k
def validate(val, si, users=None, offset=0, maxlines=None): it = gl_iter.basic_join(val, si, users) loss = 0.0 for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print('processed %d lines from validation set' % (k + 1)) return loss / k, k
def train(tr, si, alpha, beta, L1, L2, D, users=None, interaction=False, maxlines=None, iterations=1): model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for j in range(iterations): it = gl_iter.basic_join(tr, si, users) for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print 'processed %d lines on training pass %d' % (k + 1, j + 1) return model
beta = 1.0 # smoothing parameter, probably doesn't matter on big data L1 = 0.0000 # l1-regularization L2 = 0.1000 # l2-regularization D = 2**26 # feature space size interaction = False maxlines = None start = datetime.now() model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) train_path = os.path.join(PROCESSED, 'gl_train1.csv') with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): y = float(x['IsClick']) del x['IsClick'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print('processed %d lines' % (k + 1)) print('finished training') outfile = open(submission, 'w') outfile.write('ID,IsClick\n') test_path = os.path.join(PROCESSED, 'gl_test1.csv') with open(test_path) as test_file: input = csv.DictReader(test_file) for (k, x) in enumerate(input): id = x['ID']
# 'title' : lambda l : len(l[2]), # use_train = True val_ids = avito2_io.get_artifact('full_val_set.pkl') ads = avito2_io.get_artifact('context_ads.pkl') input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=False, val_ids=val_ids) model = ftrl_proximal(alpha, beta, L1, L2, D, interaction) for (k, (x, y)) in enumerate(input): f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines_train: break if (k + 1) % 1000000 == 0: print 'processed %d lines on training pass' % (k + 1) print 'finished training' # validation run input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=True,