def validate(data, model, offset=0.0): loss = 0.0 count = 0 for k, df in enumerate(data): for t in df.itertuples(): count += 1 x, y = process_line(t, False) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) loss += logloss(p, y) return loss / count
def validate(data, model, offset=0.0): loss = 0.0 count = 0 for k, df in enumerate(data): for t in df.itertuples(): count += 1 x, y = process_line(t, False) f = hash_features(x, model.D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) loss += logloss(p, y) return loss/count
def validate(val, si, users=None, offset=0, maxlines=None): it = gl_iter.basic_join(val, si, users) loss = 0.0 for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0/(1.0 + exp(-dv)) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print 'processed %d lines from validation set' % (k + 1) return loss/k, k
def validate(val, si, users=None, offset=0, maxlines=None): it = gl_iter.basic_join(val, si, users) loss = 0.0 for (k, line) in enumerate(it): y = line.pop('IsClick') process_line(line) f = hash_features(line, D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 250000 == 0: print('processed %d lines from validation set' % (k + 1)) return loss / k, k
def run_val(alpha, l2, l1, maxlines, interact): val_ids = avito2_io.get_artifact('full_val_set.pkl') model = ftrl_proximal(alpha, beta, l1, l2, D, interact) train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE) with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) not in val_ids: y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) model.update(f, p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines' % (k + 1) print 'finished training' count = 0 loss = 0.0 with open(train_path) as train_file: input = csv.DictReader(train_file) for (k, x) in enumerate(input): if int(x['SearchID']) in val_ids: count += 1 y = float(x['IsClick']) del x['IsClick'] del x['SearchDate'] del x['SearchID'] f = hash_features(x, D) p = model.predict(f) loss += logloss(p, y) if k == maxlines: break if (k + 1) % 1000000 == 0: print 'processed %d lines of raw train on validation pass' % (k + 1) print 'validation loss: %.5f on %d rows' % (loss/count, count)
p_total = total_y / (k + 1.0) p_sample = total_y / sample_ct offset = log(p_total / (1.0 - p_total)) - log(p_sample / (1.0 - p_sample)) print 'offset:' + str(offset) # validation run input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=True, val_ids=val_ids) loss = 0.0 count = 0 for (k, (x, y)) in enumerate(input): f = hash_features(x, D) dv = model.predict(f, False) dv += offset p = 1.0 / (1.0 + exp(-dv)) loss += logloss(p, y) count += 1 if k == maxlines_val: break if (k + 1) % 250000 == 0: print 'processed %d lines on validation pass' % (k + 1) print 'validation set log loss: %.5f' % (loss / count) print 'elapsed time: %s' % (datetime.now() - start)
print 'finished training' # validation run input = avito2_io.join_with_ads(True, ads, train_etl, search_etl, ads_etl, do_validation=True, val_ids=val_ids) loss = 0.0 count = 0 for (k, (x, y)) in enumerate(input): f = hash_features(x, D) p = model.predict(f) loss += logloss(p, y) count += 1 if k == maxlines_val: break if (k + 1) % 250000 == 0: print 'processed %d lines on validation pass' % (k + 1) print 'validation set log loss: %.5f' % (loss/count) print 'elapsed time: %s' % (datetime.now() - start)
from eval import logloss maxlines_val = None start = datetime.now() val_ids = avito2_io.get_artifact('full_val_set.pkl') print 'validation set ids read' train_etl = { 'ad': lambda l: l['AdID'], 'pos': lambda l: l['Position'], 'ctr': lambda l: l['HistCTR'] } search_etl = {'cat': lambda l: l['CategoryID']} # validation run input = avito2_io.rolling_join(True, train_etl, search_etl, do_validation=True, val_ids=val_ids) loss = 0.0 for (k, (x, y)) in enumerate(input): #loss += logloss(float(x['ctr']), y) loss += logloss(0.006, y) if k == maxlines_val: break if (k + 1) % 250000 == 0: print 'processed %d lines on validation pass' % (k + 1) print 'validation set log loss: %.5f' % (loss / (k + 1)) print 'elapsed time: %s' % (datetime.now() - start)
val_ids = avito2_io.get_artifact('full_val_set.pkl') print 'validation set ids read' train_etl = {'ad' : lambda l : l['AdID'], 'pos' : lambda l : l['Position'], 'ctr' : lambda l : l['HistCTR']} search_etl = {'cat' : lambda l : l['CategoryID']} # validation run input = avito2_io.rolling_join(True, train_etl, search_etl, do_validation=True, val_ids=val_ids) loss = 0.0 for (k, (x, y)) in enumerate(input): #loss += logloss(float(x['ctr']), y) loss += logloss(0.006, y) if k == maxlines_val: break if (k + 1) % 250000 == 0: print 'processed %d lines on validation pass' % (k + 1) print 'validation set log loss: %.5f' % (loss/(k + 1)) print 'elapsed time: %s' % (datetime.now() - start)