Example #1
0
def validate(data, model, offset=0.0):
    loss = 0.0
    count = 0
    for k, df in enumerate(data):
        for t in df.itertuples():
            count += 1
            x, y = process_line(t, False)
            f = hash_features(x, model.D)
            dv = model.predict(f, False)
            dv += offset
            p = 1.0 / (1.0 + exp(-dv))
            loss += logloss(p, y)
    return loss / count
Example #2
0
def validate(data, model, offset=0.0):
  loss = 0.0
  count = 0
  for k, df in enumerate(data):
    for t in df.itertuples():
      count += 1
      x, y = process_line(t, False)
      f = hash_features(x, model.D)
      dv = model.predict(f, False)
      dv += offset
      p = 1.0/(1.0 + exp(-dv))
      loss += logloss(p, y)
  return loss/count
def validate(val, si, users=None, offset=0, maxlines=None):
  it = gl_iter.basic_join(val, si, users)
  loss = 0.0
  for (k, line) in enumerate(it):
    y = line.pop('IsClick')
    process_line(line)
    f = hash_features(line, D)
    dv = model.predict(f, False)
    dv += offset
    p = 1.0/(1.0 + exp(-dv))
    loss += logloss(p, y)
    if k == maxlines:
      break
    if (k + 1) % 250000 == 0:
      print 'processed %d lines from validation set' % (k + 1)
  return loss/k, k
Example #4
0
def validate(val, si, users=None, offset=0, maxlines=None):
    it = gl_iter.basic_join(val, si, users)
    loss = 0.0
    for (k, line) in enumerate(it):
        y = line.pop('IsClick')
        process_line(line)
        f = hash_features(line, D)
        dv = model.predict(f, False)
        dv += offset
        p = 1.0 / (1.0 + exp(-dv))
        loss += logloss(p, y)
        if k == maxlines:
            break
        if (k + 1) % 250000 == 0:
            print('processed %d lines from validation set' % (k + 1))
    return loss / k, k
Example #5
0
def run_val(alpha, l2, l1, maxlines, interact):
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  model = ftrl_proximal(alpha, beta, l1, l2, D, interact)
  train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE)
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) not in val_ids:
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines' % (k + 1)
  print 'finished training'
  count = 0
  loss = 0.0
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) in val_ids:
        count += 1
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        loss += logloss(p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines of raw train on validation pass' % (k + 1)
  print 'validation loss: %.5f on %d rows' % (loss/count, count)
Example #6
0
p_total = total_y / (k + 1.0)
p_sample = total_y / sample_ct
offset = log(p_total / (1.0 - p_total)) - log(p_sample / (1.0 - p_sample))
print 'offset:' + str(offset)

# validation run
input = avito2_io.join_with_ads(True,
                                ads,
                                train_etl,
                                search_etl,
                                ads_etl,
                                do_validation=True,
                                val_ids=val_ids)
loss = 0.0
count = 0
for (k, (x, y)) in enumerate(input):
    f = hash_features(x, D)
    dv = model.predict(f, False)
    dv += offset
    p = 1.0 / (1.0 + exp(-dv))
    loss += logloss(p, y)
    count += 1
    if k == maxlines_val:
        break
    if (k + 1) % 250000 == 0:
        print 'processed %d lines on validation pass' % (k + 1)

print 'validation set log loss: %.5f' % (loss / count)
print 'elapsed time: %s' % (datetime.now() - start)
print 'finished training'

# validation run
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl, 
                                search_etl, 
                                ads_etl,
                                do_validation=True, 
                                val_ids=val_ids)
loss = 0.0
count = 0
for (k, (x, y)) in enumerate(input):
  f = hash_features(x, D)
  p = model.predict(f)
  loss += logloss(p, y)
  count += 1
  if k == maxlines_val:
    break
  if (k + 1) % 250000 == 0:
    print 'processed %d lines on validation pass' % (k + 1)
    
print 'validation set log loss: %.5f' % (loss/count)
print 'elapsed time: %s' % (datetime.now() - start)






Example #8
0
from eval import logloss

maxlines_val = None

start = datetime.now()
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {
    'ad': lambda l: l['AdID'],
    'pos': lambda l: l['Position'],
    'ctr': lambda l: l['HistCTR']
}
search_etl = {'cat': lambda l: l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True,
                               train_etl,
                               search_etl,
                               do_validation=True,
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
    #loss += logloss(float(x['ctr']), y)
    loss += logloss(0.006, y)
    if k == maxlines_val:
        break
    if (k + 1) % 250000 == 0:
        print 'processed %d lines on validation pass' % (k + 1)

print 'validation set log loss: %.5f' % (loss / (k + 1))
print 'elapsed time: %s' % (datetime.now() - start)
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {'ad'      : lambda l : l['AdID'],
             'pos'     : lambda l : l['Position'],
             'ctr'     : lambda l : l['HistCTR']}
search_etl = {'cat'    : lambda l : l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True, 
                               train_etl, 
                               search_etl, 
                               do_validation=True, 
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
  #loss += logloss(float(x['ctr']), y)
  loss += logloss(0.006, y)
  if k == maxlines_val:
    break
  if (k + 1) % 250000 == 0:
    print 'processed %d lines on validation pass' % (k + 1)
    
print 'validation set log loss: %.5f' % (loss/(k + 1))
print 'elapsed time: %s' % (datetime.now() - start)