Exemple #1
0
from eval import logloss

maxlines_val = None

start = datetime.now()
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {
    'ad': lambda l: l['AdID'],
    'pos': lambda l: l['Position'],
    'ctr': lambda l: l['HistCTR']
}
search_etl = {'cat': lambda l: l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True,
                               train_etl,
                               search_etl,
                               do_validation=True,
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
    #loss += logloss(float(x['ctr']), y)
    loss += logloss(0.006, y)
    if k == maxlines_val:
        break
    if (k + 1) % 250000 == 0:
        print 'processed %d lines on validation pass' % (k + 1)

print 'validation set log loss: %.5f' % (loss / (k + 1))
print 'elapsed time: %s' % (datetime.now() - start)
Exemple #2
0
start = datetime.now()
train_etl = {
    'ad': (lambda l: l['AdID']),
    'pos': (lambda l: l['Position']),
    'log_ctr': (lambda l: -10 * round(log(float(l['HistCTR'])), 1))
}
search_etl = {
    'user': (lambda l: l['UserID']),
    'category': (lambda l: l['CategoryID']),
    'location': (lambda l: l['LocationID']),
    'logon': (lambda l: l['IsUserLoggedOn']),
    'SPexists': (lambda l: int(len(l['SearchParams']) > 0)),
    'SQexists': (lambda l: int(len(l['SearchQuery']) > 0))
}
# use_train = True
input = rolling_join(True, train_etl, search_etl)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
for (k, (x, y)) in enumerate(input):
    f = hash_features(x, D)
    p = model.predict(f)
    model.update(f, p, y)
    if k == maxlines:
        break
    if (k + 1) % 1000000 == 0:
        print 'processed %d lines' % (k + 1)
print 'finished training'

# testing: use_train=False
train_etl['id'] = (lambda l: l['ID'])
input = rolling_join(False, train_etl, search_etl)
outfile = open(submission, 'w')
from datetime import datetime
from eval import logloss

maxlines_val = None

start = datetime.now()
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {'ad'      : lambda l : l['AdID'],
             'pos'     : lambda l : l['Position'],
             'ctr'     : lambda l : l['HistCTR']}
search_etl = {'cat'    : lambda l : l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True, 
                               train_etl, 
                               search_etl, 
                               do_validation=True, 
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
  #loss += logloss(float(x['ctr']), y)
  loss += logloss(0.006, y)
  if k == maxlines_val:
    break
  if (k + 1) % 250000 == 0:
    print 'processed %d lines on validation pass' % (k + 1)
    
print 'validation set log loss: %.5f' % (loss/(k + 1))
print 'elapsed time: %s' % (datetime.now() - start)

interaction = False
maxlines = None


start = datetime.now()
train_etl = {'ad'     : (lambda l : l['AdID']),
             'pos'    : (lambda l : l['Position']),
             'log_ctr': (lambda l : -10 * round(log(float(l['HistCTR'])), 1))}
search_etl = {'user'    : (lambda l : l['UserID']),
              'category': (lambda l : l['CategoryID']),
              'location': (lambda l : l['LocationID']),
              'logon'   : (lambda l : l['IsUserLoggedOn']),
              'SPexists': (lambda l : int(len(l['SearchParams']) > 0)),
              'SQexists': (lambda l : int(len(l['SearchQuery']) > 0))}
# use_train = True
input = rolling_join(True, train_etl, search_etl)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
for (k, (x, y)) in enumerate(input):
  f = hash_features(x, D)
  p = model.predict(f)
  model.update(f, p, y)
  if k == maxlines:
    break
  if (k + 1) % 1000000 == 0:
    print 'processed %d lines' % (k + 1)
print 'finished training'

# testing: use_train=False
train_etl['id'] = (lambda l : l['ID'])
input = rolling_join(False, train_etl, search_etl)
outfile = open(submission, 'w')