Exemple #1
0
def train(tr,
          si,
          alpha,
          beta,
          L1,
          L2,
          D,
          users=None,
          interaction=False,
          maxlines=None,
          iterations=1):
    model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
    for j in range(iterations):
        it = gl_iter.basic_join(tr, si, users)
        for (k, line) in enumerate(it):
            y = line.pop('IsClick')
            process_line(line)
            f = hash_features(line, D)
            p = model.predict(f)
            model.update(f, p, y)
            if k == maxlines:
                break
            if (k + 1) % 250000 == 0:
                print('processed %d lines on training pass %d' %
                      (k + 1, j + 1))
    return model
Exemple #2
0
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26):
    '''
  Runs one training pass.
  '''
    model = ftrl_proximal(alpha, beta, L1, L2, D, False)
    for df in data:
        for t in df.itertuples():
            x, y = process_line(t, False)
            f = hash_features(x, D)
            p = model.predict(f)
            model.update(f, p, y)
    return model
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26):
  '''
  Runs one training pass.
  '''
  model = ftrl_proximal(alpha, beta, L1, L2, D, False)
  for df in data:
    for t in df.itertuples():
      x, y = process_line(t, False)
      f = hash_features(x, D)
      p = model.predict(f)
      model.update(f, p, y)
  return model
def train(tr, si, alpha, beta, L1, 
          L2, D, users=None, 
          interaction=False, maxlines=None,
          iterations=1):
  model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
  for j in range(iterations):
    it = gl_iter.basic_join(tr, si, users)
    for (k, line) in enumerate(it):
      y = line.pop('IsClick')
      process_line(line)
      f = hash_features(line, D)
      p = model.predict(f)
      model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 250000 == 0:
        print 'processed %d lines on training pass %d' % (k + 1, j + 1)
  return model
Exemple #5
0
def run_val(alpha, l2, l1, maxlines, interact):
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  model = ftrl_proximal(alpha, beta, l1, l2, D, interact)
  train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE)
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) not in val_ids:
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines' % (k + 1)
  print 'finished training'
  count = 0
  loss = 0.0
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) in val_ids:
        count += 1
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        loss += logloss(p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines of raw train on validation pass' % (k + 1)
  print 'validation loss: %.5f on %d rows' % (loss/count, count)
Exemple #6
0
import csv
import pdb

SUBMIT_NUM = 2
submission = os.path.join(SUBMIT, 'submission%d.csv' % SUBMIT_NUM)

alpha = 0.1  # learning rate
beta = 1.0  # smoothing parameter, probably doesn't matter on big data
L1 = 0.0000  # l1-regularization
L2 = 0.1000  # l2-regularization
D = 2**26  # feature space size
interaction = False
maxlines = None

start = datetime.now()
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
train_path = os.path.join(PROCESSED, 'gl_train1.csv')
with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
        y = float(x['IsClick'])
        del x['IsClick']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
        if k == maxlines:
            break
        if (k + 1) % 1000000 == 0:
            print('processed %d lines' % (k + 1))
print('finished training')
# cut:
# 'params' : lambda l : len(l[3]),
# 'title'  : lambda l : len(l[2]),

# use_train = True
val_ids = avito2_io.get_artifact('full_val_set.pkl')
ads = avito2_io.get_artifact('context_ads.pkl')
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl, 
                                search_etl, 
                                ads_etl,
                                do_validation=False, 
                                val_ids=val_ids)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)

for (k, (x, y)) in enumerate(input):
  f = hash_features(x, D)
  p = model.predict(f)
  model.update(f, p, y)
  if k == maxlines_train:
    break
  if (k + 1) % 1000000 == 0:
    print 'processed %d lines on training pass' % (k + 1)
print 'finished training'

# validation run
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl,