コード例 #1
0
ファイル: run_model.py プロジェクト: Keesiu/meta-kaggle
def train(tr,
          si,
          alpha,
          beta,
          L1,
          L2,
          D,
          users=None,
          interaction=False,
          maxlines=None,
          iterations=1):
    model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
    for j in range(iterations):
        it = gl_iter.basic_join(tr, si, users)
        for (k, line) in enumerate(it):
            y = line.pop('IsClick')
            process_line(line)
            f = hash_features(line, D)
            p = model.predict(f)
            model.update(f, p, y)
            if k == maxlines:
                break
            if (k + 1) % 250000 == 0:
                print('processed %d lines on training pass %d' %
                      (k + 1, j + 1))
    return model
コード例 #2
0
def predict(data, model, offset=0.0):
    out = []
    for k, df in enumerate(data):
        for t in df.itertuples():
            x, id = process_line(t, True)
            f = hash_features(x, model.D)
            dv = model.predict(f, False)
            dv += offset
            p = 1.0 / (1.0 + exp(-dv))
            out.append((id, p))
    return pd.DataFrame(out, columns=['ID', 'IsClick'])
コード例 #3
0
def predict(data, model, offset=0.0):
  out = []
  for k, df in enumerate(data):
    for t in df.itertuples():
      x, id = process_line(t, True)
      f = hash_features(x, model.D)
      dv = model.predict(f, False)
      dv += offset
      p = 1.0/(1.0 + exp(-dv))
      out.append((id, p))
  return pd.DataFrame(out, columns=['ID','IsClick'])
コード例 #4
0
def run_test(submission_file, test, si, users=None, offset=0):
  it = gl_iter.basic_join(test, si, users)
  for (k, line) in enumerate(it):
    id = line.pop('ID')
    process_line(line)
    f = hash_features(line, D)
    dv = model.predict(f, False)
    dv += offset
    p = 1.0/(1.0 + exp(-dv))
    submission_file.write('%d,%s\n' % (id, str(p)))
    if (k + 1) % 250000 == 0:
      print 'processed %d lines' % (k + 1)
コード例 #5
0
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26):
    '''
  Runs one training pass.
  '''
    model = ftrl_proximal(alpha, beta, L1, L2, D, False)
    for df in data:
        for t in df.itertuples():
            x, y = process_line(t, False)
            f = hash_features(x, D)
            p = model.predict(f)
            model.update(f, p, y)
    return model
コード例 #6
0
def train(data, alpha=0.1, beta=1.0, L1=0.0, L2=0.1, D=2**26):
  '''
  Runs one training pass.
  '''
  model = ftrl_proximal(alpha, beta, L1, L2, D, False)
  for df in data:
    for t in df.itertuples():
      x, y = process_line(t, False)
      f = hash_features(x, D)
      p = model.predict(f)
      model.update(f, p, y)
  return model
コード例 #7
0
ファイル: run_model.py プロジェクト: Keesiu/meta-kaggle
def run_test(submission_file, test, si, users=None, offset=0):
    it = gl_iter.basic_join(test, si, users)
    for (k, line) in enumerate(it):
        id = line.pop('ID')
        process_line(line)
        f = hash_features(line, D)
        dv = model.predict(f, False)
        dv += offset
        p = 1.0 / (1.0 + exp(-dv))
        submission_file.write('%d,%s\n' % (id, str(p)))
        if (k + 1) % 250000 == 0:
            print('processed %d lines' % (k + 1))
コード例 #8
0
def validate(data, model, offset=0.0):
    loss = 0.0
    count = 0
    for k, df in enumerate(data):
        for t in df.itertuples():
            count += 1
            x, y = process_line(t, False)
            f = hash_features(x, model.D)
            dv = model.predict(f, False)
            dv += offset
            p = 1.0 / (1.0 + exp(-dv))
            loss += logloss(p, y)
    return loss / count
コード例 #9
0
def run_val(alpha, l2, l1, maxlines, interact):
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  model = ftrl_proximal(alpha, beta, l1, l2, D, interact)
  train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE)
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) not in val_ids:
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines' % (k + 1)
  print 'finished training'
  count = 0
  loss = 0.0
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) in val_ids:
        count += 1
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        loss += logloss(p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines of raw train on validation pass' % (k + 1)
  print 'validation loss: %.5f on %d rows' % (loss/count, count)
コード例 #10
0
def validate(data, model, offset=0.0):
  loss = 0.0
  count = 0
  for k, df in enumerate(data):
    for t in df.itertuples():
      count += 1
      x, y = process_line(t, False)
      f = hash_features(x, model.D)
      dv = model.predict(f, False)
      dv += offset
      p = 1.0/(1.0 + exp(-dv))
      loss += logloss(p, y)
  return loss/count
コード例 #11
0
def validate(val, si, users=None, offset=0, maxlines=None):
  it = gl_iter.basic_join(val, si, users)
  loss = 0.0
  for (k, line) in enumerate(it):
    y = line.pop('IsClick')
    process_line(line)
    f = hash_features(line, D)
    dv = model.predict(f, False)
    dv += offset
    p = 1.0/(1.0 + exp(-dv))
    loss += logloss(p, y)
    if k == maxlines:
      break
    if (k + 1) % 250000 == 0:
      print 'processed %d lines from validation set' % (k + 1)
  return loss/k, k
コード例 #12
0
ファイル: run_model.py プロジェクト: Keesiu/meta-kaggle
def validate(val, si, users=None, offset=0, maxlines=None):
    it = gl_iter.basic_join(val, si, users)
    loss = 0.0
    for (k, line) in enumerate(it):
        y = line.pop('IsClick')
        process_line(line)
        f = hash_features(line, D)
        dv = model.predict(f, False)
        dv += offset
        p = 1.0 / (1.0 + exp(-dv))
        loss += logloss(p, y)
        if k == maxlines:
            break
        if (k + 1) % 250000 == 0:
            print('processed %d lines from validation set' % (k + 1))
    return loss / k, k
コード例 #13
0
def train(tr, si, alpha, beta, L1, 
          L2, D, users=None, 
          interaction=False, maxlines=None,
          iterations=1):
  model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
  for j in range(iterations):
    it = gl_iter.basic_join(tr, si, users)
    for (k, line) in enumerate(it):
      y = line.pop('IsClick')
      process_line(line)
      f = hash_features(line, D)
      p = model.predict(f)
      model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 250000 == 0:
        print 'processed %d lines on training pass %d' % (k + 1, j + 1)
  return model
コード例 #14
0
ファイル: run2.py プロジェクト: Keesiu/meta-kaggle
beta = 1.0  # smoothing parameter, probably doesn't matter on big data
L1 = 0.0000  # l1-regularization
L2 = 0.1000  # l2-regularization
D = 2**26  # feature space size
interaction = False
maxlines = None

start = datetime.now()
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
train_path = os.path.join(PROCESSED, 'gl_train1.csv')
with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
        y = float(x['IsClick'])
        del x['IsClick']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
        if k == maxlines:
            break
        if (k + 1) % 1000000 == 0:
            print('processed %d lines' % (k + 1))
print('finished training')

outfile = open(submission, 'w')
outfile.write('ID,IsClick\n')
test_path = os.path.join(PROCESSED, 'gl_test1.csv')
with open(test_path) as test_file:
    input = csv.DictReader(test_file)
    for (k, x) in enumerate(input):
        id = x['ID']
コード例 #15
0
# 'title'  : lambda l : len(l[2]),

# use_train = True
val_ids = avito2_io.get_artifact('full_val_set.pkl')
ads = avito2_io.get_artifact('context_ads.pkl')
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl, 
                                search_etl, 
                                ads_etl,
                                do_validation=False, 
                                val_ids=val_ids)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)

for (k, (x, y)) in enumerate(input):
  f = hash_features(x, D)
  p = model.predict(f)
  model.update(f, p, y)
  if k == maxlines_train:
    break
  if (k + 1) % 1000000 == 0:
    print 'processed %d lines on training pass' % (k + 1)
print 'finished training'

# validation run
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl, 
                                search_etl, 
                                ads_etl,
                                do_validation=True,