-
Notifications
You must be signed in to change notification settings - Fork 0
/
run1.py
77 lines (60 loc) · 2.18 KB
/
run1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
This script records the generation of the first full-data run of the
ftrl-proximal model using data pulled by avito2_io.rolling_join().
This only runs one epoch (~200M rows).
author: David Thaler
date: July 2015
'''
from avito2_io import rolling_join
from avito2_io import SUBMIT
from hash_features import hash_features
from ftrl_proximal import ftrl_proximal
from datetime import datetime
from math import log
from eval import logloss
import os.path
import pdb
SUBMIT_NUM = 1
submission = os.path.join(SUBMIT, 'submission%d.csv' % SUBMIT_NUM)
alpha = 0.1 # learning rate
beta = 1.0 # smoothing parameter, probably doesn't matter on big data
L1 = 0.0000 # l1-regularization
L2 = 0.1000 # l2-regularization
D = 2**26 # feature space size
interaction = False
maxlines = None
start = datetime.now()
train_etl = {'ad' : (lambda l : l['AdID']),
'pos' : (lambda l : l['Position']),
'log_ctr': (lambda l : -10 * round(log(float(l['HistCTR'])), 1))}
search_etl = {'user' : (lambda l : l['UserID']),
'category': (lambda l : l['CategoryID']),
'location': (lambda l : l['LocationID']),
'logon' : (lambda l : l['IsUserLoggedOn']),
'SPexists': (lambda l : int(len(l['SearchParams']) > 0)),
'SQexists': (lambda l : int(len(l['SearchQuery']) > 0))}
# use_train = True
input = rolling_join(True, train_etl, search_etl)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
for (k, (x, y)) in enumerate(input):
f = hash_features(x, D)
p = model.predict(f)
model.update(f, p, y)
if k == maxlines:
break
if (k + 1) % 1000000 == 0:
print 'processed %d lines' % (k + 1)
print 'finished training'
# testing: use_train=False
train_etl['id'] = (lambda l : l['ID'])
input = rolling_join(False, train_etl, search_etl)
outfile = open(submission, 'w')
outfile.write('ID,IsClick\n')
# y is just 0 here, and it isn't used
for (k, (x, y)) in enumerate(input):
f = hash_features(x, D)
p = model.predict(f)
outfile.write('%s,%s\n' % (x['id'], str(p)))
if (k + 1) % 1000000 == 0:
print 'processed %d lines' % (k + 1)
print 'elapsed time: %s' % (datetime.now() - start)