コード例 #1
0
ファイル: vw_io.py プロジェクト: Keesiu/meta-kaggle
def join(tss, si, delimiter=','):
    '''
  NB: SearchID in tss and si are strings of int.
      The IDs in ctx, loc and cat are ints.
  '''
    ctx = avito2_io.get_artifact('context_ads_map.pkl')
    loc = avito2_io.get_artifact('location_map.pkl')
    cat = avito2_io.get_artifact('cat_map.pkl')
    with gzip.open(si) as f_si:
        with gzip.open(tss) as f_t:
            read_t = csv.DictReader(f_t, delimiter=delimiter)
            read_si = csv.DictReader(f_si, delimiter=delimiter)
            si_line = next(read_si)
            for (k, t_line) in enumerate(read_t):
                search_id = t_line['SearchID']
                while search_id != si_line['SearchID']:
                    si_line = next(read_si)
                    # Now the SearchID's match
                # NB: ad before si overwrites ad.CategoryID
                ad_id = int(t_line['AdID'])
                t_line.update(ctx[ad_id])
                t_line.update(si_line)
                loc_id = int(si_line['LocationID'])
                t_line.update(loc[loc_id])
                cat_id = int(si_line['CategoryID'])
                t_line.update(cat[cat_id])
                yield t_line
コード例 #2
0
def join(tss, si, delimiter=','):
  '''
  NB: SearchID in tss and si are strings of int.
      The IDs in ctx, loc and cat are ints.
  '''
  ctx = avito2_io.get_artifact('context_ads_map.pkl')
  loc = avito2_io.get_artifact('location_map.pkl')
  cat = avito2_io.get_artifact('cat_map.pkl')
  with gzip.open(si) as f_si:
    with gzip.open(tss) as f_t:
      read_t  = csv.DictReader(f_t,  delimiter=delimiter)
      read_si = csv.DictReader(f_si, delimiter=delimiter)
      si_line = read_si.next()
      for (k, t_line) in enumerate(read_t):
        search_id = t_line['SearchID']
        while search_id != si_line['SearchID']:
          si_line = read_si.next()
          # Now the SearchID's match
        # NB: ad before si overwrites ad.CategoryID
        ad_id = int(t_line['AdID'])
        t_line.update(ctx[ad_id])
        t_line.update(si_line)
        loc_id = int(si_line['LocationID'])
        t_line.update(loc[loc_id])
        cat_id = int(si_line['CategoryID'])
        t_line.update(cat[cat_id])
        yield t_line
コード例 #3
0
ファイル: sframes.py プロジェクト: Keesiu/meta-kaggle
def search_val():
  '''
  This function filters the rows of search.gl (the SFrame containing
  SearchInfo.tsv) to just the rows used in the validation set.
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  si = load('search.gl')
  idx = si['SearchID'].apply(lambda x : x in val_ids)
  si_val = si[idx]
  path = os.path.join(GL_DATA, 'search_val.gl')
  si_val.save(path)
  print('elapsed time: %s' % (datetime.now() - start))
コード例 #4
0
ファイル: sframes.py プロジェクト: Keesiu/meta-kaggle
def val_context():
  '''
  This function filters the rows of train_context.gl to just those rows 
  that are in the validation set(train_context() has to be run first).
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  tr = load('train_context.gl')
  idx = tr['SearchID'].apply(lambda id : id in val_ids)
  val = tr[idx]
  path = os.path.join(GL_DATA, 'val_context.gl')
  val.save(path)
  print('elapsed time: %s' % (datetime.now() - start))
コード例 #5
0
def search_val():
  '''
  This function filters the rows of search.gl (the SFrame containing
  SearchInfo.tsv) to just the rows used in the validation set.
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  si = load('search.gl')
  idx = si['SearchID'].apply(lambda x : x in val_ids)
  si_val = si[idx]
  path = os.path.join(GL_DATA, 'search_val.gl')
  si_val.save(path)
  print 'elapsed time: %s' % (datetime.now() - start)
コード例 #6
0
def val_context():
  '''
  This function filters the rows of train_context.gl to just those rows 
  that are in the validation set(train_context() has to be run first).
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  tr = load('train_context.gl')
  idx = tr['SearchID'].apply(lambda id : id in val_ids)
  val = tr[idx]
  path = os.path.join(GL_DATA, 'val_context.gl')
  val.save(path)
  print 'elapsed time: %s' % (datetime.now() - start)
コード例 #7
0
ファイル: sframes.py プロジェクト: Keesiu/meta-kaggle
def build_combo():
  '''
  Builds a combo SFrame, with test and train, joined to search,
  sorted by date, with some of the features added.
  '''
  start = datetime.now()
  print('concatenating train_context.gl and test_context.gl')
  tr = load('train_context.gl')
  test = load('test_context.gl')
  tr['isTest'] = 0
  test['isTest'] = 1
  tr['ID'] = -1
  test['IsClick'] = -1
  both = tr.append(test)
  both['HistCTR'] = both['HistCTR'].apply(lambda x : round(log(x), 1))
  
  print('modifying search.gl')
  si = load('search.gl')
  ds = load('train_ds.gl')
  ds_ids = set(ds['SearchID'])
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  
  si['isDS'] = si['SearchID'].apply(lambda id : id in ds_ids)
  si['isVal'] = si['SearchID'].apply(lambda id : id in val_ids)
  print('converting datetimes')
  si['dt'] = si['SearchDate'].str_to_datetime()
  # produces a 0-based running day (0-25) from 4/25 to 5/20
  si['runDay'] = si['dt'].apply(lambda dt : (dt.month - 4) * 30 + dt.day - 25)
  del si['SearchDate']
  si['sqe'] = si['SearchQuery'].apply(lambda sq : len(sq) > 0)
  si['spe'] = si['SearchParams'].apply(lambda sp : sp is not None)
  si['spsq'] = si['sqe'] * si['spe']
  si['spe_cat'] = si['CategoryID'] + 0.1 * si['spe']
  si['sqe_cat'] = si['CategoryID'] + 0.1 * si['sqe']
  si['sq_len'] = si['SearchQuery'].apply(lambda x : len(x)/3)
  
  print('joining')
  combo = si.join(both)
  combo['cat_pos'] = combo['CategoryID'] + 0.1 * combo['Position']
  combo['sqe_pos'] = combo['sqe'] + 0.1 * combo['Position']
  combo['spe_pos'] = combo['spe'] + 0.1 * combo['Position']
  
  print('sorting')
  combo = combo.sort('dt')
  print('saving')
  path = os.path.join(GL_DATA, 'combo.gl')
  combo.save(path)
  print('elapsed time: %s' % (datetime.now() - start)) 
コード例 #8
0
def build_combo():
  '''
  Builds a combo SFrame, with test and train, joined to search,
  sorted by date, with some of the features added.
  '''
  start = datetime.now()
  print 'concatenating train_context.gl and test_context.gl'
  tr = load('train_context.gl')
  test = load('test_context.gl')
  tr['isTest'] = 0
  test['isTest'] = 1
  tr['ID'] = -1
  test['IsClick'] = -1
  both = tr.append(test)
  both['HistCTR'] = both['HistCTR'].apply(lambda x : round(log(x), 1))
  
  print 'modifying search.gl'
  si = load('search.gl')
  ds = load('train_ds.gl')
  ds_ids = set(ds['SearchID'])
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  
  si['isDS'] = si['SearchID'].apply(lambda id : id in ds_ids)
  si['isVal'] = si['SearchID'].apply(lambda id : id in val_ids)
  print 'converting datetimes'
  si['dt'] = si['SearchDate'].str_to_datetime()
  # produces a 0-based running day (0-25) from 4/25 to 5/20
  si['runDay'] = si['dt'].apply(lambda dt : (dt.month - 4) * 30 + dt.day - 25)
  del si['SearchDate']
  si['sqe'] = si['SearchQuery'].apply(lambda sq : len(sq) > 0)
  si['spe'] = si['SearchParams'].apply(lambda sp : sp is not None)
  si['spsq'] = si['sqe'] * si['spe']
  si['spe_cat'] = si['CategoryID'] + 0.1 * si['spe']
  si['sqe_cat'] = si['CategoryID'] + 0.1 * si['sqe']
  si['sq_len'] = si['SearchQuery'].apply(lambda x : len(x)/3)
  
  print 'joining'
  combo = si.join(both)
  combo['cat_pos'] = combo['CategoryID'] + 0.1 * combo['Position']
  combo['sqe_pos'] = combo['sqe'] + 0.1 * combo['Position']
  combo['spe_pos'] = combo['spe'] + 0.1 * combo['Position']
  
  print 'sorting'
  combo = combo.sort('dt')
  print 'saving'
  path = os.path.join(GL_DATA, 'combo.gl')
  combo.save(path)
  print 'elapsed time: %s' % (datetime.now() - start) 
コード例 #9
0
ファイル: sframes.py プロジェクト: Keesiu/meta-kaggle
def train_ds(p=0.05):
  '''
  Filters train_context.gl such that all of the positive rows are kept,
  but the negatives are selected with probability p. Also removes any 
  rows that are in the validation set.
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  tr1 = load('train_context.gl')
  idx1 = tr1['IsClick'].apply(lambda x : 1 if random() < p else x)
  tr2 = tr1[idx1]
  idx2 = tr2['SearchID'].apply(lambda x : x not in val_ids)
  tr_ds = tr2[idx2]
  path = os.path.join(GL_DATA, 'train_ds.gl')
  tr_ds.save(path)
  print('elapsed time: %s' % (datetime.now() - start))
コード例 #10
0
def train_ds(p=0.05):
  '''
  Filters train_context.gl such that all of the positive rows are kept,
  but the negatives are selected with probability p. Also removes any 
  rows that are in the validation set.
  '''
  start = datetime.now()
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  tr1 = load('train_context.gl')
  idx1 = tr1['IsClick'].apply(lambda x : 1 if random() < p else x)
  tr2 = tr1[idx1]
  idx2 = tr2['SearchID'].apply(lambda x : x not in val_ids)
  tr_ds = tr2[idx2]
  path = os.path.join(GL_DATA, 'train_ds.gl')
  tr_ds.save(path)
  print 'elapsed time: %s' % (datetime.now() - start)
コード例 #11
0
def run_val(alpha, l2, l1, maxlines, interact):
  val_ids = avito2_io.get_artifact('full_val_set.pkl')
  model = ftrl_proximal(alpha, beta, l1, l2, D, interact)
  train_path = os.path.join(avito2_io.PROCESSED, TRAIN_INFILE)
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) not in val_ids:
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        model.update(f, p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines' % (k + 1)
  print 'finished training'
  count = 0
  loss = 0.0
  with open(train_path) as train_file:
    input = csv.DictReader(train_file)
    for (k, x) in enumerate(input):
      if int(x['SearchID']) in val_ids:
        count += 1
        y = float(x['IsClick'])
        del x['IsClick']
        del x['SearchDate']
        del x['SearchID']
        f = hash_features(x, D)
        p = model.predict(f)
        loss += logloss(p, y)
      if k == maxlines:
        break
      if (k + 1) % 1000000 == 0:
        print 'processed %d lines of raw train on validation pass' % (k + 1)
  print 'validation loss: %.5f on %d rows' % (loss/count, count)
コード例 #12
0
ファイル: val_run0.py プロジェクト: Keesiu/meta-kaggle
'''
This script gets log loss on the validation set from full_val_set.pkl, 
(generated by the full_validation_set.py script) for some simple, 
no-learning models like the HistCTR, all 0's, or mean-value benchmark.

author: David Thaler
date: July 2015
'''
import avito2_io
from datetime import datetime
from eval import logloss

maxlines_val = None

start = datetime.now()
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {
    'ad': lambda l: l['AdID'],
    'pos': lambda l: l['Position'],
    'ctr': lambda l: l['HistCTR']
}
search_etl = {'cat': lambda l: l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True,
                               train_etl,
                               search_etl,
                               do_validation=True,
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
コード例 #13
0
ファイル: run_model.py プロジェクト: Keesiu/meta-kaggle
def build_user_dict():
    users = avito2_io.get_artifact('user_counts.pkl')
    users.update(avito2_io.get_artifact('user_dict.pkl'))
    return users
コード例 #14
0
 parser.add_argument('-n', '--maxlines_val',type=int, default=None,
       help='A max # lines for validation, if none, all data is used.')
 parser.add_argument('-s', '--sub', type=str,
       help='Do test and write results at submissions/submission<sub>.csv')
 parser.add_argument('-u', '--users', type=str, default=None,
       help="None, 'counts' or 'full' - what user data to use")
 parser.add_argument('-a','--all', action='store_const', default=False, 
       const=True, help='Full training run; use all training data.')
 parser.add_argument('-p', '--passes',type=int, default=1,
       help='# of passes over training data.')
 args = parser.parse_args()
 if args.users=='full':
   users = build_user_dict()
   print 'loading full user data'
 elif args.users=='counts':
   users = avito2_io.get_artifact('user_counts.pkl')
   print 'loading user counts only from user_counts.pkl'
 elif args.users == 'si':
   users = avito2_io.get_artifact('user_si.pkl')
   print 'loading user dict from user_si.pkl'
 else:
   users = None
 D = 2**args.bits
 if args.all:
   tr = sframes.load('train_context.gl')
   si = sframes.load('search.gl')
   if not args.sub:
     raise Warning('--all without --sub is not sensible.')
 else:
   tr = sframes.load('train_ds.gl')
   si = sframes.load('search_ds.gl') 
コード例 #15
0
search_etl = {'user'    : (lambda l : l['UserID']),
              'category': (lambda l : l['CategoryID']),
              'location': (lambda l : l['LocationID']),
              'logon'   : (lambda l : l['IsUserLoggedOn']),
              'SPexists': (lambda l : int(len(l['SearchParams']) > 0)),
              'SQexists': (lambda l : int(len(l['SearchQuery']) > 0))}
              
ads_etl ={'price'  : lambda l : ceil(float(l[1])/100.),
          'ad_cat' : lambda l : l[0]}

# cut:
# 'params' : lambda l : len(l[3]),
# 'title'  : lambda l : len(l[2]),

# use_train = True
val_ids = avito2_io.get_artifact('full_val_set.pkl')
ads = avito2_io.get_artifact('context_ads.pkl')
input = avito2_io.join_with_ads(True, 
                                ads,
                                train_etl, 
                                search_etl, 
                                ads_etl,
                                do_validation=False, 
                                val_ids=val_ids)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)

for (k, (x, y)) in enumerate(input):
  f = hash_features(x, D)
  p = model.predict(f)
  model.update(f, p, y)
  if k == maxlines_train:
コード例 #16
0
    'logon': (lambda l: l['IsUserLoggedOn']),
    'SPexists': (lambda l: int(len(l['SearchParams']) > 0)),
    'SQexists': (lambda l: int(len(l['SearchQuery']) > 0))
}

ads_etl = {
    'price': lambda l: ceil(float(l[1]) / 100.),
    'ad_cat': lambda l: l[0]
}

# cut:
# 'params' : lambda l : len(l[3]),
# 'title'  : lambda l : len(l[2]),

# use_train = True
val_ids = avito2_io.get_artifact('full_val_set.pkl')
ads = avito2_io.get_artifact('context_ads.pkl')
print 'small objects loaded'
input = avito2_io.join_with_ads(True,
                                ads,
                                train_etl,
                                search_etl,
                                ads_etl,
                                do_validation=False,
                                val_ids=val_ids)
model = ftrl_proximal(alpha, beta, L1, L2, D, interaction)

# total count is just k + 1
total_y = 0.0
sample_ct = 0.0
コード例 #17
0
'''
This script gets log loss on the validation set from full_val_set.pkl, 
(generated by the full_validation_set.py script) for some simple, 
no-learning models like the HistCTR, all 0's, or mean-value benchmark.

author: David Thaler
date: July 2015
'''
import avito2_io
from datetime import datetime
from eval import logloss

maxlines_val = None

start = datetime.now()
val_ids = avito2_io.get_artifact('full_val_set.pkl')
print 'validation set ids read'
train_etl = {'ad'      : lambda l : l['AdID'],
             'pos'     : lambda l : l['Position'],
             'ctr'     : lambda l : l['HistCTR']}
search_etl = {'cat'    : lambda l : l['CategoryID']}
# validation run
input = avito2_io.rolling_join(True, 
                               train_etl, 
                               search_etl, 
                               do_validation=True, 
                               val_ids=val_ids)
loss = 0.0
for (k, (x, y)) in enumerate(input):
  #loss += logloss(float(x['ctr']), y)
  loss += logloss(0.006, y)
コード例 #18
0
def build_user_dict():
  users = avito2_io.get_artifact('user_counts.pkl')
  users.update(avito2_io.get_artifact('user_dict.pkl'))
  return users
コード例 #19
0
ファイル: run_model.py プロジェクト: Keesiu/meta-kaggle
                     '--all',
                     action='store_const',
                     default=False,
                     const=True,
                     help='Full training run; use all training data.')
 parser.add_argument('-p',
                     '--passes',
                     type=int,
                     default=1,
                     help='# of passes over training data.')
 args = parser.parse_args()
 if args.users == 'full':
     users = build_user_dict()
     print('loading full user data')
 elif args.users == 'counts':
     users = avito2_io.get_artifact('user_counts.pkl')
     print('loading user counts only from user_counts.pkl')
 elif args.users == 'si':
     users = avito2_io.get_artifact('user_si.pkl')
     print('loading user dict from user_si.pkl')
 else:
     users = None
 D = 2**args.bits
 if args.all:
     tr = sframes.load('train_context.gl')
     si = sframes.load('search.gl')
     if not args.sub:
         raise Warning('--all without --sub is not sensible.')
 else:
     tr = sframes.load('train_ds.gl')
     si = sframes.load('search_ds.gl')