Ejemplo n.º 1
0
def full_train():
    combo = sframes.load('combo.gl')
    extras = sframes.load('extras.gl')
    combo.add_columms(extras)
    prepareSFrame(combo)
    cit = chunk_iterator(combo)
    sit = select_train(cit, True)
    model = train(sit)
    return model
Ejemplo n.º 2
0
def full_train():
  combo = sframes.load('combo.gl')
  extras = sframes.load('extras.gl')
  combo.add_columms(extras)
  prepareSFrame(combo)
  cit = chunk_iterator(combo)
  sit = select_train(cit, True)
  model = train(sit)
  return model
Ejemplo n.º 3
0
def get_context_ads():
  '''
  Loads the ads.gl file (all of AdsInfo.tsv) and extracts the context ads.
  '''
  ads = sframes.load('ads.gl')
  ctx = ads[ads['IsContext']]
  del ctx['IsContext']
  del ctx['LocationID']
  return ctx
Ejemplo n.º 4
0
def basic_join(tss, si, user):
  '''
  A generator that performs a rolling join over Graphlab SFrames tss, which
  stores data from train/testSearchStream.tsv and si, which is from 
  SearchInfo.tsv. SFrame context_ads.gl, which has the contextual ads from 
  AdsInfo.tsv, is loaded and joined in. UserInfo.tsv is joined in from loading
  the artifact user_dict.pkl from artifacts/.
  
  args:
    tss - an SFrame with data from trainSearchStream or testSearchStream, 
        including samples or validation sets
    si - an SFrame with data from SearchInfo. Must have all of the SearchIDs
        in tss, but it can be a sample
    user -  dict or None. A dict from UserID to a dict of features for 
        that user. Caller should construct this if used.
        
  generates:
    a dict that combines all of the fields from tss, si and ads for a row
  '''
  ctx = sframes.load('context_ads.gl')
  ctx = sframes.sframe_to_dict('AdID', ctx)
  si_it = iter(si)
  si_line = si_it.next()
  for tss_line in tss:
    search_id = tss_line['SearchID']
    ad_id = tss_line['AdID']
    user_id = si_line['UserID']
    while search_id != si_line['SearchID']:
      si_line = si_it.next()
    # Now the SearchIDs match
    tss_line.update(ctx[ad_id])
    # SearchInfo.CategoryID overwrites AdInfo.CategoryID in this line
    tss_line.update(si_line)
    if user is not None and user_id in user:
      tss_line.update(user[user_id])
    yield tss_line
Ejemplo n.º 5
0
def basic_join(tss, si, user):
    '''
  A generator that performs a rolling join over Graphlab SFrames tss, which
  stores data from train/testSearchStream.tsv and si, which is from 
  SearchInfo.tsv. SFrame context_ads.gl, which has the contextual ads from 
  AdsInfo.tsv, is loaded and joined in. UserInfo.tsv is joined in from loading
  the artifact user_dict.pkl from artifacts/.
  
  args:
    tss - an SFrame with data from trainSearchStream or testSearchStream, 
        including samples or validation sets
    si - an SFrame with data from SearchInfo. Must have all of the SearchIDs
        in tss, but it can be a sample
    user -  dict or None. A dict from UserID to a dict of features for 
        that user. Caller should construct this if used.
        
  generates:
    a dict that combines all of the fields from tss, si and ads for a row
  '''
    ctx = sframes.load('context_ads.gl')
    ctx = sframes.sframe_to_dict('AdID', ctx)
    si_it = iter(si)
    si_line = si_it.next()
    for tss_line in tss:
        search_id = tss_line['SearchID']
        ad_id = tss_line['AdID']
        user_id = si_line['UserID']
        while search_id != si_line['SearchID']:
            si_line = si_it.next()
        # Now the SearchIDs match
        tss_line.update(ctx[ad_id])
        # SearchInfo.CategoryID overwrites AdInfo.CategoryID in this line
        tss_line.update(si_line)
        if user is not None and user_id in user:
            tss_line.update(user[user_id])
        yield tss_line
Ejemplo n.º 6
0
       help='# of passes over training data.')
 args = parser.parse_args()
 if args.users=='full':
   users = build_user_dict()
   print 'loading full user data'
 elif args.users=='counts':
   users = avito2_io.get_artifact('user_counts.pkl')
   print 'loading user counts only from user_counts.pkl'
 elif args.users == 'si':
   users = avito2_io.get_artifact('user_si.pkl')
   print 'loading user dict from user_si.pkl'
 else:
   users = None
 D = 2**args.bits
 if args.all:
   tr = sframes.load('train_context.gl')
   si = sframes.load('search.gl')
   if not args.sub:
     raise Warning('--all without --sub is not sensible.')
 else:
   tr = sframes.load('train_ds.gl')
   si = sframes.load('search_ds.gl') 
 # no interactions; it'd take days
 model = train(tr, 
               si, 
               args.alpha, 
               args.beta, 
               args.l1, 
               args.l2, 
               D, 
               users,
Ejemplo n.º 7
0
def features2():
  '''
  This function implements and records the construction of the second set of 
  graphlab sframe features. These include all of the integer-valued raw and
  lightly processed features from train/test, SearchInfo, Category, Location,
  AdsInfo and UserInfo. Only contextual ads are considered.
  
  NB: This leaves SearchID in the output(to allow for validation).
      Run script must delete SearchID.
  '''
  start = datetime.now()
  print 'loading context ads'
  ctx = get_context_ads()
  ctx['LogPrice'] = ctx['Price'].apply(lambda x : round(log(x+1), 1))
  ctx['ParamLen'] = ctx['Params'].apply(lambda d : len(d)).fillna(0)
  ctx['TitleLen'] = ctx['Title'].apply(lambda s : len(s)).fillna(0)
  del ctx['Price']
  del ctx['Title']
  del ctx['Params']
  print 'loading users'
  users = sframes.load('user.gl')
  print 'loading category and location'
  ctg = sframes.load('category.gl')
  # Admins said this field could be deleted.
  del ctg['SubcategoryID']
  loc = sframes.load('location.gl')
  print 'small objects loaded, elapsed time: %s' % (datetime.now() - start)
  
  print 'ingesting train.gl'
  tr = sframes.load('train.gl')
  tr = tr[tr['ObjectType'] == 3]
  del tr['ObjectType']
  tr['log_ctr'] = tr['HistCTR'].apply(lambda x : -10 * round(log(x), 1))
  del tr['HistCTR']
  print 'train.gl ingested, elapsed time: %s' % (datetime.now() - start) 
  
  # SearchDate: I can't decide what to do with it, so I'm leaving it in, 
  # as-is. The run script will have to remove it. This allows sorting by
  # date without doing this huge join again.
  
  print 'ingesting search.gl'
  si = sframes.load('search.gl')  
  si['SQexists'] = si['SearchQuery'].apply(lambda x : len(x) > 0)
  del si['SearchQuery']
  si['SPexists'] = (si['SearchParams'].apply(lambda d : int(d is not None))
                                      .fillna(0))
  del si['SearchParams']
  print 'search.gl ingested, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining user.gl into search.gl'
  si = si.join(users, how='left', on='UserID')
  print 'user.gl joined in, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining location.gl to search.gl'
  si = si.join(loc, how='left', on='LocationID')
  print 'location.gl joined in, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining category.gl to search.gl'
  si = si.join(ctg, how='left', on='CategoryID')
  print 'category.gl joined in, elapsed time: %s' % (datetime.now() - start)
  
  # join category into context ads and rename to avoid name clash
  print 'joining category into ads'
  ctx = ctx.join(ctg, how='left', on='CategoryID')
  ctx.rename({'CategoryID':'AdCat'})
  print 'category.gl joined into ads, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining context ads into train'
  tr = tr.join(ctx, how='left', on='AdID')
  print 'context ads joined into train, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining up training set (search and train)...'
  tr = tr.join(si, how='left', on='SearchID')
  print 'join completed, elapsed time: %s' % (datetime.now() - start)
  
  print 'sorting train by SearchDate, SearchID, AdID
  tr = tr.sort(['SearchDate', 'SearchID', 'AdID'])
  
  path = os.path.join(avito2_io.PROCESSED, 'gl_train2.csv')
  print 'saving training features to %s' % path
  tr.save(path, format='csv')
  print 'training features saved, elapsed time: %s' % (datetime.now() - start)
  
  # test
  
  print 'ingesting test.gl'
  test = sframes.load('test.gl')
  test = test[test['ObjectType'] == 3]
  del test['ObjectType']
  test['log_ctr'] = test['HistCTR'].apply(lambda x : -10 * round(log(x), 1))
  del test['HistCTR']
  print 'test.gl ingested, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining context ads into test'
  test = test.join(ctx, how='left', on='AdID')
  print 'context ads joined into test, elapsed time: %s' % (datetime.now() - start)
  
  print 'joining up test set...'
  ftest = test.join(si, how='left', on='SearchID')
  del ftest['SearchID']                          
  print 'join completed, elapsed time: %s' % (datetime.now() - start)
  
  print 'sorting test...'
  ftest = ftest.sort('ID')
  
  path = os.path.join(avito2_io.PROCESSED, 'gl_test2.csv')
  print 'saving test features to %s' % path
  ftest.save(path, format='csv')
  print 'finished, elapsed time: %s' % (datetime.now() - start)
Ejemplo n.º 8
0
def features1():
  '''
  This function implements and records the construction of the first sframe
  features, which are the same as the features used in the pure python/pypy 
  run1.py. This uses just SearchInfo and trainSearchStream and runs row-wise
  on context ads only.
  '''
  # process trainSearchStream
  start = datetime.now()
  print 'ingesting train.gl'
  tr = sframes.load('train.gl')
  tr = tr[tr['ObjectType'] == 3]
  del tr['ObjectType']
  tr['log_ctr'] = tr['HistCTR'].apply(lambda x : -10 * round(log(x), 1))
  del tr['HistCTR']
  print 'train.gl ingested, elapsed time: %s' % (datetime.now() - start)
  
  # process SearchInfo
  print 'ingesting search.gl'
  si = sframes.load('search.gl')
  # In run1.py, we didn't use date or IPID
  del si['SearchDate']
  del si['IPID']
  si['SQexists'] = si['SearchQuery'].apply(lambda x : len(x) > 0)
  del si['SearchQuery']
  # NB: lambda d : 0 if d is None else len(d) doesn't seem to work
  si['SPexists'] = (si['SearchParams'].apply(lambda d : int(d is not None))
                                      .fillna(0))
  del si['SearchParams']
  print 'search.gl ingested, elapsed time: %s' % (datetime.now() - start)
  
  # join up training set
  # NB: due to lazy evaluation, this might not time accurately
  print 'joining up training set...'
  f = tr.join(si, how='left', on='SearchID')
  # This line makes validation impossible. Run script must delete SearchID.
  #del f['SearchID']
  print 'join completed, elapsed time: %s' % (datetime.now() - start)
  
  # save training features
  path = os.path.join(avito2_io.PROCESSED, 'gl_train1.csv')
  print 'saving training features to %s' % path
  f.save(path, format='csv')
  print 'training features saved, elapsed time: %s' % (datetime.now() - start)
  
  # load test set
  print 'ingesting test.gl'
  test = sframes.load('test.gl')
  test = test[test['ObjectType'] == 3]
  del test['ObjectType']
  test['log_ctr'] = test['HistCTR'].apply(lambda x : -10 * round(log(x), 1))
  del test['HistCTR']
  print 'test.gl ingested, elapsed time: %s' % (datetime.now() - start)
  
  # join up test set
  print 'joining up test set...'
  ftest = test.join(si, how='left', on='SearchID')
  del ftest['SearchID']                          
  print 'join completed, elapsed time: %s' % (datetime.now() - start)
  
  # save test set
  path = os.path.join(avito2_io.PROCESSED, 'gl_test1.csv')
  print 'saving test features to %s' % path
  ftest.save(path, format='csv')
  print 'finished, elapsed time: %s' % (datetime.now() - start)
Ejemplo n.º 9
0
def run_test(model):
    test = sframes.load('combo_test.gl')
    prepareSFrame(test)
    cit = chunk_iterator(test)
    pred = predict(cit, model)
    return pred
Ejemplo n.º 10
0
def run_test(model):
  test = sframes.load('combo_test.gl')
  prepareSFrame(test)
  cit = chunk_iterator(test)
  pred = predict(cit, model)
  return pred
Ejemplo n.º 11
0
                        help='# of passes over training data.')
    args = parser.parse_args()
    if args.users == 'full':
        users = build_user_dict()
        print('loading full user data')
    elif args.users == 'counts':
        users = avito2_io.get_artifact('user_counts.pkl')
        print('loading user counts only from user_counts.pkl')
    elif args.users == 'si':
        users = avito2_io.get_artifact('user_si.pkl')
        print('loading user dict from user_si.pkl')
    else:
        users = None
    D = 2**args.bits
    if args.all:
        tr = sframes.load('train_context.gl')
        si = sframes.load('search.gl')
        if not args.sub:
            raise Warning('--all without --sub is not sensible.')
    else:
        tr = sframes.load('train_ds.gl')
        si = sframes.load('search_ds.gl')
    # no interactions; it'd take days
    model = train(tr, si, args.alpha, args.beta, args.l1, args.l2, D, users,
                  False, args.maxlines, args.passes)
    print('finished training')

    if args.all:
        offset = 0.0
    else:
        offset = compute_offset(tr, args.maxlines)