Ejemplo n.º 1
0
    def train(self, df_train, alg="skrf", mdl_config={}, norm=None):
        mdl_path = "%s/models/%s" % (self.root, self.stamp)
        os.mkdir(mdl_path)
        print("[Train] start with mdl_config=%s, write models to %s @ %s" %
              (mdl_config, mdl_path, conv.now('full')))

        processes = []
        for x_idx, (x_min, x_max) in enumerate(self.x_ranges):
            x_min, x_max = conv.trim_range(x_min, x_max, self.size)
            df_row = df_train[(df_train.x >= x_min) & (df_train.x < x_max)]
            mp_pool = mp.Pool(pool_size)
            for y_idx, (y_min, y_max) in enumerate(self.y_ranges):
                y_min, y_max = conv.trim_range(y_min, y_max, self.size)
                df_grid = df_row[(df_row.y >= y_min) & (df_row.y < y_max)]

                # normalize for scale sensitive algorithms
                if norm:
                    for k, v in norm.items():
                        df_grid[k] = df_grid[k] * v

                # preprocessing
                if self.en_preprocessing:
                    df_grid, cols_extra = conv.df_preprocess(
                        self.en_preprocessing, df_grid, x_idx, y_idx, LOCATION,
                        AVAIL_WDAYS, AVAIL_HOURS, POPULAR, GRID_CANDS)
                    X_train, y_train, _ = conv.df2sample(
                        df_grid, self.x_cols + cols_extra)
                else:
                    X_train, y_train, _ = conv.df2sample(df_grid, self.x_cols)

                # save model (can't stay in memory, too large)
                clf = self.get_alg(alg, mdl_config)
                mdl_name = "%s/models/%s/grid_model_x_%s_y_%s.pkl.gz" % (
                    self.root, self.stamp, x_idx, y_idx)
                p = mp_pool.apply_async(save_model,
                                        (alg, mdl_name, clf, X_train, y_train))
                processes.append(p)
                clf = None  # clear memory
                # prevent memory explode!
                while (len(processes) > 30):
                    processes.pop(0).get()
            print("[Train] grid(%i,%i): %i samples / %i classes @ %s" %
                  (x_idx, y_idx, len(y_train), len(
                      set(y_train)), conv.now('full')))
            mp_pool.close()
        while processes:
            processes.pop(0).get()
        print("[Train] done, rest processes=%i (should be 0!) @ %s" %
              (len(processes), conv.now('full')))
Ejemplo n.º 2
0
def tree_cv(clf, df_grid, debug=False):
    scores = []
    x_cols = ['hour', 'qday', 'weekday', 'month', 'year', 'logacc', 'x', 'y']
    for ts in np.arange(100000, 700000, 100000):
        df_tr = df_grid[df_grid.time <= ts]
        df_va = df_grid[(df_grid.time > ts) & (df_grid.time < ts + 100000)]
        if debug:
            df_tr = df_tr[:100]
            df_va = df_va[:100]
        X_tr, y_tr, _ = conv.df2sample(df_tr, x_cols)
        X_va, y_va, _ = conv.df2sample(df_va, x_cols)
        clf.fit(X_tr, y_tr)
        sols = clf.predict_proba(X_va)
        score, _ = drill_eva(clf, X_va, y_va)
        # print(ts, score)
        scores.append(score)
    avg_score = np.mean(scores)
    return avg_score
Ejemplo n.º 3
0
  def train(self, df_train, alg="skrf", mdl_config={}, norm=None):
    mdl_path = "%s/models/%s" % (self.root, self.stamp)
    os.mkdir(mdl_path)
    print("[Train] start with mdl_config=%s, write models to %s @ %s" % (mdl_config, mdl_path, conv.now('full')))
    
    processes = []
    for x_idx, (x_min, x_max) in enumerate(self.x_ranges):
      x_min, x_max = conv.trim_range(x_min, x_max, self.size)
      df_row = df_train[(df_train.x >= x_min) & (df_train.x < x_max)]
      mp_pool = mp.Pool(pool_size)
      for y_idx, (y_min, y_max) in enumerate(self.y_ranges): 
        y_min, y_max = conv.trim_range(y_min, y_max, self.size)
        df_grid = df_row[(df_row.y >= y_min) & (df_row.y < y_max)]

        # normalize for scale sensitive algorithms
        if norm:  
          for k, v in norm.items(): df_grid[k] = df_grid[k]*v

        # preprocessing
        if self.en_preprocessing:
          df_grid, cols_extra = conv.df_preprocess(self.en_preprocessing, df_grid, x_idx, y_idx, LOCATION, AVAIL_WDAYS, AVAIL_HOURS, POPULAR, GRID_CANDS)
          X_train, y_train, _ = conv.df2sample(df_grid, self.x_cols+cols_extra)
        else:
          X_train, y_train, _ = conv.df2sample(df_grid, self.x_cols)

        # save model (can't stay in memory, too large)
        clf = self.get_alg(alg, mdl_config)
        mdl_name = "%s/models/%s/grid_model_x_%s_y_%s.pkl.gz" % (self.root, self.stamp, x_idx, y_idx)
        p = mp_pool.apply_async(save_model, (alg, mdl_name, clf, X_train, y_train))
        processes.append(p)
        clf = None  # clear memory
        # prevent memory explode!
        while (len(processes) > 30): processes.pop(0).get()
      print("[Train] grid(%i,%i): %i samples / %i classes @ %s" % (x_idx, y_idx, len(y_train), len(set(y_train)), conv.now('full')))
      mp_pool.close()
    while processes: processes.pop(0).get()
    print("[Train] done, rest processes=%i (should be 0!) @ %s" % (len(processes), conv.now('full')))
Ejemplo n.º 4
0
def drill_grid(df_grid, x_cols, xi, yi, grid_submit_path, do_blending=True):
  best_score = 0
  all_score = []
  Xs, ys = {}, {}
  for m in ['tr', 'te']:
    Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)
  
  # [grid search best models]
  mdl_configs = [
    {'alg': 'skrf', 'n_estimators': 500, 'max_depth': 11},
    {'alg': 'skrfp', 'n_estimators': 500, 'max_depth': 11},
    {'alg': 'sket', 'n_estimators': 500, 'max_depth': 11},
    {'alg': 'sketp', 'n_estimators': 500, 'max_depth': 11},

    # {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'sket', 'n_estimators': 800, 'max_features': 0.5, 'max_depth': 15},
    # {'alg': 'sketp', 'n_estimators': 1000, 'max_features': 0.5, 'max_depth': 11},

    # {'alg': 'skrf', 'n_estimators': 1000, 'max_features': 0.4, 'max_depth': 9},
    # {'alg': 'skrfp', 'n_estimators': 1000, 'max_features': 0.4, 'max_depth': 9},
  ]

  # mdl_configs = []
  # for alg in ['skrf', 'skrfp', 'sket', 'sketp']:
  #   for n_estimators in [500]:
  #     for max_features in [0.5]:   # 0.4
  #       for max_depth in [15]:    # 11
  #         mdl_configs.append({'alg': alg, 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth}) 
  all_bests = []

  # for mdl_config in mdl_configs:
  #   # train
  #   clf = get_alg(mdl_config['alg'], mdl_config)
  #   clf.fit(Xs['tr'], ys['tr'])
  #   # valid
  #   score, bests = drill_eva(clf, Xs['va'], ys['va'])
  #   print("drill(%i,%i) va_score %.4f for model %s(%s) @ %s" % (xi, yi, score, mdl_config['alg'], mdl_config, conv.now('full')))
  #   if score > best_score:
  #     best_score = score
  #     best_config = mdl_config
  #   # for blending
  #   if do_blending:
  #     all_score.append(score)
  #     all_bests.append(bests)
  
  # # blending
  # if do_blending:
  #   best_bcnt = None
  #   best_blend_score = 0
  #   best_good_idxs = []
  #   for bcnt in [5, 10]:
  #     good_idxs = [k for k,v in sorted(enumerate(all_score), key=lambda v: v[1], reverse=True)][:bcnt]
  #     blended_bests = blending([m for idx, m in enumerate(all_bests) if idx in good_idxs])
  #     blended_match = [apk([ans], vals) for ans, vals in zip(ys['va'], blended_bests)]
  #     blended_score = sum(blended_match)/len(blended_match)
  #     if blended_score > best_blend_score:
  #       best_blend_score = blended_score
  #       best_good_idxs = good_idxs
  #       best_bcnt = bcnt
  #     print("drill(%i,%i) va_score %.4f for model 'blending_%i' @ %s" % (xi, yi, blended_score, bcnt, conv.now('full')))
  

  # train again with full training samples
  # Xs['tr_va'] = pd.concat([Xs['tr'], Xs['va']])
  # ys['tr_va'] = np.append(ys['tr'], ys['va'])
  

  # always write best blending (in case best single model overfitting)
  all_bt_preds = []
  # for bcfg in [m for idx,m in enumerate(mdl_configs) if idx in best_good_idxs]:
  for bcfg in mdl_configs:
    bmdl = get_alg(bcfg['alg'], bcfg)
    bmdl.fit(Xs['tr'], ys['tr'])
    _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
    all_bt_preds.append(bt_preds)
  blending_test_preds = blending(all_bt_preds)
  blending_test_preds = pd.DataFrame(blending_test_preds)
  blending_test_preds['row_id'] = row_id
  df2submit(blending_test_preds, (grid_submit_path[:-4] + '_blend.csv'))


  # # collect results
  # if do_blending and (best_blend_score > best_score):
  best_score = 1.0 #best_blend_score
  test_preds = blending_test_preds
  #   best_config = "blending_%i" % best_bcnt
  # else:
  #   best_model = get_alg(best_config['alg'], best_config)
  #   best_model.fit(Xs['tr_va'], ys['tr_va'])
  #   _, test_preds = drill_eva(best_model, Xs['te'], ys['te'])
  #   test_preds = pd.DataFrame(test_preds)
  #   test_preds['row_id'] = row_id

  # write partial submit  
  # df2submit(test_preds, grid_submit_path)
  # print("[drill_grid (%i,%i)] choose best_model %s, best_score=%.4f @ %s" % (xi, yi, best_config, best_score, datetime.now()))
  print("[drill_grid (%i,%i)] blended @ %s" % (xi, yi, datetime.now()))
  return best_score, test_preds
Ejemplo n.º 5
0
def drill_grid(df_grid, x_cols, xi, yi, grid_submit_path, do_blending=True):
  best_score = 0
  all_score = []
  Xs, ys = {}, {}
  for m in ['tr', 'va', 'te']:
    Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)
  
  # [grid search best models]
  mdl_configs = [
    {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
    # {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
    # {'alg': 'sket', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'sketp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 11},
    # {'alg': 'skrf', 'n_estimators': 800, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'skrfp', 'n_estimators': 800, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'sket', 'n_estimators': 1500, 'max_features': 0.35, 'max_depth': 15},
    # {'alg': 'sketp', 'n_estimators': 1500, 'max_features': 0.35, 'max_depth': 11},
  ]
  all_bests = []

  # train & collect valid preds for weight selection
  for mdl_config in mdl_configs:
    # train
    clf = get_alg(mdl_config['alg'], mdl_config)
    clf.fit(Xs['tr'], ys['tr'])
    # valid
    score, bests = drill_eva(clf, Xs['va'], ys['va'])
    all_score.append(score)
    all_bests.append(bests)
    clf = None
    gc.collect()
    print("drill(%i,%i) va_score %.4f for model %s(%s) @ %s" % (xi, yi, score, mdl_config['alg'], mdl_config, conv.now('full')))
    
  # train again with full training samples
  Xs['tr_va'] = pd.concat([Xs['tr'], Xs['va']])
  ys['tr_va'] = np.append(ys['tr'], ys['va'])
  
  # collect test preds
  all_bt_preds = []
  for bcfg in mdl_configs:
    bmdl = get_alg(bcfg['alg'], bcfg)
    bmdl.fit(Xs['tr_va'], ys['tr_va'])
    _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
    all_bt_preds.append(bt_preds)
    bmdl = None
    gc.collect()

  info = {
    'mdl_configs'   : mdl_configs,
    'all_va_score'  : all_score,
    'all_va_preds'  : all_bests,
    'y_va'          : ys['va'],
    'all_te_preds'  : all_bt_preds,
    'row_id'        : row_id,
  }
  pickle.dump(info, open(grid_submit_path, 'wb'))
  Xs, ys, all_score, all_bests, all_bt_preds = [None]*5
  print("cv raw collect for (%i,%i) in %s @ %s" % (xi, yi, grid_submit_path, datetime.now()))
  gc.collect()
  return None, None
Ejemplo n.º 6
0
def drill_grid(df_grid, x_cols, xi, yi, stamp, grid_submit_path, do_blending=True):
  # params init
  best_score = 0
  all_score = []
  Xs, ys = {}, {}
  for m in ['tr', 'te']:
    Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)
  scnt = len(ys['tr'])
  
  mdl_path = '/'.join(grid_submit_path.split('/')[:-1])
  dat_file = 'dat_%i_%i.pkl' % (xi, yi)
  sol_file = 'dat_%i_%i.sol.%s' % (xi, yi, stamp)

  # if scnt > 1500:
  #   mdl_configs = [
  #     {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
  #     {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
  #     {'alg': 'sket', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
  #     {'alg': 'sketp', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 11},
  #   ]
  # else:
  mdl_configs = raw_mdl_configs

  # prepare data
  cmds = {
    'mdl_configs': mdl_configs,
    'Xs': {k:v.values for k,v in Xs.items()},
    'ys': ys,
    'row_id': row_id.values,
    'sol_file': sol_file,
  }
  dat_path = "%s/%s" % (mdl_path, dat_file)
  try:
    pickle.dump(cmds, open(dat_path, 'wb'), protocol=2) # lambda only has python2, which support latest pickle protocol=2 
  except Exception as e:
    print(e)
    print("ERROR for (%i/%i), cmds: %s" % (xi, yi, cmds))

  # upload to s3 for lambda
  job_done = False

  # bucket = boto3.resource('s3', region_name='us-west-2').Bucket('recom.lambda.m1536')
  session = boto3.session.Session()
  bucket = session.resource('s3', region_name='us-west-2').Bucket('recom.lambda.m1536')

  #-----[use aws lambda]-------------------------------------------
  if True: #scnt < 2500:
    # if scnt > 1000:
    #   bucket = bucket1536
    # elif scnt > 500:
    #   bucket = bucket1024
    # elif scnt > 300:  
    #   bucket = bucket512
    # else:
    #   bucket = bucket256
    try:
      bucket.upload_file(dat_path, dat_file)
    except Exception as e:
      print(e)
      print("when bucket.upload_file", dat_path)
    print("upload dat_file %s of %i tr samples @ %s" % (dat_file, len(ys['tr']), datetime.now()))
    df_grid, Xs, ys, row_id, cmds = [None]*5  # release memory
    
    
    # print("try download %s to %s" % (sol_file, grid_submit_path))
    try_cnt, max_try = 0, 6
    while try_cnt <= max_try:
      try:
        bucket.download_file(sol_file, grid_submit_path)
        job_done = True
        break
      except Exception as e:
        if try_cnt > 4: print("(%i/%i) scnt=%i, waiting %i ... @ %s" % (xi, yi, scnt, try_cnt, datetime.now()))
        try_cnt += 1
        sleep(30)

    # remove tmp files
    bucket.delete_objects(Delete={'Objects': [{'Key': sol_file}], 'Quiet': True,})

    # collect sols
    if job_done:
      try:
        sols = json.load(open(grid_submit_path, 'rt'))
      except Exception as e:
        print(e)
        print("when json try load %s" % grid_submit_path)
      # print(sols[:5])
      sols = pd.DataFrame(sols)
      sols['row_id'] = row_id
      df2submit(sols, grid_submit_path)
      # print("get sols:\n %s \n@ %s" % (sols.head(), datetime.now()))
      print("[drill_grid (%i,%i)] blended @ %s" % (xi, yi, datetime.now()))
    else:
      sols = None
      print("[TIMEOUT] job timeout: (%i/%i)" % (xi, yi))

  #-----[use local machine]-------------------------------------------
  else:
    all_bt_preds = []
    for bcfg in mdl_configs:
      bmdl = get_alg(bcfg['alg'], bcfg)
      bmdl.fit(Xs['tr'], ys['tr'])
      _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
      all_bt_preds.append(bt_preds)
    sols = blending(all_bt_preds)
    sols = pd.DataFrame(sols)
    sols['row_id'] = row_id
    df2submit(sols, grid_submit_path)
    print("[LOCAL] done (%i/%i) locally @ %s" % (xi, yi, datetime.now()))

  return 1.0, sols
Ejemplo n.º 7
0
def drill_grid(df_grid, x_cols, xi, yi, grid_submit_path, do_blending=True):
    best_score = 0
    all_score = []
    Xs, ys = {}, {}
    for m in ['tr', 'va', 'te']:
        Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)

    # [grid search best models]
    mdl_configs = []
    for alg in ['skrf', 'skrfp', 'sket', 'sketp']:
        for n_estimators in [500, 1000, 1500]:
            for max_features in [0.3, 0.5]:
                for max_depth in [11, 15]:
                    mdl_configs.append({
                        'alg': alg,
                        'n_estimators': n_estimators,
                        'max_features': max_features,
                        'max_depth': max_depth
                    })
    all_bests = []

    for mdl_config in mdl_configs:
        # train
        clf = get_alg(mdl_config['alg'], mdl_config)
        clf.fit(Xs['tr'], ys['tr'])
        # valid
        score, bests = drill_eva(clf, Xs['va'], ys['va'])
        print("drill(%i,%i) va_score %.4f for model %s(%s) @ %s" %
              (xi, yi, score, mdl_config['alg'], mdl_config, conv.now('full')))
        if score > best_score:
            best_score = score
            best_config = mdl_config
        # for blending
        if do_blending:
            all_score.append(score)
            all_bests.append(bests)

    # blending
    if do_blending:
        best_bcnt = None
        best_blend_score = 0
        best_good_idxs = []
        for bcnt in [5, 10]:
            good_idxs = [
                k for k, v in sorted(
                    enumerate(all_score), key=lambda v: v[1], reverse=True)
            ][:bcnt]
            blended_bests = blending(
                [m for idx, m in enumerate(all_bests) if idx in good_idxs])
            blended_match = [
                apk([ans], vals) for ans, vals in zip(ys['va'], blended_bests)
            ]
            blended_score = sum(blended_match) / len(blended_match)
            if blended_score > best_blend_score:
                best_blend_score = blended_score
                best_good_idxs = good_idxs
                best_bcnt = bcnt
            print("drill(%i,%i) va_score %.4f for model 'blending_%i' @ %s" %
                  (xi, yi, blended_score, bcnt, conv.now('full')))

    # train again with full training samples
    Xs['tr_va'] = pd.concat([Xs['tr'], Xs['va']])
    ys['tr_va'] = np.append(ys['tr'], ys['va'])

    # always write best blending (in case best single model overfitting)
    all_bt_preds = []
    for bcfg in [
            m for idx, m in enumerate(mdl_configs) if idx in best_good_idxs
    ]:
        bmdl = get_alg(bcfg['alg'], bcfg)
        bmdl.fit(Xs['tr_va'], ys['tr_va'])
        _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
        all_bt_preds.append(bt_preds)
    blending_test_preds = blending(all_bt_preds)
    blending_test_preds = pd.DataFrame(blending_test_preds)
    blending_test_preds['row_id'] = row_id
    df2submit(blending_test_preds, (grid_submit_path[:-4] + '_blend.csv'))

    # collect results
    if do_blending and (best_blend_score > best_score):
        best_score = best_blend_score
        test_preds = blending_test_preds
        best_config = "blending_%i" % best_bcnt
    else:
        best_model = get_alg(best_config['alg'], best_config)
        best_model.fit(Xs['tr_va'], ys['tr_va'])
        _, test_preds = drill_eva(best_model, Xs['te'], ys['te'])
        test_preds = pd.DataFrame(test_preds)
        test_preds['row_id'] = row_id

    # write partial submit
    df2submit(test_preds, grid_submit_path)
    print("[drill_grid (%i,%i)] choose best_model %s, best_score=%.4f @ %s" %
          (xi, yi, best_config, best_score, datetime.now()))
    return best_score, test_preds
Ejemplo n.º 8
0
  def evaluate(self, df, title, norm=None):
    preds_total = []
    score_total = []

    # check model exists
    if not os.path.exists("%s/models/%s" % (self.root, self.stamp)):
      print("[ERROR] evaluate: model %s does not exists!!" % self.stamp)
      return

    # launch mp jobs
    processes = []
    for x_idx, (x_min, x_max) in enumerate(self.x_ranges):
      if (x_idx % self.x_inter == 0):   # skip interleave blocks
        x_min, x_max = conv.trim_range(x_min, x_max, self.size)
        df_row = df[(df.x >= x_min) & (df.x < x_max)]
        row_scores, row_samples = [], 0
        mp_pool = mp.Pool(pool_size)
      for y_idx, (y_min, y_max) in enumerate(self.y_ranges):
        if (x_idx % self.x_inter == 0) and (y_idx % self.y_inter == 0): # skip interleave blocks
          y_min, y_max = conv.trim_range(y_min, y_max, self.size)
          w_ary = [(x_idx-1, y_idx), (x_idx, y_idx), (x_idx+1, y_idx)]
          mdl_names = ["%s/models/%s/grid_model_x_%s_y_%s.pkl.gz" % (self.root, self.stamp, xi, yi) for xi, yi in w_ary]
          df_grid = df_row[(df_row.y >= y_min) & (df_row.y < y_max)]
          
          # normalize for scale sensitive algorithms
          if norm:  
            for k, v in norm.items(): df_grid[k] = df_grid[k]*v

          # preprocessing
          if self.en_preprocessing:
            df_grid, cols_extra = conv.df_preprocess(self.en_preprocessing, df_grid, x_idx, y_idx, LOCATION, AVAIL_WDAYS, AVAIL_HOURS, POPULAR, GRID_CANDS)
            X, y, row_id = conv.df2sample(df_grid, self.x_cols+cols_extra)
          else:
            X, y, row_id = conv.df2sample(df_grid, self.x_cols)
          
          # fail-safe
          if len(X) == 0:
            print("0 samples in x_idx=%i, y_idx=%i, skip evaluation." % (x_idx, y_idx))
            continue

          # parallel evaluation
          p = mp_pool.apply_async(predict_clf, (mdl_names, self.mdl_weights, X, y, row_id, x_idx//self.x_inter, y_idx//self.y_inter, self.popu_th, self.time_th_wd, self.time_th_hr))
          processes.append([x_idx, y_idx, p])
        # collect mp results
        last_block = (x_idx >= len(self.x_ranges)-1) and (y_idx >= len(self.y_ranges)-1)  
        while (len(processes) > 30) or (processes and last_block):
          xi, yi, p = processes.pop(0)
          preds, score = p.get() #predict_clf(mdl_name, X, y, row_id)
          score_total.append(score)
          preds_total.append(preds)
          # observation
          row_scores.append(score)
          row_samples += len(preds)
          if (yi == 0) and (xi > 0):
            if title == 'Submit':
              print("[Submit] row %i, %i samples @ %s" % (xi-1, row_samples, conv.now('full')))
            else:
              print("[%s] row %i, avg MAP=%.4f, %i samples @ %s" % (title, xi-1, np.average(row_scores), row_samples, conv.now('full')))          
      mp_pool.close()
      row_scores, row_samples = [], 0
      print("[%s] launching row %i processes @ %s ..." % (title, x_idx, conv.now('full')))
    print("[Evaluation] done, rest processes=%i (should be 0!)" % len(processes))

    # final stats
    if title == 'Submit':
      final_score = 'none'
      print("=====[Done submittion data]=====")
    else:
      # print("preds_total", preds_total)
      # print("score_total", score_total)
      scores, cnts = zip(*[(s*len(p), len(p)) for s, p in zip(score_total, preds_total)])
      final_score = sum(scores) / sum(cnts)
      print("=====[%s score] MAP=%.4f =====" % (title, final_score))
    preds_total = pd.concat(preds_total)
    return preds_total, final_score
Ejemplo n.º 9
0
def drill_grid(df_grid, x_cols, xi, yi, grid_submit_path, do_blending=True):
    best_score = 0
    all_score = []
    Xs, ys = {}, {}
    for m in ['tr', 'va', 'te']:
        Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)

    # [grid search best models]
    mdl_configs = [
        {
            'alg': 'skrf',
            'n_estimators': 500,
            'max_features': 0.35,
            'max_depth': 15
        },
        {
            'alg': 'skrfp',
            'n_estimators': 500,
            'max_features': 0.35,
            'max_depth': 15
        },
        # {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
        # {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
        # {'alg': 'sket', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
        # {'alg': 'sketp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 11},
        # {'alg': 'skrf', 'n_estimators': 800, 'max_features': 0.35, 'max_depth': 15},
        # {'alg': 'skrfp', 'n_estimators': 800, 'max_features': 0.35, 'max_depth': 15},
        # {'alg': 'sket', 'n_estimators': 1500, 'max_features': 0.35, 'max_depth': 15},
        # {'alg': 'sketp', 'n_estimators': 1500, 'max_features': 0.35, 'max_depth': 11},
    ]
    all_bests = []

    # train & collect valid preds for weight selection
    for mdl_config in mdl_configs:
        # train
        clf = get_alg(mdl_config['alg'], mdl_config)
        clf.fit(Xs['tr'], ys['tr'])
        # valid
        score, bests = drill_eva(clf, Xs['va'], ys['va'])
        all_score.append(score)
        all_bests.append(bests)
        clf = None
        gc.collect()
        print("drill(%i,%i) va_score %.4f for model %s(%s) @ %s" %
              (xi, yi, score, mdl_config['alg'], mdl_config, conv.now('full')))

    # train again with full training samples
    Xs['tr_va'] = pd.concat([Xs['tr'], Xs['va']])
    ys['tr_va'] = np.append(ys['tr'], ys['va'])

    # collect test preds
    all_bt_preds = []
    for bcfg in mdl_configs:
        bmdl = get_alg(bcfg['alg'], bcfg)
        bmdl.fit(Xs['tr_va'], ys['tr_va'])
        _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
        all_bt_preds.append(bt_preds)
        bmdl = None
        gc.collect()

    info = {
        'mdl_configs': mdl_configs,
        'all_va_score': all_score,
        'all_va_preds': all_bests,
        'y_va': ys['va'],
        'all_te_preds': all_bt_preds,
        'row_id': row_id,
    }
    pickle.dump(info, open(grid_submit_path, 'wb'))
    Xs, ys, all_score, all_bests, all_bt_preds = [None] * 5
    print("cv raw collect for (%i,%i) in %s @ %s" %
          (xi, yi, grid_submit_path, datetime.now()))
    gc.collect()
    return None, None
Ejemplo n.º 10
0
def drill_grid(df_grid,
               x_cols,
               xi,
               yi,
               stamp,
               grid_submit_path,
               do_blending=True):
    # params init
    best_score = 0
    all_score = []
    Xs, ys = {}, {}
    for m in ['tr', 'te']:
        Xs[m], ys[m], row_id = conv.df2sample(df_grid[m], x_cols)
    scnt = len(ys['tr'])

    mdl_path = '/'.join(grid_submit_path.split('/')[:-1])
    dat_file = 'dat_%i_%i.pkl' % (xi, yi)
    sol_file = 'dat_%i_%i.sol.%s' % (xi, yi, stamp)

    # if scnt > 1500:
    #   mdl_configs = [
    #     {'alg': 'skrf', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    #     {'alg': 'skrfp', 'n_estimators': 500, 'max_features': 0.35, 'max_depth': 15},
    #     {'alg': 'sket', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 15},
    #     {'alg': 'sketp', 'n_estimators': 500, 'max_features': 0.4, 'max_depth': 11},
    #   ]
    # else:
    mdl_configs = raw_mdl_configs

    # prepare data
    cmds = {
        'mdl_configs': mdl_configs,
        'Xs': {k: v.values
               for k, v in Xs.items()},
        'ys': ys,
        'row_id': row_id.values,
        'sol_file': sol_file,
    }
    dat_path = "%s/%s" % (mdl_path, dat_file)
    try:
        pickle.dump(
            cmds, open(dat_path, 'wb'), protocol=2
        )  # lambda only has python2, which support latest pickle protocol=2
    except Exception as e:
        print(e)
        print("ERROR for (%i/%i), cmds: %s" % (xi, yi, cmds))

    # upload to s3 for lambda
    job_done = False

    # bucket = boto3.resource('s3', region_name='us-west-2').Bucket('recom.lambda.m1536')
    session = boto3.session.Session()
    bucket = session.resource(
        's3', region_name='us-west-2').Bucket('recom.lambda.m1536')

    #-----[use aws lambda]-------------------------------------------
    if True:  #scnt < 2500:
        # if scnt > 1000:
        #   bucket = bucket1536
        # elif scnt > 500:
        #   bucket = bucket1024
        # elif scnt > 300:
        #   bucket = bucket512
        # else:
        #   bucket = bucket256
        try:
            bucket.upload_file(dat_path, dat_file)
        except Exception as e:
            print(e)
            print("when bucket.upload_file", dat_path)
        print("upload dat_file %s of %i tr samples @ %s" %
              (dat_file, len(ys['tr']), datetime.now()))
        df_grid, Xs, ys, row_id, cmds = [None] * 5  # release memory

        # print("try download %s to %s" % (sol_file, grid_submit_path))
        try_cnt, max_try = 0, 6
        while try_cnt <= max_try:
            try:
                bucket.download_file(sol_file, grid_submit_path)
                job_done = True
                break
            except Exception as e:
                if try_cnt > 4:
                    print("(%i/%i) scnt=%i, waiting %i ... @ %s" %
                          (xi, yi, scnt, try_cnt, datetime.now()))
                try_cnt += 1
                sleep(30)

        # remove tmp files
        bucket.delete_objects(Delete={
            'Objects': [{
                'Key': sol_file
            }],
            'Quiet': True,
        })

        # collect sols
        if job_done:
            try:
                sols = json.load(open(grid_submit_path, 'rt'))
            except Exception as e:
                print(e)
                print("when json try load %s" % grid_submit_path)
            # print(sols[:5])
            sols = pd.DataFrame(sols)
            sols['row_id'] = row_id
            df2submit(sols, grid_submit_path)
            # print("get sols:\n %s \n@ %s" % (sols.head(), datetime.now()))
            print("[drill_grid (%i,%i)] blended @ %s" %
                  (xi, yi, datetime.now()))
        else:
            sols = None
            print("[TIMEOUT] job timeout: (%i/%i)" % (xi, yi))

    #-----[use local machine]-------------------------------------------
    else:
        all_bt_preds = []
        for bcfg in mdl_configs:
            bmdl = get_alg(bcfg['alg'], bcfg)
            bmdl.fit(Xs['tr'], ys['tr'])
            _, bt_preds = drill_eva(bmdl, Xs['te'], ys['te'])
            all_bt_preds.append(bt_preds)
        sols = blending(all_bt_preds)
        sols = pd.DataFrame(sols)
        sols['row_id'] = row_id
        df2submit(sols, grid_submit_path)
        print("[LOCAL] done (%i/%i) locally @ %s" % (xi, yi, datetime.now()))

    return 1.0, sols