def get_installment_payments(path, num_rows=None): """Preprocess and extract features from installments_payments. Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to read; None reads all rows (int, default: None). Returns: df: DataFrame with processed data. """ pay = pd.read_csv(os.path.join(path, 'installments_payments.csv'), nrows=num_rows) # Group payments and get Payment difference pay = utils.do_sum(pay, ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 'AMT_PAYMENT', 'AMT_PAYMENT_GROUPED') pay['PAYMENT_DIFFERENCE'] = pay['AMT_INSTALMENT'] - pay['AMT_PAYMENT_GROUPED'] pay['PAYMENT_RATIO'] = pay['AMT_INSTALMENT'] / pay['AMT_PAYMENT_GROUPED'] pay['PAID_OVER_AMOUNT'] = pay['AMT_PAYMENT'] - pay['AMT_INSTALMENT'] pay['PAID_OVER'] = (pay['PAID_OVER_AMOUNT'] > 0).astype(int) # Payment Entry: Days past due and Days before due pay['DPD'] = pay['DAYS_ENTRY_PAYMENT'] - pay['DAYS_INSTALMENT'] pay['DPD'] = pay['DPD'].apply(lambda x: 0 if x <= 0 else x) pay['DBD'] = pay['DAYS_INSTALMENT'] - pay['DAYS_ENTRY_PAYMENT'] pay['DBD'] = pay['DBD'].apply(lambda x: 0 if x <= 0 else x) # Flag late payment pay['LATE_PAYMENT'] = pay['DBD'].apply(lambda x: 1 if x > 0 else 0) # Percentage of payments that were late pay['INSTALMENT_PAYMENT_RATIO'] = pay['AMT_PAYMENT'] / pay['AMT_INSTALMENT'] pay['LATE_PAYMENT_RATIO'] = pay.apply(lambda x: x['INSTALMENT_PAYMENT_RATIO'] if x['LATE_PAYMENT'] == 1 else 0, axis=1) # Flag late payments that have a significant amount pay['SIGNIFICANT_LATE_PAYMENT'] = pay['LATE_PAYMENT_RATIO'].apply(lambda x: 1 if x > 0.05 else 0) # Flag k threshold late payments pay['DPD_7'] = pay['DPD'].apply(lambda x: 1 if x > 7 else 0) pay['DPD_15'] = pay['DPD'].apply(lambda x: 1 if x > 15 else 0) # Aggregations by SK_ID_CURR pay_agg = utils.group(pay, 'INS_', config.INSTALLMENTS_AGG) # Installments in the last x months for months in [18, 36]: recent_prev_id = pay[pay['DAYS_INSTALMENT'] >= -30*months]['SK_ID_PREV'].unique() pay_recent = pay[pay['SK_ID_PREV'].isin(recent_prev_id)] prefix = 'INS_{}M_'.format(months) pay_agg = utils.group_and_merge(pay_recent, pay_agg, prefix, config.INSTALLMENTS_TIME_AGG) # Last x periods trend features group_features = ['SK_ID_CURR', 'SK_ID_PREV', 'DPD', 'LATE_PAYMENT', 'PAID_OVER_AMOUNT', 'PAID_OVER', 'DAYS_INSTALMENT'] group = pay[group_features].groupby('SK_ID_CURR') func = partial(_trend_in_last_k_instalment_features, periods=[12, 24, 60, 120]) g = utils.parallel_apply(group, func, index_name='SK_ID_CURR', chunk_size=10000).reset_index() pay_agg = pay_agg.merge(g, on='SK_ID_CURR', how='left') # Last loan features g = utils.parallel_apply(group, _installments_last_loan_features, index_name='SK_ID_CURR', chunk_size=10000).reset_index() pay_agg = pay_agg.merge(g, on='SK_ID_CURR', how='left') return pay_agg
def ensemble(args): global dfs, ens_dets, MAX_NUM, bg_time MAX_NUM = args.max_num df_test = pd.read_csv(os.path.join(DATA_DIR, 'sample_empty_submission.csv')) df_test.PredictionString = df_test.PredictionString.fillna('') print(df_test.head()) print('loading {} ...'.format(csv_files)) dfs = [pd.read_csv(fn) for fn in csv_files] for df in dfs: df.PredictionString = df.PredictionString.fillna('') #assert len(preds[0][1]) == len(classes) for i in range(len(dfs)): dfs[i] = dfs[i].set_index('ImageID') dfs[i] = dfs[i].reindex(index=df_test['ImageID']) dfs[i] = dfs[i].reset_index() print('ensembling...') bg_time = time.time() counter = Value('i', 0) with Pool(24, initializer=init, initargs=(counter, )) as p: num_imgs = len(dfs[0]) #ens_dets = list(tqdm(iterable=p.map(get_ens_det, list(range(num_imgs))), total=num_imgs)) ens_dets = p.map(get_ens_det, range(num_imgs)) print('creating submission...') df_test['img_index'] = df_test.index df_test = parallel_apply(df_test, set_pred_str) df_test = df_test.drop(columns=['img_index'], axis=1) df_test.to_csv(args.out, index=False) print('done')
def submit(args): global preds, classes classes, _ = get_top_classes(args.start_index, args.end_index, args.class_file) print('loading {}...'.format(args.pred_file)) with open(args.pred_file, 'rb') as f: preds = pickle.load(f) print('len(preds):', len(preds)) print('num classes of preds:', len(preds[0][1])) print('specified num classes:', len(classes)) #assert len(preds[0][1]) == len(classes) print('creating submission...') df_test = pd.read_csv(osp.join(DATA_DIR, 'sample_empty_submission.csv')) df_test.ImageWidth = df_test.ImageID.map( lambda x: get_image_size(get_fn(x))[0]) df_test.ImageHeight = df_test.ImageID.map( lambda x: get_image_size(get_fn(x))[1]) df_test['img_index'] = df_test.index df_test = parallel_apply(df_test, set_pred_str) df_test = df_test.drop(columns=['img_index'], axis=1) df_test.to_csv(args.out, index=False) print('done')
def _update(self, update_helper_func): self._current_resolution_level = parallel_apply( self._current_resolution_level, update_helper_func, self._n_threads, 1, **{"current_vector": self._current_resolution.values} )
def ensemble(args): global all_preds, classes, ens_dets, MAX_NUM, bg_time MAX_NUM = args.max_num #print('getting img size...') df_test = pd.read_csv(osp.join(DATA_DIR, 'sample_empty_submission.csv')) print(df_test.head()) #df_test.ImageWidth = df_test.ImageID.map(lambda x: get_image_size(get_fn(x))[0]) #df_test.ImageHeight = df_test.ImageID.map(lambda x: get_image_size(get_fn(x))[1]) classes, _ = get_top_classes(args.start_index, args.end_index, args.class_file) for fn in pred_files: print('loading {} ...'.format(fn)) with open(fn, 'rb') as f: all_preds.append(pickle.load(f)) print('len(preds):', len(all_preds[0])) print('num classes of preds:', len(all_preds[0][0][1])) print('specified num classes:', len(classes)) #assert len(preds[0][1]) == len(classes) print('ensembling...') bg_time = time.time() counter = Value('i', 0) with Pool(24, initializer=init, initargs=(counter, )) as p: num_imgs = len(all_preds[0]) #ens_dets = list(tqdm(iterable=p.map(get_ens_det, list(range(num_imgs))), total=num_imgs)) ens_dets = p.map(get_ens_det, range(num_imgs)) #num_imgs = len(preds1) #for idx in tqdm(range(num_imgs), total=num_imgs): # ens_dets.append(get_ens_det(idx)) print('creating submission...') df_test['img_index'] = df_test.index df_test = parallel_apply(df_test, set_pred_str) df_test = df_test.drop(columns=['img_index'], axis=1) df_test.to_csv(args.out, index=False) print('done')
historical_transactions = pd.read_csv( os.path.join(PATH, 'historical_transactions.csv')) historical_transactions['purchase_date'] = pd.to_datetime( historical_transactions['purchase_date']) historical_transactions['days'] = ( datetime.date(2018, 2, 28) - historical_transactions['purchase_date'].dt.date).dt.days historical_transactions = historical_transactions.query( '0 <= installments and installments <= 12') # ============================================================================= # # ============================================================================= groupby = historical_transactions.groupby('card_id') func = partial(last_k_instalment_features_with_fractions, periods=[60, 180, 360, 540], fraction_periods=[(60, 180), (60, 360), (180, 540), (360, 540)]) g = utils.parallel_apply(groupby, func, index_name='card_id', num_workers=4, chunk_size=10000).reset_index() g.to_pickle(f'../feature/{PREF}.pkl') #============================================================================== utils.end(__file__)
print("flag:", 7) data_link_split = data_link_split.reset_index(drop=True) data_link_split[[ 'link_id', 'link_time', 'link_ratio', 'link_current_status', 'link_arrival_status' ]] = data_link_split['link_info'].str.split(':|,', 5, expand=True) print("flag:", 8) data_link_split = data_link_split[['order_id', 'link_id']] data_link_split['link_id'] = data_link_split['link_id'].astype(int) features = pd.DataFrame( {'order_id': data_link_split['order_id'].unique()}) groupby = data_link_split.groupby(['order_id']) func = partial(link_id_find) g = parallel_apply(groupby, func, index_name='order_id', num_workers=5, chunk_size=10000) g = pd.DataFrame(g, columns=['from_id', 'to_id']) g = g.drop_duplicates() nextlinks_new.append(g) nextlinks_new = pd.concat(nextlinks_new, axis=0) nextlinks_new = nextlinks_new.drop_duplicates() nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index( drop=True) nextlinks = pd.concat([nextlinks, nextlinks_new], axis=0) nextlinks = nextlinks.drop_duplicates() nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True) print('save all csv') nextlinks.to_csv(root_path + 'nextlinks_allday.csv', index=False) print('calcute weight')
def _score(self, score_helper_func): score = parallel_apply( self._current_resolution_level, score_helper_func, self._n_threads, 1) return score