def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}', filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='train') valid = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='valid') train = train.sample(15000000, random_state=SEED) # qs = pd.read_csv('./data/output/question_lsi50.csv') # qs = qs[['question_id','tags_lsi']] # qs = qs.rename(columns={'question_id':'content_id'}) # train = pd.merge(train,qs,on='content_id',how='left') # valid = pd.merge(valid,qs,on='content_id',how='left') # train = data_util.reduce_mem_usage(train) # valid = data_util.reduce_mem_usage(valid) LOG.info( f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}' ) model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG) data_util.seve_model(model, fi, file_name) valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}', filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='train') valid = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='valid') train = train.sample(15000000, random_state=SEED) # train = data_util.reduce_mem_usage(train) # valid = data_util.reduce_mem_usage(valid) LOG.info( f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}' ) model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG) data_util.seve_model(model, fi, file_name) valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}', filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train_df = data_util.load_features(feature_list) train_df = train_df.sample(frac=0.10, random_state=127) valid_df = train_df.sample(frac=0.02, random_state=127) valid_id = valid_df['row_id'] train_df = train_df[~train_df['row_id'].isin(valid_id)] train_df = train_df.reset_index(drop=True) valid_df = valid_df.reset_index(drop=True) LOG.info(f'train shape : {train_df.shape}') LOG.info(f'valid shape : {valid_df.shape}') train_x = train_df[USE_COLS] train_y = train_df[TARGET] valid_x = valid_df[USE_COLS] valid_y = valid_df[TARGET] lgb_model, fi, valid_df['pred'] = run_lgb(train_x=train_x, train_y=train_y, valid_x=valid_x, valid_y=valid_y, LOG=LOG) data_util.seve_model(lgb_model, fi, file_name) valid_df[['user_id', 'pred']]
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}', filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='train') valid = data_util.load_features(FEATURES_LIST, path=f'features/{path}', train_valid='valid') qs = pd.read_csv('./data/input/question_cmnts.csv') qs.columns = ['content_id', 'community'] train = pd.merge(train, qs, on='content_id', how='left') valid = pd.merge(valid, qs, on='content_id', how='left') # train = data_util.reduce_mem_usage(train) # valid = data_util.reduce_mem_usage(valid) LOG.info( f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}' ) model, fi, valid['pred'] = run_lgb(train=train, valid=valid, LOG=LOG) data_util.seve_model(model, fi, file_name) valid[['row_id', 'pred']].to_feather(f'./data/oof/{file_name}.feather')
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}',filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train = data_util.load_features(FEATURES_LIST,path='features/mini_data',train_valid='train') valid = data_util.load_features(FEATURES_LIST,path='features/mini_data',train_valid='valid') LOG.info(f'train_size:{train.shape} valid_size:{valid.shape}') model,fi,valid_pred = run_lgb(train=train,valid=valid,LOG=LOG) data_util.seve_model(model,fi,file_name)
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}',filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train = data_util.load_features(FEATURES_LIST,path=f'features/{path}',train_valid='train') valid = data_util.load_features(FEATURES_LIST,path=f'features/{path}',train_valid='valid') train = train.sample(15000000, random_state = SEED) questions = pd.read_csv('./data/input/questions.csv') lst = [] for tags in questions["tags"]: ohe = np.zeros(188) if str(tags) != "nan": for tag in tags.split(): ohe += np.eye(188)[int(tag)] lst.append(ohe) tags_df = pd.DataFrame(lst, columns=[f"tag_{i}" for i in range(188)]).astype(int) questions = pd.concat([questions,tags_df],axis=1) questions = questions.rename(columns={'question_id':'content_id'}) questions = questions[QS+['content_id']] train = pd.merge(train,questions,on='content_id',how='left') valid = pd.merge(valid,questions,on='content_id',how='left') # qs = pd.read_csv('./data/output/question_lsi50.csv') # qs = qs[['question_id','tags_lsi']] # qs = qs.rename(columns={'question_id':'content_id'}) # train = pd.merge(train,qs,on='content_id',how='left') # valid = pd.merge(valid,qs,on='content_id',how='left') # train = data_util.reduce_mem_usage(train) # valid = data_util.reduce_mem_usage(valid) LOG.info(f'train_size:{train[USE_COLS].shape} valid_size:{valid[USE_COLS].shape}') model,fi,valid['pred'] = run_lgb(train=train,valid=valid,LOG=LOG) data_util.seve_model(model,fi,file_name) valid[['row_id','pred']].to_feather(f'./data/oof/{file_name}.feather')
def main(): file_name = os.path.basename(__file__)[:-3] LOG = logger.Logger(name=f'{file_name}', filename=file_name) LOG.info('base line') LOG.info(f'{USE_COLS}') train_df = data_util.load_features(feature_list) train_index = pd.read_feather(f'./data/train_valid/cv1_train.feather') valid_index = pd.read_feather(f'./data/train_valid/cv1_valid.feather') train = train_df[train_df['row_id'].isin(train_index['row_id'])] valid = train_df[train_df['row_id'].isin(valid_index['row_id'])] del train_df gc.collect() LOG.info(f'train_size:{train.shape} valid_size:{valid.shape}') model, fi, valid_pred = run_lgb(train=train, valid=valid, LOG=LOG) data_util.seve_model(model, fi, file_name)