def convert(ori, des, feats): df_ori = utils.load_df(ori) for f in feats: tmp = utils.load_df(config.feat+'m3_' +f) print(f) df_ori = pd.concat([df_ori,tmp.drop(['session_id','impressions'],axis=1)],axis=1) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori,des)
def convert(ori, des, feat): df_ori = utils.load_df(ori) print(df_ori.shape) df_feat = utils.load_df(config.feat + feat) df_ori = df_ori.merge(df_feat, on=['session_id', 'impressions'], how='left') print(df_ori.shape) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(ori, des, prefix): df_ori = utils.load_df(ori) print(df_ori.shape) for feat in feats: df_feat = utils.load_df(config.model + prefix + '%s.csv' % feat).rename( columns={'target': feat}) df_ori = df_ori.merge(df_feat[['session_id', 'impressions', feat]], on=['session_id', 'impressions'], how='left') print(df_ori.shape) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def convert(ori, des, sample): tr = utils.load_df(ori) print(tr.shape) tr_out = tr[['session_id', 'impressions']] dfs = utils.load_df(sample) dfs['impr_rank'] = dfs.groupby(['session_id', 'step']).cumcount().values print(dfs.head()) tr_out = cate_encoding.cate_num_stat(dfs, tr_out, ['session_id', 'impressions'], 'impr_rank', ['min', 'max', 'median']) tr_out.columns = tr_out.columns.astype(str) print(tr_out.head()) utils.save_df(tr_out, des)
def convert(ori, des, feats): df_ori = utils.load_df(ori) print(df_ori.shape) for feat in feats: df_feat = utils.load_df(config.feat + feat) df_ori = df_ori.merge(df_feat, on=['session_id', 'impressions'], how='left') print(df_ori.shape) del df_feat gc.collect() df_ori = utils.reduce_mem(df_ori) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des)
def build_sampler(model, dataset, split, dt): params = dict(sd_peds=0, sd_speed=0) traj_list = load_trajectories( ut.load_df(ut.get_dataset_split_path(dataset, split, is_train=True))) df = ut.load_df( ut.get_dataset_split_path(dataset, 'split_1.0_0', is_train=True)) if model.is_sd_s: params['sd_speed'] = calculate_sd_speed(df, dt) mu_peds, sd_peds = calculate_mu_sd_n_peds(df) params['mu_peds'] = mu_peds if model.is_sd_p: params['sd_peds'] = sd_peds r = 4.0 sampler = Sampler(traj_list, r=r, dt=dt, **params) return sampler
def get_data(adate): ''' download data from internet and save it to file params: adate str - the update date return df if success, or None if adate is not present in df - df pandas dataframe - as u.COLUMNS_WORLD['en'] collateral effects: if success, save df as csv file ''' # load datafram from internet url = SOURCE fn = FNAME.format(adate) df = u.load_df(url, pd.read_csv) # test for udated data available check_date = datetime.strptime(adate, u.D_FMT).date().strftime(u.D_FMT2) bflag = df['dateRep'].str.contains(check_date).any() if bflag: df.to_csv(Path(u.DIR_DATA) / fn) else: df = None return df
def main(afile): # national trend #df = pd.read_csv(Path(u.DIR_DATA) / afile) #df = shape_data(df) #make_plot(df, afile, 'it') #make_plot(df, afile, 'en') #regional trend #df = u.load_df(Path(u.DIR_DATA) / afile, pd.read_csv, u.COLUMNS_RITALY['it'], encoding='utf-8') #df = shape_data(df) #make_rplot(df, afile, xlabel='data', ylabel='numero totale di casi', title=f'Covid-19: andamento temporale per le {N_MOST_HITTED} regioni più colpite', lang = 'it' ) #make_rplot(df, afile, xlabel='date', ylabel='total number of cases', title=f'Covid-19: temporal trend for the {N_MOST_HITTED} most hitted regions', lang = 'en' ) df = u.load_df(Path(u.DIR_DATA) / afile, pd.read_csv, u.COLUMNS_ITALY['it'], encoding='utf-8') df = shape_data(df) make_national( df, 'nuovi_positivi', afile, xlabel='data', ylabel='numero giornaliero di nuovi casi', title= f'Covid-19: andamento temporale giornaliero di nuovi positivi (nazionale)', lang='it') make_national(df, 'nuovi_positivi', afile, xlabel='date', ylabel='daily number of new cases', title=f'Covid-19: time trend of daily (national) new cases', lang='en')
def load_and_log_params(cli_params): cli_params = AttributeDict(cli_params) if cli_params.get('load_from'): p = load_df(cli_params.load_from, 'params').to_dict()[0] p = AttributeDict(p) for key in cli_params.keys(): if key not in p: p[key] = None new_params = cli_params loaded = True else: p = cli_params new_params = {} loaded = False # Make dseed seed unless specified explicitly if p.get('dseed') is None and p.get('seed') is not None: p['dseed'] = p['seed'] logger.info('== COMMAND LINE ==') logger.info(' '.join(sys.argv)) logger.info('== PARAMETERS ==') for k, v in p.items(): if new_params.get(k) is not None: p[k] = new_params[k] replace_str = "<- " + str(new_params.get(k)) else: replace_str = "" logger.info(" {!s:20s}: {!s:<20s} {}".format(k, v, replace_str)) return p, loaded
def load_and_log_params(cli_params): cli_params = AttributeDict(cli_params) if cli_params.get('load_from'): p = load_df(cli_params.load_from, 'params').to_dict()[0] p = AttributeDict(p) for key in cli_params.iterkeys(): if key not in p: p[key] = None new_params = cli_params loaded = True else: p = cli_params new_params = {} loaded = False # Make dseed seed unless specified explicitly if p.get('dseed') is None and p.get('seed') is not None: p['dseed'] = p['seed'] logger.info('== COMMAND LINE ==') logger.info(' '.join(sys.argv)) logger.info('== PARAMETERS ==') for k, v in p.iteritems(): if new_params.get(k) is not None: p[k] = new_params[k] replace_str = "<- " + str(new_params.get(k)) else: replace_str = "" logger.info(" {:20}: {:<20} {}".format(k, v, replace_str)) return p, loaded
def fit(self, dataset, dirty_train): dirty_raw_path = utils.get_dir(dataset, 'raw', 'raw.csv') clean_raw_path = utils.get_dir(dataset, 'raw', 'inconsistency_clean_raw.csv') if not os.path.exists(clean_raw_path): print( "Must provide clean version of raw data for cleaning inconsistency" ) sys.exit(1) dirty_raw = utils.load_df(dataset, dirty_raw_path) clean_raw = utils.load_df(dataset, clean_raw_path) N, m = dirty_raw.shape dirty_raw = dirty_raw.values clean_raw = clean_raw.values mask = (dirty_raw != clean_raw) dirty = dirty_raw[mask] clean = clean_raw[mask] self.incon_dict = dict(zip(dirty, clean))
def fit(self, dataset, dirty_train): index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') index_train = pd.read_csv(index_train_path).values.reshape(-1) index_test = pd.read_csv(index_test_path).values.reshape(-1) clean_path = utils.get_dir(dataset, 'raw', 'mislabel_clean_raw.csv') clean = utils.load_df(dataset, clean_path) self.clean_train = clean.loc[index_train, :] self.clean_test = clean.loc[index_test, :]
def get_n_tracks(self, n_steps, dataset, split): if self.is_large: n_tracks = 500 if dataset not in ['univ'] else 100 else: df = ut.load_df( ut.get_dataset_split_path(dataset, split, is_train=True)) n_frames = df['t'].unique().size n_tracks = n_frames // (n_steps + 1 ) # n_steps -> n_steps+1 positions return n_tracks
def load_data(dataset, train_path, test_path_list): """Load and split data into features and label. Args: dataset (dict): dataset dict in config train_path (string): path for training set test_path_list (list): a list of paths for test set (missing values and outlier have multiple test sets) """ # load data train = utils.load_df(dataset, train_path) test_list = [ utils.load_df(dataset, test_dir) for test_dir in test_path_list ] # split X, y label = dataset['label'] features = [v for v in train.columns if not v == label] X_train, y_train = train.loc[:, features], train.loc[:, label] X_test_list = [test.loc[:, features] for test in test_list] y_test_list = [test.loc[:, label] for test in test_list] return X_train, y_train, X_test_list, y_test_list
def load_features(path): """ Cargar pickle que se generó durante la transformación :param path: Path donde se encuentra el pickle :return: """ print("Opening feature engineering pickle from output path") output_path = os.path.join(path, "output", "fe_df.pkl") # Recuperar el pickle incidentes_pkl = load_df(output_path) print("Feature Engineering pickle successfully retrieved.") return incidentes_pkl
def convert(ori, des, feat): df_ori = utils.load_df(ori) print(df_ori.shape) for c in cols: df_ori = cate_encoding.cate_num_rank(df_ori, ['session_id'], c, ascending=True, show_agg=True) df_ori = df_ori.reset_index(drop=True) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des) utils.save_df( df_ori[[ 'session_id', 'impressions', 'session_id_by_prices_rank', 'session_id_by_ctr_rank', 'session_id_by_last_ts_sub_max_rank' ]], feat)
def load_and_log_params(cli_params): cli_params = AttributeDict(cli_params) # 如果有load_from参数,说明参数是有文件里面读取的,文件格式为hdf if cli_params.get('load_from'): # load_from值 + params组成完整地址 # string => dict p = load_df(cli_params.load_from, 'params').to_dict()[0] # dict => AttributeDict p = AttributeDict(p) for key in cli_params.iterkeys(): if key not in p: p[key] = None new_params = cli_params loaded = True # 如果没有load from参数,直接封装一下cli_params else: p = cli_params new_params = {} loaded = False # Make dseed seed unless specified explicitly # dseed为空而seed不为空时,dseed复制为seed if p.get('dseed') is None and p.get('seed') is not None: p['dseed'] = p['seed'] # log相关 logger.info('== COMMAND LINE ==') logger.info(' '.join(sys.argv)) logger.info('== PARAMETERS ==') for k, v in p.iteritems(): if new_params.get(k) is not None: p[k] = new_params[k] replace_str = "<- " + str(new_params.get(k)) else: replace_str = "" logger.info(" {:20}: {:<20} {}".format(k, v, replace_str)) return p, loaded
def inject(dataset): """ Inject mislabels Args: dataset (dict): dataset dict in config """ # create saving folder major_save_dir = utils.makedirs( [config.data_dir, dataset["data_dir"] + "_major", 'raw']) minor_save_dir = utils.makedirs( [config.data_dir, dataset["data_dir"] + "_minor", 'raw']) uniform_save_dir = utils.makedirs( [config.data_dir, dataset["data_dir"] + "_uniform", 'raw']) # load clean data clean_path = utils.get_dir(dataset, 'raw', 'raw.csv') clean = utils.load_df(dataset, clean_path) clean = clean.dropna().reset_index(drop=True) major_clean_path = os.path.join(major_save_dir, 'mislabel_clean_raw.csv') minor_clean_path = os.path.join(minor_save_dir, 'mislabel_clean_raw.csv') uniform_clean_path = os.path.join(uniform_save_dir, 'mislabel_clean_raw.csv') clean.to_csv(major_clean_path, index=False) clean.to_csv(minor_clean_path, index=False) clean.to_csv(uniform_clean_path, index=False) label = dataset['label'] # uniform flip uniform = uniform_class_noise(clean, label) # pairwise flip major, minor = pairwise_class_noise(clean, label) major_raw_path = os.path.join(major_save_dir, 'raw.csv') minor_raw_path = os.path.join(minor_save_dir, 'raw.csv') uniform_raw_path = os.path.join(uniform_save_dir, 'raw.csv') major.to_csv(major_raw_path, index=False) minor.to_csv(minor_raw_path, index=False) uniform.to_csv(uniform_raw_path, index=False)
def extract(sample, ori, feat): nrows = None df = pd.read_csv( sample, nrows=nrows, usecols=['session_id', 'step', 'reference', 'impressions']) print(df.head()) df_ori = utils.load_df(ori) print(df_ori.head()) df = df.merge(df_ori[['session_id', 'step']].drop_duplicates(), on='session_id', how='left') print(df.head()) df = df[df.step_x < df.step_y] tmp = df.drop_duplicates(subset=['session_id', 'step_x']) df_clk = tmp.groupby(['session_id', 'reference'])['step_x'].agg('count').reset_index() print(df_clk.head()) df_clk.rename(columns={ 'reference': 'impressions', 'step_x': 'item_sid_clk_cnt' }, inplace=True) df_impr = df.groupby(['session_id', 'impressions'])['step_x'].agg('count').reset_index() print(df_impr.head()) df_impr.rename(columns={'step_x': 'item_sid_impr_cnt'}, inplace=True) df_out = df_ori[['session_id', 'impressions']] df_out = df_out.merge(df_clk, on=['session_id', 'impressions'], how='left') df_out = df_out.merge(df_impr, on=['session_id', 'impressions'], how='left') print(df_out.head()) df_out.columns = df_out.columns.astype(str) utils.save_df(df_out, feat)
import sys import utils import cate_encoding import config def extract(df_ori, des): print(df_ori.shape) df_ori = df_ori.merge(df, on = ['session_id'], how = 'left') df_ori['last_act_gap'] = df_ori['timestamp'] - df_ori['timestamp_x'] df_ori.drop(['timestamp','timestamp_x'],axis=1,inplace=True) print(df_ori.head(10)) df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des) nrows = None tr = utils.load_df(config.data+'train.csv',nrows=nrows) te = utils.load_df(config.data+'test.csv',nrows=nrows) df = pd.concat([tr,te]) df = df[['session_id','timestamp','step','action_type','reference']] trs = utils.load_df(config.feat+'m3_tr_0.ftr') tes = utils.load_df(config.feat+'m3_te_0.ftr') df_sample = pd.concat([trs,tes]) df_sample = df_sample[['session_id','timestamp']].drop_duplicates() df = df.merge(df_sample,on='session_id',how='left') print(df.head(10)) df = df[df.timestamp_x < df.timestamp_y] df = df[['session_id','timestamp_x','action_type']].drop_duplicates(subset=['session_id'],keep='last') df = cate_encoding.label_encode(df,'action_type') df.rename(columns={'action_type':'sid_last_act'},inplace=True)
type=str, default="none") parser.add_argument("-G", help="gravitational constant for the simulation", default=1, type=float) args = parser.parse_args() """/arguments""" """adjust spark settings""" spark = SparkSession.builder.getOrCreate() spark.conf.set("spark.sql.caseSensitive", "true") """load data""" df_t0 = utils.load_df(args.input, schema=schemas.clust, part="id", limit=args.limit) """setup simulation""" methods = { "eul1": IntergratorEuler(args.dt, args.G), "eul2": IntergratorEuler2(args.dt, args.G), "rk4": IntegratorRungeKutta4(args.dt, args.G), "vlf": IntegratorLeapfrog(args.dt, args.G), } nameStr = utils.clean_str( spark.conf.get("spark.app.name")) + "-" + spark.conf.get("spark.app.id") sopts = utils.SaveOptions(os.path.join(args.outputDir, nameStr), fformat=args.f, compression=args.comp, header="true")
# -*-coding:utf-8-*- # @Time :2020/3/6/9:58 # @Author :Lwf # @Email :S # @File :predict.py # 导包 from pandas import DataFrame import joblib import numpy as np from utils import load_df # 按照指定的格式生成结果 def create_submission(ids, predictions, filename='submission.csv'): submissions = np.concatenate( (ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1) df = DataFrame(submissions) df.to_csv(filename, header=['id', 'click'], index=False) classifier = joblib.load('classifier.pkl') test_data_df = load_df('csv/test.csv', training=False) print(test_data_df) ids = test_data_df.values[0:, 0] print(ids) predictions = classifier.predict(test_data_df.values[0:, 1:]) print(predictions) create_submission(ids, predictions)
from pandas import DataFrame from sklearn.externals import joblib import numpy as np from utils import load_df def create_submission(ids, predictions, filename='../out/submission_unit.csv'): submissions = np.concatenate( (ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1) df = DataFrame(submissions) df.to_csv(filename, header=['id', 'click'], index=False) print("*******************test starting***************************") classifier = joblib.load('../out/model/classifier_unit.pkl') # test_data_df = load_df('csv/test', training=False) test_data_df = load_df('../datasets/test/test', training=False) ids = test_data_df.values[0:, 0] print(ids) predictions = classifier.predict(test_data_df.values[0:, 1:]) create_submission(ids, predictions) print("*******************test end***************************")
def dump_feat(ori, des): df = utils.load_df(ori) df = df[cols + ['session_id', 'impressions']] df.columns = re_cols + ['session_id', 'impressions'] print(df.shape) utils.save_df(df, des)
print("AUC: ", metrics.roc_auc_score(true_values, predicted_values)) print("Confusion Matrix: ", +metrics.confusion_matrix(true_values, predicted_values)) print(metrics.classification_report(true_values, predicted_values)) # 拟合分类器 def classify(classifier_class, train_input, train_targets): classifier_object = classifier_class() classifier_object.fit(train_input, train_targets) return classifier_object # 模型存储 def save_model(clf): joblib.dump(clf, 'classifier.pkl') train_data = load_df('csv/train_small.csv').values # print(train_data) X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.3, random_state=0) # print(X_train) # print(y_train) classifier = classify(LogisticRegression, X_train, y_train) predictions = classifier.predict(X_test) print_metrics(y_test, predictions) save_model(classifier)
def main(afile=p.FN): # ATTENZIONE RIMETTERE IN LINEA LE SEGUENTI if BACKUP: backup() if DOWNLOAD: dt = datetime.now() rc = get_data(dt.date().strftime(DATEURL_FMT)) if rc == -1: print("error downloading today's data") sys.exit(-1) #import pdb; pdb.set_trace() #df = pd.read_csv(Path(u.DIR_DATA) / afile) df = u.load_df( Path(u.DIR_DATA) / afile, pd.read_csv, u.COLUMNS_ITALY['it']) df = p.shape_data(df) p.make_plot(df, afile, 'it') p.make_plot(df, afile, 'en') # FINE ATTENZIONE p.make_national( df, 'nuovi_positivi', afile, xlabel='data', ylabel='numero giornaliero di nuovi casi', title= f'Covid-19: andamento temporale giornaliero di nuovi positivi (nazionale)', lang='it') p.make_national( df, 'nuovi_positivi', afile, xlabel='date', ylabel='daily number of new cases', title=f'Covid-19: time trend of daily (national) new cases', lang='en') rdf = u.load_df( Path(u.DIR_DATA) / h.FN, pd.read_csv, u.COLUMNS_RITALY['it']) rdf = h.shape_data(rdf) #p.make_rplot(rdf) p.make_rplot( rdf, h.FN, xlabel='data', ylabel='numero totale di casi', title= f'Covid-19: andamento temporale per le {p.N_MOST_HITTED} regioni più colpite', lang='it') p.make_rplot( rdf, h.FN, xlabel='date', ylabel='total number of cases', title= f'Covid-19: temporal trend for the {p.N_MOST_HITTED} most hitted regions', lang='en') h.make_histogram(rdf, h.FN, 'it') h.make_histogram(rdf, h.FN, 'en') article = make_article(FN_IT_TEMPLATE, df, rdf, 'it', TAB_TITLE) article = make_article(FN_EN_TEMPLATE, df, rdf, 'en', TAB_EN_TITLE) # ATTENZIONE RIMETTERE IN LINEA LE SEGUENTI if u.ENABLE_LDFA: to_ldfa() if u.ENABLE_PRODUCTION: to_production()
from utils import load_df def print_metrics(true_values, predicted_values): print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values) print "AUC: ", metrics.roc_auc_score(true_values, predicted_values) print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values) print metrics.classification_report(true_values, predicted_values) def classify(classifier_class, train_input, train_targets): classifier_object = classifier_class() classifier_object.fit(train_input, train_targets) return classifier_object def save_model(clf): joblib.dump(clf, 'classifier.pkl') train_data = load_df('csv/train_small.csv').values X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.3, random_state=0) classifier = classify(LogisticRegression, X_train, y_train) predictions = classifier.predict(X_test) print_metrics(y_test, predictions) save_model(classifier)
tr.drop(['current_filters', 'reference', 'action_type'], axis=1, inplace=True) te.drop(['current_filters', 'reference', 'action_type', 'target'], axis=1, inplace=True) utils.save_df(te, config.data + 'm3_te.ftr') return tr, te def gen_tr_click(df): df = df[['session_id', 'reference']].drop_duplicates(subset='session_id', keep='last').reset_index(drop=True) print(df.shape) df = df[pd.notnull(df.reference)].reset_index(drop=True) print(df.shape) utils.save_df(df, config.data + 'm3_tr_click.ftr') if __name__ == '__main__': nrow = None train = utils.load_df(config.data + 'sample_train.csv', nrows=nrow) test = utils.load_df(config.data + 'sample_test.csv', nrows=nrow) df = pd.concat([train, test]).reset_index(drop=True) tr1 = gen_train_sample(train) tr2, te = get_test_sample(test) tr = pd.concat([tr1, tr2]).reset_index(drop=True) utils.save_df(tr1, config.data + 'm3_tr.ftr') gen_tr_click(df)
from pandas import DataFrame from sklearn.externals import joblib import numpy as np from utils import load_df def create_submission(ids, predictions, filename='submission.csv'): submissions = np.concatenate((ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1) df = DataFrame(submissions) df.to_csv(filename, header=['id', 'click'], index=False) classifier = joblib.load('classifier.pkl') test_data_df = load_df('csv/test', training=False) ids = test_data_df.values[0:, 0] predictions = classifier.predict(test_data_df.values[0:, 1:]) create_submission(ids, predictions)
def main(adate): if u.ENABLE_DEBUG: breakpoint( header=f'world.py, main({adate}), break before download data') # get data if DOWNLOAD: df = get_data(adate) if type(df) == type( None ): # we cannot do: df == None because if df is dataframe it raises ValueError print('data from Internet not updated. run interrupted') sys.exit(1) else: df = u.load_df(u.DIR_DATA + '/' + FNAME.format(adate), pd.read_csv, u.COLUMNS_WORLD['en'], encoding='cp1250') df = world_shape(df) df_world = df.copy() df_eu = modify_by_area(df) # make two graphs: # 10 most hitted countries make_spagetti(df, datetime.strptime('2020-01-14', u.D_FMT).date(), Path(u.DIR_WIMG) / (os.path.splitext(FNAME.format(adate))[0] + '_1-10.png'), title=TITLE10.format('2020-01-14'), ss=0, se=10) # from 3rd to 10th of the 10 most hitted countries make_spagetti(df, datetime.strptime('2020-02-20', u.D_FMT).date(), Path(u.DIR_WIMG) / (os.path.splitext(FNAME.format(adate))[0] + '_3-10.png'), title=TITLE09.format('2020-02-20'), ss=2, se=10) # from 1st to 10th of the 10 most hitted EU countries make_spagetti(df_eu, datetime.strptime('2020-02-20', u.D_FMT).date(), Path(u.DIR_WIMG) / (os.path.splitext(FNAME.format(adate))[0] + '_eu_1-10.png'), title=TITLEEU.format('2020-02-20'), ss=0, se=10) # make articles make_article(u.DIR_TEMPLATE + '/' + FN_IT_TEMPLATE, adate, df, df_world) make_article(u.DIR_TEMPLATE + '/' + FN_EN_TEMPLATE, adate, df, df_world) # make summary about all world and save it to disk df2 = world_get_sum_on_last_day(df_world) columns = [ 'date', 'cases', 'death', 'death/cases', 'cases/population', 'death/population', 'country' ] df2.to_csv(Path(u.DIR_DATA) / FSNAME.format(adate), columns=columns, index=False, float_format='%.6f') df.to_csv( Path(u.DIR_DATA) / FEUNAME, index=False, ) # copy results to ldfa filesystem and to production if u.ENABLE_LDFA: to_ldfa(adate) if u.ENABLE_PRODUCTION: to_production(adate)
output_file_path = '/'.join(output_filepath.split('/')[:-1]) output_file_name = output_filepath.split('/')[-1] try: os.mkdir(output_file_path) except OSError: print('Failed to create output directory.') else: print('Succesfully created directory.') # Get connect engine engine = postgres_connector( database_host, 5432, "intern_task", "candidate", "dcard-data-intern-2020" ) df = load_df(engine, mode='test') learn = load_learner(file_path, file_name, test=TabularList.from_df(df)) preds = learn.get_preds(ds_type=DatasetType.Test)[1].numpy() final_df = pd.DataFrame({'post_key': df['post_key'], 'is_trending': preds}) final_df.to_csv('predictions.csv', header=True, index=False) df['is_trending'] = df['like_count_36_hour'] >= 1000 df.is_trending = df.is_trending.astype(int) y_true = df['is_trending'].to_numpy() y_pred = preds print('f1_socre:') print(f1_score(y_true, y_pred, average='macro'))
print(df_ori.shape) df_ori = df_ori.merge(df_sid, on=['session_id'], how='left') df_ori.columns = df_ori.columns.astype(str) utils.save_df(df_ori, des) rows = None tr = pd.read_csv( config.data + 'train.csv', usecols=['session_id', 'action_type', 'reference', 'impressions'], nrows=rows) print(tr.shape) te = pd.read_csv( config.data + 'test.csv', usecols=['session_id', 'action_type', 'reference', 'impressions'], nrows=rows) print(te.shape) df = pd.concat([tr, te]) print(df.shape) df = df[df.action_type == 'clickout item'] df_sid = extract_list_cnt(df) print(df_sid.head()) trs = utils.load_df(config.feat + 'm3_tr_0.ftr') tes = utils.load_df(config.feat + 'm3_te_0.ftr') tr_out = trs[['session_id', 'impressions']] te_out = tes[['session_id', 'impressions']] extract(tr_out, config.feat + 'm3_tr_imprlist_feat2.ftr') extract(te_out, config.feat + 'm3_te_imprlist_feat2.ftr')
def get_reciprocal_ranks(ps): """Calculate reciprocal ranks for recommendations.""" mask = ps.reference == np.array(ps.impressions) if mask.sum() == 1: rranks = generate_rranks_range(0, len(ps.impressions)) return np.array(rranks)[mask].min() else: return 0.0 def generate_rranks_range(start, end): """Generate reciprocal ranks for a given list length.""" return 1.0 / (np.arange(start, end) + 1) tr = pd.read_csv(config.model + '%s/tr_pred.csv' % sys.argv[1]) tr = tr.sort_values(by='target', ascending=False).reset_index(drop=True) tr = tr.groupby(['session_id'])['impressions'].apply(list).reset_index() print(tr.head()) print(tr.shape) tr_click = utils.load_df(config.data + 'm3_tr_click.ftr') tr = tr.merge(tr_click, on='session_id') print(tr.shape) print(tr.head()) tr['score'] = tr.apply(get_reciprocal_ranks, axis=1) print(tr.score.mean())
args = parser.parse_args() """/arguments""" G = 1 TOLERANCE = 1e-04 res = [] data = [ 'c_0500.csv', 'c_0700.csv', 'c_0600.csv', 'c_1000.csv', 'c_0900.csv', 'c_1200.csv', 'c_1100.csv', 'c_1500.csv', 'c_0300.csv', 'c_1800.csv', 'c_1300.csv', 'c_0800.csv', 'c_1700.csv', 'c_0200.csv', 'c_0100.csv', 'c_0400.csv', 'c_0000.csv', 'c_1600.csv', 'c_1400.csv' ] for fname in data: df = utils.load_df(os.path.join(args.input, fname), schema=schemas.clust, part="id") e = cluster.calc_E(df) diff = abs(e - (-0.25)) res.append([ fname, e, -0.25, diff, ]) sc = SparkContext.getOrCreate() res = sc.parallelize(res).toDF(schema=schemas.E_test_res) utils.save_df(res, "E_TEST", args.outputDir, fformat="csv")