def main(api_base): stages_df = get_df('stages', api_base=api_base).rename(columns={ 'id': 'stage_id', 'name': 'stage_name' }).drop(['parameters', 'script', 'script_md5', 'states'], axis=1) states_df = get_df('states', api_base=api_base).rename(columns={'id': 'state_id'}) subjects_df = get_df('subjects', api_base=api_base).rename(columns={'id': 'state_id'}) subjects_df['state_id'] = subjects_df['state'].map(lambda x: x['id']) subjects_df.drop(['state'], axis=1, inplace=True) stages_states_df = pd.merge(stages_df, states_df, on='stage_id') regimens_df = get_df('regimens', api_base=api_base).rename(columns={ 'id': 'regimen_id', 'name': 'regimen_name' }).drop(['states', 'active'], axis=1) stages_states_regimens_df = pd.merge(stages_states_df, regimens_df, on='regimen_id') # print stages_states_regimens_df # print subjects_df subjects_df = (pd.merge( stages_states_regimens_df, subjects_df, on='state_id').sort_values('LabTracks_ID').set_index('LabTracks_ID'))[[ 'regimen_name', 'stage_name', 'regimen_id', 'stage_id', 'state_id' ]] print(subjects_df)
def fe7(): prefix = '分支机构信息' df, train_num = utils.get_df(prefix, ['企业名称', '分支机构状态', '分支成立时间', '分支死亡时间']) df[prefix + '_分支机构状态'] = df[prefix + '_分支机构状态'].apply(lambda x: 1 if 1 == x else 0) df = pd.get_dummies(df, prefix=['dummy_' + prefix + '_分支机构状态'], columns=[prefix + '_分支机构状态']) df[prefix + '_分支成立时间'] = df[prefix + '_分支成立时间'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' and x != '1899/12/30' else np.nan) df[prefix + '_分支死亡时间'] = df[prefix + '_分支死亡时间'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' else np.nan) df[prefix + '_分支成立时间_分支死亡时间_diff']\ = df[prefix + '_分支成立时间'] - df[prefix + '_分支死亡时间'] raw_features = df.columns.values[1:] group = df.groupby('企业名称', as_index=False) for f in raw_features: if 'dummy' in f: df = pd.merge(df, utils.get_agg(group, f, ['sum', 'mean']), on='企业名称', how='left') df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_分支成立时间', ['max', 'min', 'mean', 'diff_mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_分支死亡时间', ['max', 'min', 'mean', 'diff_mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_分支成立时间_分支死亡时间_diff', ['max', 'min', 'mean']), on='企业名称', how='left') for f in raw_features: del df[f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def fe4(): prefix = '双公示-法人行政许可信息' df, train_num = utils.get_df(prefix, ['企业名称', '许可决定日期', '许可截止期']) df[prefix + '_许可决定日期'] = df[prefix + '_许可决定日期'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' else np.nan) df[prefix + '_许可截止期'] = df[prefix + '_许可截止期'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' else np.nan) df[prefix + '_许可决定日期_许可截止期_diff'] = df[prefix + '_许可决定日期'] - df[prefix + '_许可截止期'] group = df.groupby('企业名称', as_index=False) df = pd.merge(df, utils.get_agg(group, prefix + '_许可决定日期', ['max', 'min', 'mean', 'diff_mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_许可截止期', ['max', 'min', 'mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_许可决定日期_许可截止期_diff', ['max', 'min', 'mean']), on='企业名称', how='left') for f in ['许可决定日期', '许可截止期', '许可决定日期_许可截止期_diff']: del df[prefix + '_' + f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def fe6(): prefix = '许可资质年检信息' df, train_num = utils.get_df(prefix, ['企业名称', '年检结果', '年检事项名称', '年检日期']) df[prefix + '_年检结果'] = df[prefix + '_年检结果'].astype('str').apply( lambda x: '合格' if '合格' in x else '通过') df[prefix + '_年检事项名称'] = df[prefix + '_年检事项名称'].astype('str').apply( lambda x: x if '单位年检' == x or '对道路' in x else '其他') df = pd.get_dummies(df, prefix=['dummy_' + prefix + '_年检事项名称', 'dummy_' + prefix + '_年检结果'], columns=[prefix + '_年检事项名称', prefix + '_年检结果']) del df['dummy_' + prefix + '_年检事项名称_其他'] df[prefix + '_年检日期'] = df[prefix + '_年检日期'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' else np.nan) raw_features = df.columns.values[1:] group = df.groupby('企业名称', as_index=False) for f in raw_features: if 'dummy' in f: df = pd.merge(df, utils.get_agg(group, f, ['sum', 'mean']), on='企业名称', how='left') df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_年检日期', ['max', 'min', 'mean', 'diff_mean']), on='企业名称', how='left') for f in raw_features: del df[f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def figs_rf_abc(): df = get_df(path=f"{basepath_dfs}/supplement_rf_abc.csv", ) subfolder = "rf_abc" df.loc[df["algorithm"] == "RF-ABC", "algorithm"] = " RF-ABC" for tasks in tqdm(all_tasks): plot_task_metric(df, tasks, "C2ST", subfolder=subfolder) plot_task_metric(df, tasks, "C2ST", subfolder=subfolder, labels=False)
def fe2(): def recruiting_numbers(x): if '若干' == x: return np.nan if '人' in x: return x[:-1] return x prefix = '招聘数据' df, train_num = utils.get_df(prefix, ['企业名称', '招聘人数', '招聘日期']) df[prefix + '_招聘人数'] = df[prefix + '_招聘人数'].fillna('若干').astype('str').apply( recruiting_numbers).astype('float32') df[prefix + '_招聘日期'] = df[prefix + '_招聘日期'].apply(utils.get_date) group = df.groupby('企业名称', as_index=False) df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_招聘人数', ['mean', 'sum']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_招聘日期', ['max', 'min', 'mean', 'diff_mean']), on='企业名称', how='left') df[prefix + '_freq'] = df[prefix + '_count'] / ( df[prefix + '_招聘日期_max'] - df[prefix + '_招聘日期_min'] + 1) for f in ['招聘人数', '招聘日期']: del df[prefix + '_' + f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def train(task, size, data, shards, checkpoint): name = '{}/{}-{}/'.format(task, size, shards) model = getattr(models, task)[size] task = getattr(tasks, task) df = utils.get_df(data, shards) df = utils.normalize_df(df) df = df.sample(frac=1) dataset = utils.Data(df, task) callbacks = get_callbacks('logs/{}'.format(name)) model.compile(optimizer='adam', loss=task['outputs'], metrics=task.get('metrics')) model.summary() model.fit( dataset, callbacks=callbacks, workers=2, max_queue_size=10, use_multiprocessing=True, ) if checkpoint: model.save('checkpoints/{}'.format(name))
def fe9(): def date_proc(x1, x2): if 'nan' == x1 and 'nan' == x2: return np.nan if x1 != 'nan': return utils.get_date(x1) return utils.get_date(x2) def cert_name(x): if '高新技术' in x: return '高新技术' if '建筑施工' in x: return '建筑施工' return '其他' prefix = '资质登记(变更)信息' df, train_num = utils.get_df(prefix, ['企业名称', '资质名称', '资质生效期', '资质截止期', '认定日期']) df[prefix + '_资质名称'] = df[prefix + '_资质名称'].astype('str').apply(cert_name) df = pd.get_dummies(df, prefix=['dummy_' + prefix + '_资质名称'], columns=[prefix + '_资质名称']) df[prefix + '_资质生效期'] = df[[ prefix + '_资质生效期', prefix + '_认定日期' ]].astype('str').apply( lambda x: date_proc(x[prefix + '_资质生效期'], x[prefix + '_认定日期']), axis=1) df[prefix + '_资质截止期'] = df[prefix + '_资质截止期'].astype('str').apply(lambda x: utils.get_date( x) if x != 'nan' and x[:4] != '1950' else np.nan) df[prefix + '_资质生效期_资质截止期_diff'] = df[prefix + '_资质生效期'] - df[prefix + '_资质截止期'] raw_features = df.columns.values[1:] group = df.groupby('企业名称', as_index=False) for f in raw_features: if 'dummy' in f: df = pd.merge(df, utils.get_agg(group, f, ['sum', 'mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_资质生效期', ['max', 'min', 'mean']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_资质截止期', ['min']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_资质生效期_资质截止期_diff', ['min']), on='企业名称', how='left') for f in raw_features: del df[f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def update_data(session, dataset): print(f"Inserting data into table article") ospaths = get_paths() session = ospaths["datadir"] + session + "/" df = get_df(session + dataset, drop_nans=True) df.to_sql(name="article", con=db.engine, index=False, if_exists="replace")
def fe5(): prefix = '企业非正常户认定' df, train_num = utils.get_df(prefix, ['企业名称', '认定日期']) df[prefix] = 1 df[prefix + '_认定日期'] = df[prefix + '_认定日期'].apply(utils.get_date) train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def figs_sl(): df = get_df(path=f"{basepath_dfs}/supplement_sl.csv", ) subfolder = "sl" df.loc[df["algorithm"] == "SL", "algorithm"] = " SL" df.loc[df["algorithm"] == "NLE-MAF", "algorithm"] = "NLE" df.loc[df["algorithm"] == "SNLE-MAF", "algorithm"] = "SNLE" for tasks in tqdm(all_tasks): plot_task_metric(df, tasks, "C2ST", subfolder=subfolder) plot_task_metric(df, tasks, "C2ST", subfolder=subfolder, labels=False)
def figs_main_paper(): df = get_df(path=f"{basepath_dfs}/main_paper.csv", ) df.loc[df["algorithm"] == "REJ-ABC", "algorithm"] = " REJ-ABC" for tasks in tqdm(all_tasks): for metric in all_metrics: plot_task_metric(df, tasks, metric, subfolder="main_paper") plot_task_metric(df, tasks, metric, subfolder="main_paper", labels=False)
def test_ping_h1_h5_and_h2_h5_balance_of_charges(): utils.get_output("wireshark/s2-eth1-h1-ping-h5.pcap", "test/s2-ping-h1-h5.txt") utils.get_output("wireshark/s2-eth1-h2-ping-h5.pcap", "test/s2-ping-h2-h5.txt") utils.get_output("wireshark/s3-eth1-h1-ping-h5.pcap", "test/s3-ping-h1-h5.txt") utils.get_output("wireshark/s3-eth1-h2-ping-h5.pcap", "test/s3-ping-h2-h5.txt") def callback(x): return x['dst'] == '10.0.0.5' and x['protocol'] == 'ICMP' df_1 = utils.get_df("test/s2-ping-h1-h5.txt") df_1.columns = ['src', 'dst', 'protocol'] df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback) size_1 = len(df_1.index) df_2 = utils.get_df("test/s2-ping-h2-h5.txt") df_2.columns = ['src', 'dst', 'protocol'] df_2 = utils.filter_by(df_2, ['dst', 'protocol'], callback) size_2 = len(df_2.index) df_3 = utils.get_df("test/s3-ping-h1-h5.txt") df_3.columns = ['src', 'dst', 'protocol'] df_3 = utils.filter_by(df_3, ['dst', 'protocol'], callback) size_3 = len(df_3.index) df_4 = utils.get_df("test/s3-ping-h2-h5.txt") df_4.columns = ['src', 'dst', 'protocol'] df_4 = utils.filter_by(df_4, ['dst', 'protocol'], callback) size_4 = len(df_4.index) os.system("rm test/s2-ping-h1-h5.txt") os.system("rm test/s2-ping-h2-h5.txt") os.system("rm test/s3-ping-h1-h5.txt") os.system("rm test/s3-ping-h2-h5.txt") ecmp_h1_h5 = (size_1 == 0 and size_3 != 0) or (size_1 != 0 and size_3 == 0) ecmp_h2_h5 = (size_2 == 0 and size_4 != 0) or (size_2 != 0 and size_4 == 0) assert ecmp_h1_h5 and ecmp_h2_h5
def run(session_name, target_size): print(f"Starting preprocessing of images for session {session_name}") global SESSION, IMAGE_SESSION SESSION = session_name IMAGE_SESSION = SESSION + "resized_images/" if not os.path.exists(IMAGE_SESSION): os.mkdir(IMAGE_SESSION) print(f"New directory: {IMAGE_SESSION}") f = utils.get_df(SESSION + "id_path.csv") f = f.path.values.tolist() copy_and_resize_sample_images(f, target_size)
def compute_distance(df, sp, feature, normalize): print(f"Computing distance similarity for {feature}") if not isinstance(df, pd.DataFrame): measures = utils.get_df(df) else: measures = df func = calculate_distance if normalize: func = normalize_and_calculate_distance results = for_pivot(measures[feature], measures, func) print() save_as_pivot(results, sp=sp)
def fe1(): prefix = '企业基本信息&高管信息&投资信息' df, train_num = utils.get_df(prefix, ['企业名称', '注册资金', '注册资本(金)币种名称', '企业(机构)类型名称', '行业门类代码', '成立日期', '核准日期', '住所所在地省份', '姓名', '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例']) df[prefix + '_出资比例'] = df[prefix + '_出资比例'].apply(lambda x: x if x <= 1 else x / 100) df[prefix + '_注册资金'] = df[[prefix + '_注册资金', prefix + '_注册资本(金)币种名称']].apply( lambda x: x[prefix + '_注册资金'] if x[prefix + '_注册资本(金)币种名称'] not in utils.exch_rate.keys() else x[prefix + '_注册资金'] * utils.exch_rate[x[prefix + '_注册资本(金)币种名称']], axis=1).fillna(0) df[prefix + '_注册资金_binning'] = df[prefix + '_注册资金'].apply( lambda x: utils.binning(x, [300, 500, 1000, 3000, 6000])) df[prefix + '_成立日期'] = df[prefix + '_成立日期'].astype('str').apply( lambda x: utils.get_date(x[:10]) if x != 'nan' else np.nan) df[prefix + '_核准日期'] = df[prefix + '_核准日期'].apply(lambda x: utils.get_date(x[:10])) df[prefix + '_成立日期_核准日期_diff'] = df[prefix + '_成立日期'] - df[prefix + '_核准日期'] df[prefix + '_法定代表人职务'] = df[[prefix + '_法定代表人标志', prefix + '_职务']].apply( lambda x: x[prefix + '_职务'] if x[prefix + '_法定代表人标志'] == '是' else np.nan, axis=1) df[prefix + '_首席代表职务'] = df[[prefix + '_首席代表标志', prefix + '_职务']].apply( lambda x: x[prefix + '_职务'] if x[prefix + '_首席代表标志'] == '是' else np.nan, axis=1) df = pd.merge(df, df.dropna(subset=[prefix + '_姓名']).groupby( prefix + '_姓名', as_index=False)['企业名称'].agg({prefix + '_姓名_企业名称_nunique': 'nunique'}), on=prefix + '_姓名', how='left') df = pd.merge(df, df.dropna(subset=[prefix + '_投资人']).groupby( prefix + '_投资人', as_index=False)['企业名称'].agg({prefix + '_投资人_企业名称_nunique': 'nunique'}), on=prefix + '_投资人', how='left') group = df.groupby('企业名称', as_index=False) df = pd.merge(df, utils.get_agg(group, prefix + '_姓名', ['nunique']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_投资人', ['nunique']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_姓名_企业名称_nunique', ['max', 'mean', 'sum']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_投资人_企业名称_nunique', ['max', 'mean', 'sum']), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_出资比例', ['max', 'min', 'mean']), on='企业名称', how='left') f_pairs = [ [prefix + '_住所所在地省份', prefix + '_企业(机构)类型名称'], [prefix + '_住所所在地省份', prefix + '_行业门类代码'], [prefix + '_注册资金_binning', prefix + '_企业(机构)类型名称'], [prefix + '_注册资金_binning', prefix + '_行业门类代码'], [prefix + '_企业(机构)类型名称', prefix + '_行业门类代码'] ] df = utils.get_ratio(df, f_pairs) for f in ['注册资本(金)币种名称', '姓名', '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例', '姓名_企业名称_nunique', '投资人_企业名称_nunique']: del df[prefix + '_' + f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) train_df.dropna(subset=[prefix + '_成立日期'], inplace=True) test_df.drop_duplicates('企业名称', inplace=True) for f in ['法定代表人职务', '首席代表职务', '企业(机构)类型名称', '行业门类代码', '住所所在地省份']: label_dict = dict(zip(train_df[prefix + '_' + f].unique(), range(train_df[prefix + '_' + f].nunique()))) train_df[prefix + '_' + f] = train_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16') test_df[prefix + '_' + f] = test_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16') return prefix, train_df, test_df
def test_tcp_balance_of_charges(): utils.get_output("wireshark/s2-eth1-iperf-h1-h4.pcap", "test/s2-h1-h4.txt") utils.get_output("wireshark/s3-eth1-iperf-h1-h4.pcap", "test/s3-h1-h4.txt") def callback(x): return x['dst'] == '10.0.0.4' and x['protocol'] == 'TCP' df_1 = utils.get_df("test/s2-h1-h4.txt") df_1.columns = ['src', 'dst', 'protocol'] df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback) size_1 = len(df_1.index) df_2 = utils.get_df("test/s3-h1-h4.txt") df_2.columns = ['src', 'dst', 'protocol'] df_2 = utils.filter_by(df_2, ['dst', 'protocol'], callback) size_2 = len(df_2.index) os.system("rm test/s2-h1-h4.txt") os.system("rm test/s3-h1-h4.txt") assert (size_1 == 0 and size_2 != 0) or (size_1 != 0 and size_2 == 0)
def fe12(): prefix = '法人行政许可注(撤、吊)销信息' df, train_num = utils.get_df(prefix, ['企业名称', '注(撤、吊)销批准日期']) df[prefix + '_注(撤、吊)销批准日期'] = df[prefix + '_注(撤、吊)销批准日期'].apply(utils.get_date) group = df.groupby('企业名称', as_index=False) df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_注(撤、吊)销批准日期', ['max']), on='企业名称', how='left') for f in ['注(撤、吊)销批准日期']: del df[prefix + '_' + f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def figs_hyperparameters_snre(): df = get_df(path=f"{basepath_dfs}/supplement_hyperparameters_snre.csv", ) for tasks in tqdm(all_tasks): for metric in ["C2ST", "RT"]: plot_task_metric(df, tasks, metric, subfolder="hyperparameters_snre") plot_task_metric(df, tasks, metric, subfolder="hyperparameters_snre", labels=False)
def copy_images(src, dst): """" :param src -- path to the current sample CSV """ df = utils.get_df(src, drop_nans=False, dt=True) idpath = utils.get_id_path_pairs(df, from_path="drive") images = list(idpath.values()) ids = list(idpath.keys()) for i in range(len(images)): file = images[i] ext = file[file.rfind("."):] shutil.copyfile(file, dst + ids[i] + ext)
def fe11(): prefix = '双打办打击侵权假冒处罚案件信息' df, train_num = utils.get_df(prefix, ['企业名称', '提供日期']) df[prefix + '_提供日期'] = df[prefix + '_提供日期'].astype('str').apply( lambda x: utils.get_date(x) if x != 'nan' else np.nan) group = df.groupby('企业名称', as_index=False) df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left') df = pd.merge(df, utils.get_agg(group, prefix + '_提供日期', ['max']), on='企业名称', how='left') for f in ['提供日期']: del df[prefix + '_' + f] train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', inplace=True) test_df.drop_duplicates('企业名称', inplace=True) return prefix, train_df, test_df
def raw(self): if self._raw is not None: return self._raw df = get_df( self.host, self.path, self._refresh, self._verbose, errors='raise', parse_dates=['data'], ) self._raw = df # cache return df
def test_udp_denial_of_service(): utils.get_output("wireshark/s5-eth1-iperf-udp-h1-h5.pcap", "test/s5-udp-h1-h5.txt") def callback(x): return x['dst'] == '10.0.0.5' and x['protocol'] == 'UDP' df_1 = utils.get_df("test/s5-udp-h1-h5.txt") df_1.columns = ['src', 'dst', 'protocol'] df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback) size_1 = len(df_1.index) assert size_1 < 893 os.system("rm test/s5-udp-h1-h5.txt")
def fe8(): prefix = '企业税务登记信息' df, train_num = utils.get_df(prefix, ['企业名称', '审核时间', '登记注册类型', '审核结果']) df[prefix + '_审核时间'] = df[prefix + '_审核时间'].apply(utils.get_date) df[prefix + '_审核结果'] = df[prefix + '_审核结果'].apply( lambda x: x if '江苏省苏州地方税务局' == x or '开业' == x or '正常' == x else '其他') f_pairs = [[prefix + '_登记注册类型', prefix + '_审核结果']] df = utils.get_ratio(df, f_pairs) train_df, test_df = df[:train_num], df[train_num:] train_df.drop_duplicates('企业名称', keep='last', inplace=True) test_df.drop_duplicates('企业名称', keep='last', inplace=True) for f in [prefix + '_审核结果', prefix + '_登记注册类型']: label_dict = dict(zip(train_df[f].unique(), range(train_df[f].nunique()))) train_df[f] = train_df[f].map(label_dict).fillna(-1).astype('int16') test_df[f] = test_df[f].map(label_dict).fillna(-1).astype('int16') return prefix, train_df, test_df
def create_image_paths(df_path, cat): df = utils.get_df(df_path, category=cat, drop_nans=True, drop_duplicates=True) file_paths = utils.get_id_path_pairs(df, from_path="drive", ignore_types="gif") file_paths = list(file_paths.keys()) df = df[df.id.isin(file_paths)] df.to_csv(SESSION + "politics.csv", index=False) utils.save_clean_copy(df, SESSION + "politics.csv") utils.get_id_path_pairs(df, save_path=SESSION + "id_path_all.csv", from_path="drive")
def _get_data(self, refresh, verbose): """Download or fetch data from cache. Downloads 1 file per day, and unions files with the same schema. Unioning across schemas is taken care of `self._consolidate`. Returns: List[DataFrame]: One DataFrame for each schema version. """ schemas = { '20200122': { 'dates': pd.date_range('2020-01-22', '2020-02-29'), 'kwargs': dict(parse_dates=['Last Update']), }, '20200301': { 'dates': pd.date_range('2020-03-01', '2020-03-21'), 'kwargs': dict(parse_dates=['Last Update']), }, '20200322': { 'dates': pd.date_range('2020-03-22', dt.datetime.today()), 'kwargs': dict(parse_dates=['Last_Update']), }, } schemas_dfs = {} for schema_name, schema in schemas.items(): files_dfs = [] for date in schema['dates']: filename = f"{date.strftime('%m-%d-%Y')}.csv" df = get_df( self.host, f"{self.path}/{filename}", refresh, verbose, errors='ignore', **schema['kwargs'] ) if df is not None: files_dfs.append(df.assign(_filedate=date, _filename=filename)) # validate schemas columns = list(map(lambda df: set(df.columns), files_dfs)) assert all(x == columns[0] for x in columns), f"schemas differ for {schema_name}" schemas_dfs[schema_name] = pd.concat(files_dfs) return schemas_dfs
def raw(self): if self._raw is not None: return self._raw df = get_df( self.host, self.path, self._refresh, self._verbose, errors='raise', parse_dates=['dateRep'], dayfirst=True, keep_default_na=False, # don't treat Namibia as 'NA' ) self._raw = df # cache return df
def figs_hyperparameters_rej_abc(): df = get_df( path=f"{basepath_dfs}/supplement_hyperparameters_rej_abc.csv", ) for tasks in tqdm(all_tasks): for metric in ["C2ST"]: plot_task_metric( df, tasks, metric, subfolder="hyperparameters_rej_abc", default_color=get_colors(df=df, include_defaults=True)["REJ"], ) plot_task_metric( df, tasks, metric, subfolder="hyperparameters_rej_abc", labels=False, default_color=get_colors(df=df, include_defaults=True)["REJ"], )
def figs_abc_lra_sass(): df = get_df(path=f"{basepath_dfs}/supplement_abc_lra_sass.csv", ) for tasks in tqdm(all_tasks): for metric in ["C2ST"]: plot_task_metric( df, tasks, metric, subfolder="abc_lra_sass", default_color=get_colors(df=df), ) plot_task_metric( df, tasks, metric, subfolder="abc_lra_sass", labels=False, default_color=get_colors(df=df), )
def get_data_loaders(bs=8, num_workers=0, shuffle=True, ts=0.2): train_df, img_2_ohe_vector = get_df() train_imgs, val_imgs = train_test_split(train_df['Image'].values, test_size=ts, stratify=train_df['Class'].map(lambda x: str(sorted(list(x)))), random_state=42) print(train_imgs) print(val_imgs) print(len(train_imgs)) print(len(val_imgs)) train_dataset = CloudDataset(img_2_ohe_vector, img_ids=train_imgs, transforms=get_training_augmentation()) train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=shuffle, num_workers=num_workers) val_dataset = CloudDataset(img_2_ohe_vector, img_ids=val_imgs, transforms=get_validation_augmentation()) val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=shuffle, num_workers=num_workers) return train_loader, val_loader
def setUp(self): df = get_df() self.check = Check(df) self.jvmCheck = self.check.jvmCheck
#perform the fit res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds) grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon) #get the predictions time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean()) fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3]) if res.success == True: return [fare_pred, time_pred] else: return [0,0] #read in dataframe and cache it df1 = utils.get_df(sqlContext, 1, 1) df1.cache() #get random sample for prediction test_sample=df1.sample(False, 0.1, seed=42).limit(500).toPandas() test_sample.columns = df1.columns test_fare = test_sample["total_notip"].tolist() test_time = test_sample["trip_time_in_secs"].tolist() pred_fare = [] pred_time = [] #get prediction for each event for index, row in test_sample.iterrows(): print 'Processing event '+str(index) event = [row['pickup'],utils.dayofweek(row['pickup']),row['pick_lat'],row['pick_lon'],row['drop_lat'],row['drop_lon'],row['pc']] prediction = make_prediction(event, df1)