Esempio n. 1
0
def main(api_base):
    stages_df = get_df('stages', api_base=api_base).rename(columns={
        'id': 'stage_id',
        'name': 'stage_name'
    }).drop(['parameters', 'script', 'script_md5', 'states'], axis=1)
    states_df = get_df('states',
                       api_base=api_base).rename(columns={'id': 'state_id'})
    subjects_df = get_df('subjects',
                         api_base=api_base).rename(columns={'id': 'state_id'})
    subjects_df['state_id'] = subjects_df['state'].map(lambda x: x['id'])
    subjects_df.drop(['state'], axis=1, inplace=True)
    stages_states_df = pd.merge(stages_df, states_df, on='stage_id')
    regimens_df = get_df('regimens',
                         api_base=api_base).rename(columns={
                             'id': 'regimen_id',
                             'name': 'regimen_name'
                         }).drop(['states', 'active'], axis=1)
    stages_states_regimens_df = pd.merge(stages_states_df,
                                         regimens_df,
                                         on='regimen_id')

    # print stages_states_regimens_df

    # print subjects_df
    subjects_df = (pd.merge(
        stages_states_regimens_df, subjects_df,
        on='state_id').sort_values('LabTracks_ID').set_index('LabTracks_ID'))[[
            'regimen_name', 'stage_name', 'regimen_id', 'stage_id', 'state_id'
        ]]

    print(subjects_df)
Esempio n. 2
0
def fe7():
    prefix = '分支机构信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '分支机构状态', '分支成立时间', '分支死亡时间'])
    df[prefix + '_分支机构状态'] = df[prefix + '_分支机构状态'].apply(lambda x: 1 if 1 == x else 0)
    df = pd.get_dummies(df, prefix=['dummy_' + prefix + '_分支机构状态'], columns=[prefix + '_分支机构状态'])
    df[prefix + '_分支成立时间'] = df[prefix + '_分支成立时间'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' and x != '1899/12/30' else np.nan)
    df[prefix + '_分支死亡时间'] = df[prefix + '_分支死亡时间'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' else np.nan)
    df[prefix + '_分支成立时间_分支死亡时间_diff']\
        = df[prefix + '_分支成立时间'] - df[prefix + '_分支死亡时间']
    raw_features = df.columns.values[1:]
    group = df.groupby('企业名称', as_index=False)
    for f in raw_features:
        if 'dummy' in f:
            df = pd.merge(df, utils.get_agg(group, f, ['sum', 'mean']), on='企业名称', how='left')
    df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_分支成立时间', ['max', 'min', 'mean', 'diff_mean']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_分支死亡时间', ['max', 'min', 'mean', 'diff_mean']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_分支成立时间_分支死亡时间_diff', ['max', 'min', 'mean']),
                  on='企业名称', how='left')
    for f in raw_features:
        del df[f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 3
0
def fe4():
    prefix = '双公示-法人行政许可信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '许可决定日期', '许可截止期'])
    df[prefix + '_许可决定日期'] = df[prefix + '_许可决定日期'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' else np.nan)
    df[prefix + '_许可截止期'] = df[prefix + '_许可截止期'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' else np.nan)
    df[prefix +
       '_许可决定日期_许可截止期_diff'] = df[prefix + '_许可决定日期'] - df[prefix + '_许可截止期']
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_许可决定日期',
                                ['max', 'min', 'mean', 'diff_mean']),
                  on='企业名称',
                  how='left')
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_许可截止期',
                                ['max', 'min', 'mean']),
                  on='企业名称',
                  how='left')
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_许可决定日期_许可截止期_diff',
                                ['max', 'min', 'mean']),
                  on='企业名称',
                  how='left')
    for f in ['许可决定日期', '许可截止期', '许可决定日期_许可截止期_diff']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 4
0
def fe6():
    prefix = '许可资质年检信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '年检结果', '年检事项名称', '年检日期'])
    df[prefix + '_年检结果'] = df[prefix + '_年检结果'].astype('str').apply(
        lambda x: '合格' if '合格' in x else '通过')
    df[prefix + '_年检事项名称'] = df[prefix + '_年检事项名称'].astype('str').apply(
        lambda x: x if '单位年检' == x or '对道路' in x else '其他')
    df = pd.get_dummies(df, prefix=['dummy_' + prefix + '_年检事项名称', 'dummy_' + prefix + '_年检结果'],
                        columns=[prefix + '_年检事项名称', prefix + '_年检结果'])
    del df['dummy_' + prefix + '_年检事项名称_其他']
    df[prefix + '_年检日期'] = df[prefix + '_年检日期'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' else np.nan)
    raw_features = df.columns.values[1:]
    group = df.groupby('企业名称', as_index=False)
    for f in raw_features:
        if 'dummy' in f:
            df = pd.merge(df, utils.get_agg(group, f, ['sum', 'mean']), on='企业名称', how='left')
    df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_年检日期', ['max', 'min', 'mean', 'diff_mean']),
                  on='企业名称', how='left')
    for f in raw_features:
        del df[f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 5
0
def figs_rf_abc():
    df = get_df(path=f"{basepath_dfs}/supplement_rf_abc.csv", )
    subfolder = "rf_abc"
    df.loc[df["algorithm"] == "RF-ABC", "algorithm"] = " RF-ABC"
    for tasks in tqdm(all_tasks):
        plot_task_metric(df, tasks, "C2ST", subfolder=subfolder)
        plot_task_metric(df, tasks, "C2ST", subfolder=subfolder, labels=False)
Esempio n. 6
0
def fe2():
    def recruiting_numbers(x):
        if '若干' == x:
            return np.nan
        if '人' in x:
            return x[:-1]
        return x
    prefix = '招聘数据'
    df, train_num = utils.get_df(prefix, ['企业名称', '招聘人数', '招聘日期'])
    df[prefix + '_招聘人数'] = df[prefix + '_招聘人数'].fillna('若干').astype('str').apply(
        recruiting_numbers).astype('float32')
    df[prefix + '_招聘日期'] = df[prefix + '_招聘日期'].apply(utils.get_date)
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_招聘人数', ['mean', 'sum']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_招聘日期', ['max', 'min', 'mean', 'diff_mean']),
                  on='企业名称', how='left')
    df[prefix + '_freq'] = df[prefix + '_count'] / (
            df[prefix + '_招聘日期_max'] - df[prefix + '_招聘日期_min'] + 1)
    for f in ['招聘人数', '招聘日期']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 7
0
def train(task, size, data, shards, checkpoint):
    name = '{}/{}-{}/'.format(task, size, shards)
    model = getattr(models, task)[size]
    task = getattr(tasks, task)

    df = utils.get_df(data, shards)
    df = utils.normalize_df(df)
    df = df.sample(frac=1)
    dataset = utils.Data(df, task)
    callbacks = get_callbacks('logs/{}'.format(name))

    model.compile(optimizer='adam',
                  loss=task['outputs'],
                  metrics=task.get('metrics'))
    model.summary()

    model.fit(
        dataset,
        callbacks=callbacks,
        workers=2,
        max_queue_size=10,
        use_multiprocessing=True,
    )

    if checkpoint:
        model.save('checkpoints/{}'.format(name))
Esempio n. 8
0
def fe9():
    def date_proc(x1, x2):
        if 'nan' == x1 and 'nan' == x2:
            return np.nan
        if x1 != 'nan':
            return utils.get_date(x1)
        return utils.get_date(x2)

    def cert_name(x):
        if '高新技术' in x:
            return '高新技术'
        if '建筑施工' in x:
            return '建筑施工'
        return '其他'

    prefix = '资质登记(变更)信息'
    df, train_num = utils.get_df(prefix,
                                 ['企业名称', '资质名称', '资质生效期', '资质截止期', '认定日期'])
    df[prefix + '_资质名称'] = df[prefix + '_资质名称'].astype('str').apply(cert_name)
    df = pd.get_dummies(df,
                        prefix=['dummy_' + prefix + '_资质名称'],
                        columns=[prefix + '_资质名称'])
    df[prefix + '_资质生效期'] = df[[
        prefix + '_资质生效期', prefix + '_认定日期'
    ]].astype('str').apply(
        lambda x: date_proc(x[prefix + '_资质生效期'], x[prefix + '_认定日期']), axis=1)
    df[prefix +
       '_资质截止期'] = df[prefix +
                      '_资质截止期'].astype('str').apply(lambda x: utils.get_date(
                          x) if x != 'nan' and x[:4] != '1950' else np.nan)
    df[prefix +
       '_资质生效期_资质截止期_diff'] = df[prefix + '_资质生效期'] - df[prefix + '_资质截止期']
    raw_features = df.columns.values[1:]
    group = df.groupby('企业名称', as_index=False)
    for f in raw_features:
        if 'dummy' in f:
            df = pd.merge(df,
                          utils.get_agg(group, f, ['sum', 'mean']),
                          on='企业名称',
                          how='left')
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_资质生效期',
                                ['max', 'min', 'mean']),
                  on='企业名称',
                  how='left')
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_资质截止期', ['min']),
                  on='企业名称',
                  how='left')
    df = pd.merge(df,
                  utils.get_agg(group, prefix + '_资质生效期_资质截止期_diff', ['min']),
                  on='企业名称',
                  how='left')
    for f in raw_features:
        del df[f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 9
0
 def update_data(session, dataset):
     print(f"Inserting data into table article")
     ospaths = get_paths()
     session = ospaths["datadir"] + session + "/"
     df = get_df(session + dataset, drop_nans=True)
     df.to_sql(name="article",
               con=db.engine,
               index=False,
               if_exists="replace")
Esempio n. 10
0
def fe5():
    prefix = '企业非正常户认定'
    df, train_num = utils.get_df(prefix, ['企业名称', '认定日期'])
    df[prefix] = 1
    df[prefix + '_认定日期'] = df[prefix + '_认定日期'].apply(utils.get_date)
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 11
0
def figs_sl():
    df = get_df(path=f"{basepath_dfs}/supplement_sl.csv", )
    subfolder = "sl"
    df.loc[df["algorithm"] == "SL", "algorithm"] = " SL"
    df.loc[df["algorithm"] == "NLE-MAF", "algorithm"] = "NLE"
    df.loc[df["algorithm"] == "SNLE-MAF", "algorithm"] = "SNLE"
    for tasks in tqdm(all_tasks):
        plot_task_metric(df, tasks, "C2ST", subfolder=subfolder)
        plot_task_metric(df, tasks, "C2ST", subfolder=subfolder, labels=False)
Esempio n. 12
0
def figs_main_paper():
    df = get_df(path=f"{basepath_dfs}/main_paper.csv", )
    df.loc[df["algorithm"] == "REJ-ABC", "algorithm"] = " REJ-ABC"
    for tasks in tqdm(all_tasks):
        for metric in all_metrics:
            plot_task_metric(df, tasks, metric, subfolder="main_paper")
            plot_task_metric(df,
                             tasks,
                             metric,
                             subfolder="main_paper",
                             labels=False)
Esempio n. 13
0
def test_ping_h1_h5_and_h2_h5_balance_of_charges():
    utils.get_output("wireshark/s2-eth1-h1-ping-h5.pcap",
                     "test/s2-ping-h1-h5.txt")
    utils.get_output("wireshark/s2-eth1-h2-ping-h5.pcap",
                     "test/s2-ping-h2-h5.txt")
    utils.get_output("wireshark/s3-eth1-h1-ping-h5.pcap",
                     "test/s3-ping-h1-h5.txt")
    utils.get_output("wireshark/s3-eth1-h2-ping-h5.pcap",
                     "test/s3-ping-h2-h5.txt")

    def callback(x):
        return x['dst'] == '10.0.0.5' and x['protocol'] == 'ICMP'

    df_1 = utils.get_df("test/s2-ping-h1-h5.txt")
    df_1.columns = ['src', 'dst', 'protocol']
    df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback)
    size_1 = len(df_1.index)

    df_2 = utils.get_df("test/s2-ping-h2-h5.txt")
    df_2.columns = ['src', 'dst', 'protocol']
    df_2 = utils.filter_by(df_2, ['dst', 'protocol'], callback)
    size_2 = len(df_2.index)

    df_3 = utils.get_df("test/s3-ping-h1-h5.txt")
    df_3.columns = ['src', 'dst', 'protocol']
    df_3 = utils.filter_by(df_3, ['dst', 'protocol'], callback)
    size_3 = len(df_3.index)

    df_4 = utils.get_df("test/s3-ping-h2-h5.txt")
    df_4.columns = ['src', 'dst', 'protocol']
    df_4 = utils.filter_by(df_4, ['dst', 'protocol'], callback)
    size_4 = len(df_4.index)

    os.system("rm test/s2-ping-h1-h5.txt")
    os.system("rm test/s2-ping-h2-h5.txt")
    os.system("rm test/s3-ping-h1-h5.txt")
    os.system("rm test/s3-ping-h2-h5.txt")

    ecmp_h1_h5 = (size_1 == 0 and size_3 != 0) or (size_1 != 0 and size_3 == 0)
    ecmp_h2_h5 = (size_2 == 0 and size_4 != 0) or (size_2 != 0 and size_4 == 0)
    assert ecmp_h1_h5 and ecmp_h2_h5
Esempio n. 14
0
def run(session_name, target_size):
    print(f"Starting preprocessing of images for session {session_name}")
    global SESSION, IMAGE_SESSION
    SESSION = session_name
    IMAGE_SESSION = SESSION + "resized_images/"
    if not os.path.exists(IMAGE_SESSION):
        os.mkdir(IMAGE_SESSION)
        print(f"New directory: {IMAGE_SESSION}")

    f = utils.get_df(SESSION + "id_path.csv")
    f = f.path.values.tolist()
    copy_and_resize_sample_images(f, target_size)
Esempio n. 15
0
def compute_distance(df, sp, feature, normalize):
    print(f"Computing distance similarity for {feature}")
    if not isinstance(df, pd.DataFrame):
        measures = utils.get_df(df)
    else:
        measures = df
    func = calculate_distance
    if normalize:
        func = normalize_and_calculate_distance
    results = for_pivot(measures[feature], measures, func)
    print()
    save_as_pivot(results, sp=sp)
Esempio n. 16
0
def fe1():
    prefix = '企业基本信息&高管信息&投资信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '注册资金', '注册资本(金)币种名称', '企业(机构)类型名称',
                                          '行业门类代码', '成立日期', '核准日期', '住所所在地省份', '姓名',
                                          '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例'])
    df[prefix + '_出资比例'] = df[prefix + '_出资比例'].apply(lambda x: x if x <= 1 else x / 100)
    df[prefix + '_注册资金'] = df[[prefix + '_注册资金', prefix + '_注册资本(金)币种名称']].apply(
        lambda x: x[prefix + '_注册资金'] if x[prefix + '_注册资本(金)币种名称'] not in utils.exch_rate.keys()
        else x[prefix + '_注册资金'] * utils.exch_rate[x[prefix + '_注册资本(金)币种名称']], axis=1).fillna(0)
    df[prefix + '_注册资金_binning'] = df[prefix + '_注册资金'].apply(
        lambda x: utils.binning(x, [300, 500, 1000, 3000, 6000]))
    df[prefix + '_成立日期'] = df[prefix + '_成立日期'].astype('str').apply(
        lambda x: utils.get_date(x[:10]) if x != 'nan' else np.nan)
    df[prefix + '_核准日期'] = df[prefix + '_核准日期'].apply(lambda x: utils.get_date(x[:10]))
    df[prefix + '_成立日期_核准日期_diff'] = df[prefix + '_成立日期'] - df[prefix + '_核准日期']
    df[prefix + '_法定代表人职务'] = df[[prefix + '_法定代表人标志', prefix + '_职务']].apply(
        lambda x: x[prefix + '_职务'] if x[prefix + '_法定代表人标志'] == '是' else np.nan, axis=1)
    df[prefix + '_首席代表职务'] = df[[prefix + '_首席代表标志', prefix + '_职务']].apply(
        lambda x: x[prefix + '_职务'] if x[prefix + '_首席代表标志'] == '是' else np.nan, axis=1)
    df = pd.merge(df, df.dropna(subset=[prefix + '_姓名']).groupby(
        prefix + '_姓名', as_index=False)['企业名称'].agg({prefix + '_姓名_企业名称_nunique': 'nunique'}),
                  on=prefix + '_姓名', how='left')
    df = pd.merge(df, df.dropna(subset=[prefix + '_投资人']).groupby(
        prefix + '_投资人', as_index=False)['企业名称'].agg({prefix + '_投资人_企业名称_nunique': 'nunique'}),
                  on=prefix + '_投资人', how='left')
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df, utils.get_agg(group, prefix + '_姓名', ['nunique']), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_投资人', ['nunique']), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_姓名_企业名称_nunique', ['max', 'mean', 'sum']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_投资人_企业名称_nunique', ['max', 'mean', 'sum']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_出资比例', ['max', 'min', 'mean']), on='企业名称', how='left')
    f_pairs = [
        [prefix + '_住所所在地省份', prefix + '_企业(机构)类型名称'],
        [prefix + '_住所所在地省份', prefix + '_行业门类代码'],
        [prefix + '_注册资金_binning', prefix + '_企业(机构)类型名称'],
        [prefix + '_注册资金_binning', prefix + '_行业门类代码'],
        [prefix + '_企业(机构)类型名称', prefix + '_行业门类代码']
    ]
    df = utils.get_ratio(df, f_pairs)
    for f in ['注册资本(金)币种名称', '姓名', '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例',
              '姓名_企业名称_nunique', '投资人_企业名称_nunique']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    train_df.dropna(subset=[prefix + '_成立日期'], inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    for f in ['法定代表人职务', '首席代表职务', '企业(机构)类型名称', '行业门类代码', '住所所在地省份']:
        label_dict = dict(zip(train_df[prefix + '_' + f].unique(), range(train_df[prefix + '_' + f].nunique())))
        train_df[prefix + '_' + f] = train_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16')
        test_df[prefix + '_' + f] = test_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16')
    return prefix, train_df, test_df
def test_tcp_balance_of_charges():
    utils.get_output("wireshark/s2-eth1-iperf-h1-h4.pcap", "test/s2-h1-h4.txt")
    utils.get_output("wireshark/s3-eth1-iperf-h1-h4.pcap", "test/s3-h1-h4.txt")

    def callback(x):
        return x['dst'] == '10.0.0.4' and x['protocol'] == 'TCP'

    df_1 = utils.get_df("test/s2-h1-h4.txt")
    df_1.columns = ['src', 'dst', 'protocol']
    df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback)
    size_1 = len(df_1.index)

    df_2 = utils.get_df("test/s3-h1-h4.txt")
    df_2.columns = ['src', 'dst', 'protocol']
    df_2 = utils.filter_by(df_2, ['dst', 'protocol'], callback)
    size_2 = len(df_2.index)

    os.system("rm test/s2-h1-h4.txt")
    os.system("rm test/s3-h1-h4.txt")

    assert (size_1 == 0 and size_2 != 0) or (size_1 != 0 and size_2 == 0)
Esempio n. 18
0
def fe12():
    prefix = '法人行政许可注(撤、吊)销信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '注(撤、吊)销批准日期'])
    df[prefix + '_注(撤、吊)销批准日期'] = df[prefix + '_注(撤、吊)销批准日期'].apply(utils.get_date)
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_注(撤、吊)销批准日期', ['max']), on='企业名称', how='left')
    for f in ['注(撤、吊)销批准日期']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 19
0
def figs_hyperparameters_snre():
    df = get_df(path=f"{basepath_dfs}/supplement_hyperparameters_snre.csv", )
    for tasks in tqdm(all_tasks):
        for metric in ["C2ST", "RT"]:
            plot_task_metric(df,
                             tasks,
                             metric,
                             subfolder="hyperparameters_snre")
            plot_task_metric(df,
                             tasks,
                             metric,
                             subfolder="hyperparameters_snre",
                             labels=False)
Esempio n. 20
0
def copy_images(src, dst):
    """"
        :param src -- path to the current sample CSV
    """
    df = utils.get_df(src, drop_nans=False, dt=True)
    idpath = utils.get_id_path_pairs(df, from_path="drive")
    images = list(idpath.values())
    ids = list(idpath.keys())

    for i in range(len(images)):
        file = images[i]
        ext = file[file.rfind("."):]
        shutil.copyfile(file, dst + ids[i] + ext)
Esempio n. 21
0
def fe11():
    prefix = '双打办打击侵权假冒处罚案件信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '提供日期'])
    df[prefix + '_提供日期'] = df[prefix + '_提供日期'].astype('str').apply(
        lambda x: utils.get_date(x) if x != 'nan' else np.nan)
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df, group['企业名称'].agg({prefix + '_count': 'count'}), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_提供日期', ['max']), on='企业名称', how='left')
    for f in ['提供日期']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    return prefix, train_df, test_df
Esempio n. 22
0
    def raw(self):

        if self._raw is not None:
            return self._raw

        df = get_df(
            self.host,
            self.path,
            self._refresh,
            self._verbose,
            errors='raise',
            parse_dates=['data'],
        )
        self._raw = df  # cache
        return df
def test_udp_denial_of_service():
    utils.get_output("wireshark/s5-eth1-iperf-udp-h1-h5.pcap",
                     "test/s5-udp-h1-h5.txt")

    def callback(x):
        return x['dst'] == '10.0.0.5' and x['protocol'] == 'UDP'

    df_1 = utils.get_df("test/s5-udp-h1-h5.txt")
    df_1.columns = ['src', 'dst', 'protocol']
    df_1 = utils.filter_by(df_1, ['dst', 'protocol'], callback)
    size_1 = len(df_1.index)

    assert size_1 < 893

    os.system("rm test/s5-udp-h1-h5.txt")
Esempio n. 24
0
def fe8():
    prefix = '企业税务登记信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '审核时间', '登记注册类型', '审核结果'])
    df[prefix + '_审核时间'] = df[prefix + '_审核时间'].apply(utils.get_date)
    df[prefix + '_审核结果'] = df[prefix + '_审核结果'].apply(
        lambda x: x if '江苏省苏州地方税务局' == x or '开业' == x or '正常' == x else '其他')
    f_pairs = [[prefix + '_登记注册类型', prefix + '_审核结果']]
    df = utils.get_ratio(df, f_pairs)
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', keep='last', inplace=True)
    test_df.drop_duplicates('企业名称', keep='last', inplace=True)
    for f in [prefix + '_审核结果', prefix + '_登记注册类型']:
        label_dict = dict(zip(train_df[f].unique(), range(train_df[f].nunique())))
        train_df[f] = train_df[f].map(label_dict).fillna(-1).astype('int16')
        test_df[f] = test_df[f].map(label_dict).fillna(-1).astype('int16')
    return prefix, train_df, test_df
Esempio n. 25
0
def create_image_paths(df_path, cat):
    df = utils.get_df(df_path,
                      category=cat,
                      drop_nans=True,
                      drop_duplicates=True)
    file_paths = utils.get_id_path_pairs(df,
                                         from_path="drive",
                                         ignore_types="gif")
    file_paths = list(file_paths.keys())
    df = df[df.id.isin(file_paths)]

    df.to_csv(SESSION + "politics.csv", index=False)
    utils.save_clean_copy(df, SESSION + "politics.csv")
    utils.get_id_path_pairs(df,
                            save_path=SESSION + "id_path_all.csv",
                            from_path="drive")
Esempio n. 26
0
    def _get_data(self, refresh, verbose):
        """Download or fetch data from cache.
        
        Downloads 1 file per day, and unions files with the same schema.
        Unioning across schemas is taken care of `self._consolidate`.

        Returns:
            List[DataFrame]: One DataFrame for each schema version.
        """
        schemas = {
            '20200122': {
                'dates': pd.date_range('2020-01-22', '2020-02-29'),
                'kwargs': dict(parse_dates=['Last Update']),
            },
            '20200301': {
                'dates': pd.date_range('2020-03-01', '2020-03-21'),
                'kwargs': dict(parse_dates=['Last Update']),
            },
            '20200322': {
                'dates': pd.date_range('2020-03-22', dt.datetime.today()),
                'kwargs': dict(parse_dates=['Last_Update']),
            },
        }

        schemas_dfs = {}

        for schema_name, schema in schemas.items():
            files_dfs = []
            for date in schema['dates']:
                filename = f"{date.strftime('%m-%d-%Y')}.csv"
                df = get_df(
                    self.host,
                    f"{self.path}/{filename}",
                    refresh,
                    verbose,
                    errors='ignore',
                    **schema['kwargs']
                )
                if df is not None:
                    files_dfs.append(df.assign(_filedate=date, _filename=filename))
            # validate schemas
            columns = list(map(lambda df: set(df.columns), files_dfs))
            assert all(x == columns[0] for x in columns), f"schemas differ for {schema_name}"

            schemas_dfs[schema_name] = pd.concat(files_dfs)

        return schemas_dfs
Esempio n. 27
0
    def raw(self):

        if self._raw is not None:
            return self._raw

        df = get_df(
            self.host,
            self.path,
            self._refresh,
            self._verbose,
            errors='raise',
            parse_dates=['dateRep'],
            dayfirst=True,
            keep_default_na=False,  # don't treat Namibia as 'NA'
        )
        self._raw = df  # cache
        return df
Esempio n. 28
0
def figs_hyperparameters_rej_abc():
    df = get_df(
        path=f"{basepath_dfs}/supplement_hyperparameters_rej_abc.csv", )
    for tasks in tqdm(all_tasks):
        for metric in ["C2ST"]:
            plot_task_metric(
                df,
                tasks,
                metric,
                subfolder="hyperparameters_rej_abc",
                default_color=get_colors(df=df, include_defaults=True)["REJ"],
            )
            plot_task_metric(
                df,
                tasks,
                metric,
                subfolder="hyperparameters_rej_abc",
                labels=False,
                default_color=get_colors(df=df, include_defaults=True)["REJ"],
            )
Esempio n. 29
0
def figs_abc_lra_sass():
    df = get_df(path=f"{basepath_dfs}/supplement_abc_lra_sass.csv", )

    for tasks in tqdm(all_tasks):
        for metric in ["C2ST"]:
            plot_task_metric(
                df,
                tasks,
                metric,
                subfolder="abc_lra_sass",
                default_color=get_colors(df=df),
            )
            plot_task_metric(
                df,
                tasks,
                metric,
                subfolder="abc_lra_sass",
                labels=False,
                default_color=get_colors(df=df),
            )
def get_data_loaders(bs=8, num_workers=0, shuffle=True, ts=0.2):
    train_df, img_2_ohe_vector = get_df()
    train_imgs, val_imgs = train_test_split(train_df['Image'].values,
                            test_size=ts,
                            stratify=train_df['Class'].map(lambda x: str(sorted(list(x)))),
                            random_state=42)
    print(train_imgs)
    print(val_imgs)
    print(len(train_imgs))
    print(len(val_imgs))
    train_dataset = CloudDataset(img_2_ohe_vector, img_ids=train_imgs,
                                 transforms=get_training_augmentation())
    train_loader = DataLoader(train_dataset, batch_size=bs,
                              shuffle=shuffle, num_workers=num_workers)

    val_dataset = CloudDataset(img_2_ohe_vector, img_ids=val_imgs,
                                 transforms=get_validation_augmentation())
    val_loader = DataLoader(val_dataset, batch_size=bs,
                              shuffle=shuffle, num_workers=num_workers)

    return train_loader, val_loader
 def setUp(self):
     df = get_df()
     self.check = Check(df)
     self.jvmCheck = self.check.jvmCheck
Esempio n. 32
0
    
    #perform the fit
    res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds)
    grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

    #get the predictions
    time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean())
    fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3])
    if res.success == True:
        return [fare_pred, time_pred]
    else:
        return [0,0]


#read in dataframe and cache it
df1 = utils.get_df(sqlContext, 1, 1)
df1.cache()

#get random sample for prediction
test_sample=df1.sample(False, 0.1, seed=42).limit(500).toPandas()
test_sample.columns = df1.columns
test_fare = test_sample["total_notip"].tolist()
test_time = test_sample["trip_time_in_secs"].tolist()
pred_fare = []
pred_time = []

#get prediction for each event
for index, row in test_sample.iterrows():
    print 'Processing event '+str(index)
    event = [row['pickup'],utils.dayofweek(row['pickup']),row['pick_lat'],row['pick_lon'],row['drop_lat'],row['drop_lon'],row['pc']]
    prediction = make_prediction(event, df1)