Exemple #1
0
def main():
    print 'Cargando... rate = %r...' % get_ratio()
    with DAOokwDataContext(_CS_OKW) as okw:
        okw.CommandTimeout = _DEFAULT_COMMAND_TIMEOUT
        with DAOSomatomDataContext(_CS_OKW) as somatom:
            somatom.CommandTimeout = _DEFAULT_COMMAND_TIMEOUT

            script = Script(okw, somatom, get_ratio())
            script.procesa()
    print 'listo...'
Exemple #2
0
def fe1():
    prefix = '企业基本信息&高管信息&投资信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '注册资金', '注册资本(金)币种名称', '企业(机构)类型名称',
                                          '行业门类代码', '成立日期', '核准日期', '住所所在地省份', '姓名',
                                          '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例'])
    df[prefix + '_出资比例'] = df[prefix + '_出资比例'].apply(lambda x: x if x <= 1 else x / 100)
    df[prefix + '_注册资金'] = df[[prefix + '_注册资金', prefix + '_注册资本(金)币种名称']].apply(
        lambda x: x[prefix + '_注册资金'] if x[prefix + '_注册资本(金)币种名称'] not in utils.exch_rate.keys()
        else x[prefix + '_注册资金'] * utils.exch_rate[x[prefix + '_注册资本(金)币种名称']], axis=1).fillna(0)
    df[prefix + '_注册资金_binning'] = df[prefix + '_注册资金'].apply(
        lambda x: utils.binning(x, [300, 500, 1000, 3000, 6000]))
    df[prefix + '_成立日期'] = df[prefix + '_成立日期'].astype('str').apply(
        lambda x: utils.get_date(x[:10]) if x != 'nan' else np.nan)
    df[prefix + '_核准日期'] = df[prefix + '_核准日期'].apply(lambda x: utils.get_date(x[:10]))
    df[prefix + '_成立日期_核准日期_diff'] = df[prefix + '_成立日期'] - df[prefix + '_核准日期']
    df[prefix + '_法定代表人职务'] = df[[prefix + '_法定代表人标志', prefix + '_职务']].apply(
        lambda x: x[prefix + '_职务'] if x[prefix + '_法定代表人标志'] == '是' else np.nan, axis=1)
    df[prefix + '_首席代表职务'] = df[[prefix + '_首席代表标志', prefix + '_职务']].apply(
        lambda x: x[prefix + '_职务'] if x[prefix + '_首席代表标志'] == '是' else np.nan, axis=1)
    df = pd.merge(df, df.dropna(subset=[prefix + '_姓名']).groupby(
        prefix + '_姓名', as_index=False)['企业名称'].agg({prefix + '_姓名_企业名称_nunique': 'nunique'}),
                  on=prefix + '_姓名', how='left')
    df = pd.merge(df, df.dropna(subset=[prefix + '_投资人']).groupby(
        prefix + '_投资人', as_index=False)['企业名称'].agg({prefix + '_投资人_企业名称_nunique': 'nunique'}),
                  on=prefix + '_投资人', how='left')
    group = df.groupby('企业名称', as_index=False)
    df = pd.merge(df, utils.get_agg(group, prefix + '_姓名', ['nunique']), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_投资人', ['nunique']), on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_姓名_企业名称_nunique', ['max', 'mean', 'sum']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_投资人_企业名称_nunique', ['max', 'mean', 'sum']),
                  on='企业名称', how='left')
    df = pd.merge(df, utils.get_agg(group, prefix + '_出资比例', ['max', 'min', 'mean']), on='企业名称', how='left')
    f_pairs = [
        [prefix + '_住所所在地省份', prefix + '_企业(机构)类型名称'],
        [prefix + '_住所所在地省份', prefix + '_行业门类代码'],
        [prefix + '_注册资金_binning', prefix + '_企业(机构)类型名称'],
        [prefix + '_注册资金_binning', prefix + '_行业门类代码'],
        [prefix + '_企业(机构)类型名称', prefix + '_行业门类代码']
    ]
    df = utils.get_ratio(df, f_pairs)
    for f in ['注册资本(金)币种名称', '姓名', '法定代表人标志', '首席代表标志', '职务', '投资人', '出资比例',
              '姓名_企业名称_nunique', '投资人_企业名称_nunique']:
        del df[prefix + '_' + f]
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', inplace=True)
    train_df.dropna(subset=[prefix + '_成立日期'], inplace=True)
    test_df.drop_duplicates('企业名称', inplace=True)
    for f in ['法定代表人职务', '首席代表职务', '企业(机构)类型名称', '行业门类代码', '住所所在地省份']:
        label_dict = dict(zip(train_df[prefix + '_' + f].unique(), range(train_df[prefix + '_' + f].nunique())))
        train_df[prefix + '_' + f] = train_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16')
        test_df[prefix + '_' + f] = test_df[prefix + '_' + f].map(label_dict).fillna(-1).astype('int16')
    return prefix, train_df, test_df
Exemple #3
0
def fe8():
    prefix = '企业税务登记信息'
    df, train_num = utils.get_df(prefix, ['企业名称', '审核时间', '登记注册类型', '审核结果'])
    df[prefix + '_审核时间'] = df[prefix + '_审核时间'].apply(utils.get_date)
    df[prefix + '_审核结果'] = df[prefix + '_审核结果'].apply(
        lambda x: x if '江苏省苏州地方税务局' == x or '开业' == x or '正常' == x else '其他')
    f_pairs = [[prefix + '_登记注册类型', prefix + '_审核结果']]
    df = utils.get_ratio(df, f_pairs)
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', keep='last', inplace=True)
    test_df.drop_duplicates('企业名称', keep='last', inplace=True)
    for f in [prefix + '_审核结果', prefix + '_登记注册类型']:
        label_dict = dict(zip(train_df[f].unique(), range(train_df[f].nunique())))
        train_df[f] = train_df[f].map(label_dict).fillna(-1).astype('int16')
        test_df[f] = test_df[f].map(label_dict).fillna(-1).astype('int16')
    return prefix, train_df, test_df
Exemple #4
0
def fe():
    label_df11 = pd.read_csv(
        open('data/round1/train/失信被执行人名单.csv', encoding='utf-8'))
    label_df12 = pd.read_csv(
        open('data/round2/train/失信被执行人名单.csv', encoding='utf-8'))
    label_df1 = pd.concat([label_df11, label_df12], axis=0, ignore_index=True)
    label_df1['label1'] = 1
    label_df21 = pd.read_csv(
        open('data/round1/train/双公示-法人行政处罚信息.csv', encoding='utf-8'))
    label_df22 = pd.read_csv(
        open('data/round2/train/双公示-法人行政处罚信息.csv', encoding='utf-8'))
    label_df22.drop(label_df22.columns.values[1:], axis=1, inplace=True)
    label_df2 = pd.concat([label_df21, label_df22], axis=0, ignore_index=True)
    label_df2['label2'] = 1

    print('fe1')
    prefix1, train_df1, test_df1 = fe1()
    print('fe2')
    prefix2, train_df2, test_df2 = fe2()
    print('fe3')
    prefix3, train_df3, test_df3 = fe3()
    print('fe4')
    prefix4, train_df4, test_df4 = fe4()
    print('fe5')
    prefix5, train_df5, test_df5 = fe5()
    print('fe6')
    prefix6, train_df6, test_df6 = fe6()
    print('fe7')
    prefix7, train_df7, test_df7 = fe7()
    print('fe8')
    prefix8, train_df8, test_df8 = fe8()
    print('fe9')
    prefix9, train_df9, test_df9 = fe9()
    print('fe10')
    prefix10, train_df10, test_df10 = fe10()
    print('fe11')
    prefix11, train_df11, test_df11 = fe11()
    print('fe12')
    prefix12, train_df12, test_df12 = fe12()

    test_df = pd.merge(test_df1, test_df2, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df3, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df4, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df5, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df6, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df7, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df8, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df9, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df10, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df11, on='企业名称', how='left')
    test_df = pd.merge(test_df, test_df12, on='企业名称', how='left')
    test_df[prefix5] = test_df[prefix5].fillna(0)

    df = pd.merge(train_df1, train_df2, on='企业名称', how='left')
    df = pd.merge(df, train_df3, on='企业名称', how='left')
    df = pd.merge(df, train_df4, on='企业名称', how='left')
    df = pd.merge(df, train_df5, on='企业名称', how='left')
    df = pd.merge(df, train_df6, on='企业名称', how='left')
    df = pd.merge(df, train_df7, on='企业名称', how='left')
    df = pd.merge(df, train_df8, on='企业名称', how='left')
    df = pd.merge(df, train_df9, on='企业名称', how='left')
    df = pd.merge(df, train_df10, on='企业名称', how='left')
    df = pd.merge(df, train_df11, on='企业名称', how='left')
    df = pd.merge(df, train_df12, on='企业名称', how='left')
    df = pd.merge(df, label_df1, on='企业名称', how='left')
    df = pd.merge(df, label_df2, on='企业名称', how='left')
    df[prefix5] = df[prefix5].fillna(0)
    df['label1'] = df['label1'].fillna(0)
    df['label2'] = df['label2'].fillna(0)
    labels2 = df['label2'].values

    df[prefix1 + '_成立日期_' + prefix3 +
       '_发证日期_diff'] = df[prefix1 + '_成立日期'] - df[prefix3 + '_发证日期']
    test_df[prefix1 + '_成立日期_' + prefix3 +
            '_发证日期_diff'] = test_df[prefix1 + '_成立日期'] - test_df[prefix3 +
                                                                 '_发证日期']
    train_num = df.shape[0]
    df = pd.concat([df, test_df], axis=0, ignore_index=True)
    f_pairs = [[prefix1 + '_住所所在地省份', prefix3 + '_行政区划']]
    df = utils.get_ratio(df, f_pairs)
    del df['label1'], df['label2']
    test_df = df[train_num:]
    df = df[:train_num]
    train_names = df['企业名称']
    test_names = test_df['企业名称']
    del df['企业名称'], test_df['企业名称']
    return df, train_names, labels2, test_df, test_names
Exemple #5
0
def fe3():
    def business_scope(x):
        x = re.sub('(.*?)', '', x)
        if '。' == x[-1]:
            x = x[:-1]
        if '。' in x:
            if ';' in x:
                x.replace('。', ';')
            elif '、' in x:
                x.replace('。', '、')
            else:
                x.replace('。', ' ')
        return x

    def business_scope_to_key_words(x, key_words):
        for w in key_words:
            if w in x:
                return w
        return '其他'

    prefix = '机构设立(变更)登记信息'
    df, train_num = utils.get_df(prefix, [
        '企业名称', '注册(开办)资金', '实收资金', '经营范围', '企业类型代码', '所属行业代码', '机构地址(住所)',
        '发证日期', '行政区划'
    ])

    df[prefix + '_经营范围'] = df[prefix +
                              '_经营范围'].astype('str').apply(business_scope)
    sentences = df[prefix + '_经营范围'].values
    docs = []
    for s in sentences:
        if ';' in s:
            docs.append(' '.join(s.split(';')))
        elif '、' in s:
            docs.append(' '.join(s.split('、')))
        else:
            docs.append(' '.join(s.split(' ')))
    v = TfidfVectorizer(max_df=0.7, min_df=323)
    v.fit_transform(docs)
    key_words = v.get_feature_names()
    df[prefix + '_经营范围'] = df[prefix + '_经营范围'].apply(
        lambda x: business_scope_to_key_words(x, key_words))

    df[prefix + '_注册(开办)资金'] = df[[
        prefix + '_注册(开办)资金', prefix + '_实收资金'
    ]].apply(lambda x: x[prefix + '_注册(开办)资金']
             if x[prefix + '_实收资金'] not in utils.exch_rate.keys() else x[
                 prefix + '_注册(开办)资金'] * utils.exch_rate[x[prefix + '_实收资金']],
             axis=1).fillna(0)
    df[prefix + '_注册(开办)资金_binning'] = df[prefix + '_注册(开办)资金'].apply(
        lambda x: utils.binning(x, [300, 500, 1000, 3000, 6000]))
    df[prefix + '_发证日期'] = df[prefix + '_发证日期'].apply(lambda x: utils.get_date(
        x) if '.' not in x else utils.get_date(x[:10]))
    df[prefix + '_机构地址(住所)'] = df[prefix + '_机构地址(住所)'].apply(lambda x: x[:2])
    df[prefix + '_行政区划'] = df[prefix + '_行政区划'].astype('str').apply(
        lambda x: x if x != '999999' else 'nan').astype('float32')
    f_pairs = [[prefix + '_行政区划', prefix + '_企业类型代码'],
               [prefix + '_行政区划', prefix + '_所属行业代码'],
               [prefix + '_经营范围', prefix + '_企业类型代码'],
               [prefix + '_经营范围', prefix + '_所属行业代码'],
               [prefix + '_经营范围', prefix + '_注册(开办)资金_binning'],
               [prefix + '_机构地址(住所)', prefix + '_企业类型代码'],
               [prefix + '_机构地址(住所)', prefix + '_所属行业代码'],
               [prefix + '_机构地址(住所)', prefix + '_行政区划'],
               [prefix + '_机构地址(住所)', prefix + '_注册(开办)资金_binning']]
    df = utils.get_ratio(df, f_pairs)
    del df[prefix + '_注册(开办)资金'], df[prefix + '_注册(开办)资金_binning'], df[prefix +
                                                                       '_实收资金']
    train_df, test_df = df[:train_num], df[train_num:]
    train_df.drop_duplicates('企业名称', keep='last', inplace=True)
    test_df.drop_duplicates('企业名称', keep='last', inplace=True)
    for f in ['企业类型代码', '所属行业代码', '行政区划', '经营范围', '机构地址(住所)']:
        label_dict = dict(
            zip(train_df[prefix + '_' + f].unique(),
                range(train_df[prefix + '_' + f].nunique())))
        train_df[prefix + '_' +
                 f] = train_df[prefix + '_' +
                               f].map(label_dict).fillna(-1).astype('int16')
        test_df[prefix + '_' +
                f] = test_df[prefix + '_' +
                             f].map(label_dict).fillna(-1).astype('int16')
    return prefix, train_df, test_df
    def process(self):
        # fill na
        for column in self.nanum_columns:
            print("Fill NA {}".format(column))
            self.df_all[column].fillna(-1, inplace=True)

        for column in self.nastr_columns:
            print("Fill NA {}".format(column))
            self.df_all[column].fillna("", inplace=True)

        # new features
        self.df_all["dstipscope_dominate"] = self.df_all.apply(
            lambda row: utils.get_ip_scope(row["dstipcategory_dominate"]),
            axis=1)
        self.df_all["srcipscope_dominate"] = self.df_all.apply(
            lambda row: utils.get_ip_scope(row["srcipcategory_dominate"]),
            axis=1)

        # ip zone features
        self.df_all["ip_zone_1"] = self.df_all.apply(
            lambda row: utils.get_ip_zone(row["ip"], 1), axis=1)
        self.df_all["ip_zone_2"] = self.df_all.apply(
            lambda row: utils.get_ip_zone(row["ip"], 2), axis=1)
        self.df_all["ip_zone_3"] = self.df_all.apply(
            lambda row: utils.get_ip_zone(row["ip"], 3), axis=1)
        self.df_all["ip_zone_4"] = self.df_all.apply(
            lambda row: utils.get_ip_zone(row["ip"], 4), axis=1)

        # concatenation features
        self.df_all["ip_zone_12"] = self.df_all.apply(
            lambda row: utils.concatenate_values(
                [row["ip_zone_1"], row["ip_zone_2"]]),
            axis=1)
        self.df_all["ip_zone_123"] = self.df_all.apply(
            lambda row: utils.concatenate_values(
                [row["ip_zone_1"], row["ip_zone_2"], row["ip_zone_3"]]),
            axis=1)
        self.df_all["ip_zone_34"] = self.df_all.apply(
            lambda row: utils.concatenate_values(
                [row["ip_zone_3"], row["ip_zone_4"]]),
            axis=1)
        self.df_all["ip_zone_234"] = self.df_all.apply(
            lambda row: utils.concatenate_values(
                [row["ip_zone_2"], row["ip_zone_3"], row["ip_zone_4"]]),
            axis=1)
        self.le_columns.append("ip_zone_12")
        self.le_columns.append("ip_zone_123")
        self.le_columns.append("ip_zone_34")
        self.le_columns.append("ip_zone_234")

        feature_pairs = [("categoryname", "ipcategory_scope"), \
                         ("categoryname", "overallseverity"), \
                         ("srcipscope_dominate", "dstipscope_dominate")]

        for item in feature_pairs:
            f1 = item[0]
            f2 = item[1]
            fn = f1 + "_" + f2
            self.df_all[fn] = self.df_all.apply(
                lambda row: utils.concatenate_values([row[f1], row[f2]]),
                axis=1)
            self.le_columns.append(fn)

        # timestamp_dist in hour and minute
        self.df_all["timestamp_hour"] = self.df_all.apply(
            lambda row: utils.get_duration(row["timestamp_dist"]), axis=1)

        # ending time features
        self.df_all["end_hour"] = self.df_all.apply(
            lambda row: utils.get_end_time(row["start_hour"], row[
                "start_minute"], row["start_second"], row["timestamp_dist"],
                                           "hour"),
            axis=1)
        self.df_all["end_minute"] = self.df_all.apply(
            lambda row: utils.get_end_time(row["start_hour"], row[
                "start_minute"], row["start_second"], row["timestamp_dist"],
                                           "minute"),
            axis=1)
        self.df_all["end_second"] = self.df_all.apply(
            lambda row: utils.get_end_time(row["start_hour"], row[
                "start_minute"], row["start_second"], row["timestamp_dist"],
                                           "second"),
            axis=1)

        # sum score features
        self.df_all["sum_score"] = self.df_all.apply(
            lambda row: utils.get_sum([
                row["{}score".format(score)]
                for score in ["untrust", "flow", "trust", "enforcement"]
            ]),
            axis=1)
        self.df_all["sum_n"] = self.df_all.apply(lambda row: utils.get_sum(
            [row["n{}".format(i)] for i in range(1, 11)]),
                                                 axis=1)
        self.df_all["sum_p5"] = self.df_all.apply(lambda row: utils.get_sum(
            [row["p5{}".format(p5)] for p5 in ["m", "w", "d"]]),
                                                  axis=1)
        self.df_all["sum_p8"] = self.df_all.apply(lambda row: utils.get_sum(
            [row["p8{}".format(p8)] for p8 in ["m", "w", "d"]]),
                                                  axis=1)
        #self.df_all["sum_p58"] = self.df_all.apply(lambda row: utils.get_sum([row["sum_p5"], row["sum_p8"]]), axis = 1)

        # get ratio features
        # self.df_all["thrcnt_month_week"] = self.df_all.apply(lambda row: utils.get_ratio(row["thrcnt_month"], row["thrcnt_week"]), axis = 1)
        self.df_all["thrcnt_month_day"] = self.df_all.apply(
            lambda row: utils.get_ratio(row["thrcnt_month"], row["thrcnt_day"]
                                        ),
            axis=1)
        self.df_all["thrcnt_week_day"] = self.df_all.apply(
            lambda row: utils.get_ratio(row["thrcnt_week"], row["thrcnt_day"]),
            axis=1)

        # encode features with label encoder
        label_encoder = LabelEncoder()
        for column in self.le_columns:
            print("Label encoding {}".format(column))
            label_encoder.fit(self.df_all[column])
            self.df_all[column] = label_encoder.transform(self.df_all[column])

        # encode features with one-hot encoder
        for column in self.oe_columns:
            print("One-hot encoding {}".format(column))
            pd_encoded = pd.get_dummies(self.df_all[column])
            pd_encoded.columns = [
                "{}_{}".format(column, "_".join(str(col).lower().split()))
                for col in pd_encoded.columns
            ]
            self.df_all.drop(column, axis=1, inplace=True)
            self.df_all = pd.concat([self.df_all, pd_encoded], axis=1)