Exemple #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rm', action='store_true')
    parser.add_argument('--out-dir')
    args = parser.parse_args()
    
    # Build editor schema.
    the_editor = aprinter_config_editor.editor()
    editor_schema = the_editor.json_schema()
    
    # Determine directories.
    src_dir = file_utils.file_dir(__file__)
    libs_dir = os.path.join(src_dir, 'libs')
    jsoneditor_dir = os.path.join(src_dir, '..', 'json-editor')
    if args.out_dir is not None:
        dist_dir = args.out_dir
    else:
        dist_dir = os.path.join(src_dir, 'dist')
    
    # Remove dist dir.
    if args.rm and os.path.isdir(dist_dir):
        shutil.rmtree(dist_dir)
    
    # The temp dir is where we will prepare contents before hashification.
    temp_dir = os.path.join(dist_dir, 'TEMP')

    # Create directories.
    os.mkdir(dist_dir)
    os.mkdir(temp_dir)
    
    # Copy JS libraries.
    shutil.copyfile(os.path.join(jsoneditor_dir, 'dist', 'jsoneditor.min.js'), os.path.join(temp_dir, 'jsoneditor.js'))
    shutil.copyfile(os.path.join(libs_dir, 'FileSaver.min.js'), os.path.join(temp_dir, 'FileSaver.js'))
    shutil.copyfile(os.path.join(libs_dir, 'jquery-1.11.2.min.js'), os.path.join(temp_dir, 'jquery.js'))
    
    # Copy Bootstrap.
    subprocess.call(['unzip', '-q', os.path.join(libs_dir, 'bootstrap-3.3.2-dist.zip'), '-d', temp_dir])
    os.rename(os.path.join(temp_dir, 'bootstrap-3.3.2-dist'), os.path.join(temp_dir, 'bootstrap'))

    # Copy files.
    for filename in ['index.html', 'Ajax-loader.gif']:
        shutil.copyfile(os.path.join(src_dir, filename), os.path.join(temp_dir, filename))
    
    # Read default configuration.
    default_config = json.loads(file_utils.read_file(os.path.join(src_dir, 'default_config.json')))
    
    # Build and write init.js.
    init_js_template = file_utils.read_file(os.path.join(src_dir, 'init.js'))
    init_js = rich_template.RichTemplate(init_js_template).substitute({
        'SCHEMA': json.dumps(editor_schema, separators=(',',':'), sort_keys=True),
        'DEFAULT': json.dumps(default_config, separators=(',',':'), sort_keys=True)
    })
    file_utils.write_file(os.path.join(temp_dir, 'init.js'), init_js)

    # Run hashify to produce the final contents.
    resource_hashifier.hashify(temp_dir, _HASHIFY_CONFIG, dist_dir)

    # Remove the temp dir.
    shutil.rmtree(temp_dir)
Exemple #2
0
 def test_output(self):
     """Check if we are still getting the same good dep tree output."""
     test_cmd = self._get_cmd(self.TEST_RULE)
     proc = _run_cmd(test_cmd)
     temp_file = fu.get_temp_file(suffix='dep_tree_output')
     fu.write_file(temp_file, proc.stdout.read())
     assert proc.returncode == 0
     assert _compare_files(self.TEST_RULE_EXPECTED_OUT_FILE, temp_file)
def _write_to_output(files, file_entries, root_files, out_dir_path):
    for file in files:
        file_entry = file_entries[file]

        rel_dst_path = file if file in root_files else \
            os.path.join(os.path.dirname(file), file_entry['new_name'])
        
        dst_path = os.path.join(out_dir_path, rel_dst_path)
        
        dir_path = os.path.dirname(dst_path)
        if not os.path.isdir(dir_path):
            os.makedirs(dir_path)
        
        file_utils.write_file(dst_path, file_entry['content'])
Exemple #4
0
def _compare_files(expected, generated, sort=False):
    """Compares file contents of generated file with expected file."""
    def _sort_lines(data):
        """Sort all lines of data using newline as delimiter."""
        return '\n'.join(sorted(data.split('\n')))

    expected_data = fu.read_file(expected)
    generated_data = _normalize(fu.read_file(generated))
    if sort:
        expected_data = _sort_lines(expected_data)
        generated_data = _sort_lines(_normalize(fu.read_file(generated)))

    matches = _sort_lines(expected_data) == _sort_lines(generated_data)
    if not matches and UPDATE_TEST_DATA:
        fu.write_file(expected, generated_data)
    if not matches and not UPDATE_TEST_DATA:
        print 'Files {} and {} differ!'.format(expected, generated)
        print '{}\nExpected output: {}'.format('~' * 80, expected_data)
        print '{}\nGenerated output: {}'.format('~' * 80, generated_data)
    return matches
Exemple #5
0
def check_submitq_test(expected_tests, changed_files):
    """Check if submit queue outputs expected tests for given changed files."""
    LOGGER.debug('Running SUBMITQ tests on: [%s].', ', '.join(changed_files))
    try:
        # In debug mode, the tests are only listed. Otherwise the tests are
        # actually executed.
        os.environ['SUBMITQ_DEBUG_MODE'] = 'true'
        input_file = fu.get_temp_file()
        output_file = fu.get_temp_file()
        input_data = [os.path.join(BUILD_ROOT, f) for f in changed_files]
        fu.write_file(input_file, '\n'.join(input_data))
        with open(output_file, 'w') as stdout_obj:
            subprocess.check_call([BU_SCRIPT, 'do_test_changes', input_file],
                                  stdout=stdout_obj)
        actual_tests = [l for l in fu.read_file(output_file).split('\n') if l]
        expected_tests.sort()
        actual_tests.sort()
        if expected_tests != actual_tests:
            LOGGER.error('Expected tests: %s', ' '.join(expected_tests))
            LOGGER.error('  Actual tests: %s', ' '.join(actual_tests))
            raise Error('SUBMITQ tests failed!!')
    finally:
        os.remove(input_file)
        os.remove(output_file)
Exemple #6
0
def empty_value_handle_share_exchange_info():
    """
    Dirty value handle for table 年报-股东股权转让.xlsx.
    First we'll drop rows that empty value is too many.
    ['变更前股权比例','变更后股权比例','年报年份','股权变更日期']
    Once there are more than 2 empties in these 4 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    变更前股权比例
    ------
    Empty percentage is 0.3939%(17 out of 4316). We replace them as -1.
    The format is not uniformed. Some are formatted as '.07%', some are '0.07%' and some are '0.07'. We need to drop '%'
    and make all format as '0.07'. For numbers greater than 1, we mark them as -1.

    -----------------------------
    变更后股权比例
    ------
    Empty percentage is 0.278%(12 out of 4316). We replace them as -1.
    The format is not uniformed. Some are formatted as '.07%', some are '0.07%' and some are '0.07'. We need to drop '%'
    and make all format as '0.07'. For numbers greater than 1, we mark them as -1.
    A more complicate problem is some value are actually belong to '股权变更日期', which we need to copy them to column
    '股权变更日期'

    -----------------------------
    股权变更日期
    ------
    Empty percentage is 0.3939%(17 out of 4316). The empty value are replaced to the invalid value('1000-01-01')
    so we can handle it later.
    Others are well formatted with format yyyy-mm-dd.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0.139%(6 out of 4316). The empty value are replaced to the invalid value('1000')
    so we can handle it later.
    Others are well formatted with format yyyy-mm-dd.

    -----------------------------
    :return:
    """
    empty_check_list = [
        u'变更前股权比例'.encode('utf-8'), u'变更后股权比例'.encode('utf-8'),
        u'年报年份'.encode('utf-8'), u'股权变更日期'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-股东股权转让.xlsx',
                                 columns=empty_check_list,
                                 thresh=2)
    panaly.list_category_columns_values([u'年报-股东股权转让'],
                                        u'年报-股东股权转让_empty_handled',
                                        file_url=clean_data_temp_file_url)

    df = file_utils.read_file_to_df(clean_data_temp_file_url, u'年报-股东股权转让')
    values = {
        u'变更前股权比例'.encode('utf-8'): -1,
        u'变更后股权比例'.encode('utf-8'): -1,
        u'股权变更日期'.encode('utf-8'): '1000-01-01',
        u'年报年份'.encode('utf-8'): '1000'
    }
    for index in range(0, len(df)):
        content = df.at[index, u'股权变更日期'.encode('utf-8')]
        content_b = df.at[index, u'变更后股权比例'.encode('utf-8')]
        if '-' in str(content_b) and (pandas.isnull(content)
                                      or pandas.isna(content)):
            df.set_value(index, u'股权变更日期'.encode('utf-8'), content_b)
            df.set_value(index, u'变更后股权比例'.encode('utf-8'), '')

    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url, u'年报-股东股权转让')

    dcu.drop_unit_with_float_format(u'年报-股东股权转让',
                                    u'变更前股权比例'.encode('utf-8'), ['%'],
                                    empty_mask=-1)
    dcu.drop_unit_with_float_format(u'年报-股东股权转让',
                                    u'变更后股权比例'.encode('utf-8'), ['%'],
                                    empty_mask=-1)

    dcu.mark_invalid_num_data(u'年报-股东股权转让', u'变更前股权比例'.encode('utf-8'), '>',
                              100)
    dcu.mark_invalid_num_data(u'年报-股东股权转让', u'变更后股权比例'.encode('utf-8'), '>',
                              100)

    return
Exemple #7
0
def cross_section(file_name, vars, file_url=clean_data_temp_file_url, dst_file_url=corporation_index_file_url):
    """
    指标:所有变量和时间(季度)的交叉项,转化为截面数据
    :param file_name:
    :param vars: 需要和时间交叉的变量集,写成向量
    :param empty_mask:
    :param file_url:
    :param dst_file_url:
    :return:
    """

    """
    调试函数用
    file_name=u'上市公司财务信息-每股指标'
    vars=[u'基本每股收益(元)', u'扣非每股收益(元)',u'稀释每股收益(元)',
        u'每股净资产(元)',u'每股公积金(元)',u'每股未分配利润(元)',u'每股经营现金流(元)']
    file_url = clean_data_temp_file_url
    dst_file_url = corporation_index_file_url
    """

    data_frame = file_utils.read_file_to_df(file_url, file_name)
    date = data_frame[u'日期'.encode('utf-8')]  # 日期列
    unique_date = numpy.sort(list(set(date)))  # 删除重复,并按时间排列
    # 只保留 16.9之后的
    # for j in range(0, len(unique_date)):
    #     if unique_date[j][0:3]<2016 or unique_date[j][5:6]<09:
    #         unique_date[j]=[]

    # 新表的列名,是变量名和日期的交叉项
    var_date = []
    for i in range(0, len(vars)):
        for j in range(0, len(unique_date)):
            var_date.append(vars[i] + unique_date[j].encode('utf-8'))
    """
    尝试
    a = []
    aa=[u'每股收益',2,3]
    a = pandas.DataFrame(index=[range(1001, 3001)], columns=aa)
#   输出时文件名需要加上'',index=true表示包含第一列
    file_utils.write_file(a, file_utils.check_file_url(dst_file_url), 'a',ext='.xlsx',
                          sheet_name='Sheet', index='true')    

    ab=u'每股收益'
    ab in aa
    a.set_value(company,ab,this_number)
    
    column in var_date
    
    print data_frame.values[0][0]
    """

    # 建立空表
    b = []
    b = pandas.DataFrame(index=[range(test_start, test_end + 1)], columns=var_date)

    # 赋值
    for i in range(0, len(vars)):
        for j in range(0, len(data_frame)):  # 原表中的每一行
            company = data_frame.iloc[j, 0]
            # at后要写列的名字,不能写列数
            # company = data_frame.at[j, u'企业总评分']
            this_season = data_frame.at[j, u'日期'.encode('utf-8')]
            this_number = data_frame.at[j, vars[i]]
            if this_number != 'Unknown':
                column = vars[i] + this_season.encode('utf-8')
                b.set_value(company, column, this_number)

    file_utils.write_file(b, file_utils.check_file_url(dst_file_url), file_name + '_index', ext='.xlsx',
                          sheet_name='Sheet', index=True)

    """
    空值的处理有点问题
    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个
    for i in range(0,len(var_date)):
        dcu.merge_status('b', u'基本每股收益(元)2010-12-31', status_list, status_after, empty_mask='-65535')
    var_date[i]
    """

    """
    运行框
        file_name=u'上市信息财务信息资产负债表'
        vars=[u'资产:货币资金(元)',u'资产:应收账款(元)',u'资产:其它应收款(元)',u'资产:存货(元)',
        u'资产:流动资产合计(元)',u'资产:长期股权投资(元)',u'资产:累计折旧(元)',u'资产:固定资产(元)',u'资产:无形资产(元)',u'资产:资产总计(元)',u'负债:应付账款(元)',
        u'负债:预收账款(元)',u'负债:存货跌价准备(元)',u'负债:流动负债合计(元)',u'负债:长期负债合计(元)',
        u'负债:负债合计(元)',u'权益:实收资本(或股本)(元)',u'权益:资本公积金(元)',u'权益:盈余公积金(元)',u'权益:股东权益合计(元)',u'流动比率']
    import exploratory_data_finance
    
    exploratory_data_finance.cross_section(u'上市公司财务信息-每股指标', [u'基本每股收益(元)'.encode('utf-8'), u'扣非每股收益(元)'.encode('utf-8'), u'稀释每股收益(元)'.encode('utf-8'),
                u'每股净资产(元)'.encode('utf-8'), u'每股公积金(元)'.encode('utf-8'), u'每股未分配利润(元)'.encode('utf-8'), u'每股经营现金流(元)'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-财务风险指标', [u'资产负债率(%)'.encode('utf-8'),u'流动负债/总负债(%)'.encode('utf-8'),u'流动比率'.encode('utf-8'),u'速动比率'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-成长能力指标', [u'营业总收入(元)'.encode('utf-8'),u'毛利润(元)'.encode('utf-8'),u'归属净利润(元)'.encode('utf-8'),
        u'扣非净利润(元)'.encode('utf-8'),u'营业总收入同比增长(元)'.encode('utf-8'),u'归属净利润同比增长(元)'.encode('utf-8'),u'扣非净利润同比增长(元)'.encode('utf-8'),
        u'营业总收入滚动环比增长(元)'.encode('utf-8'),u'归属净利润滚动环比增长(元)'.encode('utf-8'),u'扣非净利润滚动环比增长(元)'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-利润表', [u'营业收入(元)',u'营业成本(元)',u'销售费用(元)',u'财务费用(元)',
       u'管理费用(元)',u'资产减值损失(元)',u'投资收益(元)',u'营业利润(元)',u'利润总额(元)',u'所得税(元)',u'归属母公司所有者净利润(元)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息-现金流量表', [u'经营:销售商品、提供劳务收到的现金(元)',u'经营:收到的税费返还(元)',
                u'经营:收到其他与经营活动有关的现金(元)', u'经营:经营活动现金流入小计(元)',u'经营:购买商品、接受劳务支付的现金(元)',u'经营:支付给职工以及为职工支付的现金(元)',
                u'经营:支付的各项税费(元)',u'经营:支付其他与经营活动有关的现金(元)',u'经营:经营活动现金流出小计(元)',u'经营:经营活动产生的现金流量净额(元)',
                u'投资:取得投资收益所收到的现金(元)',u'投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)',u'投资:投资活动现金流入小计(元)',
                u'投资:购建固定资产、无形资产和其他长期资产支付的现金(元)',u'投资:投资支付的现金(元)',u'投资:投资活动现金流出小计(元)',
                u'投资:投资活动产生的现金流量净额(元)',u'筹资:吸收投资收到的现金(元)',u'筹资:取得借款收到的现金(元)',u'筹资:筹资活动现金流入小计(元)',
                u'筹资:偿还债务支付的现金(元)',u'筹资:分配股利、利润或偿付利息支付的现金(元)',u'筹资:筹资活动现金流出小计(元)',u'筹资活动产生的现金流量净额(元)'])

    exploratory_data_finance.cross_section(u'上市信息财务信息盈利能力指标', [u'加权净资产收益率(%)',u'摊薄净资产收益率(%)',u'摊薄总资产收益率(%)',u'毛利率(%)',u'净利率(%)',u'实际税率(%)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息运营能力指标', [u'总资产周转率(次)',u'应收账款周转天数(天)',u'存货周转天数(天)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息资产负债表', [u'资产:货币资金(元)',u'资产:应收账款(元)',u'资产:其它应收款(元)',u'资产:存货(元)',
        u'资产:流动资产合计(元)',u'资产:长期股权投资(元)',u'资产:累计折旧(元)',u'资产:固定资产(元)',u'资产:无形资产(元)',u'资产:资产总计(元)',u'负债:应付账款(元)',
        u'负债:预收账款(元)',u'负债:存货跌价准备(元)',u'负债:流动负债合计(元)',u'负债:长期负债合计(元)',
        u'负债:负债合计(元)',u'权益:实收资本(或股本)(元)',u'权益:资本公积金(元)',u'权益:盈余公积金(元)',u'权益:股东权益合计(元)',u'流动比率'])

    """
    return
Exemple #8
0
def drop_indexes_too_many_empty():  # 删空行太多的列。已经跑过一遍这个函数的表再跑会加一列序号
    for file_n in category_finance_files:
        df = file_utils.read_file_to_df(corporation_index_file_url, file_n + '_index')
        df = df.dropna(axis=1, thresh=200)
        df = df.fillna(-0.5)
        file_utils.write_file(df, corporation_index_file_url, file_n + '_index')
def generate_index_custom_credit(corporate_start, corporate_end):
    """
    ***海关进出口信用***
    指标1:经济区划,总计8个,int
    指标2:经营类别,总计6个,int
    指标3:有海关注销标志企业,总计1个,int
    指标4:年报情况,总计5个,int
    指标5:信用等级,总计4个,int
    :return:
    """

    columns = [
        'kind_of_range_1', 'kind_of_range_2', 'kind_of_range_3',
        'kind_of_range_4', 'kind_of_range_5', 'kind_of_range_6',
        'kind_of_range_7', 'kind_of_range_8', 'kind_of_tax_company_1',
        'kind_of_tax_company_2', 'kind_of_tax_company_3',
        'kind_of_tax_company_4', 'kind_of_tax_company_5',
        'kind_of_tax_company_6', 'log_out_custom', 'status_of_annual_report_1',
        'status_of_annual_report_2', 'status_of_annual_report_3',
        'status_of_annual_report_4', 'status_of_annual_report_5',
        'level_of_credit_1', 'level_of_credit_2', 'level_of_credit_3',
        'level_of_credit_4'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'海关进出口信用')
    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        total_num1 = 0
        total_num2 = 0
        total_num3 = 0
        total_num4 = 0
        total_num5 = 0

        df_temp = data_frame[data_frame[u'企业编号'.encode('utf-8')] == corporate]

        # 经济区划
        for i in range(1, 9):
            y_df = df_temp[df_temp[u'经济区划'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num1 += len(df_temp)

        # 经营类别
        for i in range(1, 7):
            y_df = df_temp[df_temp[u'经营类别'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num2 += len(df_temp)

        # print(len(row_list))

        # 有海关注销标志企业
        y_df = df_temp.loc[df_temp[u'海关注销标志'.encode('utf-8')] == 2,
                           u'海关注销标志'.encode('utf-8')]
        row_list.append(len(y_df))
        total_num3 += len(df_temp)

        # print(len(row_list))

        # 年报情况
        for i in range(1, 5):
            y_df = df_temp[df_temp[u'年报情况'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num4 += len(df_temp)

        # 信用等级
        for i in range(1, 6):
            y_df = df_temp[df_temp[u'信用等级'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num5 += len(df_temp)

        row_dict[corporate] = row_list

        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'海关进出口信用_index',
                  index=True)
    return
def generate_index_financing(corporate_start, corporate_end):
    """
    ***融资信息***
    指标1:公司融资次数
    指标2:公司不同类型融资次数,总计29个,int
    指标3:公司投资金额小于1亿、在1亿和5亿之间、大于5亿,总计3个,int
    指标4:年报情况,总计5个,int
    指标5:信用等级,总计4个,int
    :return:
    """

    columns = [
        'financing_count', 'round_1', 'round_2', 'round_3', 'round_4',
        'round_5', 'round_6', 'round_7', 'round_8', 'round_9', 'round_10',
        'round_11', 'round_12', 'round_13', 'round_14', 'round_15', 'round_16',
        'round_17', 'round_18', 'round_19', 'round_20', 'round_21', 'round_22',
        'round_23', 'round_24', 'round_25', 'round_26', 'round_27', 'round_28',
        'round_29', 'investment_amount_less_than_100_million',
        'investment_amount_between_100million_and_500_million',
        'investment_amount_more_than_500_million',
        'investment_year_before_2009', 'invest_year_between_2009_and_2013',
        'investment_year_after_2013'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'融资信息')
    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        total_num1 = 0
        total_num2 = 0
        total_num3 = 0
        total_num4 = 0
        total_num5 = 0
        total_num6 = 0
        total_num7 = 0
        total_num8 = 0

        df_temp = data_frame[data_frame[u'企业编号'.encode('utf-8')] == corporate]

        # 公司融资次数
        row_list.append(len(df_temp))
        total_num1 += len(df_temp)

        # 公司融资轮次数
        for i in range(1, 30):
            y_df = df_temp[df_temp[u'轮次'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num2 += len(df_temp)

        # 投资金额小于1亿
        y_df = df_temp[(df_temp[u'投资金额'.encode('utf-8')] > 0)
                       & (df_temp[u'投资金额'.encode('utf-8')] <= 1000000000)]
        row_list.append(len(y_df))
        total_num3 += len(df_temp)

        # 投资金额在1亿和5亿之间
        y_df = df_temp[(df_temp[u'投资金额'.encode('utf-8')] > 100000000)
                       & (df_temp[u'投资金额'.encode('utf-8')] <= 500000000)]
        row_list.append(len(y_df))
        total_num4 += len(df_temp)

        # 投资金额大于5亿
        y_df = df_temp[df_temp[u'投资金额'.encode('utf-8')] > 500000000]
        row_list.append(len(y_df))
        total_num5 += len(df_temp)

        # 公司融资日期在2009年之前
        y_df = df_temp[(df_temp['year0'] > 1000) & (df_temp['year0'] < 2009)]
        row_list.append(len(y_df))
        total_num6 += len(df_temp)

        # 公司融资日期在2009年和2013年之间
        y_df = df_temp[(df_temp['year0'] >= 2009) & (df_temp['year0'] < 2013)]
        row_list.append(len(y_df))
        total_num7 += len(df_temp)

        # 公司融资日期在2013年之后
        y_df = df_temp[df_temp['year0'] >= 2013]
        row_list.append(len(y_df))
        total_num8 += len(df_temp)

        row_dict[corporate] = row_list

        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'融资信息_index',
                  index=True)
    return
def generate_index_bond(corporate_start, corporate_end):
    """
    ***债券信息***
    指标1:企业拥有的不同债券评级数,总计7个,int
    指标2:债券期限小于1年或大于一年的数量,总计2个,int
    指标3:企业拥有的不同债券品种的数量,总计7个,int
    指标4:计划发行额度小于10亿或者大于10亿的数量,总计2个,int
    指标5:利率小于5%或者大于5%的数量,总计2个,int
    指标6:债券发行日期在2013年前和在2013年后的数量,总计2个,int
    指标7:债券兑付日期在2020年前和在2020年后的数量,总计2个,int
    :return:
    """

    columns = [
        'ranking_of_bond_1', 'ranking_of_bond_2', 'ranking_of_bond_3',
        'ranking_of_bond_4', 'ranking_of_bond_5', 'ranking_of_bond_6',
        'ranking_of_bond_7', 'bond_duration_less_than_1_year',
        'bond_duration_longer_than_1_year', 'kind_of_bond_1', 'kind_of_bond_2',
        'kind_of_bond_3', 'kind_of_bond_4', 'kind_of_bond_5', 'kind_of_bond_6',
        'kind_of_bond_7', 'total_planned_issuance_less_than_one_billion',
        'total_planned_issuance_more_than_one_billion',
        'interest_rate_less_than_5%', 'interest_rate_more_than_5%',
        'interest_pay_1', 'interest_pay_2', 'interest_pay_3', 'interest_pay_4',
        'interest_pay_5', 'interest_pay_6',
        'issuance_date_of_bonds_before_2013',
        'issuance_date_of_bonds_after_2013', 'bond_payment_date_before_2020',
        'bond_payment_date_after_2020'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'债券信息')
    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        total_num1 = 0
        total_num2 = 0
        total_num3 = 0
        total_num4 = 0
        total_num5 = 0
        total_num6 = 0
        total_num7 = 0
        total_num8 = 0
        total_num9 = 0
        total_num10 = 0
        total_num11 = 0
        total_num12 = 0
        total_num13 = 0

        df_temp = data_frame[data_frame[u'企业编号'.encode('utf-8')] == corporate]

        # 债券信用评级
        for i in range(1, 8):
            y_df = df_temp[df_temp[u'债券信用评级'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num1 += len(df_temp)

        # 债券期限小于一年
        y_df = df_temp[df_temp[u'债券期限'.encode('utf-8')] <= 1]
        row_list.append(len(y_df))
        total_num2 += len(df_temp)

        # 债券期限大于一年
        y_df = df_temp[df_temp[u'债券期限'.encode('utf-8')] > 1]
        row_list.append(len(y_df))
        total_num3 += len(df_temp)

        # 债券品种
        for i in range(1, 8):
            y_df = df_temp[df_temp[u'债券品种'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num4 += len(df_temp)

        # 计划发行总额小于10亿
        y_df = df_temp[df_temp[u'计划发行总额(亿元)'.encode('utf-8')] <= 10]
        row_list.append(len(y_df))
        total_num5 += len(df_temp)

        # 计划发行总额大于10亿
        y_df = df_temp[df_temp[u'计划发行总额(亿元)'.encode('utf-8')] > 10]
        row_list.append(len(y_df))
        total_num6 += len(df_temp)

        # 票面利率小于5%
        y_df = df_temp[df_temp[u'票面利率(%)'.encode('utf-8')] <= 5]
        row_list.append(len(y_df))
        total_num7 += len(df_temp)

        # 票面利率大于5%
        y_df = df_temp[df_temp[u'票面利率(%)'.encode('utf-8')] > 5]
        row_list.append(len(y_df))
        total_num8 += len(df_temp)

        # 付息方式
        for i in range(1, 7):
            y_df = df_temp[df_temp[u'付息方式'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num9 += len(df_temp)

        # 债券发行日期在2014年之前
        y_df = df_temp[(df_temp['year0'] <= 2013) & (df_temp['year0'] > 1000)]
        row_list.append(len(y_df))
        total_num11 += len(df_temp)

        # 债券发行日期在2014年及以后
        y_df = df_temp[df_temp['year0'] > 2013]
        row_list.append(len(y_df))
        total_num10 += len(df_temp)

        # 债券兑付日期在2020年之前
        y_df = df_temp[df_temp['year1'] <= 2020 & (df_temp['year1'] > 1000)]
        row_list.append(len(y_df))
        total_num13 += len(df_temp)

        # 债券兑付日期在2020年以后
        y_df = df_temp[df_temp['year1'] > 2020]
        row_list.append(len(y_df))
        total_num12 += len(df_temp)

        row_dict[corporate] = row_list

        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'债券信息_index',
                  index=True)
    return
Exemple #12
0
def time_split(file_name, column_name, i = 0):
    df = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet')  # 读取工作表
    df["year"+str(i)], df["month"+str(i)], df["day"+str(i)] = df[column_name].str.split("-", n=2).str  # 分成三个表 n为劈开的次数
    df.drop(column_name, axis=1, inplace=True)  # 删除原有的列
    fu.write_file(df, clean_data_temp_file_url, file_name, ext='.xlsx', sheet_name='Sheet', index=False) # 保存
    return
def generate_index_basic_info():
    """
    ***工商基本信息表***
    指标1:注册资本(万元),总计3000个,int
    指标2:经营状态,总计4个,int
    指标3:行业大类(代码),总计18个,int
    指标4:行业小类(代码),总计80个,int
    指标5:类型,总计2个,int
    指标6:省份代码,总计32个,int
    指标7:是否上市,总计2个,int
    指标8:员工人数,总计41个,int
    指标9:公司存在时间
    指标10:公司是否注销,总计2个,int

    :return:
    """

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'工商基本信息表')

    data_frame.rename(columns={
        u'企业编号'.encode('utf-8'): 'Unnamed: 0',
        u'注册资本币种(正则)'.encode('utf-8'): 'type_of_currency',
        u'注册资本(万元)'.encode('utf-8'): 'register_capital',
        u'经营状态'.encode('utf-8'): 'running_status',
        u'行业大类(代码)'.encode('utf-8'): 'industry category',
        u'行业小类(代码)'.encode('utf-8'): 'industry subgroup',
        u'类型'.encode('utf-8'): 'type',
        u'省份代码'.encode('utf-8'): 'province_code',
        u'是否上市'.encode('utf-8'): 'list_shares_or_not',
        u'员工人数'.encode('utf-8'): 'staff_number'
    },
                      inplace=True)

    # 公司资本的人民币化处理
    # 将column2 中某些行(通过column1中的value1来过滤出来的)的值为value2
    data_frame.loc[data_frame['type_of_currency'] == 2,
                   'register_capital'] = data_frame.register_capital.apply(
                       lambda x: x * 6.7)
    data_frame.drop('type_of_currency', axis=1, inplace=True)

    # 公司存在时间
    data_frame["year_2019"] = 2019  # 生成新列2019
    # 用将2019年份列与公司成立年份列相减,形成存在时间,其中x带表当前行,可以通过下标进行索引
    data_frame['exist_year'] = data_frame.apply(
        lambda x: x['year_2019'] - x['year0'], axis=1)

    # 公司是否注销
    # 将column2 中某些行(通过column1中的value1来过滤出来的)的值为value2
    # 正常公司为1,注销公司为0
    data_frame['log_out_or_not'] = 0
    data_frame.loc[((data_frame[u'注销原因'.encode('utf-8')] == -1) &
                    (data_frame[u'注销时间'.encode('utf-8')] == -1)),
                   'log_out_or_not'] = 1
    data_frame.drop([u'注销原因'.encode('utf-8'), u'注销时间'.encode('utf-8')],
                    axis=1,
                    inplace=True)

    data_frame.drop(['month0', 'day0'], axis=1, inplace=True)

    fu.write_file(data_frame,
                  corporation_index_file_url,
                  u'工商基本信息表_index',
                  index=True)
    return
Exemple #14
0
 def serialize(self, path):
     file_utils.write_file(path, self.tostring())
Exemple #15
0
def empty_value_handle_share_holder_info():
    """
        Dirty value handle for table 年报-股东(发起人)及出资信息_rearranged.xlsx.
    First we'll drop rows that empty value is too many.
    ['实缴出资额(万元)','实缴出资方式','实缴出资日期','认缴出资方式', '认缴出资日期','认缴出资额(万元)']
    Once there are more than 3 empties in these 8 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    股东类型
    ------
    Empty percentage is 95.8587%(76547 out of 79854). We need to drop it.

    -----------------------------
    股东所占比例
    ------
    Empty percentage is 98.7815263%(78881 out of 79854). We need to drop it.

    -----------------------------
    认缴出资方式
    ------
    Empty percentage is 2.3418%(1870 out of 79854). We replace them with -1.
    It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them
    with [',', '、'], to do this, we should drop the ',' or '、' at the end first.

    -----------------------------
    认缴出资额(万元)
    ------
    Empty percentage is 0.0288%(23 out of 79854). We just replace them with -1.
    We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied
    by 6.7.

    -----------------------------
    认缴出资日期
    ------
    Empty percentage is 1.7344%(1385 out of 79854). We replace them by '1000-01-01'
    They are all formatted with format yyyy-mm-dd.
    But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty.

    -----------------------------
    实缴出资方式
    ------
    Empty percentage is 5.9484%(4750 out of 79854). We replace them with -1.
    It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them
    with [',', '、', ','], to do this, we should drop the ',' or '、' or ',' at the end first.

    -----------------------------
    实缴出资额(万元)
    ------
    Empty percentage is 3.2284%(2578 out of 79854). We just replace them with -1.
    We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied
    by 6.7.

    -----------------------------
    实缴出资日期
    ------
    Empty percentage is 5.2558%(4197 out of 79854). We replace them by '1000-01-01'
    They are all formatted with format yyyy-mm-dd.
    But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0.05009%(40 out of 79854). We replace them by '1000'

    -----------------------------
    :return:
    """
    empty_check_list = [
        u'实缴出资方式'.encode('utf-8'), u'实缴出资日期'.encode('utf-8'),
        u'实缴出资额(万元)'.encode('utf-8'), u'认缴出资方式'.encode('utf-8'),
        u'认缴出资日期'.encode('utf-8'), u'认缴出资额(万元)'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-股东(发起人)及出资信息_rearranged.xlsx',
                                 columns=empty_check_list,
                                 thresh=2)
    panaly.list_category_columns_values(
        [u'年报-股东(发起人)及出资信息_rearranged'],
        u'年报-股东(发起人)及出资信息_rearranged_empty_handled',
        file_url=clean_data_temp_file_url)

    dcu.drop_columns(u'年报-股东(发起人)及出资信息_rearranged',
                     [u'股东类型'.encode('utf-8'), u'股东所占比例'.encode('utf-8')])

    df = file_utils.read_file_to_df(clean_data_temp_file_url,
                                    u'年报-股东(发起人)及出资信息_rearranged')
    values = {
        u'认缴出资方式'.encode('utf-8'): -1,
        u'实缴出资方式'.encode('utf-8'): -1,
        u'认缴出资日期'.encode('utf-8'): '1000-01-01',
        u'实缴出资日期'.encode('utf-8'): '1000-01-01',
        u'认缴出资额(万元)'.encode('utf-8'): -1,
        u'实缴出资额(万元)'.encode('utf-8'): -1,
        u'年报年份'.encode('utf-8'): '1000'
    }
    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url,
                          u'年报-股东(发起人)及出资信息_rearranged')

    # 认缴出资方式
    # 实缴出资方式
    splits = [',', u'、', u',']
    dcu.drop_unit(u'年报-股东(发起人)及出资信息_rearranged',
                  u'认缴出资方式'.encode('utf-8'),
                  splits,
                  empty_mask=-1)
    dcu.drop_unit(u'年报-股东(发起人)及出资信息_rearranged',
                  u'实缴出资方式'.encode('utf-8'),
                  splits,
                  empty_mask=-1)

    dcu.count_split(u'年报-股东(发起人)及出资信息_rearranged',
                    u'认缴出资方式'.encode('utf-8'),
                    splits,
                    empty_mask=-1)
    dcu.count_split(u'年报-股东(发起人)及出资信息_rearranged',
                    u'实缴出资方式'.encode('utf-8'),
                    splits,
                    empty_mask=-1)

    # 认缴出资额(万元)
    # 实缴出资额(万元)
    dcu.drop_unit_with_transfer(u'年报-股东(发起人)及出资信息_rearranged',
                                u'认缴出资额(万元)'.encode('utf-8'),
                                [u'万', u'万元', u'万元人民币', u'万人民币'], {
                                    u'万美元': 6.7,
                                    u'人民币': 0.0001
                                },
                                empty_mask=-1)
    dcu.drop_unit_with_transfer(u'年报-股东(发起人)及出资信息_rearranged',
                                u'实缴出资额(万元)'.encode('utf-8'),
                                [u'万', u'万元', u'万元人民币', u'万人民币'], {
                                    u'万美元': 6.7,
                                    u'人民币': 0.0001
                                },
                                empty_mask=-1)

    return
def generate_index_tender(corporate_start, corporate_end):
    """
    ***招投标***
    指标1:公告类型,总计19个,int
    指标2:省份,总计34个,int
    指标3:中标或招标,总计2个,int
    指标4:年报情况,总计5个,int
    指标5:信用等级,总计4个,int
    :return:
    """

    columns = [
        'status_of_announcement_1', 'status_of_announcement_2',
        'status_of_announcement_3', 'status_of_announcement_4',
        'status_of_announcement_5', 'status_of_announcement_6',
        'status_of_announcement_7', 'status_of_announcement_8',
        'status_of_announcement_9', 'status_of_announcement_10',
        'status_of_announcement_11', 'status_of_announcement_12',
        'status_of_announcement_13', 'status_of_announcement_14',
        'status_of_announcement_15', 'status_of_announcement_16',
        'status_of_announcement_17', 'status_of_announcement_18',
        'status_of_announcement_19', 'province_11', 'province_12',
        'province_13', 'province_14', 'province_15', 'province_21',
        'province_22', 'province_23', 'province_31', 'province_32',
        'province_33', 'province_34', 'province_35', 'province_36',
        'province_37', 'province_41', 'province_42', 'province_43',
        'province_44', 'province_45', 'province_46', 'province_50',
        'province_51', 'province_52', 'province_53', 'province_54',
        'province_61', 'province_62', 'province_63', 'province_64',
        'province_65', 'province_71', 'province_81', 'province_82', 'bidding',
        'tendering', 'announcement_year_before_2009',
        'announcement_year_2009_2013', 'announcement_year_2013_2019'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'招投标')
    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        total_num1 = 0
        total_num2 = 0
        total_num3 = 0
        total_num4 = 0
        total_num5 = 0
        total_num6 = 0

        df_temp = data_frame[data_frame[u'企业编号'.encode('utf-8')] == corporate]

        # 公告类型
        for i in range(1, 20):
            y_df = df_temp[df_temp[u'公告类型'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num1 += len(df_temp)

        # 省份
        for i in (11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37,
                  41, 42, 43, 44, 45, 46, 50, 51, 52, 53, 54, 61, 62, 63, 64,
                  65, 71, 81, 82):
            y_df = df_temp[df_temp[u'省份'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num2 += len(df_temp)

        # 中标或招标
        for i in range(1, 3):
            y_df = df_temp[df_temp[u'中标或招标'.encode('utf-8')] == i]
            row_list.append(len(y_df))
            total_num3 += len(df_temp)

        y_df = df_temp[(df_temp['year0'] <= 2009) & (df_temp['year0'] > 1000)]
        row_list.append(len(y_df))
        total_num4 += len(df_temp)

        y_df = df_temp[(df_temp['year0'] > 2009) & (df_temp['year0'] <= 2013)]
        row_list.append(len(y_df))
        total_num5 += len(df_temp)

        y_df = df_temp[(df_temp['year0'] > 2013) & (df_temp['year0'] <= 2019)]
        row_list.append(len(y_df))
        total_num6 += len(df_temp)

        row_dict[corporate] = row_list

        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df, corporation_index_file_url, u'招投标_index', index=True)
    return
Exemple #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--aprinter-src-dir')
    parser.add_argument('--request-file')
    parser.add_argument('--response-file')
    parser.add_argument('--temp-dir')
    parser.add_argument('--stderr-truncate-bytes')
    parser.add_argument('--python')
    parser.add_argument('--nix-build')
    parser.add_argument('--mkdir')
    parser.add_argument('--rsync')
    parser.add_argument('--p7za')
    parser.add_argument('--bash')
    parser.add_argument('--head')
    parser.add_argument('--cat')
    args = parser.parse_args()
    
    # Read the request.
    with file_utils.use_input_file(args.request_file) as input_stream:
        request = input_stream.read()
    
    # The response will be built from these variables.
    response_success = False
    response_message = ''
    response_error = None
    response_filename = None
    response_data = None

    try:
        # Create a subfolder which we will archive.
        build_path = os.path.join(args.temp_dir, 'aprinter-build')
        run_process_limited(args, [args.mkdir, build_path], '', 'The mkdir failed!?')
        
        # Write the configuration to the build folder.
        config_path = os.path.join(build_path, 'config.json')
        file_utils.write_file(config_path, request)
        
        # Do the build.
        result_path = os.path.join(args.temp_dir, 'result')
        nixbuild_cmd = [args.nix_build, args.aprinter_src_dir, '-A', 'aprinterBuild',
            '-o', result_path, '--argstr', 'aprinterConfigFile', config_path]
        run_process_limited(args, nixbuild_cmd, '', 'Failed to build APrinter.')
        
        # Copy the build to the build_path.
        run_process_limited(args, [args.rsync, '-rL', '--chmod=ugo=rwX', '{}/'.format(result_path), '{}/'.format(build_path)], '', 'The rsync failed!?')
        
        # Produce the archive.
        archive_filename = 'aprinter-build.zip'
        archive_path = os.path.join(args.temp_dir, archive_filename)
        archive_cmd = [args.p7za, 'a', archive_path, build_path]
        run_process_limited(args, archive_cmd, '', 'The p7za failed!?')
        
        # Read the archive contents.
        archive_contents = file_utils.read_file(archive_path)
        
        response_success = True
        response_message = 'Compilation successful.'
        response_filename = archive_filename
        response_data = archive_contents
        
    except ProcessError as e:
        response_message = str(e)
        response_error = e.stderr_output
    except Exception as e:
        response_message = str(e)
    
    # Build the response.
    response = collections.OrderedDict({})
    response['success'] = response_success
    response['message'] = response_message
    if response_error is not None:
        response['error'] = response_error
    if response_filename is not None:
        response['filename'] = response_filename
        response['data'] = base64.b64encode(response_data)
    
    # Write the response.
    with file_utils.use_output_file(args.response_file) as output_stream:
        json.dump(response, output_stream)
Exemple #18
0
def generate_index_general_taxer(corporate_start, corporate_end):
    """
    ***专利***
    指标1:企业具有纳税人资格的个数,总计1个,int
    指标2:企业拥有的不同种类纳税人资格数量,总计4个,int
    指标3:纳税人状态为注销或报验个数,总计1个,int
    指标4:纳税公司不同注册类型个数,总计6个,int
    指标5:出口退(免)税企业数,总计1个,int
    指标6:认定时间在2000年前,2000-2010,2010年后的个数,总计3个,int
    :return:
    """

    columns = [
        'taxer_count',
        'kind_taxer_1',
        'kind_taxer_2',
        'kind_taxer_3',
        'kind_taxer_4',
        'status_cancel_or_inspire',
        'register_1',
        'register_2',
        'register_3',
        'register_4',
        'register_5',
        'register_6',
        'export_tax_rebate',
        'identify_year_before_2000',
        'identify_year_between_2000_and_2010',
        'identify_year_after_2010',
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'一般纳税人')
    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        total_num1 = 0
        total_num2 = 0
        total_num3 = 0
        total_num4 = 0
        total_num5 = 0
        total_num6 = 0
        total_num7 = 0
        total_num8 = 0

        df_temp = data_frame[data_frame[u'企业编号'.encode('utf-8')] == corporate]

        # 企业纳税人资格数
        row_list.append(len(df_temp))
        total_num1 += len(df_temp)

        # 企业拥有的不同种类纳税人资格数量
        for kind in range(1, 5):
            y_df = df_temp.loc[df_temp[u'纳税人资格'.encode('utf-8')] == kind,
                               u'纳税人资格'.encode('utf-8')]
            row_list.append(len(y_df))
            total_num2 += len(df_temp)

        # 纳税人状态为注销或报验个数
        y_df = df_temp.loc[df_temp[u'纳税人状态'.encode('utf-8')] >= 2,
                           u'纳税人状态'.encode('utf-8')]
        row_list.append(len(y_df))
        total_num3 += len(df_temp)

        # 纳税公司不同注册类型个数
        for kind in range(1, 7):
            y_df = df_temp.loc[df_temp[u'登记注册类型'.encode('utf-8')] == kind,
                               u'登记注册类型'.encode('utf-8')]
            row_list.append(len(y_df))
            total_num4 += len(df_temp)

        # 出口退(免)税企业数
        y_df = df_temp.loc[df_temp[u'出口状态备案状态'.encode('utf-8')] == 1,
                           u'出口状态备案状态'.encode('utf-8')]
        row_list.append(len(y_df))
        total_num5 += len(df_temp)

        # 认定日期在2000年前
        y_df = df_temp.loc[(df_temp['year0'] > 1000)
                           & (df_temp['year0'] <= 2000)]
        row_list.append(len(y_df))
        total_num6 += len(df_temp)

        # 认定日期在2000年-2010年之间
        y_df = df_temp.loc[(df_temp['year0'] > 2000)
                           & (df_temp['year0'] <= 2010)]
        row_list.append(len(y_df))
        total_num7 += len(df_temp)

        # 认定日期在2010年之后
        y_df = df_temp.loc[df_temp['year0'] > 2010]
        row_list.append(len(y_df))
        total_num8 += len(df_temp)

        row_dict[corporate] = row_list

        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'一般纳税人_index',
                  index=True)
    return