Beispiel #1
0
def data_categorize():
    """
    categorize these tables
    :return:
    """
    file_name = u'数据初整理'
    new_file_name = u'categorize数据初整理.xlsx'
    data = file_utils.read_file_to_df('', file_name, sheet_name='data')
    categorize_info = file_utils.read_file_to_df('',
                                                 file_name,
                                                 sheet_name='categorize')
    writer = pandas.ExcelWriter(new_file_name)
    for column in categorize_info.columns:
        categorized_data = pandas.DataFrame(columns=data.columns.tolist())
        for row in data.itertuples():
            if categorize_info[column].tolist().__contains__(row[1]):
                row_data = [list(row[1:len(row)])]
                categorized_data = pandas.concat([
                    categorized_data,
                    pandas.DataFrame(row_data, columns=data.columns.tolist())
                ],
                                                 ignore_index=True)
        file_utils.write_file_without_save(categorized_data,
                                           writer,
                                           sheet_name=column,
                                           index=False)

    writer.save()
def extract_keyword(file_name,
                    column_name,
                    keywords,
                    empty_mask='Unknown',
                    others_mask='Others',
                    file_url=clean_data_temp_file_url,
                    dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, empty_mask)
        for j in range(0, len(keywords)):
            if keywords[j] in str(content):
                data_frame.set_value(index, column_name, keywords[j])
                break

    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if content not in keywords:
            data_frame.set_value(index, column_name, others_mask)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
def empty_value_handle_work():
    """

    :return:
    """
    df = file_utils.read_file_to_df(clean_data_temp_file_url, u'作品著作权')
    values = {u'作品著作权类别'.encode('utf-8'): 9, u'作品著作权登记日期'.encode('utf-8'): '1000-01-01',
              u'作品著作权创作完成日期'.encode('utf-8'): '1000-01-01', u'作品著作权首次发布日期'.encode('utf-8'): '1000-01-01'}
    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url, u'作品著作权')

    status_1 = [u'A 文字', u'文字', u'文字作品']
    status_2 = [u'B 音乐', u'音乐', u'音乐作品']
    status_3 = [u'F 美术', u'美术', u'美术作品']
    status_4 = [u'G 摄影', u'摄影', u'摄影作品']
    status_5 = [u'H 电影', u'电影', u'电影作品和类似摄制电影的方法创造的作品', u'电影和类似摄制电影方法创作的作品', u'I 类似摄制电影方法创作作品', u'类似摄制电影方法创作的作品']
    status_6 = [u'J 工程设计图、产品设计图', u'工程设计图、产品设计图', u'工程设计图、产品设计图作品', u'建筑']
    status_7 = [u'K 地图、示意图', u'地图、示意图', u'图形']
    status_8 = [9]
    status_list = [status_1, status_2, status_3, status_4, status_5, status_6, status_7, status_8]
    status_after = [1, 2, 3, 4, 5, 6, 7, 9]

    dcu.merge_status(u'作品著作权', u'作品著作权类别'.encode('utf-8'), status_list, status_after, others=8)

    # TODO Other columns
    return
def drop_useless_indexes(index_files,
                         ind_fil,
                         read_url=corporation_index_file_url,
                         write_url=corporation_index_file_url):
    """
    Drop indexes we think is useless from the image of scatter.
    :return:
    """
    print('total indexes: ' + str(len(ind_fil)))
    indexes_filter_temp = ind_fil
    counts = 0
    for file_n in index_files:
        print file_n

        data_frame = fu.read_file_to_df(read_url, file_n + '_index')
        for column in data_frame.columns:
            if column in [
                    'Unnamed: 0', u'企业总评分', 'int_score', 'int_score_root'
            ]:
                continue
            if column not in ind_fil:
                data_frame = data_frame.drop(column, axis=1)
            else:
                indexes_filter_temp.remove(column)
        counts += len(data_frame.columns) - 4
        fu.write_file(data_frame, fu.check_file_url(write_url),
                      file_n + '_index')
    print('set indexes: ' + str(counts))
    print(indexes_filter_temp)
def time_periods_format(file_name,
                        column_name,
                        file_url=clean_data_temp_file_url,
                        dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, '-')
            continue
        if u'年' in content:
            content = str(content).replace('-',
                                           '~').replace(u'年', '/').replace(
                                               u'月', '/').replace(u'日', '')
        elif '~' in content:
            content = str(content).replace('-', '/')
        elif u'至' in content:
            content = str(content).replace(u'至', '~').replace('-', '/')
        content = str(content).replace('/0', '/')

        data_frame.set_value(index, column_name, content)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
def drop_unit_with_float_format(file_name,
                                column_name,
                                unit_strs,
                                empty_mask=-1,
                                file_url=clean_data_temp_file_url,
                                dst_file_url=clean_data_temp_file_url):
    """

    :type unit_strs: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, column_name, empty_mask)
        if str(content).startswith('.'):
            content = str(content).replace('.', '0.')
        for j in range(0, len(unit_strs)):
            if str(content).endswith(unit_strs[j]):
                data_frame.set_value(index, column_name,
                                     str(content).replace(unit_strs[j], ''))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
def count_split(file_name,
                column_name,
                splits,
                empty_mask=-1,
                file_url=clean_data_temp_file_url,
                dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, empty_mask)

        is_counted = False
        for j in range(0, len(splits)):
            if splits[j] in str(content):
                strs = str(content).split(splits[j])
                data_frame.set_value(index, column_name, len(strs))
                is_counted = True
        if not is_counted:
            data_frame.set_value(index, column_name, 1)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
def drop_invalid_data(file_name,
                      column_name,
                      operator,
                      thresh_value,
                      file_url=clean_data_temp_file_url,
                      dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if not (isinstance(content, float) or isinstance(content, int)):
            continue

        isvalid = True
        if operator == '<':
            isvalid = not (content < thresh_value)
        elif operator == '>':
            isvalid = not (content > thresh_value)
        elif operator == '>=':
            isvalid = not (content >= thresh_value)
        elif operator == '<=':
            isvalid = not (content <= thresh_value)

        if not isvalid:
            data_frame = data_frame.drop(index=index)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
def pic_corr_heat_map(index_files,
                      category_name,
                      index_file_url=corporation_index_file_url,
                      heat_map_url=corporation_index_heat_map_file_url):
    """
    Draw correlation heat map pictures in order to filter features.
    :param index_files:
    :param category_name:
    :param index_file_url:
    :param heat_map_url: where to store these pictures.
    :return:
    """
    fig = plt.figure(figsize=(26, 18))
    for file_n in index_files:
        print file_n
        data_frame = fu.read_file_to_df(index_file_url, file_n + '_index')
        data_frame = data_frame.drop(
            columns=['Unnamed: 0', u'企业总评分'.encode('utf-8')])
        corr_matrix = data_frame.corr()
        print(corr_matrix)
        sns.heatmap(corr_matrix,
                    annot=True,
                    vmax=1,
                    vmin=0,
                    xticklabels=True,
                    yticklabels=True,
                    square=True)
        plt.title(file_n)

        fig.savefig(
            fu.check_file_url(heat_map_url + '/' + category_name + '/') +
            file_n + '.png',
            dpi=75)
        plt.clf()
def adjust_time(file_name,
                column_name,
                empty_mask='Unknown',
                file_url=clean_data_temp_file_url,
                dst_file_url=clean_data_temp_file_url):
    """
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, column_name, empty_mask)
        if len(str(content)) > 2:
            if str(content)[2] == u'-':
                data_frame.set_value(index, column_name, '20' + str(content))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    # 14.2-18.4 19个季度
    #     status_list = [[u'14-06-30'], [u'14-09-30'], [u'14-12-31'],
    #                    [u'15-03-31'] [u'15-06-30'], [u'15-09-30'], [u'15-12-31'],
    # [u'15-03-31'] [u'15-06-30'], [u'15-09-30'], [u'15-12-31'],
    #                    ]
    #     status_after = [1,2,3,4,5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,16,17,18,19]
    #     dcu.merge_status(file_name, column_name, status_list, status_after)
    return
Beispiel #11
0
def comments_generate():
    """
    generate comments for each table's dirty value handle
    :return:
    """
    comment_str_ori = u"\
    Dirty value handle for table {$$}.\n\
    First we'll drop rows that empty value is too many.\n\
    # ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额']\n\
    # Once there are more than 3 empties in these 8 columns we will drop that row.\n\
    Then we check nulls column by column and decide how to process with it.\n\
    Next we should numeric all the value for future process.\n\
    After these are done, it's time to work out features we can use in this table which belongs\n\
        to exploratory data analysis. \n\
".encode('utf-8')
    column_str_ori = "\n\
    -----------------------------\n\
    {$$$}\n\
    ------\n"

    for file_name in os.listdir(working_file_url):
        comment_str = comment_str_ori.replace('{$$}',
                                              file_name.encode('utf-8'))
        df = file_utils.read_file_to_df(working_file_url, file_name)
        column_list = df.columns.tolist()
        for i in range(1, len(column_list)):
            comment_str += column_str_ori.replace(
                '{$$$}', column_list[i].encode('utf-8'))

        comment_str += '\n    -----------------------------'
        with open(
                file_utils.check_file_url('dirty_value_handle_comments/') +
                file_name + '_comments.txt', 'w+') as f:
            f.write(comment_str)
    return
def change_number(file_name,
                  column_name,
                  file_url=clean_data_temp_file_url,
                  dst_file_url=clean_data_temp_file_url):
    """

    :type status_names: list
    :type status: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if str(content).endswith(u'万'):
            num = str(content).replace(u'万', '')  # 把前面的改成后面的,此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**4))
        elif str(content).endswith(u'万亿'):
            num = str(content).replace(u'万亿', '')  # 把前面的改成后面的,此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**12))
        elif str(content).endswith(u'亿'):
            num = str(content).replace(u'亿', '')  # 把前面的改成后面的,此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**8))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
def merge_status_new_column(file_name,
                            column_name,
                            new_column_name,
                            status,
                            status_names,
                            others='',
                            empty_mask='Unknown',
                            file_url=clean_data_temp_file_url,
                            dst_file_url=clean_data_temp_file_url):
    """

    :type status_names: list
    :type status: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    data_frame[new_column_name] = empty_mask
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, new_column_name, empty_mask)
        is_categorized = False
        for j in range(0, len(status)):
            if content in status[j]:
                data_frame.set_value(index, new_column_name, status_names[j])
                is_categorized = True
        if (not is_categorized) and (not others == ''):
            data_frame.set_value(index, new_column_name, others)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return
Beispiel #14
0
def generate_index_patent(corporate_start, corporate_end):
    """
    ***专利***
    指标1:专利总数,总计1个,int
    指标2:分年专利数,[pre_2001, pre_2010, 2010-2013, 2014, 2015, 2016, 2017, 2018, 2019]总计9个,int
    指标3:分专利类型专利数,总计3个,int
    指标4:专利2018年增长率(2018/2017-1),总计1个,int
    指标4:专利2017年增长率(2017/2016-1),总计1个,int
    指标4:专利2016年增长率(2016/2015-1),总计1个,int
    :return:
    """

    columns = [
        'patent_count_total', 'patent_count_pre_2001', 'patent_count_pre_2010',
        'patent_count_2010-13', 'patent_count_2014', 'patent_count_2015',
        'patent_count_2016', 'patent_count_2017', 'patent_count_2018',
        'patent_count_2019', 'fm_patent_count', 'wg_patent_count',
        'sy_patent_count'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'专利')

    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        df_temp = data_frame[data_frame[corporate_index_false] == corporate]
        df_temp['year'] = df_temp[u'申请日'.encode('utf-8')].apply(
            lambda x: parser.parse(x).year)

        row_list.append(len(df_temp))

        for year in [2001, 2010, 2013, 2014, 2015, 2016, 2017, 2018, 2019]:
            if year == 2001:
                df_y_temp = df_temp[df_temp['year'] < 2001]
            elif year == 2010:
                df_y_temp = df_temp[df_temp['year'] < 2010]
            elif year == 2013:
                df_y_temp = df_temp[df_temp['year'] < 2013][
                    df_temp['year'] > 2010]
            else:
                df_y_temp = df_temp[df_temp['year'] == year]
            row_list.append(len(df_y_temp))

        for category in [0, 1, 2]:
            df_ca_temp = df_temp[df_temp[u'专利类型'.encode('utf-8')] == category]
            row_list.append(len(df_ca_temp))

        row_dict[corporate] = row_list
        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)
    #
    # dis_df['growth_rate_2018'] = dis_df.apply(lambda x: x['patent_count_2018'] / x['patent_count_2017'] - 1, axis=1)
    # dis_df['growth_rate_2017'] = dis_df.apply(lambda x: x['patent_count_2017'] / x['patent_count_2016'] - 1, axis=1)
    # dis_df['growth_rate_2016'] = dis_df.apply(lambda x: x['patent_count_2016'] / x['patent_count_2015'] - 1, axis=1)

    fu.write_file(dis_df, corporation_index_file_url, u'专利_index', index=True)
    return
Beispiel #15
0
def clean_financing():
    file_name = u'融资信息'
    dcu.merge_status(file_name, u'融资日期'.encode('utf-8'), [], [], empty_mask='0000-00-00')
    dcu.merge_status(file_name, u'轮次'.encode('utf-8'), [], [], empty_mask='Unknown')

    time_rearranged(file_name, u'融资日期'.encode('utf-8'))

    round(file_name, u'轮次'.encode('utf-8'))

    column_name = u'投资金额'.encode('utf-8')
    wr1 = fu.read_file_to_df(clean_data_temp_file_url, u'融资信息.xlsx',
                             sheet_name='Sheet')
    wr1.fillna({column_name: 'Unknown'})  # 对空值进行处理以进行索引

    for index in range(0, len(wr1)):
        content = wr1.at[index, column_name]
        if str(content).startswith(u'数'):
            str1 = '0'
            wr1.set_value(index, column_name, str1)
        elif str(content).startswith(u'未披露'):
            str1 = '0'
            wr1.set_value(index, column_name, str1)
    fu.write_file(wr1, clean_data_temp_file_url, u'融资信息', ext='.xlsx',
                  sheet_name='Sheet', index=False)

    for index in range(0, len(wr1)):
        content = wr1.at[index, column_name]
        if str(content).endswith(u'万美元'):
            # num = re.sub(u'万美元','', str(content))
            num = str(content).replace(u'万美元', u'')  # 去除万美元并乘以美元汇率,从而换算成人民币
            numb = float(num)
            numc = numb * (10 ** 4) * 6.72  # 3月24日美元汇率
            wr1.set_value(index, column_name, numc)

        elif str(content).endswith(u'万港币'):
            # num = re.sub(u'万港币','', str(content))
            num = str(content).replace(u'万港币', '')  # 去除万美元并乘以港币汇率,从而换算成人民币
            numb = float(num)
            numc = numb * (10 ** 4) * 0.856  # 3月24日港币汇率
            wr1.set_value(index, column_name, numc)

        elif str(content).endswith(u'万人民币'):
            num = str(content).replace(u'万人民币', '')  # 去除万人民币
            numb = float(num)
            numc = numb * (10 ** 4)
            wr1.set_value(index, column_name, numc)

        elif str(content).endswith(u'万'):
            num = str(content).replace(u'万', '')  # 去除万人民币
            numb = float(num)
            numc = numb * (10 ** 4)
            wr1.set_value(index, column_name, numc)


    fu.write_file(wr1, clean_data_temp_file_url, u'融资信息', ext='.xlsx',
                  sheet_name='Sheet', index=False)

    return
def merge_rows_by_columns(file_name,
                          keys=None,
                          file_url=working_file_url,
                          dst_file_url=clean_data_temp_file_url):
    """
    merge a table's rows with the same unique keys.
    :param file_name:
    :param keys:
    :param file_url:
    :param dst_file_url: which file folder should store the result
    :return:
    """
    origin_df = file_utils.read_file_to_df(file_url, file_name)
    data_frame = origin_df
    data_frames = [data_frame]

    str_keys = []
    for index in range(1, len(origin_df)):
        anchor_row = origin_df[index - 1:index]

        temp_df = origin_df[index:]
        for key in keys:
            if index == 1:
                str_keys.append(key.encode('utf-8'))
            temp_df = temp_df.loc[temp_df[key.encode('utf-8')] ==
                                  anchor_row.loc[index - 1,
                                                 key.encode('utf-8')]]

        duplicated_num = len(temp_df)
        for j in range(0, duplicated_num):
            data_frames[0] = data_frames[0].drop(index=index)

        for frame_nums in range(1, duplicated_num + 1):
            if len(data_frames) > frame_nums:
                data_frames[frame_nums] = data_frames[frame_nums].append(
                    temp_df[frame_nums - 1:frame_nums])
            else:
                new_df = temp_df[frame_nums - 1:frame_nums]
                data_frames.append(new_df)

        index += duplicated_num

    data_frame = data_frames[0]
    for df in data_frames:
        data_frame = pandas.merge(data_frame,
                                  df,
                                  how='left',
                                  on=origin_df.columns.tolist())

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

    return
def empty_value_handle_trademark():
    """
    Dirty value handle for table 商标.xlsx.
    First we'll drop rows that empty value is too many.
    # ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额']
    # Once there are more than 3 empties in these 8 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    商标状态
    ------
    Empty percentage is 0.2597%(367 out of 141312). We replace them as 'Unknown'.

    -----------------------------
    申请日期
    ------
    Empty percentage is 0.3637%(514 out of 141312). We replace with '1000-01-01'.
    Others are well formatted.

    -----------------------------
    专用权期限开始日期
    ------
    All empty, drop it.

    -----------------------------
    专用权期限结束日期
    ------
    Empty percentage is 21.4922%(30371 out of 141312). This column's value can be extract from '商标使用期限时间段', so we
    drop it.
    -----------------------------
    商标使用期限时间段
    ------
    Empty percentage is 1.5915%(2249 out of 141312). We map them to '1000-01-01至1000-01-01'.
    Others are well formatted except some are '至', for these value we change to '1000-01-01至1000-01-01'.

    -----------------------------
    :return:
    """
    df = file_utils.read_file_to_df(clean_data_temp_file_url, u'商标')
    values = {u'商标状态'.encode('utf-8'): 'Unknown', u'申请日期'.encode('utf-8'): '1000-01-01',
              u'商标使用期限时间段'.encode('utf-8'): u'1000-01-01至1000-01-01'}
    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url, u'商标')

    dcu.drop_columns(u'商标', [u'专用权期限开始日期'.encode('utf-8')])
    dcu.drop_columns(u'商标', [u'专用权期限结束日期'.encode('utf-8')])

    status_1 = [u'至']
    status_list = [status_1]
    status_after = [u'1000-01-01至1000-01-01']

    dcu.merge_status(u'商标', u'商标使用期限时间段'.encode('utf-8'), status_list, status_after)
    return
Beispiel #18
0
def generate_index_work(corporate_start, corporate_end):
    """
    ***作品著作权***
    指标1:作品著作权个数,总计1个,int
    指标2:近1年作品著作权个数,总计1个,int
    指标3:近3年作品著作权个数,总计1个,int
    指标4:分类别作品著作权个数,总计9个,int

    总计12个
    :return:
    """
    columns = [
        'works_total', 'works_2018', 'works_2016_2019', 'works_1', 'works_2',
        'works_3', 'works_4', 'works_5', 'works_6', 'works_7', 'works_8',
        'works_9'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'作品著作权')
    data_frame['year'] = data_frame[u'作品著作权登记日期'.encode('utf-8')].apply(
        lambda x: edu.cal_year_in_work_copyright(x))

    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        df_temp = data_frame[data_frame[corporate_index_false] == corporate]

        # 作品著作权个数
        row_list.append(len(df_temp))

        # 作品著作权个数2018
        df_y_temp = df_temp[df_temp['year'] == 2018]
        row_list.append(len(df_y_temp))

        # 作品著作权个数2016-2019
        df_y_temp = df_temp[df_temp['year'] >= 2016]
        row_list.append(len(df_y_temp))

        # 分类别作品著作权个数
        for category in range(1, 10):
            df_c_temp = df_temp[df_temp[u'作品著作权类别'.encode('utf-8')] ==
                                category]
            row_list.append(len(df_c_temp))

        row_dict[corporate] = row_list
        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'作品著作权_index',
                  index=True)
    return
Beispiel #19
0
def time_rearranged(file_name, column_name,  i = 0):
    # 用split分开时间, 注意:之后数据分析所要用时间表头为0(数字格式)
    df = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet')  # 读取工作表
    df["year"+str(i)], df["month"+str(i)], df["day"+str(i)] = df[column_name].str.split("-", n=2).str  # 分成三个表 n为劈开的次数
    df.drop(column_name, axis=1, inplace=True)  # 删除原有的列
    fu.write_file(df, clean_data_temp_file_url, file_name, ext='.xlsx', sheet_name='Sheet', index=False) # 保存

    # # 用split分开时间, 注意:之后数据分析所要用时间表头为0(数字格式)
    # table = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet')
    # wr1 = pd.concat([table, table[column_name].str.split(r'-', expand=True)], axis=2, names=['year','month', 'day'])
    # fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',sheet_name='Sheet', index=False)
    return
def merge_rows(file_name,
               keys=None,
               file_url=working_file_url,
               dst_file_url=clean_data_temp_file_url):
    """
    remove duplicated rows.
    :param file_name:
    :param keys:
    :param file_url:
    :param dst_file_url: which file folder should store the result
    :return:
    """
    # origin_df = file_utils.read_file(working_file_url + file_name)
    # data_frame = origin_df
    # data_frames = [data_frame]
    #
    # str_keys = []
    # for index in range(1, len(origin_df)):
    #     anchor_row = origin_df[index - 1:index]
    #
    #     temp_df = origin_df[index:]
    #     for key in keys:
    #         if index == 1:
    #             str_keys.append(key.encode('utf-8'))
    #         temp_df = temp_df.loc[temp_df[key.encode('utf-8')] == anchor_row.loc[index-1, key.encode('utf-8')]]
    #
    #     duplicated_num = len(temp_df)
    #     for j in range(0, duplicated_num):
    #         data_frames[0] = data_frames[0].drop(index=index)
    #
    #     for frame_nums in range(1, duplicated_num + 1):
    #         if len(data_frames) > frame_nums:
    #             data_frames[frame_nums] = data_frames[frame_nums].append(temp_df[frame_nums - 1: frame_nums])
    #         else:
    #             new_df = temp_df[frame_nums - 1: frame_nums]
    #             data_frames.append(new_df)
    #
    #     index += duplicated_num
    #
    # data_frame = data_frames[0]
    # for df in data_frames:
    #     data_frame = pandas.merge(data_frame, df, how='left', on=origin_df.columns.tolist())

    data_frame = file_utils.read_file_to_df(file_url, file_name)
    data_frame = data_frame.drop_duplicates()

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

    return
Beispiel #21
0
def score_integerize():  # 评分化为整数
    """
    scores are float, and we want try if integers will helps.
    :return:
    """
    for file_n in category_finance_files:
        print file_n

        data_frame = file_utils.read_file_to_df(corporation_index_file_url, file_n + '_index')
        data_frame['int_score'] = data_frame[u'企业总评分'.encode('utf-8')].apply(lambda x: round(x))

        file_utils.write_file(data_frame, corporation_index_file_url, file_n + '_index')
Beispiel #22
0
def clean_bond():
    file_name = u'债券信息'
    dcu.merge_status(file_name, u'债券信用评级'.encode('utf-8'), [], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'付息日期'.encode('utf-8'), [], [], empty_mask='00-00')
    dcu.merge_status(file_name, u'兑付日期'.encode('utf-8'), [], [], empty_mask='0000-00-00')
    dcu.merge_status(file_name, u'主体信用评级'.encode('utf-8'), [], [], empty_mask='Unknown')  # 空值改为Unknown
    dcu.merge_status(file_name, u'债券品种'.encode('utf-8'),[], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'付息方式'.encode('utf-8'), [], [], empty_mask='Unknown')


    dcu.drop_unit(file_name, u'债券期限'.encode('utf-8'), [u'年'], empty_mask= -1)

    wr1 = fu.read_file_to_df(clean_data_temp_file_url,file_name,
                             sheet_name='Sheet')
    wr1 = wr1.fillna({u'纳税人资格'.encode('utf-8'): 'unknown'})  # 对空值进行处理以进行索引
    fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',
                  sheet_name='Sheet', index=False)

    wr1 = fu.read_file_to_df(clean_data_temp_file_url, file_name,
                             sheet_name='Sheet')
    wr1 = wr1.fillna({u'票面利率(%)'.encode('utf-8'): 'unknown'})  # 对空值进行处理以进行索引
    fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',
                  sheet_name='Sheet', index=False)


    dcu.drop_columns(file_name, u'币种'.encode('utf-8'))
    dcu.drop_columns(file_name, u'流通场所'.encode('utf-8'))
    dcu.drop_columns(file_name, u'实际发行总额(亿元)'.encode('utf-8'))


    ranking_of_bond(file_name, u'债券信用评级'.encode('utf-8'))
    kind_of_bond(file_name, u'债券品种'.encode('utf-8'))
    ranking_of_co(file_name, u'主体信用评级'.encode('utf-8'))
    interest_pay(file_name, u'付息方式'.encode('utf-8'))

    time_rearranged(file_name, u'发行日期'.encode('utf-8'), i = 0)
    time_rearranged(file_name, u'兑付日期'.encode('utf-8'), i = 1)


    return
Beispiel #23
0
def list_file_columns_values(file_name, file_url=working_file_url):
    """
    list a file's columns statistic info.
    :param file_name:
    :param file_url:
    :return:
    """
    columns_dict = {}
    data = file_utils.read_file_to_df(file_url, file_name)
    for column in data.columns:
        print('column:%s' % column)
        if list(data.columns).index(
                column
        ) == 0:  # ignore the first column -- the number of company
            continue
        dropped_data = data.drop_duplicates(subset=[column], keep='first')
        # if dropped_data.size > 1000:
        #     column_dict = {column: ['varied']}
        # else:
        #     column_dict = {column: dropped_data[column].tolist()}
        sort_list = dropped_data[column].tolist()
        sort_list.sort()
        sort_list.insert(0, 'Nan Percent')
        sort_list.insert(0, 'Total Num')
        count_list = []
        for item in sort_list:
            if item == 'Total Num':
                count_list.append(data.__len__())
            elif item == 'Nan Percent':
                try:
                    count_list.append(
                        float(
                            float(data[column].isna().sum()) /
                            float(data[column].__len__())))
                except AttributeError as ae:
                    count_list.append(0)
            elif item is numpy.nan:
                try:
                    count_list.append(data[column].isna().sum())
                except AttributeError as ae:
                    count_list.append(0)
            elif isinstance(item, unicode):
                counted_data = data[data[column] == item.encode('utf-8')]
                count_list.append(counted_data.__len__())
            else:
                count_list.append(data[data[column] == item].__len__())
        column_dict = {column: sort_list}
        count_dict = {column + '_count': count_list}
        columns_dict.update(column_dict)
        columns_dict.update(count_dict)

    return columns_dict
def append_score():
    """
    append score to each index file.
    :return:
    """
    score_frame = fu.read_file_to_df(working_file_url, u'企业评分')
    score_frame = score_frame.set_index(u'企业编号'.encode('utf-8'))

    for file_n in category_landing_purchase:
        print file_n

        data_frame = fu.read_file_to_df(corporation_index_file_url,
                                        file_n + '_index')
        data_frame = data_frame.set_index('Unnamed: 0')

        data_frame = data_frame.join(score_frame)

        fu.write_file(data_frame,
                      corporation_index_file_url,
                      file_n + '_index',
                      index=True)
    return
Beispiel #25
0
def append_score():  # 加上评分
    """
        append score to each index file.
        :return:
        """
    score_frame = file_utils.read_file_to_df(working_file_url, u'企业评分')
    score_frame = score_frame.set_index(u'企业编号')

    for file_n in category_finance_files:
        print file_n
        data_frame = file_utils.read_file_to_df(corporation_index_file_url, file_n + '_index')
        # columns = data_frame.columns.values.tolist()
        # print data_frame
        try:
            data_frame = data_frame.set_index('Unnamed: 0')
        except KeyError, f:
            print f.message
        try:
            data_frame = data_frame.join(score_frame)
            file_utils.write_file(data_frame, corporation_index_file_url, file_n + '_index', index=True)
        except ValueError, e:
            print e.message
Beispiel #26
0
def gen_growth_ratio():
    data_frame = file_utils.read_file_to_df(corporation_index_file_url, u'上市信息财务信息-利润表' + '_index')
    print data_frame.columns
    # data_frame['interest_growth_1709-1809'] = data_frame.apply(
    #     lambda x: edu.cal_growth_rate(x, u'利润总额(元)2018-09-30'.encode('utf-8'), u'利润总额(元)2017-09-30'.encode('utf-8'),
    #                                   default=0, jump_value=-65535), axis=1)
    data_frame['interest_growth_range_1709-1809'] = data_frame.apply(
        lambda x: edu.cal_growth_rate_range(x, u'利润总额(元)2018-09-30'.encode('utf-8'),
                                            u'利润总额(元)2017-09-30'.encode('utf-8'),
                                            [-100, -50, -10, -5, -3, -1, 0, 1, 3, 5, 10, 50, 100, 300],
                                            default=0, jump_value=-65535), axis=1)

    file_utils.write_file(data_frame, corporation_index_file_url, u'上市信息财务信息-利润表' + '_index')
Beispiel #27
0
def generate_index_copyright(corporate_start, corporate_end):
    """
    ***软著著作权***
    指标1:软件著作权个数,总计1个,int
    指标2:软件著作权登记批准日期在2017-01-01(含)之后的个数,总计1个,int
    指标3:软件著作权登记批准日期在2013-01-01(不含)之前的个数,总计1个,int
    指标4:软件著作权登记批准日期在2006-01-01(不含)之前的个数,总计1个,int

    总计4个
    :return:
    """
    columns = [
        'copyright_total', 'copyright_after_2017', 'copyright_before_2013',
        'copyright_before_2006'
    ]
    dis_df = pd.DataFrame(columns=columns)

    data_frame = fu.read_file_to_df(clean_data_temp_file_url, u'软著著作权')
    data_frame['year'] = data_frame[u'软件著作权登记批准日期'.encode('utf-8')].apply(
        lambda x: edu.cal_year_in_common(x))

    for corporate in range(corporate_start, corporate_end + 1):
        row_dict = {}
        row_list = []

        df_temp = data_frame[data_frame[corporate_index_false] == corporate]

        # 软件著作权个数
        row_list.append(len(df_temp))

        # 软件著作权登记批准日期在2017-01-01(含)之后的个数
        df_y_temp = df_temp[df_temp['year'] >= 2017]
        row_list.append(len(df_y_temp))

        # 软件著作权登记批准日期在2013-01-01(不含)之前的个数
        df_y_temp = df_temp[df_temp['year'] < 2013][df_temp['year'] > 1000]
        row_list.append(len(df_y_temp))

        # 软件著作权登记批准日期在2006-01-01(不含)之前的个数
        df_y_temp = df_y_temp[df_y_temp['year'] < 2006]
        row_list.append(len(df_y_temp))

        row_dict[corporate] = row_list
        dis_df = dis_df.append(pd.DataFrame(row_dict, index=columns).T,
                               ignore_index=False)

    fu.write_file(dis_df,
                  corporation_index_file_url,
                  u'软著著作权_index',
                  index=True)
    return
Beispiel #28
0
def list_single_column_values(file_name,
                              column_name,
                              file_url=working_file_url):
    """
    list a single column's all values
    :param file_name: the file name to be handled (all files should be stored in file_directions.working_file_url)
    :param column_name: the column name to be handled
    :param file_url:
    :return: a list of column values
    """
    data = file_utils.read_file_to_df(file_url, file_name)

    dropped_data = data.drop_duplicates(subset=[column_name], keep='first')
    return dropped_data[column_name].tolist()
Beispiel #29
0
def time_rearranged(file_name, column_name, i = 0):

    # 用split分开时间, 注意:之后数据分析所要用时间表头为0(数字格式)
    df = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet')  # 读取工作表
    df["time"+str(i)], df["minute"+str(i)] = df[column_name].str.split(" ", n=1).str  # 分成两个表 n为劈开的次数
    df.drop(column_name, axis=1, inplace=True)  # 删除原有的列
    df.drop("minute"+str(i), axis=1, inplace=True)  #删除具体时间
    fu.write_file(df, clean_data_temp_file_url, file_name, ext='.xlsx', sheet_name='Sheet', index=False)  # 保存

    # table = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet')
    # wr1 = pd.concat([table, table[column_name].str.split(r' ', expand=True)], axis=1, names=['time','miniute'])
    # fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',sheet_name='Sheet', index=False)
    #
    # dcu.drop_columns(file_name, 1 )
    return
Beispiel #30
0
def time_split(file_name, column_name, i=0):
    df = fu.read_file_to_df(clean_data_temp_file_url,
                            file_name,
                            sheet_name='Sheet')  # 读取工作表
    df["year" + str(i)], df["month" +
                            str(i)], df["day" +
                                        str(i)] = df[column_name].str.split(
                                            "-", n=2).str  # 分成三个表 n为劈开的次数
    df.drop(column_name, axis=1, inplace=True)  # 删除原有的列
    fu.write_file(df,
                  clean_data_temp_file_url,
                  file_name,
                  ext='.xlsx',
                  sheet_name='Sheet',
                  index=False)  # 保存
    return