Python check_file_urlの例、file_utils.check_file_url Pythonの例

コード例 #1

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_invalid_data(file_name,
                      column_name,
                      operator,
                      thresh_value,
                      file_url=clean_data_temp_file_url,
                      dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if not (isinstance(content, float) or isinstance(content, int)):
            continue

        isvalid = True
        if operator == '<':
            isvalid = not (content < thresh_value)
        elif operator == '>':
            isvalid = not (content > thresh_value)
        elif operator == '>=':
            isvalid = not (content >= thresh_value)
        elif operator == '<=':
            isvalid = not (content <= thresh_value)

        if not isvalid:
            data_frame = data_frame.drop(index=index)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #2

0

ファイルを表示

ファイル: visualize_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def pic_corr_heat_map(index_files,
                      category_name,
                      index_file_url=corporation_index_file_url,
                      heat_map_url=corporation_index_heat_map_file_url):
    """
    Draw correlation heat map pictures in order to filter features.
    :param index_files:
    :param category_name:
    :param index_file_url:
    :param heat_map_url: where to store these pictures.
    :return:
    """
    fig = plt.figure(figsize=(26, 18))
    for file_n in index_files:
        print file_n
        data_frame = fu.read_file_to_df(index_file_url, file_n + '_index')
        data_frame = data_frame.drop(
            columns=['Unnamed: 0', u'企业总评分'.encode('utf-8')])
        corr_matrix = data_frame.corr()
        print(corr_matrix)
        sns.heatmap(corr_matrix,
                    annot=True,
                    vmax=1,
                    vmin=0,
                    xticklabels=True,
                    yticklabels=True,
                    square=True)
        plt.title(file_n)

        fig.savefig(
            fu.check_file_url(heat_map_url + '/' + category_name + '/') +
            file_n + '.png',
            dpi=75)
        plt.clf()

コード例 #3

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_unit_with_float_format(file_name,
                                column_name,
                                unit_strs,
                                empty_mask=-1,
                                file_url=clean_data_temp_file_url,
                                dst_file_url=clean_data_temp_file_url):
    """

    :type unit_strs: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, column_name, empty_mask)
        if str(content).startswith('.'):
            content = str(content).replace('.', '0.')
        for j in range(0, len(unit_strs)):
            if str(content).endswith(unit_strs[j]):
                data_frame.set_value(index, column_name,
                                     str(content).replace(unit_strs[j], ''))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #4

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def extract_keyword(file_name,
                    column_name,
                    keywords,
                    empty_mask='Unknown',
                    others_mask='Others',
                    file_url=clean_data_temp_file_url,
                    dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, empty_mask)
        for j in range(0, len(keywords)):
            if keywords[j] in str(content):
                data_frame.set_value(index, column_name, keywords[j])
                break

    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if content not in keywords:
            data_frame.set_value(index, column_name, others_mask)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #5

0

ファイルを表示

def list_category_columns_values(category,
                                 category_name,
                                 file_url=working_file_url):
    """

    :param category:
    :param category_name:
    :param file_url:
    :return:
    """

    wb = xlsxwt.Workbook(
        file_utils.check_file_url(categorized_data_file_url) + category_name +
        '.xlsx', {'nan_inf_to_errors': True})
    # writer = pandas.ExcelWriter(unicode(file_utils.check_file_url(categorized_data_file_url) + category_name + '.xlsx'))
    for file_name in category:
        print(file_name)
        ws = wb.add_worksheet(unicode(file_name))
        cols_dict = list_file_columns_values(unicode(file_name),
                                             file_url=file_url)
        index_column = 0
        cols_sort_keys = cols_dict.keys()
        cols_sort_keys.sort()
        for key in cols_sort_keys:
            index_row = 0
            ws.write(index_row, index_column, key)
            for item in cols_dict.get(key):
                index_row += 1
                ws.write(index_row, index_column, item)

            index_column += 1

    wb.close()
    return

コード例 #6

0

ファイルを表示

def comments_generate():
    """
    generate comments for each table's dirty value handle
    :return:
    """
    comment_str_ori = u"\
    Dirty value handle for table {$$}.\n\
    First we'll drop rows that empty value is too many.\n\
    # ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额']\n\
    # Once there are more than 3 empties in these 8 columns we will drop that row.\n\
    Then we check nulls column by column and decide how to process with it.\n\
    Next we should numeric all the value for future process.\n\
    After these are done, it's time to work out features we can use in this table which belongs\n\
        to exploratory data analysis. \n\
".encode('utf-8')
    column_str_ori = "\n\
    -----------------------------\n\
    {$$$}\n\
    ------\n"

    for file_name in os.listdir(working_file_url):
        comment_str = comment_str_ori.replace('{$$}',
                                              file_name.encode('utf-8'))
        df = file_utils.read_file_to_df(working_file_url, file_name)
        column_list = df.columns.tolist()
        for i in range(1, len(column_list)):
            comment_str += column_str_ori.replace(
                '{$$$}', column_list[i].encode('utf-8'))

        comment_str += '\n    -----------------------------'
        with open(
                file_utils.check_file_url('dirty_value_handle_comments/') +
                file_name + '_comments.txt', 'w+') as f:
            f.write(comment_str)
    return

コード例 #7

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def change_number(file_name,
                  column_name,
                  file_url=clean_data_temp_file_url,
                  dst_file_url=clean_data_temp_file_url):
    """

    :type status_names: list
    :type status: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if str(content).endswith(u'万'):
            num = str(content).replace(u'万', '')  # 把前面的改成后面的，此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**4))
        elif str(content).endswith(u'万亿'):
            num = str(content).replace(u'万亿', '')  # 把前面的改成后面的，此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**12))
        elif str(content).endswith(u'亿'):
            num = str(content).replace(u'亿', '')  # 把前面的改成后面的，此处是删去结尾的'万'
            numb = float(num)
            data_frame.set_value(index, column_name, numb * (10**8))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #8

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def count_split(file_name,
                column_name,
                splits,
                empty_mask=-1,
                file_url=clean_data_temp_file_url,
                dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, empty_mask)

        is_counted = False
        for j in range(0, len(splits)):
            if splits[j] in str(content):
                strs = str(content).split(splits[j])
                data_frame.set_value(index, column_name, len(strs))
                is_counted = True
        if not is_counted:
            data_frame.set_value(index, column_name, 1)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #9

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def adjust_time(file_name,
                column_name,
                empty_mask='Unknown',
                file_url=clean_data_temp_file_url,
                dst_file_url=clean_data_temp_file_url):
    """
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, column_name, empty_mask)
        if len(str(content)) > 2:
            if str(content)[2] == u'-':
                data_frame.set_value(index, column_name, '20' + str(content))

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    # 14.2-18.4 19个季度
    #     status_list = [[u'14-06-30'], [u'14-09-30'], [u'14-12-31'],
    #                    [u'15-03-31'] [u'15-06-30'], [u'15-09-30'], [u'15-12-31'],
    # [u'15-03-31'] [u'15-06-30'], [u'15-09-30'], [u'15-12-31'],
    #                    ]
    #     status_after = [1,2,3,4,5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,16,17,18,19]
    #     dcu.merge_status(file_name, column_name, status_list, status_after)
    return

コード例 #10

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def merge_status_new_column(file_name,
                            column_name,
                            new_column_name,
                            status,
                            status_names,
                            others='',
                            empty_mask='Unknown',
                            file_url=clean_data_temp_file_url,
                            dst_file_url=clean_data_temp_file_url):
    """

    :type status_names: list
    :type status: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    data_frame[new_column_name] = empty_mask
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, new_column_name, empty_mask)
        is_categorized = False
        for j in range(0, len(status)):
            if content in status[j]:
                data_frame.set_value(index, new_column_name, status_names[j])
                is_categorized = True
        if (not is_categorized) and (not others == ''):
            data_frame.set_value(index, new_column_name, others)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #11

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def time_periods_format(file_name,
                        column_name,
                        file_url=clean_data_temp_file_url,
                        dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content) or pandas.isna(content):
            data_frame.set_value(index, column_name, '-')
            continue
        if u'年' in content:
            content = str(content).replace('-',
                                           '~').replace(u'年', '/').replace(
                                               u'月', '/').replace(u'日', '')
        elif '~' in content:
            content = str(content).replace('-', '/')
        elif u'至' in content:
            content = str(content).replace(u'至', '~').replace('-', '/')
        content = str(content).replace('/0', '/')

        data_frame.set_value(index, column_name, content)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

コード例 #12

0

ファイルを表示

ファイル: exploratory_data_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_useless_indexes(index_files,
                         ind_fil,
                         read_url=corporation_index_file_url,
                         write_url=corporation_index_file_url):
    """
    Drop indexes we think is useless from the image of scatter.
    :return:
    """
    print('total indexes: ' + str(len(ind_fil)))
    indexes_filter_temp = ind_fil
    counts = 0
    for file_n in index_files:
        print file_n

        data_frame = fu.read_file_to_df(read_url, file_n + '_index')
        for column in data_frame.columns:
            if column in [
                    'Unnamed: 0', u'企业总评分', 'int_score', 'int_score_root'
            ]:
                continue
            if column not in ind_fil:
                data_frame = data_frame.drop(column, axis=1)
            else:
                indexes_filter_temp.remove(column)
        counts += len(data_frame.columns) - 4
        fu.write_file(data_frame, fu.check_file_url(write_url),
                      file_n + '_index')
    print('set indexes: ' + str(counts))
    print(indexes_filter_temp)

コード例 #13

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def merge_rows_by_columns(file_name,
                          keys=None,
                          file_url=working_file_url,
                          dst_file_url=clean_data_temp_file_url):
    """
    merge a table's rows with the same unique keys.
    :param file_name:
    :param keys:
    :param file_url:
    :param dst_file_url: which file folder should store the result
    :return:
    """
    origin_df = file_utils.read_file_to_df(file_url, file_name)
    data_frame = origin_df
    data_frames = [data_frame]

    str_keys = []
    for index in range(1, len(origin_df)):
        anchor_row = origin_df[index - 1:index]

        temp_df = origin_df[index:]
        for key in keys:
            if index == 1:
                str_keys.append(key.encode('utf-8'))
            temp_df = temp_df.loc[temp_df[key.encode('utf-8')] ==
                                  anchor_row.loc[index - 1,
                                                 key.encode('utf-8')]]

        duplicated_num = len(temp_df)
        for j in range(0, duplicated_num):
            data_frames[0] = data_frames[0].drop(index=index)

        for frame_nums in range(1, duplicated_num + 1):
            if len(data_frames) > frame_nums:
                data_frames[frame_nums] = data_frames[frame_nums].append(
                    temp_df[frame_nums - 1:frame_nums])
            else:
                new_df = temp_df[frame_nums - 1:frame_nums]
                data_frames.append(new_df)

        index += duplicated_num

    data_frame = data_frames[0]
    for df in data_frames:
        data_frame = pandas.merge(data_frame,
                                  df,
                                  how='left',
                                  on=origin_df.columns.tolist())

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

    return

コード例 #14

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def merge_rows(file_name,
               keys=None,
               file_url=working_file_url,
               dst_file_url=clean_data_temp_file_url):
    """
    remove duplicated rows.
    :param file_name:
    :param keys:
    :param file_url:
    :param dst_file_url: which file folder should store the result
    :return:
    """
    # origin_df = file_utils.read_file(working_file_url + file_name)
    # data_frame = origin_df
    # data_frames = [data_frame]
    #
    # str_keys = []
    # for index in range(1, len(origin_df)):
    #     anchor_row = origin_df[index - 1:index]
    #
    #     temp_df = origin_df[index:]
    #     for key in keys:
    #         if index == 1:
    #             str_keys.append(key.encode('utf-8'))
    #         temp_df = temp_df.loc[temp_df[key.encode('utf-8')] == anchor_row.loc[index-1, key.encode('utf-8')]]
    #
    #     duplicated_num = len(temp_df)
    #     for j in range(0, duplicated_num):
    #         data_frames[0] = data_frames[0].drop(index=index)
    #
    #     for frame_nums in range(1, duplicated_num + 1):
    #         if len(data_frames) > frame_nums:
    #             data_frames[frame_nums] = data_frames[frame_nums].append(temp_df[frame_nums - 1: frame_nums])
    #         else:
    #             new_df = temp_df[frame_nums - 1: frame_nums]
    #             data_frames.append(new_df)
    #
    #     index += duplicated_num
    #
    # data_frame = data_frames[0]
    # for df in data_frames:
    #     data_frame = pandas.merge(data_frame, df, how='left', on=origin_df.columns.tolist())

    data_frame = file_utils.read_file_to_df(file_url, file_name)
    data_frame = data_frame.drop_duplicates()

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

    return

コード例 #15

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_columns(file_name,
                 columns,
                 file_url=clean_data_temp_file_url,
                 dst_file_url=clean_data_temp_file_url):
    try:
        data_frame = file_utils.read_file_to_df(file_url, file_name)
        data_frame = data_frame.drop(columns, axis=1)

        file_utils.write_file(data_frame,
                              file_utils.check_file_url(dst_file_url),
                              file_name,
                              sheet_name='Sheet',
                              index=False)
    except ValueError as e:
        print('except:', e)
    return

コード例 #16

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_unit_with_transfer(file_name,
                            column_name,
                            unit_strs,
                            transfer_map,
                            empty_mask='Unknown',
                            file_url=clean_data_temp_file_url,
                            dst_file_url=clean_data_temp_file_url):
    """

    :type transfer_map: dict
    :type unit_strs: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if pandas.isnull(content):
            data_frame.set_value(index, column_name, empty_mask)
        for j in range(0, len(unit_strs)):
            if str(content).endswith(unit_strs[j]):
                data_frame.set_value(index, column_name,
                                     str(content).replace(unit_strs[j], ''))

        for key in transfer_map.keys():
            if str(content).endswith(key):
                content = str(content).replace(key, '')
                if not (isinstance(content, float)
                        or isinstance(content, int)):
                    try:
                        content = float(str(content))
                    except AttributeError as ae:
                        print(ae)
                        continue
                    except ValueError as ve:
                        print(ve)
                        continue

                content = content * transfer_map.get(key)
                data_frame.set_value(index, column_name, content)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #17

0

ファイルを表示

def data_excel_statistic_info(file_name,
                              init_file_dir=origin_file_url,
                              work_file_dir=working_file_url):
    """
    get the simple statistic info for one data file, the info will be stored under statistic_data_file_url with the
    same file name with file_name, every sheet is one column's info with sheet name be the column's name.
    :param work_file_dir:
    :param init_file_dir:
    :param file_name: the file name of the excel, e.g. data.xls
    :return:
    """
    init_file = init_file_dir + file_name

    if work_file_dir is not None:
        work_file = work_file_dir + file_name
        file_utils.copy_file(init_file, work_file)
        print('copy file: ' + file_name)

    print(init_file)
    data = file_utils.read_file_to_df(init_file_dir, file_name)

    writer = pandas.ExcelWriter(
        unicode(
            file_utils.check_file_url(statistic_data_file_url) + file_name))
    for column in data.columns:
        described_data = data[column].describe()
        print(described_data)
        # to name a sheet, there's some rules need to adopt
        m = openpyxl_child.INVALID_TITLE_REGEX.search(column)
        if m:
            for item in m.group():
                scolumn = column.encode('utf-8').replace(
                    item.encode('utf-8'), '-')
                column = scolumn
        if len(unicode(column)) > 10:
            column = unicode(column)[0:10]
        file_utils.write_file_without_save(described_data,
                                           writer,
                                           sheet_name=column,
                                           index=True)
    writer.save()

コード例 #18

0

ファイルを表示

ファイル: visualize_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def pic_scatter(index_files,
                category_name,
                index_file_url=corporation_index_file_url,
                scatter_url=corporation_index_scatter_file_url):
    """
    scatter picture for each index and the score.
    :param index_files: the index file we need to analyse.
    :param category_name: the category of the index, the images will be stored at this file folder.
    :param index_file_url: file url to index files.
    :param scatter_url: scatter image url to be stored.
    :return:
    """
    fig = plt.figure(figsize=(14, 9))
    for file_n in index_files:
        print file_n
        data_frame = fu.read_file_to_df(index_file_url, file_n + '_index')
        for column in data_frame.columns:
            if column == 'Unnamed: 0':
                continue

            plt.title(column)
            plt.xlabel('score')
            plt.ylabel(column)

            x = data_frame['int_score'].to_list()
            y = data_frame[column].to_list()
            xy = list(zip(x, y))

            s = []
            c = np.random.rand(len(xy))
            for xy_item in xy:
                s.append(xy.count(xy_item) * 1.5)
            plt.scatter(x, y, s=s, c=c)

            # plt.show()

            fig.savefig(fu.check_file_url(scatter_url + '/' + category_name +
                                          '/' + file_n + '/') +
                        str(column).replace('/', '-') + '.png',
                        dpi=150)
            plt.clf()

コード例 #19

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def merge_number_with_c(file_name,
                        column_name,
                        file_url=clean_data_temp_file_url,
                        dst_file_url=clean_data_temp_file_url):
    """

    :type status_names: list
    :type status: list
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if str(content).endswith(u'万'):
            data_frame.set_value(index, column_name, 'Unknown')
        elif str(content).endswith(u'亿'):
            data_frame.set_value(index, column_name, 'Unknown')

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)

コード例 #20

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def mark_invalid_num_data(file_name,
                          column_name,
                          operator,
                          thresh_value,
                          error_mask=-1,
                          file_url=clean_data_temp_file_url,
                          dst_file_url=clean_data_temp_file_url):
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    for index in range(0, len(data_frame)):
        content = data_frame.at[index, column_name]
        if not (isinstance(content, float) or isinstance(content, int)):
            try:
                content = float(str(content))
            except AttributeError as ae:
                print(ae)
                continue
            except ValueError as ve:
                print(ve)
                continue

        isvalid = True
        if operator == '<':
            isvalid = not (content < thresh_value)
        elif operator == '>':
            isvalid = not (content > thresh_value)
        elif operator == '>=':
            isvalid = not (content >= thresh_value)
        elif operator == '<=':
            isvalid = not (content <= thresh_value)

        if not isvalid:
            data_frame.set_value(index, column_name, error_mask)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #21

0

ファイルを表示

ファイル: data_clean_utils.py プロジェクト: BarryZM/BFSUIEEEISIWorldCup

def drop_rows_too_many_empty(file_name,
                             columns,
                             thresh=2,
                             file_url=clean_data_temp_file_url,
                             dst_file_url=clean_data_temp_file_url):
    """
    drop rows that too many values are empty.
    :param file_name:
    :param columns: the columns we need to check if it is empty
    :param thresh: how many empty is 'too many'
    :param file_url: input file url
    :param dst_file_url: where to store the result
    :return:
    """
    data_frame = file_utils.read_file_to_df(file_url, file_name)
    data_frame = data_frame.dropna(subset=columns, thresh=thresh)

    file_utils.write_file(data_frame,
                          file_utils.check_file_url(dst_file_url),
                          file_name,
                          sheet_name='Sheet',
                          index=False)
    return

コード例 #22

0

ファイルを表示

def cross_section(file_name, vars, file_url=clean_data_temp_file_url, dst_file_url=corporation_index_file_url):
    """
    指标：所有变量和时间（季度）的交叉项，转化为截面数据
    :param file_name:
    :param vars: 需要和时间交叉的变量集，写成向量
    :param empty_mask:
    :param file_url:
    :param dst_file_url:
    :return:
    """

    """
    调试函数用
    file_name=u'上市公司财务信息-每股指标'
    vars=[u'基本每股收益(元)', u'扣非每股收益(元)',u'稀释每股收益(元)',
        u'每股净资产(元)',u'每股公积金(元)',u'每股未分配利润(元)',u'每股经营现金流(元)']
    file_url = clean_data_temp_file_url
    dst_file_url = corporation_index_file_url
    """

    data_frame = file_utils.read_file_to_df(file_url, file_name)
    date = data_frame[u'日期'.encode('utf-8')]  # 日期列
    unique_date = numpy.sort(list(set(date)))  # 删除重复，并按时间排列
    # 只保留 16.9之后的
    # for j in range(0, len(unique_date)):
    #     if unique_date[j][0:3]<2016 or unique_date[j][5:6]<09:
    #         unique_date[j]=[]

    # 新表的列名，是变量名和日期的交叉项
    var_date = []
    for i in range(0, len(vars)):
        for j in range(0, len(unique_date)):
            var_date.append(vars[i] + unique_date[j].encode('utf-8'))
    """
    尝试
    a = []
    aa=[u'每股收益',2,3]
    a = pandas.DataFrame(index=[range(1001, 3001)], columns=aa)
#   输出时文件名需要加上''，index=true表示包含第一列
    file_utils.write_file(a, file_utils.check_file_url(dst_file_url), 'a',ext='.xlsx',
                          sheet_name='Sheet', index='true')    

    ab=u'每股收益'
    ab in aa
    a.set_value(company,ab,this_number)
    
    column in var_date
    
    print data_frame.values[0][0]
    """

    # 建立空表
    b = []
    b = pandas.DataFrame(index=[range(test_start, test_end + 1)], columns=var_date)

    # 赋值
    for i in range(0, len(vars)):
        for j in range(0, len(data_frame)):  # 原表中的每一行
            company = data_frame.iloc[j, 0]
            # at后要写列的名字，不能写列数
            # company = data_frame.at[j, u'企业总评分']
            this_season = data_frame.at[j, u'日期'.encode('utf-8')]
            this_number = data_frame.at[j, vars[i]]
            if this_number != 'Unknown':
                column = vars[i] + this_season.encode('utf-8')
                b.set_value(company, column, this_number)

    file_utils.write_file(b, file_utils.check_file_url(dst_file_url), file_name + '_index', ext='.xlsx',
                          sheet_name='Sheet', index=True)

    """
    空值的处理有点问题
    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个
    for i in range(0,len(var_date)):
        dcu.merge_status('b', u'基本每股收益(元)2010-12-31', status_list, status_after, empty_mask='-65535')
    var_date[i]
    """

    """
    运行框
        file_name=u'上市信息财务信息资产负债表'
        vars=[u'资产:货币资金(元)',u'资产:应收账款(元)',u'资产:其它应收款(元)',u'资产:存货(元)',
        u'资产:流动资产合计(元)',u'资产:长期股权投资(元)',u'资产:累计折旧(元)',u'资产:固定资产(元)',u'资产:无形资产(元)',u'资产:资产总计(元)',u'负债:应付账款(元)',
        u'负债:预收账款(元)',u'负债:存货跌价准备(元)',u'负债:流动负债合计(元)',u'负债:长期负债合计(元)',
        u'负债:负债合计(元)',u'权益:实收资本(或股本)(元)',u'权益:资本公积金(元)',u'权益:盈余公积金(元)',u'权益:股东权益合计(元)',u'流动比率']
    import exploratory_data_finance
    
    exploratory_data_finance.cross_section(u'上市公司财务信息-每股指标', [u'基本每股收益(元)'.encode('utf-8'), u'扣非每股收益(元)'.encode('utf-8'), u'稀释每股收益(元)'.encode('utf-8'),
                u'每股净资产(元)'.encode('utf-8'), u'每股公积金(元)'.encode('utf-8'), u'每股未分配利润(元)'.encode('utf-8'), u'每股经营现金流(元)'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-财务风险指标', [u'资产负债率(%)'.encode('utf-8'),u'流动负债/总负债(%)'.encode('utf-8'),u'流动比率'.encode('utf-8'),u'速动比率'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-成长能力指标', [u'营业总收入(元)'.encode('utf-8'),u'毛利润(元)'.encode('utf-8'),u'归属净利润(元)'.encode('utf-8'),
        u'扣非净利润(元)'.encode('utf-8'),u'营业总收入同比增长(元)'.encode('utf-8'),u'归属净利润同比增长(元)'.encode('utf-8'),u'扣非净利润同比增长(元)'.encode('utf-8'),
        u'营业总收入滚动环比增长(元)'.encode('utf-8'),u'归属净利润滚动环比增长(元)'.encode('utf-8'),u'扣非净利润滚动环比增长(元)'.encode('utf-8')])
    exploratory_data_finance.cross_section(u'上市信息财务信息-利润表', [u'营业收入(元)',u'营业成本(元)',u'销售费用(元)',u'财务费用(元)',
       u'管理费用(元)',u'资产减值损失(元)',u'投资收益(元)',u'营业利润(元)',u'利润总额(元)',u'所得税(元)',u'归属母公司所有者净利润(元)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息-现金流量表', [u'经营:销售商品、提供劳务收到的现金(元)',u'经营:收到的税费返还(元)',
                u'经营:收到其他与经营活动有关的现金(元)', u'经营:经营活动现金流入小计(元)',u'经营:购买商品、接受劳务支付的现金(元)',u'经营:支付给职工以及为职工支付的现金(元)',
                u'经营:支付的各项税费(元)',u'经营:支付其他与经营活动有关的现金(元)',u'经营:经营活动现金流出小计(元)',u'经营:经营活动产生的现金流量净额(元)',
                u'投资:取得投资收益所收到的现金(元)',u'投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)',u'投资:投资活动现金流入小计(元)',
                u'投资:购建固定资产、无形资产和其他长期资产支付的现金(元)',u'投资:投资支付的现金(元)',u'投资:投资活动现金流出小计(元)',
                u'投资:投资活动产生的现金流量净额(元)',u'筹资:吸收投资收到的现金(元)',u'筹资:取得借款收到的现金(元)',u'筹资:筹资活动现金流入小计(元)',
                u'筹资:偿还债务支付的现金(元)',u'筹资:分配股利、利润或偿付利息支付的现金(元)',u'筹资:筹资活动现金流出小计(元)',u'筹资活动产生的现金流量净额(元)'])

    exploratory_data_finance.cross_section(u'上市信息财务信息盈利能力指标', [u'加权净资产收益率(%)',u'摊薄净资产收益率(%)',u'摊薄总资产收益率(%)',u'毛利率(%)',u'净利率(%)',u'实际税率(%)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息运营能力指标', [u'总资产周转率(次)',u'应收账款周转天数(天)',u'存货周转天数(天)'])
    exploratory_data_finance.cross_section(u'上市信息财务信息资产负债表', [u'资产:货币资金(元)',u'资产:应收账款(元)',u'资产:其它应收款(元)',u'资产:存货(元)',
        u'资产:流动资产合计(元)',u'资产:长期股权投资(元)',u'资产:累计折旧(元)',u'资产:固定资产(元)',u'资产:无形资产(元)',u'资产:资产总计(元)',u'负债:应付账款(元)',
        u'负债:预收账款(元)',u'负债:存货跌价准备(元)',u'负债:流动负债合计(元)',u'负债:长期负债合计(元)',
        u'负债:负债合计(元)',u'权益:实收资本(或股本)(元)',u'权益:资本公积金(元)',u'权益:盈余公积金(元)',u'权益:股东权益合计(元)',u'流动比率'])

    """
    return