Beispiel #1
0
                title_set.add(var)  # filter_title
            except KeyError:
                continue
            res_list.append(temp_list)
    res_list = [i for i in res_list if i[0] != i[2]]
    df_blood = pd.DataFrame(res_list, columns=['nums', 'POLICY_ID', 'POLICY_TITLE', 'FATHER_ID', 'FATHER_TITLE'])
    df_blood.to_csv('./test/policy_bj_blood.txt', index=False, header=None, encoding='utf-8', sep='\t')


if __name__ == '__main__':
    # Init LoadData Class
    Load = LoadData()
    FLAG_VERIFY = True   # 是否验证跟新数据在目前库中存在

    # Load Policy Content Data (after run_policy_detail)
    df_content = Load.load_data('./test/policy_bj_content.txt')

    # Extract Father Policy
    df_temp = df_content[['id', 'title', 'content', '政策背景', '支持内容']].copy()
    df_etc = extract_policy(df_temp)

    # Load All Policy Title from Local File To Match Father
    # Firstly Match Policy list, Second Match Whole Pool with No Match Policy
    pool_policy_1 = Load.load_data('./file/policy_content_include.txt')  # Which Policy include Content
    # pool_policy_2 = Load.load_data('./file/policy_content_exclude.txt')  # Which Policy exclude Content

    if FLAG_VERIFY:
        # 通过 FLAG_VERIFY 返回值,判断是否直接更新 policy_content_include.txt表 还是后续手动更新
        df_etc, pool_policy_new = verify_title(df_etc, pool_policy_1)
        # verify_title(df_etc, pool_policy_2, father_dict)  # 通过 FLAG_VERIFY 返回值,判断是否直接更新 policy_content_include.txt表 还是后续手动更新
    df_match = pd.DataFrame(res_list,
                            columns=[
                                'nums', 'POLICY_ID', 'POLICY_TITLE',
                                'SIMILARITY_ID', 'SIMILARITY_TITLE',
                                'SIMILARITY_PROB'
                            ])
    df_match.drop_duplicates(keep='first', inplace=True)
    df_match.to_csv('./test/policy_bj_match.txt',
                    index=False,
                    header=None,
                    encoding='utf-8',
                    sep='\t')


if __name__ == '__main__':
    # Init LoadData Class
    Load = LoadData()

    # Load Policy List Data (after run_policy_list)
    df_list = Load.load_data('./test/policy_bj_list.txt')

    # Load All Policy Title from Local File To Match
    history_policy = Load.load_data(
        './file/policy_content_include.txt')  # Which Policy include Content

    df_etc = match_policy(df_list, history_policy)
    dict_policy = trans_dict(df_list, history_policy)

    # 存储血缘关系
    save_blood(df_etc, dict_policy)
Beispiel #3
0
        temp = re.sub(r'&[a-z]{1,};', '', temp)
        temp = temp.strip()
        if temp == '':
            del_index.append(index)
    if del_index:
        lines = [i for index, i in enumerate(lines) if index not in del_index]
    each_line = split_data(lines, sep_tag)
    return each_line


if __name__ == '__main__':
    # Init LoadData Class
    Load = LoadData()

    # Load Policy Content List Data (after run_policy_list)
    df_list = Load.load_data('./test/policy_bj_content_list.txt', 'title')
    print(df_list.shape)

    # clear Data
    df_list.loc[:, 'content'] = df_list['content'].map(clear_data)

    # predict Data
    model = Classify()
    df_list = model.paragraph_classify(_df=df_list)
    df_list = df_list[[
        'id', 'title', 'content', '政策背景', '支持内容', '申报条件', '申报材料', '申报方式',
        '其他内容', 'originalLink'
    ]]
    df_list.loc[:, 'content'] = df_list[['content'
                                         ]].applymap(lambda x: ''.join(x))
    print(df_list.shape)
# coding:utf-8
import numpy as np
import warnings
import clear_data as cd
from load_data import LoadData

warnings.filterwarnings("ignore")
np.random.seed(0)

if __name__ == '__main__':
    # Init LoadData Class
    Load = LoadData()

    # Load zg_bj Data
    df = Load.load_data('./test/clear_zg_bj_0908.txt', 'title')
    # Load.save_data(df, './test/clear_zg_bj_0831.txt')  # 保存清洗后的数据

    # # Load All Data
    # df_match = Load.load_data('./clear/clear_zg_all_0831.txt', 'title')
    # # Load.save_data(df_match, './test/clear_zg_all_0831.txt')  # 保存数据
    #
    # nums = df_match.shape[0]
    cols = [
        'title', 'level', 'scope', 'source', 'province', 'city', 'county',
        'power', 'funds', 'set', 'txt_id', 'originalLink', 'start_time',
        'end_time', 'content'
    ]
    df_ = df[cols].copy()

    # Fill province, city, county
    df_fill = cd.fill_data(df_)