예제 #1
0
파일: data_run.py 프로젝트: Yunnhan/extract
 def __init__(self):
     # 数据库
     self.db = DataSQL()
     # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容
     # 位于algorithm.bid_information
     self.ifm = Information(pattern)
     # 辅助类,用以打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开,辅助检查提取算法的精度
     self.open_html = See()
예제 #2
0
파일: data_run.py 프로젝트: Yunnhan/extract
class DataRun(object):
    """可以批量地进行数据的测试, 通过sql读取一部分id, 然后获取这部分id数据各个需要提取的字段的内容

    """
    def __init__(self):
        # 数据库
        self.db = DataSQL()
        # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容
        # 位于algorithm.bid_information
        self.ifm = Information(pattern)
        # 辅助类,用以打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开,辅助检查提取算法的精度
        self.open_html = See()

    def _get_ids(self, sql):
        res = self.db.read_sql(sql)
        id_res = [ele[0] for ele in res]
        return id_res

    def main_with_open(self):
        # 读取批量测试数据的id
        sql = "SELECT id FROM stang_cbid WHERE cate_id = 2"
        # 读取上面sql,并将结果转化为id的列表
        ids = self._get_ids(sql)
        # 数据库
        cnn = DataSQL()
        for i in ids:
            try:
                # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入
                # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容
                bid_info = BidData(cnn, pattern, i, 'stang_cbid')
                # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象
                res = self.ifm.get_information(bid_info)
                # print(res, i)

                # 将提取后的结果插入数据库
                cnn.insert_data_with_table_name('stang_bid_extract_zid',
                                                bidid=i,
                                                first_bidcompany=str(res[0]),
                                                manager=str(res[1]),
                                                tablename='stang_cbid')

                # 打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开
                # self.open_html.open_html(i, 'stang_bid_new', extra_text=str(res))

                cnn.db.commit()
                # time.sleep(0.8)
            except Exception as e:
                print(e)
예제 #3
0
class Res:
    """ 测试每一个id, 提取的结果

    """
    def __init__(self):
        # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容
        # 位于algorithm.bid_information
        self.ifm = Information(pattern)
        # 连接数据库
        self.cnn = DataSQL()

    def main(self, id, table_name):
        # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入
        # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容
        data = BidData(self.cnn, pattern, id, table_name)
        # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象
        return self.ifm.get_information(data)
예제 #4
0
파일: compute.py 프로젝트: Yunnhan/extract
    print(
        '这是附近查找单个',
        dst.get_surrounding_cell_text_from_one_df(df,
                                                  pattern.MORE_KEY_PATTERN))
    print(
        '这是附近查找多个',
        dst.get_surrounding_cell_text_from_dfs(list((df, df)),
                                               pattern.FIRST_KEY_PATTERN))
    # 测试位置
    # import doctest
    # print(doctest.testmod())
    from extraction import pattern
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    from algorithm.feature_compute.compute import Computation
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os
    from algorithm.bid_information import Information
    os.chdir(os.pardir)
    os.chdir(os.pardir)
    cnn = DataSQL()
    cpt = Computation(pattern.KEY_PATTERN)
    ifm = Information(pattern)
    ids = [4000115, 125452, 8000303, 8000296, 8000147]
    for id in ids:
        bid_data = BidData(cnn, pattern, id, 'stang_bid_new')
        try:
            print(ifm.cpt.get_df_type_pro(bid_data.get_dfs()[0], True))
        except Exception as e:
            print(e)
예제 #5
0
                # 当值为None,设定为空字符串。例如在第一中标候选单位提取程序,会返回None, 这是时候将None转换为空字符串.
                if value is None:
                    value = ''
            except KeyError:
                value = ''
            res[key] = value
        return res

if __name__ == '__main__':
    from extraction import pattern
    from algorithm.bid_information import Information
    import json
    import os
    os.chdir(os.pardir)
    path = os.path.join('main_data', 'kafka_test_json.txt')
    with open(path) as f:
        data = f.read()
        print(type(data))
    dict_data = json.loads(data)
    test_data_obj = KafkaData(dict_data, pattern)
    print(test_data_obj.title)
    print(test_data_obj.get_info_sequence())
    print(test_data_obj.get_info_text())
    print(test_data_obj.get_info_soup())
    print(test_data_obj.get_tag_sequence())
    print(test_data_obj.is_valid())
    print(test_data_obj.is_valid_bid())
    print(test_data_obj.cate_id)
    ifm = Information(pattern)
    print(ifm.get_information(test_data_obj))
    print()
예제 #6
0
 def __init__(self):
     # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容
     # 位于algorithm.bid_information
     self.ifm = Information(pattern)
     # 连接数据库
     self.cnn = DataSQL()
예제 #7
0
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os
    from algorithm.create_df.data_frame_main import DataFramePre
    from algorithm.bid_information import Information
    from algorithm.clean_manager import CleanManager

    os.chdir(os.pardir)
    os.chdir(os.pardir)
    cnn = DataSQL()
    cpt = Computation(pattern.KEY_PATTERN)
    ner = Ner()
    clean_manager = CleanManager(pattern)
    cpt_manager = ComputationManager(pattern, ner, clean_manager)
    df_pre = DataFramePre(pattern)
    ifm = Information(pattern)
    ids = [8905, 8896, 8898, 4000115, 125452, 8000303, 8000296, 8000147]
    for id in ids:
        data = BidData(cnn, pattern, id, 'stang_bid_new')
        # try:
        # print(cpt_manager.get_manager_loc(df_pre.get_all_valid_pandas_df(data)[0], None))
        first_bidcompany = ifm.get_information(data)[0]
        if first_bidcompany:
            # 在项目经理方面,不是查找距离关键词最近,而是查找距离第一中标候选人最近的。
            res = cpt_manager.get_nearest_target_from_dfs(
                df_pre.get_all_valid_pandas_df(data), cpt_manager.get_location,
                first_bidcompany, cpt_manager.get_manager_loc, None)
            res = clean_manager.get_target_list(res, pattern.MANAGER_PATTERN,
                                                ner.get_persons_from_string)
        if res:
            print(res[0], first_bidcompany)