def __init__(self): # 数据库 self.db = DataSQL() # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容 # 位于algorithm.bid_information self.ifm = Information(pattern) # 辅助类,用以打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开,辅助检查提取算法的精度 self.open_html = See()
class DataRun(object): """可以批量地进行数据的测试, 通过sql读取一部分id, 然后获取这部分id数据各个需要提取的字段的内容 """ def __init__(self): # 数据库 self.db = DataSQL() # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容 # 位于algorithm.bid_information self.ifm = Information(pattern) # 辅助类,用以打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开,辅助检查提取算法的精度 self.open_html = See() def _get_ids(self, sql): res = self.db.read_sql(sql) id_res = [ele[0] for ele in res] return id_res def main_with_open(self): # 读取批量测试数据的id sql = "SELECT id FROM stang_cbid WHERE cate_id = 2" # 读取上面sql,并将结果转化为id的列表 ids = self._get_ids(sql) # 数据库 cnn = DataSQL() for i in ids: try: # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入 # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容 bid_info = BidData(cnn, pattern, i, 'stang_cbid') # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象 res = self.ifm.get_information(bid_info) # print(res, i) # 将提取后的结果插入数据库 cnn.insert_data_with_table_name('stang_bid_extract_zid', bidid=i, first_bidcompany=str(res[0]), manager=str(res[1]), tablename='stang_cbid') # 打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开 # self.open_html.open_html(i, 'stang_bid_new', extra_text=str(res)) cnn.db.commit() # time.sleep(0.8) except Exception as e: print(e)
class Res: """ 测试每一个id, 提取的结果 """ def __init__(self): # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容 # 位于algorithm.bid_information self.ifm = Information(pattern) # 连接数据库 self.cnn = DataSQL() def main(self, id, table_name): # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入 # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容 data = BidData(self.cnn, pattern, id, table_name) # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象 return self.ifm.get_information(data)
print( '这是附近查找单个', dst.get_surrounding_cell_text_from_one_df(df, pattern.MORE_KEY_PATTERN)) print( '这是附近查找多个', dst.get_surrounding_cell_text_from_dfs(list((df, df)), pattern.FIRST_KEY_PATTERN)) # 测试位置 # import doctest # print(doctest.testmod()) from extraction import pattern from algorithm.nlp_algorithm.ltp_algorithm import Ner from algorithm.feature_compute.compute import Computation from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os from algorithm.bid_information import Information os.chdir(os.pardir) os.chdir(os.pardir) cnn = DataSQL() cpt = Computation(pattern.KEY_PATTERN) ifm = Information(pattern) ids = [4000115, 125452, 8000303, 8000296, 8000147] for id in ids: bid_data = BidData(cnn, pattern, id, 'stang_bid_new') try: print(ifm.cpt.get_df_type_pro(bid_data.get_dfs()[0], True)) except Exception as e: print(e)
# 当值为None,设定为空字符串。例如在第一中标候选单位提取程序,会返回None, 这是时候将None转换为空字符串. if value is None: value = '' except KeyError: value = '' res[key] = value return res if __name__ == '__main__': from extraction import pattern from algorithm.bid_information import Information import json import os os.chdir(os.pardir) path = os.path.join('main_data', 'kafka_test_json.txt') with open(path) as f: data = f.read() print(type(data)) dict_data = json.loads(data) test_data_obj = KafkaData(dict_data, pattern) print(test_data_obj.title) print(test_data_obj.get_info_sequence()) print(test_data_obj.get_info_text()) print(test_data_obj.get_info_soup()) print(test_data_obj.get_tag_sequence()) print(test_data_obj.is_valid()) print(test_data_obj.is_valid_bid()) print(test_data_obj.cate_id) ifm = Information(pattern) print(ifm.get_information(test_data_obj)) print()
def __init__(self): # 初始化Information类,Information类为主要的提取程序,通过其方法get_information获取各个字段的内容 # 位于algorithm.bid_information self.ifm = Information(pattern) # 连接数据库 self.cnn = DataSQL()
from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os from algorithm.create_df.data_frame_main import DataFramePre from algorithm.bid_information import Information from algorithm.clean_manager import CleanManager os.chdir(os.pardir) os.chdir(os.pardir) cnn = DataSQL() cpt = Computation(pattern.KEY_PATTERN) ner = Ner() clean_manager = CleanManager(pattern) cpt_manager = ComputationManager(pattern, ner, clean_manager) df_pre = DataFramePre(pattern) ifm = Information(pattern) ids = [8905, 8896, 8898, 4000115, 125452, 8000303, 8000296, 8000147] for id in ids: data = BidData(cnn, pattern, id, 'stang_bid_new') # try: # print(cpt_manager.get_manager_loc(df_pre.get_all_valid_pandas_df(data)[0], None)) first_bidcompany = ifm.get_information(data)[0] if first_bidcompany: # 在项目经理方面,不是查找距离关键词最近,而是查找距离第一中标候选人最近的。 res = cpt_manager.get_nearest_target_from_dfs( df_pre.get_all_valid_pandas_df(data), cpt_manager.get_location, first_bidcompany, cpt_manager.get_manager_loc, None) res = clean_manager.get_target_list(res, pattern.MANAGER_PATTERN, ner.get_persons_from_string) if res: print(res[0], first_bidcompany)