def begin_stang_bid_new_job(cnn): # 加载本次任务的开始id(上次任务的结束id) start_id = load_bid_new_start_id() # 获取数据库最新id,作为本次任务的结束id last_id = get_last_id('stang_bid_new', cnn) # 当没有新数据的时候,结束本次任务 if last_id == start_id: return None # 确保结束id 大于 开始id。 防止自己手动修改开始id的时候出错 if last_id < start_id: raise AssertionError('last_id < start_id stang_bid_new') # 对于每一个id,进行字段的提取工作 for each_id in range(start_id, last_id): print(each_id, 'bid_new') # 读取数据,并返回一个bid_data对象,该对象包含了所有需要的数据 bid_data = BidData(cnn, pattern, each_id, 'stang_bid_new') # 判断info字段是否为空, 以及cate_id是否为2或者是否含有‘中标’关键词 if not bid_data.is_valid_bid(): continue # 提取第一中标单位和项目经理 first_bid_company, manager = ifm.get_information(bid_data) # 将结果插入数据库 cnn.insert_data_with_table_name('stang_bid_extract_zid', bidid=each_id, first_bidcompany=str(first_bid_company), manager=str(manager), tablename='stang_bid_new') cnn.db.commit() # 储存本次任务的结束id, 作为下次任务的开始id write_bid_new_start_id(last_id)
def main_with_open(self): # 读取批量测试数据的id sql = "SELECT id FROM stang_cbid WHERE cate_id = 2" # 读取上面sql,并将结果转化为id的列表 ids = self._get_ids(sql) # 数据库 cnn = DataSQL() for i in ids: try: # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入 # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容 bid_info = BidData(cnn, pattern, i, 'stang_cbid') # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象 res = self.ifm.get_information(bid_info) # print(res, i) # 将提取后的结果插入数据库 cnn.insert_data_with_table_name('stang_bid_extract_zid', bidid=i, first_bidcompany=str(res[0]), manager=str(res[1]), tablename='stang_cbid') # 打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开 # self.open_html.open_html(i, 'stang_bid_new', extra_text=str(res)) cnn.db.commit() # time.sleep(0.8) except Exception as e: print(e)
def open_nijian_data_summary(df, ifm, cnn, open): # open.open_html() df = df.sample(frac=1) for id in df['id'].head(80): # open.open_html(id, 'stang_bid_new') data = BidData(cnn, pattern, id, 'stang_bid_new') res = ifm.get_summary(data) open.open_html(id, 'stang_bid_new', extra_text=str(res))
if not data.is_valid(): return None html_dfs = data.get_dfs() if html_dfs is None: return None # 筛选出有效的df html_dfs_filter_obj = filter(lambda df: self.DATA_FRAME_FILTER_KEY_WORDS.findall(df.to_string()), html_dfs) html_dfs = list(html_dfs_filter_obj) if not html_dfs: return None return html_dfs if __name__ == '__main__': from extraction import pattern from algorithm.nlp_algorithm.ltp_algorithm import Ner from algorithm.feature_compute.compute import Computation from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os os.chdir(os.pardir) cnn = DataSQL() df_p = DataFramePre(pattern) test_ids = [8897, 23599, 9751767, 8703, 19599] for i in test_ids: bid_data = BidData(cnn, pattern, i, 'stang_bid_new') # print(bid_data.info) print(df_p.get_all_valid_pandas_df(bid_data))
def test_one(self, id, table_name): data = BidData(self.cnn, pattern, id, table_name) # bid_data_object candidates = self.ifm.get_first_bidmoney(data) # 获取金额, return candidates
print( '这是附近查找单个', dst.get_surrounding_cell_text_from_one_df(df, pattern.MORE_KEY_PATTERN)) print( '这是附近查找多个', dst.get_surrounding_cell_text_from_dfs(list((df, df)), pattern.FIRST_KEY_PATTERN)) # 测试位置 # import doctest # print(doctest.testmod()) from extraction import pattern from algorithm.nlp_algorithm.ltp_algorithm import Ner from algorithm.feature_compute.compute import Computation from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os from algorithm.bid_information import Information os.chdir(os.pardir) os.chdir(os.pardir) cnn = DataSQL() cpt = Computation(pattern.KEY_PATTERN) ifm = Information(pattern) ids = [4000115, 125452, 8000303, 8000296, 8000147] for id in ids: bid_data = BidData(cnn, pattern, id, 'stang_bid_new') try: print(ifm.cpt.get_df_type_pro(bid_data.get_dfs()[0], True)) except Exception as e: print(e)
from algorithm.sequence_search import SequenceSearch class Res: def __init__(self, pattern): self.sqs = SequenceSearch(pattern) def main(self, id, table_name): return self.sqs.get_target(id, table_name) if __name__ == '__main__': import os from extraction import pattern from algorithm.create_df.read_data_lib.data_base import DataSQL from extraction.bid_data import BidData from algorithm.nlp_algorithm.ltp_algorithm import Ner # 无法进行测试,模型不能够成功加载 ner = Ner() os.chdir(os.pardir) os.chdir(os.pardir) df_pre = Res(pattern) cnn = DataSQL() bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new') ids = [8160283, 10115126, 10080145, 9493171, 10125682, 10125613] for i in ids: data = BidData(cnn, pattern, i, 'stang_bid_new') print( df_pre.sqs.get_target(data.get_tag_sequence(), pattern.FIRST_KEY_PATTERN, ner.get_institution_from_string))
def test_summary_main(id, ifm, cnn, open): data = BidData(cnn, pattern, id, 'stang_bid_new') res = ifm.get_summary(data) return res
info_sequence, key_locs, get_target_function) res_next_tem = self._find_target_in_next_position( info_sequence, next_locs, get_target_function) if res_key_tem: res += res_key_tem if res_next_tem: res += res_next_tem if not res: return None return res if __name__ == '__main__': from extraction import pattern from algorithm.nlp_algorithm.ltp_algorithm import Ner from algorithm.feature_compute.compute import Computation from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os os.chdir(os.pardir) cnn = DataSQL() bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new') cpt = Computation(pattern.KEY_PATTERN) ner = Ner() sequence_search = SequenceSearch(pattern) print( sequence_search.get_target(bid_data.get_info_sequence(), pattern.FIRST_KEY_PATTERN, ner.get_institution_from_string))
def main(self, id, table_name): # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入 # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容 data = BidData(self.cnn, pattern, id, table_name) # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象 return self.ifm.get_information(data)
text_segs = info_soup.find_all(name=['p', 'div', 'span']) res = [] for each in text_segs: each_text = each.text # 为了解决id为11269279这样的情况,所有的信息为一个大的txt, 所以必须要限定文本的长度。 if key_pattern.findall(each_text) and len(str(each_text)) < 50: target = get_target_function(each.text) if target is not None and target: res += target if not res: return None return res if __name__ == '__main__': from extraction import pattern from algorithm.nlp_algorithm.ltp_algorithm import Ner from algorithm.feature_compute.compute import Computation from extraction.bid_data import BidData from algorithm.create_df.read_data_lib.data_base import DataSQL import os os.chdir(os.pardir) cnn = DataSQL() ner = Ner() bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new') tag_search = TagSearch() print( tag_search.get_target(bid_data.get_info_soup(), pattern.FIRST_KEY_PATTERN, ner.get_institution_from_string))