def main_spider(path,key_name,except_name): company_list = read_company2(path) client = pymongo.MongoClient(host='127.0.0.1', port=27017) conn = client["qg_ss"]['cp_info_2'] for company in company_list: i = Item_dump(company,key_name) ret = i.item_dump() if not ret: item = {} cp = Cp_spider(company) html_list = cp.get_info() if html_list == ["无符合条件的数据"]: cp_list = ["无符合条件的数据"] else: h = Handel_html(html_list=html_list) cp_list = h.run() if cp_list == ["获取网页内容时出现异常"]: with open('log/{}.csv'.format(except_name), 'a') as f: f.write(company + '\n') else: item["company"] = company item["文书信息"] = cp_list print(item) conn.insert_one(dict(item)) print("保存成功")
def run_text(self): for company in self.company_list: item = {} cp = Cp_spider(company) html_list = cp.get_info() if html_list == ["无符合条件的数据"]: cp_list = ["无符合条件的数据"] else: h = Handel_html(html_list) cp_list = h.run() item["company"] = company item["type"] = self.type item["文书信息"] = cp_list print(item)
def run(self): for company in self.company_list: i = Item_dump(company) ret = i.item_dump() if not ret: item = {} cp = Cp_spider(company) html_list = cp.get_info() if html_list == ["无符合条件的数据"]: cp_list = ["无符合条件的数据"] else: h = Handel_html(html_list) cp_list = h.run() item["company"] = company item["type"] = self.type item["文书信息"] = cp_list print(item) self.save_mongodb(item)
def run(self): for company in self.company_list: i = Item_dump(company) ret = i.item_dump() if not ret: item = {} cp = Cp_spider(company) html_list = cp.get_info() if html_list == ["无符合条件的数据"]: cp_list = ["无符合条件的数据"] else: h = Handel_html(html_list=html_list) cp_list = h.run() # 如果获取内容出现异常时计入异常名单 if cp_list == ["获取网页内容时出现异常"]: with open('log/except_company.csv', 'a') as f: f.write(company + '\n') else: item["company"] = company item["type"] = self.type item["文书信息"] = cp_list print(item) self.save_mongodb(item)