Esempio n. 1
0
def main_spider(path,key_name,except_name):
    company_list = read_company2(path)
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)
    conn = client["qg_ss"]['cp_info_2']
    for company in company_list:
        i = Item_dump(company,key_name)
        ret = i.item_dump()
        if not ret:
            item = {}
            cp = Cp_spider(company)
            html_list = cp.get_info()
            if html_list == ["无符合条件的数据"]:
                cp_list = ["无符合条件的数据"]
            else:
                h = Handel_html(html_list=html_list)
                cp_list = h.run()
            if cp_list == ["获取网页内容时出现异常"]:
                with open('log/{}.csv'.format(except_name), 'a') as f:
                    f.write(company + '\n')
            else:
                item["company"] = company
                item["文书信息"] = cp_list
                print(item)
                conn.insert_one(dict(item))
                print("保存成功")
Esempio n. 2
0
 def run_text(self):
     for company in self.company_list:
         item = {}
         cp = Cp_spider(company)
         html_list = cp.get_info()
         if html_list == ["无符合条件的数据"]:
             cp_list = ["无符合条件的数据"]
         else:
             h = Handel_html(html_list)
             cp_list = h.run()
         item["company"] = company
         item["type"] = self.type
         item["文书信息"] = cp_list
         print(item)
Esempio n. 3
0
 def run(self):
     for company in self.company_list:
         i = Item_dump(company)
         ret = i.item_dump()
         if not ret:
             item = {}
             cp = Cp_spider(company)
             html_list = cp.get_info()
             if html_list == ["无符合条件的数据"]:
                 cp_list = ["无符合条件的数据"]
             else:
                 h = Handel_html(html_list)
                 cp_list = h.run()
             item["company"] = company
             item["type"] = self.type
             item["文书信息"] = cp_list
             print(item)
             self.save_mongodb(item)
Esempio n. 4
0
    def run(self):
        for company in self.company_list:
            i = Item_dump(company)
            ret = i.item_dump()
            if not ret:
                item = {}
                cp = Cp_spider(company)
                html_list = cp.get_info()
                if html_list == ["无符合条件的数据"]:
                    cp_list = ["无符合条件的数据"]
                else:
                    h = Handel_html(html_list=html_list)
                    cp_list = h.run()

                # 如果获取内容出现异常时计入异常名单
                if cp_list == ["获取网页内容时出现异常"]:
                    with open('log/except_company.csv', 'a') as f:
                        f.write(company + '\n')
                else:
                    item["company"] = company
                    item["type"] = self.type
                    item["文书信息"] = cp_list
                    print(item)
                    self.save_mongodb(item)