def request(self): try: resp = requests.get(url=self.url, headers=HEADERS).text except Exception as error: print_info("{} 下载失败,错误信息:{}".format(self.url, error.__str__())) resp = False return resp
def get_post(self, item): xml = Download(item.get_info("sourceUrl")).request() if xml is False: return try: source_date = xml.xpath( '//p[@class="margin_top15 c999999 text_cencer"]')[0].text except Exception: print_info("{} 解析失败".format(item.get_info("sourceUrl"))) return source_date = source_date.split(" ") body = [] for p in xml.xpath('//div[@class="content-content"]/p'): if p.text: body.append(p.text) date = "{} {}".format(source_date[0].replace("时间:", ""), source_date[1]) update_info = { "date": date, "_id": generate_hash("{}{}".format(item.get_info("title"), date)), "source": source_date[3].replace("来源:", ""), "body": "\n".join(body), "effective": True } item.set_info(update_info)
def get_post(self, item): if item.get_info("sourceUrl").split(".")[-1] == "pdf": return xml = Download(item.get_info("sourceUrl")).request() if xml is False: return try: source_date = xml.xpath( '//div[@class="xxxq_text_tit"][1]/h6/span[2]')[0] source_date = ["深圳市卫生健康委员会", source_date.text.replace("发布日期:", "")] except Exception as e: print_info("{} 解析失败".format(item.get_info("sourceUrl"))) return body = [] for p in xml.xpath('//div[@class="TRS_Editor"]/p'): if p.text: body.append(p.text) else: continue date = source_date[1] update_info = { "date": date, "_id": generate_hash("{}{}".format(item.get_info("title"), date)), "source": source_date[0], "body": "\n".join(body), "effective": True } item.set_info(update_info)
def request(self): try: resp = requests.get(url=self.url, headers=HEADERS) root = etree.HTML(resp.content.decode("utf-8")) except Exception as error: print_info("{} 下载失败,错误信息:{}".format(self.url, error.__str__())) root = False return root
def get_rumors(self, type): url = "https://lab.isaaclin.cn/nCoV/api/rumors?num=all&rumorType={}".format( type) resp = requests.get(url=url, headers=HEADERS) if resp.status_code != 200: print_info("Something Wrong, Status Code is: {}".format( resp.status_code)) return False return resp.json()
def get_notices(self): try: data_json = requests.get(self._url, headers=HEADERS).json() except Exception: print_info("丁香园新闻信息爬取失败") return False if data_json["success"]: return data_json["results"] else: return False
def insert(self, document): try: self.__collection.insert_one(document) return True except pymongo.errors.DuplicateKeyError: # print_info("ID重复") print_info("ID重复:{}".format(str(document))) return False except Exception: print_info("其他错误:{}".format(str(document))) return False
def spider(self): for type in range(3): try: data = self.get_rumors(type=type) if data: self.deal_item(data=data) else: continue except Exception as error: print_info(error) continue
def get_json(self, resp): last = re.compile(r"\)$") front = re.compile(r"^jsonp[0-9]+\(") resp_new = self.re_sub(pattern=front, string=self.re_sub(pattern=last, string=resp, repl=""), repl="") try: return json.loads(resp_new) except Exception: print_info("json解析错误:{}".format(str(resp))) return False
def get_post(self, item): xml = Download(item.get_info("sourceUrl")).request() if xml is False: return bodys = [] try: lis = xml.xpath('//div[@class="check_content_points"]/ul/li') if len(lis) > 1: for li in lis: if li.find("span").tail: bodys.append(li.find("span").tail) else: bodys.append(lis[0].text) except Exception: print_info("解析错误:{}".format(item.get_info("sourceUrl"))) return item.set_info({"body": "\n".join(bodys)})
def get_items(self, items): datas = self.get_notices() if datas is False: return False r = re.compile(r"^http[s]*://[\w\.]+") for data in datas: sourceUrl = data["sourceUrl"] if self.url_repeat(sourceUrl) is False: bodys = data["summary"] title = data["title"] date = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(data["pubDate"]) / 1000)) source = data["infoSource"] provinceName = data["provinceName"] provinceId = data["provinceId"] _id = generate_hash("{}{}".format(title, date)) try: host = re.findall(r, sourceUrl)[0] agency = self._hosts[host] except KeyError: print_info("新Host:{}".format(host)) agency = "未知" except IndexError: print_info("错误Host:{}".format(sourceUrl)) agency = "微博" update_info = { "_id": _id, "title": title, "sourceUrl": sourceUrl, "agency": agency, "date": date, "source": source, "body": bodys, "provinceName": provinceName, "provinceId": provinceId } # 创建Item item = NewsDXYItem() item.set_info(update_info) items.append(item)
from setting import SPIDERS, SLEEP_TIME, LOG import time from units import print_info, update_redis import sys sys.stdout = open(LOG, "a") if __name__ == '__main__': # 初始化去重队列 update_redis() Spiders = __import__("spider") num = 1 while True: if len(SPIDERS) == 0: print_info("未指定爬虫,程序退出") break for spider in SPIDERS: print_info("{}开始第{}次运行".format(spider, num)) spider_class = getattr(Spiders, spider) spider_object = spider_class() spider_object.spider() print_info("{}第{}次运行结束".format(spider, num)) num += 1 time.sleep(SLEEP_TIME)