Beispiel #1
0
 def request(self):
     try:
         resp = requests.get(url=self.url, headers=HEADERS).text
     except Exception as error:
         print_info("{} 下载失败,错误信息:{}".format(self.url, error.__str__()))
         resp = False
     return resp
Beispiel #2
0
 def get_post(self, item):
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//p[@class="margin_top15 c999999 text_cencer"]')[0].text
     except Exception:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     source_date = source_date.split(" ")
     body = []
     for p in xml.xpath('//div[@class="content-content"]/p'):
         if p.text:
             body.append(p.text)
     date = "{} {}".format(source_date[0].replace("时间:", ""),
                           source_date[1])
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[3].replace("来源:", ""),
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)
Beispiel #3
0
 def get_post(self, item):
     if item.get_info("sourceUrl").split(".")[-1] == "pdf":
         return
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//div[@class="xxxq_text_tit"][1]/h6/span[2]')[0]
         source_date = ["深圳市卫生健康委员会", source_date.text.replace("发布日期:", "")]
     except Exception as e:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     body = []
     for p in xml.xpath('//div[@class="TRS_Editor"]/p'):
         if p.text:
             body.append(p.text)
         else:
             continue
     date = source_date[1]
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[0],
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)
Beispiel #4
0
 def request(self):
     try:
         resp = requests.get(url=self.url, headers=HEADERS)
         root = etree.HTML(resp.content.decode("utf-8"))
     except Exception as error:
         print_info("{} 下载失败,错误信息:{}".format(self.url, error.__str__()))
         root = False
     return root
Beispiel #5
0
 def get_rumors(self, type):
     url = "https://lab.isaaclin.cn/nCoV/api/rumors?num=all&rumorType={}".format(
         type)
     resp = requests.get(url=url, headers=HEADERS)
     if resp.status_code != 200:
         print_info("Something Wrong, Status Code is: {}".format(
             resp.status_code))
         return False
     return resp.json()
Beispiel #6
0
 def get_notices(self):
     try:
         data_json = requests.get(self._url, headers=HEADERS).json()
     except Exception:
         print_info("丁香园新闻信息爬取失败")
         return False
     if data_json["success"]:
         return data_json["results"]
     else:
         return False
Beispiel #7
0
 def insert(self, document):
     try:
         self.__collection.insert_one(document)
         return True
     except pymongo.errors.DuplicateKeyError:
         # print_info("ID重复")
         print_info("ID重复:{}".format(str(document)))
         return False
     except Exception:
         print_info("其他错误:{}".format(str(document)))
         return False
Beispiel #8
0
 def spider(self):
     for type in range(3):
         try:
             data = self.get_rumors(type=type)
             if data:
                 self.deal_item(data=data)
             else:
                 continue
         except Exception as error:
             print_info(error)
             continue
Beispiel #9
0
 def get_json(self, resp):
     last = re.compile(r"\)$")
     front = re.compile(r"^jsonp[0-9]+\(")
     resp_new = self.re_sub(pattern=front,
                            string=self.re_sub(pattern=last,
                                               string=resp,
                                               repl=""),
                            repl="")
     try:
         return json.loads(resp_new)
     except Exception:
         print_info("json解析错误:{}".format(str(resp)))
         return False
Beispiel #10
0
    def get_post(self, item):
        xml = Download(item.get_info("sourceUrl")).request()
        if xml is False:
            return
        bodys = []
        try:
            lis = xml.xpath('//div[@class="check_content_points"]/ul/li')
            if len(lis) > 1:
                for li in lis:
                    if li.find("span").tail:
                        bodys.append(li.find("span").tail)
            else:
                bodys.append(lis[0].text)
        except Exception:
            print_info("解析错误:{}".format(item.get_info("sourceUrl")))
            return

        item.set_info({"body": "\n".join(bodys)})
Beispiel #11
0
 def get_items(self, items):
     datas = self.get_notices()
     if datas is False:
         return False
     r = re.compile(r"^http[s]*://[\w\.]+")
     for data in datas:
         sourceUrl = data["sourceUrl"]
         if self.url_repeat(sourceUrl) is False:
             bodys = data["summary"]
             title = data["title"]
             date = time.strftime(
                 "%Y-%m-%d %H:%M:%S",
                 time.localtime(int(data["pubDate"]) / 1000))
             source = data["infoSource"]
             provinceName = data["provinceName"]
             provinceId = data["provinceId"]
             _id = generate_hash("{}{}".format(title, date))
             try:
                 host = re.findall(r, sourceUrl)[0]
                 agency = self._hosts[host]
             except KeyError:
                 print_info("新Host:{}".format(host))
                 agency = "未知"
             except IndexError:
                 print_info("错误Host:{}".format(sourceUrl))
                 agency = "微博"
             update_info = {
                 "_id": _id,
                 "title": title,
                 "sourceUrl": sourceUrl,
                 "agency": agency,
                 "date": date,
                 "source": source,
                 "body": bodys,
                 "provinceName": provinceName,
                 "provinceId": provinceId
             }
             # 创建Item
             item = NewsDXYItem()
             item.set_info(update_info)
             items.append(item)
Beispiel #12
0
from setting import SPIDERS, SLEEP_TIME, LOG
import time
from units import print_info, update_redis
import sys

sys.stdout = open(LOG, "a")

if __name__ == '__main__':
    # 初始化去重队列
    update_redis()
    Spiders = __import__("spider")
    num = 1
    while True:
        if len(SPIDERS) == 0:
            print_info("未指定爬虫,程序退出")
            break
        for spider in SPIDERS:
            print_info("{}开始第{}次运行".format(spider, num))
            spider_class = getattr(Spiders, spider)
            spider_object = spider_class()
            spider_object.spider()
            print_info("{}第{}次运行结束".format(spider, num))
        num += 1
        time.sleep(SLEEP_TIME)