def get_post(self, item): if item.get_info("sourceUrl").split(".")[-1] == "pdf": return xml = Download(item.get_info("sourceUrl")).request() if xml is False: return try: source_date = xml.xpath( '//div[@class="xxxq_text_tit"][1]/h6/span[2]')[0] source_date = ["深圳市卫生健康委员会", source_date.text.replace("发布日期:", "")] except Exception as e: print_info("{} 解析失败".format(item.get_info("sourceUrl"))) return body = [] for p in xml.xpath('//div[@class="TRS_Editor"]/p'): if p.text: body.append(p.text) else: continue date = source_date[1] update_info = { "date": date, "_id": generate_hash("{}{}".format(item.get_info("title"), date)), "source": source_date[0], "body": "\n".join(body), "effective": True } item.set_info(update_info)
def get_post(self, item): xml = Download(item.get_info("sourceUrl")).request() if xml is False: return try: source_date = xml.xpath( '//p[@class="margin_top15 c999999 text_cencer"]')[0].text except Exception: print_info("{} 解析失败".format(item.get_info("sourceUrl"))) return source_date = source_date.split(" ") body = [] for p in xml.xpath('//div[@class="content-content"]/p'): if p.text: body.append(p.text) date = "{} {}".format(source_date[0].replace("时间:", ""), source_date[1]) update_info = { "date": date, "_id": generate_hash("{}{}".format(item.get_info("title"), date)), "source": source_date[3].replace("来源:", ""), "body": "\n".join(body), "effective": True } item.set_info(update_info)
def get_post_list(self, url, items): xml = Download(url).request() if xml is False: return lis = xml.xpath('//div[@class="section list"][1]/ul/li') for li in lis: a = li.find("a") span = li.find("span") if self.url_repeat(a.get("href")) is False: item = GDWJWItem() item.set_info({ "title": a.get("title"), "sourceUrl": a.get("href"), "_id": generate_hash("{}{}".format(a.get("title"), span.text)), "agency": "广东省卫健委", "date": span.text, "effective": True }) items.append(item)
def get_page_num(self): xml = Download(self._start_url).request() if xml is False: return 1 js_func = xml.xpath('//div[@class="zx_ml_list_page"]/script/text()')[0] js_func = js_func.replace("createPageHTML(", "").replace(");", "") return int(js_func.split(",")[0])
def get_page_num(self): xml = Download(self._start_url).request() if xml is False: return 1 last_url = xml.xpath('//a[@class="last"]')[0].xpath("@href")[0] html_names = re.findall(pattern=r"index_[\d]*.html", string=last_url) if len(html_names) >= 1: pages_num = int(html_names[0].replace("index_", "").replace(".html", "")) return pages_num else: return 1
def get_post(self, item): xml = Download(item.get_info("sourceUrl")).request() if xml is False: return bodys = [] try: lis = xml.xpath('//div[@class="check_content_points"]/ul/li') if len(lis) > 1: for li in lis: if li.find("span").tail: bodys.append(li.find("span").tail) else: bodys.append(lis[0].text) except Exception: print_info("解析错误:{}".format(item.get_info("sourceUrl"))) return item.set_info({"body": "\n".join(bodys)})
def get_post_list(self, url, items): xml = Download(url).request() if xml is False: return lis = xml.xpath('//div[@class="wendangListC"][1]//li') for li in lis: date = li.find("strong").text a = li.find("a") post_url = re.sub("^\.", "http://wjw.sz.gov.cn/yqxx", a.get("href")) if self.url_repeat(post_url) is False: item = SZWJWItem() item.set_info({ "title": a.text, "sourceUrl": post_url, "_id": generate_hash("{}{}".format(a.text, date)), "agency": "深圳卫健委", "date": date, "effective": True, "source": "深圳市卫生健康委员会" }) items.append(item)