def parse_detail(html, url): alllog.logger.info("四川省卫健委: %s" % url) doc = pq(html) data = {} data["title"] = doc("title").text() data["content"] = doc(".wy_contMain").text().replace("\n", "") data["content_url"] = [ item.attr("href") for item in doc(".wy_contMain a").items() ] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] # data["publish_time"]=re.findall("(\d{4}-\d{1,2}-\d{1,2})",html)[0] data["publish_time"] = re.findall("(.{4}年.{1,2}月.d{1,2}日)", html)[0] except: data["publish_time"] = "" errorlog.logger.error("url:%s 未找到publish_time" % url) data["classification"] = "四川省卫健委" data["url"] = url print(data) save(data)
def parse_index(html): items = eval(json.loads(html)) print(items) for item in items["data"]: try: data = {} data["title"] = item["subject"] data["content"] = item["content"] data["content_url"] = "" data["publish_time"] = item["publishTime"] data["classification"] = "贵州省税务局" try: data["url"] = item["source"] except: data[ "url"] = "http://guizhou.chinatax.gov.cn/zcwj/content.html?entryId=" + item[ "entryId"] print(data) save(data) except: print(item)
def parse_detail(html,url): alllog.logger.info("吉林省药品监督管理局: %s"%url) doc=pq(html) data={} data["title"]=doc("title").text() data["content"]=doc("#div_print").text().replace("\n","") data["content_url"]=[item.attr("href") for item in doc("#div_print a").items()] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] data["publish_time"]=re.findall("(\d{4}-\d{1,2}-\d{1,2})",html)[0] except: data["publish_time"]="" errorlog.logger.error("url:%s 未找到publish_time"%url) if not data["content"]: data["content"]=doc(".articlecontent3").text() data["content_url"] = [item.attr("href") for item in doc(".articlecontent3 a").items()] data["classification"]="吉林省药品监督管理局" data["url"]=url print(data) save(data)
def parse_detail(html,url): alllog.logger.info("深圳国有资产委员会: %s"%url) doc=pq(html) data={} data["title"]=doc("title").text() data["content"]=doc(".con_nrs").text().replace("\n","") data["content_url"]=[item.attr("href") for item in doc(".con_nrs a").items()] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] data["publish_time"]=re.findall("(\d{4}-\d{1,2}-\d{1,2})",html)[0] except: data["publish_time"]="" errorlog.logger.error("url:%s 未找到publish_time"%url) # if not data["content"]: # data["content"] = doc(".TRS_Editor").text() # data["content_url"] = [item.attr("href") for item in doc(".TRS_Editor a").items()] data["classification"]="深圳国有资产委员会" data["url"]=url print(data) save(data)
def parse_detail(html, url): alllog.logger.info("陕西省卫健委: %s" % url) doc = pq(html) data = {} data["title"] = doc("title").text() html = re.search('<div id="zoom".+', html).group(0) doc = pq(html) data["content"] = doc("#zoom").text().replace("\n", "") data["content_url"] = [ item.attr("href") for item in doc("#zoom a").items() ] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] data["publish_time"] = re.findall("(\d{4}-\d{1,2}-\d{1,2})", html)[0] except: data["publish_time"] = "" errorlog.logger.error("url:%s 未找到publish_time" % url) data["classification"] = "陕西省卫健委" data["url"] = url print(data) save(data)
def parse_detail(html, url): alllog.logger.info("新疆教育厅: %s" % url) doc = pq(html) data = {} data["title"] = doc(".list_newxq h3").text() if not data["title"]: data["title"] = doc(".title h1").text() data["content"] = doc(".neirong").text().replace("\n", "") data["content_url"] = [ item.attr("href") for item in doc(".neirong a").items() ] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] data["publish_time"] = re.findall("(\d{4}-\d{1,2}-\d{1,2})", html)[0] except: data["publish_time"] = "" errorlog.logger.error("url:%s 未找到publish_time" % url) data["classification"] = "新疆教育厅" data["url"] = url print(data) save(data)
def parse_detail(html, url): alllog.logger.info("甘肃省人力资源和社会保障厅: %s" % url) doc = pq(html) data = {} data["title"] = doc(".content h1").text() data["content"] = doc(".content").text().replace("\n", "") data["content_url"] = [ item.attr("href") for item in doc(".content a").items() ] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] data["publish_time"] = re.findall("(\d{4}-\d{1,2}-\d{1,2})", html)[0] except: data["publish_time"] = "" errorlog.logger.error("url:%s 未找到publish_time" % url) if not data["content"]: data["content"] = doc(".Custom_UnionStyle").text() data["content_url"] = doc(".Custom_UnionStyle a").text() data["classification"] = "甘肃省人力资源和社会保障厅" data["url"] = url print(data) save(data)
def parse_detail(html,url): alllog.logger.info("安徽省国有资产委员会: %s"%url) doc=pq(html) data={} data["title"]=doc(".s_article_top h1").text() if not data["title"]: data["title"] = doc(".dicontent_bt h1").text() data["content"]=doc(".h-content").text().replace("\n","") data["content_url"]=[item.attr("href") for item in doc(".h-content a").items()] try: # data["publish_time"]=re.findall("(\d{4}年\d{1,2}月\d{1,2}日)",html)[0] # data["publish_time"]=re.findall("(\d{4}/\d{1,2}/\d{1,2})",html)[0] data["publish_time"]=re.findall("(\d{4}-\d{1,2}-\d{1,2})",html)[0] except: data["publish_time"]="" errorlog.logger.error("url:%s 未找到publish_time"%url) if not data["content"]: data["content"] = doc(".dicontent_left").text().replace("\n", "") data["content_url"] = [item.attr("href") for item in doc(".dicontent_left a").items()] data["classification"]="安徽省国有资产委员会" data["url"]=url print(data) save(data)