Python Download.xpath Examples

Programming Language: Python

Namespace/Package Name: download

Class/Type: Download

Method/Function: xpath

Examples at hotexamples.com: 7

Python Download.xpath - 7 examples found. These are the top rated real world Python examples of download.Download.xpath extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Download(30)

download(10)

xpath(7)

get_html(5)

get(3)

get_packages(2)

__init__(2)

down_html(2)

initialize(2)

output_path(1)

next(1)

pack(1)

scanFolder(1)

percent_complete(1)

pop_user(1)

post(1)

read_content(1)

multiDown(1)

main(1)

reg_callback(1)

save_content(1)

set_hits(1)

set_amount(1)

listening_test(1)

set_progressbar(1)

set_query(1)

set_savepath(1)

set_window_width(1)

start(1)

status(1)

subtitle(1)

thread_download(1)

thumbnail(1)

time(1)

total_size(1)

unzip(1)

video_download(1)

load_user(1)

_get_next(1)

link(1)

download_and_show(1)

as_view(1)

auth(1)

cancel(1)

checkFolders(1)

crawl_comic(1)

create(1)

create_from_dict(1)

create_static(1)

custom_name(1)

Example #1

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_post(self, item):
     if item.get_info("sourceUrl").split(".")[-1] == "pdf":
         return
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//div[@class="xxxq_text_tit"][1]/h6/span[2]')[0]
         source_date = ["深圳市卫生健康委员会", source_date.text.replace("发布日期：", "")]
     except Exception as e:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     body = []
     for p in xml.xpath('//div[@class="TRS_Editor"]/p'):
         if p.text:
             body.append(p.text)
         else:
             continue
     date = source_date[1]
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[0],
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)

Example #2

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_post(self, item):
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//p[@class="margin_top15 c999999 text_cencer"]')[0].text
     except Exception:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     source_date = source_date.split(" ")
     body = []
     for p in xml.xpath('//div[@class="content-content"]/p'):
         if p.text:
             body.append(p.text)
     date = "{} {}".format(source_date[0].replace("时间：", ""),
                           source_date[1])
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[3].replace("来源：", ""),
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)

Example #3

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_post_list(self, url, items):
     xml = Download(url).request()
     if xml is False:
         return
     lis = xml.xpath('//div[@class="section list"][1]/ul/li')
     for li in lis:
         a = li.find("a")
         span = li.find("span")
         if self.url_repeat(a.get("href")) is False:
             item = GDWJWItem()
             item.set_info({
                 "title":
                 a.get("title"),
                 "sourceUrl":
                 a.get("href"),
                 "_id":
                 generate_hash("{}{}".format(a.get("title"), span.text)),
                 "agency":
                 "广东省卫健委",
                 "date":
                 span.text,
                 "effective":
                 True
             })
             items.append(item)

Example #4

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_page_num(self):
     xml = Download(self._start_url).request()
     if xml is False:
         return 1
     js_func = xml.xpath('//div[@class="zx_ml_list_page"]/script/text()')[0]
     js_func = js_func.replace("createPageHTML(", "").replace(");", "")
     return int(js_func.split(",")[0])

Example #5

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_page_num(self):
     xml = Download(self._start_url).request()
     if xml is False:
         return 1
     last_url = xml.xpath('//a[@class="last"]')[0].xpath("@href")[0]
     html_names = re.findall(pattern=r"index_[\d]*.html", string=last_url)
     if len(html_names) >= 1:
         pages_num = int(html_names[0].replace("index_",
                                               "").replace(".html", ""))
         return pages_num
     else:
         return 1

Example #6

0

Show file

File: spider.py Project: lureiny/ncov_spider

    def get_post(self, item):
        xml = Download(item.get_info("sourceUrl")).request()
        if xml is False:
            return
        bodys = []
        try:
            lis = xml.xpath('//div[@class="check_content_points"]/ul/li')
            if len(lis) > 1:
                for li in lis:
                    if li.find("span").tail:
                        bodys.append(li.find("span").tail)
            else:
                bodys.append(lis[0].text)
        except Exception:
            print_info("解析错误：{}".format(item.get_info("sourceUrl")))
            return

        item.set_info({"body": "\n".join(bodys)})

Example #7

0

Show file

File: spider.py Project: lureiny/ncov_spider

 def get_post_list(self, url, items):
     xml = Download(url).request()
     if xml is False:
         return
     lis = xml.xpath('//div[@class="wendangListC"][1]//li')
     for li in lis:
         date = li.find("strong").text
         a = li.find("a")
         post_url = re.sub("^\.", "http://wjw.sz.gov.cn/yqxx",
                           a.get("href"))
         if self.url_repeat(post_url) is False:
             item = SZWJWItem()
             item.set_info({
                 "title": a.text,
                 "sourceUrl": post_url,
                 "_id": generate_hash("{}{}".format(a.text, date)),
                 "agency": "深圳卫健委",
                 "date": date,
                 "effective": True,
                 "source": "深圳市卫生健康委员会"
             })
             items.append(item)