Ejemplo n.º 1
0
 def _get_sections(self,
                   url,
                   list_rule,
                   section_rule,
                   book,
                   decode="utf-8"):
     # 获取章节列表
     res = requests_get(url=url, decode=decode)
     parse_html = html_to_etree(res)
     section_p = parse_html.xpath(list_rule)
     need_add_obj = []
     order = 0
     for i in section_p:
         # 获取目录
         order += 1
         a = i.xpath(section_rule)
         if a:
             o = a[0]
             href = o.xpath("./@href")[0]
             sec_name = o.text
             need_add_obj.append(
                 NovelSection(novel=book,
                              name=sec_name,
                              url=href,
                              order=order))
     # 结束后再次判断need_add_obj
     if need_add_obj:
         NovelSection.objects.bulk_create(need_add_obj)
     # 更新book状态
     book.section_complete = True
     book.save()
Ejemplo n.º 2
0
def grab_real_info(data, service=None):
    """抓取实时信息"""
    if service is None:
        service = "http://bm.eyuyao.com/bus/mobile/getGpsInfoCs.php?{data}"

    url = service.format(data=data)
    res = requests_get(url=url, j=True)
    return res
Ejemplo n.º 3
0
def grab_ajax_data(real_url):
    """根据real_url抓取ajax的url"""
    import re
    res = requests_get(url=real_url)
    # parse_html = html_to_etree(html_raw=res)
    # ajax = parse_html.xpath('/html/head/script[3]')
    m1 = re.findall("data:(.*)", res)
    m2 = re.findall(r"\"([a-zA-Z0-9=&]+)", "".join(m1))
    if m2:
        return m2[0]
Ejemplo n.º 4
0
    def list(self, request, *args, **kwargs):
        host = request.query_params.get("host")
        url = request.query_params.get("url")
        book_name = request.query_params.get("book_name")
        if not url:
            return ErrorHR("参数url缺失")
        if host:
            self.query_sql &= Q(host__contains=host)
        book = self.get_novel_entry(book_name=book_name)
        if not book:
            return ErrorHR("不存在该书")
        # 获取章节的抓取规则
        rule = GraspRule.objects.filter(self.query_sql).first()
        list_rule = rule.list_rule
        section_rule_p = rule.section_rule_p
        section_rule = rule.section_rule
        decode = rule.decode

        res = requests_get(url=url, decode=decode)
        parse_html = html_to_etree(res)
        sections = []
        # 获取章节列表
        section_p = parse_html.xpath(list_rule)
        section_p_obj = None
        need_add_obj = []
        order = 0
        for i in section_p:
            # 判断是否为父级目录
            if dict(i.attrib).get(
                    "class") == section_rule_p and section_rule_p is not None:
                order = 0
                # 判断need_add_obj 有就新增
                if need_add_obj:
                    NovelSection.objects.bulk_create(need_add_obj)
                    need_add_obj.clear()
                _name = i.text
                section_p_obj = self.create_section(novel=book, name=_name)
            else:
                # 获取目录
                order += 1
                a = i.xpath(section_rule)
                if a:
                    o = a[0]
                    href = o.xpath("./@href")[0]
                    sec_name = o.text
                    need_add_obj.append(
                        NovelSection(novel=book,
                                     name=sec_name,
                                     url=href,
                                     parent=section_p_obj,
                                     order=order))
        # 结束后再次判断need_add_obj
        if need_add_obj:
            NovelSection.objects.bulk_create(need_add_obj)
        return SuccessHR("创建成功")
Ejemplo n.º 5
0
 def _get_book(self, url, rule, decode="utf-8"):
     """获取书"""
     try:
         n = NovelEntry.objects.get(is_active=True, url=url)
     except NovelEntry.DoesNotExist:
         res = requests_get(url=url, decode=decode)
         parse_html = html_to_etree(res)
         book_name_p = parse_html.xpath(rule)
         book_name = ""
         if book_name_p:
             book_name = book_name_p[0].text
         # 创建书本
         return NovelEntry.objects.create(name=book_name, url=url)
     else:
         return n
Ejemplo n.º 6
0
def grab_bus_real_url(raw):
    """抓取公交实况url"""
    host = 'http://bm.eyuyao.com/bus/mobile/'
    result = []
    for i in raw:
        pk = i.get("id")
        grab_url = i.get("url")
        # 抓取实况url
        res = requests_get(url=host + grab_url)
        parse_html = html_to_etree(html_raw=res)
        real_url = parse_html.xpath('/html/body/header/div[2]/a/@href')
        print(real_url)
        if real_url:
            result.append({"id": pk, "real_url": real_url[0]})
    return result
Ejemplo n.º 7
0
def grab_bus_real_info(pk, url):
    """抓取公交实况url"""
    result = []
    # 抓取实况url
    res = requests_get(url=url)
    parse_html = html_to_etree(html_raw=res)
    station_list = parse_html.xpath('//*[@id="touchBox"]/li')
    for i in station_list:
        station_id = i.xpath("./@id")
        if station_id:
            result.append({
                "id": pk,
                "station_id": station_id[0],
                "name": i.text
            })
    return result
Ejemplo n.º 8
0
def grab_base_bus():
    """抓取公交基础信息"""
    url = "http://bm.eyuyao.com/bus/mobile/lineList.php?k=pp&q="
    list_rule = "/html/body/div/ul[@class='list borderNone mbNone']/li/a"

    res = requests_get(url=url)
    parse_html = html_to_etree(html_raw=res)
    bus_list = parse_html.xpath(list_rule)
    bus = []
    # TODO 获取反向的车 -
    for i in bus_list:
        href_list = i.xpath('./@href')
        if href_list:
            href = href_list[0]
            name = i.text
            bus.append({"name": name, "href": href})
    return bus
Ejemplo n.º 9
0
Archivo: utils.py Proyecto: MAOA-L/Blog
def parse_content(sections_url, content_rule, decode="utf-8"):
    """提取小说内容"""
    # TODO 多线程获取小说内容
    # if isinstance(sections_url, str):
    #     sections_url = [sections_url]
    # assert not isinstance(sections_url, list)

    # 发送请求获取页面数据
    res = requests_get(url=sections_url, decode=decode)
    # 解析页面
    parse_html = html_to_etree(res)
    # 获取规则下的标签
    content_tab = parse_html.xpath(content_rule)
    # 提取主体内容
    if content_tab:
        return "".join([i.tail if i.tail else "\n\n" for i in content_tab[0]])
        # content = content_tab[0].xpath("string(.)")
        # log_common.out(msg=f"内容{content[:10]}")
    return None