Esempio n. 1
0
def add_chuansongme_feed(url):
    rsp = get_with_retry(url)

    if rsp is None or not rsp.ok:
        return None

    response = HtmlResponse(url=url, body=rsp.text, encoding='utf8')

    qrcode = response.selector.xpath("//img[@width='260px']/@src").extract_first()
    avatar = response.selector.xpath("//img[@class='profile_photo_img']/@src").extract_first().strip()
    brief = response.selector.xpath("//*[@class='inline_editor_value']/div[@class='inline']/span/text()").\
        extract_first().strip()
    keywords = response.selector.xpath("//meta[@name='keywords']/@content").extract_first()

    cname, name = None, None
    try:
        cname, name = keywords.split(',')[:2]
    except:
        logger.warning(f"标题解析失败:`{keywords}")

    if qrcode and name and avatar and cname and brief:
        return save_feed_to_db(name, cname, qrcode, avatar, brief, url)
    else:
        logger.warning(f'字段解析异常:`{url}')

    return None
Esempio n. 2
0
def add_wemp_feed(url):
    rsp = get_with_retry(url)

    if rsp is None or not rsp.ok:
        return None

    response = HtmlResponse(url=url, body=rsp.text, encoding='utf8')

    qrcode = response.selector.xpath("//*[@class='mp-info__main']/img[@class='mp-info__qr']/@src").extract_first()
    avatar = response.selector.xpath("//img[@class='post-item__avatar']/@src").extract_first().strip()
    brief = response.selector.xpath("//meta[@name='description']/@content").extract_first().strip()
    title = response.selector.xpath("//*[@class='mp-header']/h1/text()").extract_first().strip()

    cname, name = None, None
    try:
        cname, name = re.search(r'^.+? (.+?)\((.+?)\) .+$', title).groups()
    except:
        logger.warning(f"标题解析失败:`{title}")

    if qrcode and name and avatar and cname and brief:
        return save_feed_to_db(name, cname, qrcode, avatar, brief, url)
    else:
        logger.warning(f'字段解析异常:`{url}')

    return None
Esempio n. 3
0
def add_anyv_feed(url):
    rsp = get_with_retry(url)

    if rsp is None or not rsp.ok:
        return None

    response = HtmlResponse(url=url, body=rsp.text, encoding='utf8')

    # 没有二维码和头像
    avatar, link = '', url
    brief = response.selector.xpath("//*[@class='user_group']/li[2]/text()"
                                    ).extract_first().strip().split(':', 1)[1].split(', 微信搜索')[0]
    cname = response.selector.xpath("//div[@class='subtitle']/h1/a/text()").extract_first().strip()[:-4]
    name = response.selector.xpath("//*[@class='user_group']/li/a/text()").extract_first().strip().split(':')[-1]

    if not brief:
        brief = cname

    if name and cname and brief:
        return save_feed_to_db(name, cname, link, avatar, brief, url)
    else:
        logger.warning(f'字段解析异常:`{url}')

    return None
Esempio n. 4
0
def add_ershicimi_feed(url):
    """
    :return: 解析结果,成功返回字典;失败 None
    """
    rsp = get_with_retry(url)

    if rsp is None or not rsp.ok:
        return None

    response = HtmlResponse(url=url, body=rsp.text, encoding='utf8')

    qrcode = response.selector.xpath("//img[@class='qr-code']/@src").extract_first()
    cname = response.selector.xpath("//li[@class='title']//span[@class='name']/text()"). \
        extract_first().strip()
    avatar = response.selector.xpath("//img[@class='avatar']/@src").extract_first().strip()
    brief = response.selector.xpath("//div[@class='Profile-sideColumnItemValue']/text()").extract_first().strip()
    name = response.selector.xpath("//meta[@name='keywords']/@content").extract_first().split(',')[1]

    if qrcode and name and avatar and cname and brief:
        return save_feed_to_db(name, cname, qrcode, avatar, brief, url)
    else:
        logger.warning(f'字段解析异常:`{url}')

    return None
Esempio n. 5
0
def podcast_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        logger.info(f"RSS 源可能失效了`{site.rss}")
        return None

    feed_obj = feedparser.parse(BytesIO(resp.content))

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
        except AttributeError:
            logger.warning(f'title 获取失败:`{site.rss}')
            continue

        link = entry.get('link') or entry.get('guid')
        if not link:
            logger.warning(f'link 获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        audio, img = None, ''
        if entry.get('links'):
            for el in entry['links']:
                if 'audio/' in el.get('type') or el.get('rel') == 'enclosure':
                    audio = el
                    break

        if entry.get('image'):
            img = entry.image.get('href')

        try:
            brief = entry.content[0].value
        except:
            brief = entry.get('description') or entry.link

        if audio is not None:
            # 生成 podlove 所需数据
            episode = {
                "version": 5,
                "show": {
                    "title": site.cname,
                    "subtitle": site.brief,
                    "poster": site.favicon,
                    "link": site.link,
                },
                "title": title,
                "link": link,
                # "subtitle": brief,
                "publicationDate": entry.get('published'),
                "poster": img,
                "duration": to_podcast_duration(entry.get('itunes_duration', '')),
                "audio": [
                    {
                        "url": audio.href,
                        "mimeType": audio.type
                    }
                ]
            }
            episode = json.dumps(episode)
            episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8')
            content = podcast_tmpl % episode + brief
        else:
            content = brief + f'''<p></p><img src="{img}">'''

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, content)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
            mark_crawled_url(link)
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    return True
Esempio n. 6
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            logger.warning(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except (AttributeError, IndexError):
            value = None

        if not value:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, value)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True
Esempio n. 7
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            guard_log(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:12]:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        # 公众号 RSS 二次抓取
        if get_host_name(site.rss) in ('qnmlgb.tech', ):
            if get_host_name(link) in ('mp.weixin.qq.com', ):
                rsp = get_with_proxy(link)

                if rsp is not None and rsp.ok:
                    try:
                        title, author, value = parse_weixin_page(rsp)
                    except:
                        pass
        try:
            article = Article(site=site,
                              title=title,
                              author=author,
                              src_url=link,
                              uindex=current_ts(),
                              content=value)
            article.save()

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True