Exemple #1
0
def atom_spider(site):
    """
    更新源内容
    """
    try:
        resp = requests.get(site.rss, timeout=30, verify=False)
    except:
        if site.star >= 9:
            logger.warning(f"RSS源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:10]:
        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = None

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        try:
            article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(),
                                content=value)
            article.save()
            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')
Exemple #2
0
def wemp_spider(urls, site):
    """
    抓取微信内容
    :param urls:
    :param site:
    :return:
    """
    for url in urls:
        if is_crawled_url(url):
            continue

        try:
            logger.info(f'开始爬取公众号地址:`{url}')
            rsp = requests.get(url, timeout=10)

            if rsp.ok:
                response = HtmlResponse(url=url,
                                        body=rsp.text,
                                        encoding='utf8')

                title = response.selector.xpath(
                    '//h2[@id="activity-name"]/text()').extract_first().strip(
                    )
                content = response.selector.xpath(
                    '//div[@id="js_content"]').extract_first().strip()

                try:
                    author = response.selector.xpath('//span[@id="js_author_name"]/text()').\
                        extract_first().strip()
                except:
                    author = response.selector.xpath(
                        '//a[@id="js_name"]/text()').extract_first().strip()

                if title and content:
                    content_soup = BeautifulSoup(content, "html.parser")
                    for img in content_soup.find_all('img'):
                        if img.attrs.get('data-src'):
                            img.attrs['src'] = img.attrs['data-src']

                    article = Article(title=title,
                                      author=author,
                                      site=site,
                                      uindex=current_ts(),
                                      content=str(content_soup),
                                      src_url=url)
                    article.save()

                    mark_crawled_url(url)
                else:
                    logger.warning(f'公众号内容解析异常:`{title}`{author}`{content}')
        except (ConnectTimeout, HTTPError, ReadTimeout, Timeout,
                ConnectionError):
            logger.warning(f'公众号爬取出现网络异常:`{url}')
        except:
            logger.warning(f'公众号爬取出现未知异常:`{url}')
Exemple #3
0
def parse_detail_page(job):
    response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8')
    title, author, content = None, None, None

    # 判断跳转后的域名
    host = get_host_name(job.rsp_url)

    if job.action == 20 or settings.MPWX_HOST in host:
        try:
            if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first():
                mark_crawled_url(job.url, job.rsp_url)
                logger.info(f"内容违规或删除:`{job.url}")
                return 6
        except:
            pass

        title, author, content = parse_mpwx_detail_page(response)

        if job.action != 20 and settings.MPWX_HOST in host:
            logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}")

    elif job.action == 21:
        title, author, content = parse_ershicimi_detail_page(response)
    elif job.action == 22:
        title, author, content = parse_wemp_detail_page(response)
    elif job.action == 23:
        title, author, content = parse_chuansongme_detail_page(response)
    elif job.action == 24:
        title, author, content = parse_anyv_detail_page(response)

    mark_crawled_url(job.url, job.rsp_url)

    if title is None:
        logger.warning(f"页面解析失败:`{title}`{job.url}")
        return 4
    else:
        try:
            uindex = current_ts()

            article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url)
            article.save()

            write_dat2_file(uindex, job.site_id, content)
        except:
            logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}")
            return 7

        return 2
Exemple #4
0
def wemp_spider(url, site):
    """
    抓取微信内容
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    try:
        rsp = requests.get(url, timeout=10)

        if rsp.ok:
            try:
                if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                    title, author, content = parse_weixin_page(rsp)
                elif 'ershicimi.com' in get_host_name(rsp.url):
                    title, author, content = parse_ershicimi_page(rsp)
                else:
                    logger.warning(f'公众号域名解析异常:`{rsp.url}')
                    return
            except:
                logger.info(f'公众号内容解析异常:`{rsp.url}')
                return

            article = Article(title=title,
                              author=author,
                              site=site,
                              uindex=current_ts(),
                              content=content,
                              src_url=url)
            article.save()

            mark_crawled_url(url)
    except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError):
        logger.warning(f'公众号爬取出现网络异常:`{url}')
    except:
        logger.warning(f'公众号爬取出现未知异常:`{url}')
Exemple #5
0
def wemp_spider(url, site):
    """
    抓取微信内容,支持直接微信域名或者 ershicimi 域名
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    rsp = get_with_proxy(url)
    if rsp is None:
        return

    if rsp.ok:
        try:
            if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                title, author, content = parse_weixin_page(rsp)
            elif 'ershicimi.com' in get_host_name(rsp.url):
                title, author, content = parse_ershicimi_page(rsp)
            else:
                logger.warning(f'公众号域名解析异常:`{rsp.url}')
                return
        except:
            logger.info(f'公众号内容解析异常:`{rsp.url}')
            return

        article = Article(title=title,
                          author=author,
                          site=site,
                          uindex=current_ts(),
                          content=content,
                          src_url=url)
        article.save()

        mark_crawled_url(url)
Exemple #6
0
def update_all_user_feed():
    """
    更新所有 site
    """
    logger.info('开始运行定时更新RSS任务')

    now = datetime.datetime.now()

    # 按照不同频率更新,以 4 小时候为一个大周期
    if now.hour % 4 == 0:
        feeds = Site.objects.filter(status='active',
                                    creator='user').order_by('-star')
    elif now.hour % 4 == 1:
        feeds = []
    elif now.hour % 4 == 2:
        feeds = Site.objects.filter(status='active',
                                    creator='user',
                                    star__gte=20).order_by('-star')
    elif now.hour % 4 == 3:
        feeds = Site.objects.filter(status='active',
                                    creator='user',
                                    star__gte=9).order_by('-star')

    for site in feeds:
        try:
            resp = requests.get(site.rss, timeout=30, verify=False)
        except:
            if site.star >= 9:
                logger.warning(f"RSS源可能失效了`{site.rss}")
            else:
                logger.info(f"RSS源可能失效了`{site.rss}")
            continue

        content = BytesIO(resp.content)
        feed_obj = feedparser.parse(content)

        for entry in feed_obj.entries[:10]:
            try:
                title = entry.title
                link = entry.link
            except AttributeError:
                logger.warning(f'必要属性获取失败:`{site.rss}')
                continue

            if is_crawled_url(link):
                continue

            try:
                author = entry['author'][:11]
            except:
                author = None

            try:
                value = entry.content[0].value
            except:
                value = entry.get('description') or entry.link

            try:
                article = Article(site=site,
                                  title=title,
                                  author=author,
                                  src_url=link,
                                  uindex=current_ts(),
                                  content=value)
                article.save()
                mark_crawled_url(link)
            except django.db.utils.IntegrityError:
                logger.info(f'数据重复插入:`{title}`{link}')
            except:
                logger.warning(f'数据插入异常:`{title}`{link}')
    logger.info('定时更新RSS任务运行结束')
Exemple #7
0
def podcast_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        logger.info(f"RSS 源可能失效了`{site.rss}")
        return None

    feed_obj = feedparser.parse(BytesIO(resp.content))

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
        except AttributeError:
            logger.warning(f'title 获取失败:`{site.rss}')
            continue

        link = entry.get('link') or entry.get('guid')
        if not link:
            logger.warning(f'link 获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        audio, img = None, ''
        if entry.get('links'):
            for el in entry['links']:
                if 'audio/' in el.get('type') or el.get('rel') == 'enclosure':
                    audio = el
                    break

        if entry.get('image'):
            img = entry.image.get('href')

        try:
            brief = entry.content[0].value
        except:
            brief = entry.get('description') or entry.link

        if audio is not None:
            # 生成 podlove 所需数据
            episode = {
                "version": 5,
                "show": {
                    "title": site.cname,
                    "subtitle": site.brief,
                    "poster": site.favicon,
                    "link": site.link,
                },
                "title": title,
                "link": link,
                # "subtitle": brief,
                "publicationDate": entry.get('published'),
                "poster": img,
                "duration": to_podcast_duration(entry.get('itunes_duration', '')),
                "audio": [
                    {
                        "url": audio.href,
                        "mimeType": audio.type
                    }
                ]
            }
            episode = json.dumps(episode)
            episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8')
            content = podcast_tmpl % episode + brief
        else:
            content = brief + f'''<p></p><img src="{img}">'''

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, content)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
            mark_crawled_url(link)
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    return True
Exemple #8
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            logger.warning(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except (AttributeError, IndexError):
            value = None

        if not value:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, value)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True
Exemple #9
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            guard_log(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:12]:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        # 公众号 RSS 二次抓取
        if get_host_name(site.rss) in ('qnmlgb.tech', ):
            if get_host_name(link) in ('mp.weixin.qq.com', ):
                rsp = get_with_proxy(link)

                if rsp is not None and rsp.ok:
                    try:
                        title, author, value = parse_weixin_page(rsp)
                    except:
                        pass
        try:
            article = Article(site=site,
                              title=title,
                              author=author,
                              src_url=link,
                              uindex=current_ts(),
                              content=value)
            article.save()

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True