def atom_spider(site): """ 更新源内容 """ try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}')
def wemp_spider(urls, site): """ 抓取微信内容 :param urls: :param site: :return: """ for url in urls: if is_crawled_url(url): continue try: logger.info(f'开始爬取公众号地址:`{url}') rsp = requests.get(url, timeout=10) if rsp.ok: response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') title = response.selector.xpath( '//h2[@id="activity-name"]/text()').extract_first().strip( ) content = response.selector.xpath( '//div[@id="js_content"]').extract_first().strip() try: author = response.selector.xpath('//span[@id="js_author_name"]/text()').\ extract_first().strip() except: author = response.selector.xpath( '//a[@id="js_name"]/text()').extract_first().strip() if title and content: content_soup = BeautifulSoup(content, "html.parser") for img in content_soup.find_all('img'): if img.attrs.get('data-src'): img.attrs['src'] = img.attrs['data-src'] article = Article(title=title, author=author, site=site, uindex=current_ts(), content=str(content_soup), src_url=url) article.save() mark_crawled_url(url) else: logger.warning(f'公众号内容解析异常:`{title}`{author}`{content}') except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def parse_detail_page(job): response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8') title, author, content = None, None, None # 判断跳转后的域名 host = get_host_name(job.rsp_url) if job.action == 20 or settings.MPWX_HOST in host: try: if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first(): mark_crawled_url(job.url, job.rsp_url) logger.info(f"内容违规或删除:`{job.url}") return 6 except: pass title, author, content = parse_mpwx_detail_page(response) if job.action != 20 and settings.MPWX_HOST in host: logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}") elif job.action == 21: title, author, content = parse_ershicimi_detail_page(response) elif job.action == 22: title, author, content = parse_wemp_detail_page(response) elif job.action == 23: title, author, content = parse_chuansongme_detail_page(response) elif job.action == 24: title, author, content = parse_anyv_detail_page(response) mark_crawled_url(job.url, job.rsp_url) if title is None: logger.warning(f"页面解析失败:`{title}`{job.url}") return 4 else: try: uindex = current_ts() article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url) article.save() write_dat2_file(uindex, job.site_id, content) except: logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}") return 7 return 2
def wemp_spider(url, site): """ 抓取微信内容 :param url: :param site: :return: """ if is_crawled_url(url): return try: rsp = requests.get(url, timeout=10) if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url) except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def wemp_spider(url, site): """ 抓取微信内容,支持直接微信域名或者 ershicimi 域名 :param url: :param site: :return: """ if is_crawled_url(url): return rsp = get_with_proxy(url) if rsp is None: return if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url)
def update_all_user_feed(): """ 更新所有 site """ logger.info('开始运行定时更新RSS任务') now = datetime.datetime.now() # 按照不同频率更新,以 4 小时候为一个大周期 if now.hour % 4 == 0: feeds = Site.objects.filter(status='active', creator='user').order_by('-star') elif now.hour % 4 == 1: feeds = [] elif now.hour % 4 == 2: feeds = Site.objects.filter(status='active', creator='user', star__gte=20).order_by('-star') elif now.hour % 4 == 3: feeds = Site.objects.filter(status='active', creator='user', star__gte=9).order_by('-star') for site in feeds: try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") continue content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:11] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') logger.info('定时更新RSS任务运行结束')
def podcast_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: logger.info(f"RSS 源可能失效了`{site.rss}") return None feed_obj = feedparser.parse(BytesIO(resp.content)) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title except AttributeError: logger.warning(f'title 获取失败:`{site.rss}') continue link = entry.get('link') or entry.get('guid') if not link: logger.warning(f'link 获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' audio, img = None, '' if entry.get('links'): for el in entry['links']: if 'audio/' in el.get('type') or el.get('rel') == 'enclosure': audio = el break if entry.get('image'): img = entry.image.get('href') try: brief = entry.content[0].value except: brief = entry.get('description') or entry.link if audio is not None: # 生成 podlove 所需数据 episode = { "version": 5, "show": { "title": site.cname, "subtitle": site.brief, "poster": site.favicon, "link": site.link, }, "title": title, "link": link, # "subtitle": brief, "publicationDate": entry.get('published'), "poster": img, "duration": to_podcast_duration(entry.get('itunes_duration', '')), "audio": [ { "url": audio.href, "mimeType": audio.type } ] } episode = json.dumps(episode) episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8') content = podcast_tmpl % episode + brief else: content = brief + f'''<p></p><img src="{img}">''' try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, content) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') mark_crawled_url(link) except: logger.warning(f'数据插入异常:`{title}`{link}') return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: logger.warning(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except (AttributeError, IndexError): value = None if not value: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, value) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: guard_log(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): rsp = get_with_proxy(link) if rsp is not None and rsp.ok: try: title, author, value = parse_weixin_page(rsp) except: pass try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True