def wemp_spider(urls, site): """ 抓取微信内容 :param urls: :param site: :return: """ for url in urls: if is_crawled_url(url): continue try: logger.info(f'开始爬取公众号地址:`{url}') rsp = requests.get(url, timeout=10) if rsp.ok: response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') title = response.selector.xpath( '//h2[@id="activity-name"]/text()').extract_first().strip( ) content = response.selector.xpath( '//div[@id="js_content"]').extract_first().strip() try: author = response.selector.xpath('//span[@id="js_author_name"]/text()').\ extract_first().strip() except: author = response.selector.xpath( '//a[@id="js_name"]/text()').extract_first().strip() if title and content: content_soup = BeautifulSoup(content, "html.parser") for img in content_soup.find_all('img'): if img.attrs.get('data-src'): img.attrs['src'] = img.attrs['data-src'] article = Article(title=title, author=author, site=site, uindex=current_ts(), content=str(content_soup), src_url=url) article.save() mark_crawled_url(url) else: logger.warning(f'公众号内容解析异常:`{title}`{author}`{content}') except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def parse_detail_page(job): response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8') title, author, content = None, None, None # 判断跳转后的域名 host = get_host_name(job.rsp_url) if job.action == 20 or settings.MPWX_HOST in host: try: if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first(): mark_crawled_url(job.url, job.rsp_url) logger.info(f"内容违规或删除:`{job.url}") return 6 except: pass title, author, content = parse_mpwx_detail_page(response) if job.action != 20 and settings.MPWX_HOST in host: logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}") elif job.action == 21: title, author, content = parse_ershicimi_detail_page(response) elif job.action == 22: title, author, content = parse_wemp_detail_page(response) elif job.action == 23: title, author, content = parse_chuansongme_detail_page(response) elif job.action == 24: title, author, content = parse_anyv_detail_page(response) mark_crawled_url(job.url, job.rsp_url) if title is None: logger.warning(f"页面解析失败:`{title}`{job.url}") return 4 else: try: uindex = current_ts() article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url) article.save() write_dat2_file(uindex, job.site_id, content) except: logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}") return 7 return 2
def wemp_spider(url, site): """ 抓取微信内容 :param url: :param site: :return: """ if is_crawled_url(url): return try: rsp = requests.get(url, timeout=10) if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url) except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def wemp_spider(url, site): """ 抓取微信内容,支持直接微信域名或者 ershicimi 域名 :param url: :param site: :return: """ if is_crawled_url(url): return rsp = get_with_proxy(url) if rsp is None: return if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url)
def update_all_user_feed(): """ 更新所有 site """ logger.info('开始运行定时更新RSS任务') now = datetime.datetime.now() # 按照不同频率更新,以 4 小时候为一个大周期 if now.hour % 4 == 0: feeds = Site.objects.filter(status='active', creator='user').order_by('-star') elif now.hour % 4 == 1: feeds = [] elif now.hour % 4 == 2: feeds = Site.objects.filter(status='active', creator='user', star__gte=20).order_by('-star') elif now.hour % 4 == 3: feeds = Site.objects.filter(status='active', creator='user', star__gte=9).order_by('-star') for site in feeds: try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") continue content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:11] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') logger.info('定时更新RSS任务运行结束')
def build_whoosh_index_cron(): """ 建立全文搜索索引 """ from web.utils import whoosh_site_schema, whoosh_article_schema from whoosh.filedb.filestore import FileStorage from whoosh.qparser import QueryParser idx_dir = settings.WHOOSH_IDX_DIR first_boot = False if not os.path.exists(idx_dir): os.makedirs(idx_dir) first_boot = True storage = FileStorage(idx_dir) # 索引站点 if first_boot: idx = storage.create_index(whoosh_site_schema, indexname="site") else: idx = storage.open_index(indexname="site", schema=whoosh_site_schema) idx_writer = idx.writer() for site_id in get_active_sites(): # 判断是否已经索引 if is_indexed('site', site_id) and not first_boot: continue try: site = Site.objects.get(pk=site_id, status='active') except: continue cname = split_cn_words(site.cname, join=True) author = site.author or '' brief = split_cn_words(site.brief, join=True) logger.info(f"源分词结果:`{site_id}`{cname}`{brief}") try: idx_writer.add_document(id=site_id, cname=cname, author=author, brief=brief) set_indexed('site', site_id) except: logger.warning(f"源索引失败:`{site_id}") idx_writer.commit() # 索引文章 if first_boot: idx = storage.create_index(whoosh_article_schema, indexname="article") else: idx = storage.open_index(indexname="article", schema=whoosh_article_schema) idx_writer = idx.writer() for uindex in get_recent_articles(): # 判断是否已经索引 if is_indexed('article', uindex) and not first_boot: continue try: article = Article.objects.get(uindex=uindex, status='active') except: continue content = get_content(uindex, article.site_id) if content: title = split_cn_words(article.title, join=True) author = article.author or '' content_soup = BeautifulSoup(content, 'html.parser') content = split_cn_words(content_soup.get_text(), join=True, limit=20) logger.info(f"文章分词结果:`{uindex}`{title}") try: idx_writer.add_document(uindex=uindex, title=title, author=author, content=content) set_indexed('article', uindex) except: logger.warning(f"文章索引失败:`{uindex}") idx_writer.commit() # 清理过期文章 idx = storage.open_index(indexname="article", schema=whoosh_article_schema) idx_writer = idx.writer() lastweek_ts = str(current_ts() - 7 * 86400 * 1000) query = QueryParser("uindex", idx.schema).parse('uindex:{to %s]' % lastweek_ts) with idx.searcher() as searcher: idx_writer.delete_by_query(query, searcher) idx_writer.commit() return True
def in_site_search(request): """ 站内搜索 """ user = get_login_user(request) keyword = request.POST.get('keyword', '').strip() scope = request.POST.get('scope', 'all') logger.warning(f"搜索关键字:`{keyword}") keyword = split_cn_words(keyword, join=True) logger.info(f"转换后的关键字:`{keyword}") if scope not in ('all', 'feed', 'article'): return HttpResponseForbidden('Param Error') if not keyword: return HttpResponseNotFound("Empty Keyword") storage = FileStorage(settings.WHOOSH_IDX_DIR) rel_sites, rel_articles = None, None # 查找相关源 if scope in ('feed', 'all'): idx = storage.open_index(indexname="site", schema=whoosh_site_schema) qp = MultifieldParser(['cname', 'author', 'brief'], schema=whoosh_site_schema) query = qp.parse(keyword) sites = [] with idx.searcher() as s: results = s.search(query, limit=50) for ret in results: sites.append(ret['id']) rel_sites = Site.objects.filter(status='active', pk__in=sites).order_by('-star') elif scope == 'article': # 查找相关文章 idx = storage.open_index(indexname="article", schema=whoosh_article_schema) qp = MultifieldParser(['title', 'author', 'content'], schema=whoosh_article_schema) query = qp.parse(keyword) articles = [] with idx.searcher() as s: old_mask = TermRange("uindex", None, str(current_ts() - 7 * 86400 * 1000)) results = s.search(query, mask=old_mask, limit=50) for ret in results: articles.append(ret['uindex']) rel_articles = Article.objects.filter(is_recent=True, status='active', uindex__in=articles).iterator() # 用户订阅 user_sub_feeds = [] if user: user_sub_feeds = get_user_subscribe_feeds(user.oauth_id, user_level=user.level) context = dict() context['user'] = user context['user_sub_feeds'] = user_sub_feeds context['rel_sites'] = rel_sites context['rel_articles'] = rel_articles context['keyword'] = keyword if scope == 'all': return render(request, 'search/search.html', context=context) elif scope == 'feed': return render(request, 'search/search_feeds.html', context=context) elif scope == 'article': return render(request, 'search/search_articles.html', context=context)
def podcast_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: logger.info(f"RSS 源可能失效了`{site.rss}") return None feed_obj = feedparser.parse(BytesIO(resp.content)) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title except AttributeError: logger.warning(f'title 获取失败:`{site.rss}') continue link = entry.get('link') or entry.get('guid') if not link: logger.warning(f'link 获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' audio, img = None, '' if entry.get('links'): for el in entry['links']: if 'audio/' in el.get('type') or el.get('rel') == 'enclosure': audio = el break if entry.get('image'): img = entry.image.get('href') try: brief = entry.content[0].value except: brief = entry.get('description') or entry.link if audio is not None: # 生成 podlove 所需数据 episode = { "version": 5, "show": { "title": site.cname, "subtitle": site.brief, "poster": site.favicon, "link": site.link, }, "title": title, "link": link, # "subtitle": brief, "publicationDate": entry.get('published'), "poster": img, "duration": to_podcast_duration(entry.get('itunes_duration', '')), "audio": [ { "url": audio.href, "mimeType": audio.type } ] } episode = json.dumps(episode) episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8') content = podcast_tmpl % episode + brief else: content = brief + f'''<p></p><img src="{img}">''' try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, content) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') mark_crawled_url(link) except: logger.warning(f'数据插入异常:`{title}`{link}') return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: logger.warning(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except (AttributeError, IndexError): value = None if not value: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, value) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True
def handle(self, *args, **kwargs): time = timezone.now().strftime('%X') self.stdout.write("It's now %s" % time) logger.info("the job sheduler stared") logger.info('开始运行定时更新RSS任务') now = datetime.now() if now.hour % 4 == 0: feeds = Site.objects.filter(status='active', creator='user').order_by('-star') elif now.hour % 4 == 1: feeds = feeds = Site.objects.filter(status='active', creator='user', star__gte=50).order_by('-star') elif now.hour % 4 == 2: feeds = Site.objects.filter(status='active', creator='user', star__gte=20).order_by('-star') elif now.hour % 4 == 3: feeds = Site.objects.filter(status='active', creator='user', star__gte=9).order_by('-star') feeds = Site.objects.filter(status='active', creator='user', star__gte=9).order_by('-star') for site in feeds: logger.info(f"RSS源`{site.rss}") feed_obj = feedparser.parse(site.rss) logger.info('定时更新RSS任务运行结束') for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link #logger.info(f"RSS源`{title}") except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue # if is_crawled_url(link): # continue try: author = entry['author'][:11] logger.info(f"RSS源数据author`{author}") except: author = None #logger.info(f"RSS源数据author`{author}") try: value = entry.content[0].value except: value = entry.get('description') or entry.link try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() #mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') logger.info('定时更新RSS任务运行结束')
def update_subsite_user_feed(global_subsite): logger.info('开始更新自定义RSS任务') feeds = Site.objects.filter(name__in=global_subsite,status='active', creator='user', star__gte=9).order_by('-star') for site in feeds: feed_obj=feedparser.parse(site.rss) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue # if is_crawled_url(link): # continue try: author = entry['author'][:11] except: author = None try: value = entry.content[0].value except: # to-do fet full descriptions value = entry.get('description') or entry.link try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() #mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') logger.info('更新自定义RSS任务运行结束')
def atom_spider(site): """ 更新源内容 """ try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}')
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: guard_log(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): rsp = get_with_proxy(link) if rsp is not None and rsp.ok: try: title, author, value = parse_weixin_page(rsp) except: pass try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True