def add_chuansongme_feed(url): rsp = get_with_retry(url) if rsp is None or not rsp.ok: return None response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') qrcode = response.selector.xpath("//img[@width='260px']/@src").extract_first() avatar = response.selector.xpath("//img[@class='profile_photo_img']/@src").extract_first().strip() brief = response.selector.xpath("//*[@class='inline_editor_value']/div[@class='inline']/span/text()").\ extract_first().strip() keywords = response.selector.xpath("//meta[@name='keywords']/@content").extract_first() cname, name = None, None try: cname, name = keywords.split(',')[:2] except: logger.warning(f"标题解析失败:`{keywords}") if qrcode and name and avatar and cname and brief: return save_feed_to_db(name, cname, qrcode, avatar, brief, url) else: logger.warning(f'字段解析异常:`{url}') return None
def add_wemp_feed(url): rsp = get_with_retry(url) if rsp is None or not rsp.ok: return None response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') qrcode = response.selector.xpath("//*[@class='mp-info__main']/img[@class='mp-info__qr']/@src").extract_first() avatar = response.selector.xpath("//img[@class='post-item__avatar']/@src").extract_first().strip() brief = response.selector.xpath("//meta[@name='description']/@content").extract_first().strip() title = response.selector.xpath("//*[@class='mp-header']/h1/text()").extract_first().strip() cname, name = None, None try: cname, name = re.search(r'^.+? (.+?)\((.+?)\) .+$', title).groups() except: logger.warning(f"标题解析失败:`{title}") if qrcode and name and avatar and cname and brief: return save_feed_to_db(name, cname, qrcode, avatar, brief, url) else: logger.warning(f'字段解析异常:`{url}') return None
def add_anyv_feed(url): rsp = get_with_retry(url) if rsp is None or not rsp.ok: return None response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') # 没有二维码和头像 avatar, link = '', url brief = response.selector.xpath("//*[@class='user_group']/li[2]/text()" ).extract_first().strip().split(':', 1)[1].split(', 微信搜索')[0] cname = response.selector.xpath("//div[@class='subtitle']/h1/a/text()").extract_first().strip()[:-4] name = response.selector.xpath("//*[@class='user_group']/li/a/text()").extract_first().strip().split(':')[-1] if not brief: brief = cname if name and cname and brief: return save_feed_to_db(name, cname, link, avatar, brief, url) else: logger.warning(f'字段解析异常:`{url}') return None
def add_ershicimi_feed(url): """ :return: 解析结果,成功返回字典;失败 None """ rsp = get_with_retry(url) if rsp is None or not rsp.ok: return None response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') qrcode = response.selector.xpath("//img[@class='qr-code']/@src").extract_first() cname = response.selector.xpath("//li[@class='title']//span[@class='name']/text()"). \ extract_first().strip() avatar = response.selector.xpath("//img[@class='avatar']/@src").extract_first().strip() brief = response.selector.xpath("//div[@class='Profile-sideColumnItemValue']/text()").extract_first().strip() name = response.selector.xpath("//meta[@name='keywords']/@content").extract_first().split(',')[1] if qrcode and name and avatar and cname and brief: return save_feed_to_db(name, cname, qrcode, avatar, brief, url) else: logger.warning(f'字段解析异常:`{url}') return None
def podcast_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: logger.info(f"RSS 源可能失效了`{site.rss}") return None feed_obj = feedparser.parse(BytesIO(resp.content)) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title except AttributeError: logger.warning(f'title 获取失败:`{site.rss}') continue link = entry.get('link') or entry.get('guid') if not link: logger.warning(f'link 获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' audio, img = None, '' if entry.get('links'): for el in entry['links']: if 'audio/' in el.get('type') or el.get('rel') == 'enclosure': audio = el break if entry.get('image'): img = entry.image.get('href') try: brief = entry.content[0].value except: brief = entry.get('description') or entry.link if audio is not None: # 生成 podlove 所需数据 episode = { "version": 5, "show": { "title": site.cname, "subtitle": site.brief, "poster": site.favicon, "link": site.link, }, "title": title, "link": link, # "subtitle": brief, "publicationDate": entry.get('published'), "poster": img, "duration": to_podcast_duration(entry.get('itunes_duration', '')), "audio": [ { "url": audio.href, "mimeType": audio.type } ] } episode = json.dumps(episode) episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8') content = podcast_tmpl % episode + brief else: content = brief + f'''<p></p><img src="{img}">''' try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, content) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') mark_crawled_url(link) except: logger.warning(f'数据插入异常:`{title}`{link}') return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: logger.warning(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except (AttributeError, IndexError): value = None if not value: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, value) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: guard_log(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): rsp = get_with_proxy(link) if rsp is not None and rsp.ok: try: title, author, value = parse_weixin_page(rsp) except: pass try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True