def processCard(card): if not shouldSend(card): return url = clearUrl(card['scheme']) if url in db.existing.items: return r = weibo_2_album.get(url) print('hash', r.hash) if (str(r.wid) in db.existing.items or str(r.rwid) in db.existing.items or str(r.hash) in db.existing.items): return print('sending', url, r.wid, r.rwid) timer.wait(10) cache[r.hash] = cache.get(r.hash, 0) + 1 if cache[r.hash] > 2: # for whatever reason, this url does not send to telegram, skip db.existing.add(r.hash) album_sender.send(channel, url, r) db.existing.add(url) db.existing.add(r.wid) db.existing.add(r.rwid) db.existing.add(r.hash)
def process(url): content = sg.getContent(url) content = yaml.load(content, Loader=yaml.FullLoader) for card in content['data']['cards']: try: processCard(card) except Exception as e: debug_group.send_message(clearUrl(card['scheme']) + '\n' + str(e))
def test2(): for url in [ 'https://www.lgbtqnation.com/2020/07/trump-administration-memo-explains-spot-transgender-woman/#.Xxhk4Z7ztRQ.wechat', 'http://mp.weixin.qq.com/s?__biz=MzUxMzAzMzk5Ng==&mid=2247484807&idx=1&sn=101bde2cf3bfbee2dcbd1e842475e06a&chksm=f95a17e4ce2d9ef27a9dbdd6c59c831b45caa8658ac566cc2d3681216c83b0f9e28a7ed0dc3f&mpshare=1&scene=1&srcid=0723RKjQqOqqKU0gQFh4vs4S&sharer_sharetime=1595435039677&sharer_shareid=f467668849c8544e583567bf8a259f31#rd', 'https://mp.weixin.qq.com/s?__biz=MzA5MDM1MTcyNQ==&mid=2657277726&idx=1&sn=613c0be79aebcd1ba714cbbbae64f66c&chksm=8b9a861cbced0f0af93ef107a5e6fc885431024ff0316948f75cc6738276f33cc3e92dd25661&mpshare=1&scene=1&srcid=0722lacFTcoG3jda9bA9JOoK&sharer_sharetime=1595402746390&sharer_shareid=a468f7684ed03b370e7298eb88d56e49#rd', 'https://mp.weixin.qq.com/s/DzSn0oX7nctjnsHRM1DpKQ?url=https%3A%2F%2Fmp.weixin.qq.com%2Fs%2FDzSn0oX7nctjnsHRM1DpKQ&share_menu=1&sinainternalbrowser=topnav&mid=4532409046147275&luicode=10000011&lfid=100103type%3D1%26q%3D%E7%90%86%E8%AE%BA&u=https%3A%2F%2Fmp.weixin.qq.com%2Fs%2FDzSn0oX7nctjnsHRM1DpKQ', 'http://www.infzm.com/wap/#/content/198828', None ]: print(clearUrl(url))
def getUrl(item, post_link): note = item.find('div', class_='note-block') if (note and note.get('data-url')) or matchKey(post_link, ['https://book.douban.com/review/', 'https://www.douban.com/note/']): note = (note and note.get('data-url')) or post_link return export_to_telegraph.export(note, force=True) or note url_block = item.find('div', class_='url-block') if url_block: url = url_block.find('a')['href'] return clearUrl(export_to_telegraph.export(url) or url)
def getCnLink(link): link = getRawLink(link) if not link: return if not matchKey( link, ['douban.', 'thepaper', 'weixin', 'zhihu.', 'cnpoliti', 'weibo']): return link = clearUrl(link) # getTitle by default will cache if matchKey(export_to_telegraph.getTitle(link), ['链接已过期', '仅限男生', '男生350分', '常玮平', '【来自投稿】20世纪初', '做了套打拳入门']): return False return link
def exportAllInText(soup): if not soup: return '' text = str(soup).replace('<br/>', '\n') quote = BeautifulSoup(text, features='lxml').text.strip() for link in soup.find_all('a', title=True, href=True): url = link['title'] url = clearUrl(export(url) or url) if '_' in url: url = '[%s](%s)' % (url, url) quote = quote.replace(link['href'], ' ' + url + ' ') for link in soup.find_all('a', title=False, href=True): if link['href'] == link.text: quote.replace(link.text, ' ' + link.text + ' ') return escapeMarkdown(quote)
def export(url, throw_exception=False, force=False, toSimplified=False, force_cache=False, noSourceLink=False, noAutoConvert=False): try: url = clearUrl(url) if not force and not isConfidentUrl(url): return p = _getPoster() if not force and _isEditable(p, url): return url article = getArticle(url, throw_exception, toSimplified=toSimplified, force_cache=force_cache, noAutoConvert=noAutoConvert) if not article.text or not article.text.text.strip(): article.text = '<div>TO BE ADDED</div>' try: r = p.post(title=article.title, author=getAuthorField(article.author, noSourceLink), author_url=getAuthorUrl(article, url, noSourceLink), text=str(article.text)) except Exception as e: if 'CONTENT_TEXT_REQUIRED' in str(e): r = p.post(title=article.title, author=getAuthorField(article.author, noSourceLink), author_url=getAuthorUrl(article, url, noSourceLink), text='<div>TO BE ADDED</div>') elif 'ACCESS_TOKEN_INVALID' in str(e): r = TelegraphPoster().post( title=article.title, author=getAuthorField(article.author, noSourceLink), author_url=getAuthorUrl(article, url, noSourceLink), text=str(article.text)) else: raise e if force or isConfident(url, article.text): return _trimUrl(r['url']) except Exception as e: if throw_exception: raise e