def getShortLink(link): if matchKey(link, ['weibo.', 'twitter.', 't.me/']): return raw_link = getRawLink(link) if isCN(export_to_telegraph.getTitle(raw_link)): return shorter(raw_link, link) if isCN(export_to_telegraph.getTitle(link)): return link
def processSite(site): try: links = link_extractor.getLinks(site) except Exception as e: print('web_bot, getLinks fail', str(e), site) return count = 0 if 'douban' in site and 'people' not in site: limit = 1 # may change later elif matchKey(site, ['tempo', 'kompas', 'nature.com']): limit = 1 else: limit = 20 for link in links: if db.existing.contain(link): continue title = ''.join(export_to_telegraph.getTitle(link).split()) if db.existing.contain(title): continue success = sendLink(site, link) db.existing.add(link) db.existing.add(title) count += 1 if (not success) or count >= limit: return
def yieldPoliticsRead(): posts = webgram.getPosts('freedom_watch', force_cache=True)[1:] for post in posts[::-1]: link = getLink(post.text, getShortLink) if not link: continue yield export_to_telegraph.getTitle(link), link
def handleUrl(update, context): msg = update.effective_message if not msg: return raw_links = msg.text.split() raw_links = [x for x in raw_links if 'http' in x] if not raw_links: return existing = set() items = [] for raw_link in raw_links: link = getCnLink(raw_link) if not link: continue title = compactText(export_to_telegraph.getTitle(link)) if link in existing or title in existing: continue existing.add(link) existing.add(title) items.append((title, link)) lines = ['【%s】%s' % item for item in items] lines = ['%d. %s' % (index + 1, item) for index, item in enumerate(lines)] reply = ('《每日文章精选 %s》 https://t.me/daily_read \n\n' % date.today().strftime("%Y-%m-%d") + '\n\n'.join(lines)) msg.reply_text(reply, disable_web_page_preview=True)
def sendLink(site, link, fixed_channel = None): simplified = None telegraph = None album_result = None additional_info = getAdditionalInfo(site) channels = list(db.sub.channels(site, tele.bot)) message = link success = True for channel, config in channels: if fixed_channel and channel.id != fixed_channel: continue blocked_keys = getMatchedKey( link + export_to_telegraph.getTitle(link) + export_to_telegraph.getTitle(link, toSimplified=True), blocklist.get(channel.id, [])) if blocked_keys: message += ' filtered: ' + ' '.join(blocked_keys) continue if not album_result and '.douban.' in link and '/note/' not in link: album_result = web_2_album.get(link, force_cache = True) if album_result.imgs: album_result.cap = cutCaptionHtml(album_result.cap, 800) else: album_result.cap = cutCaptionHtml(album_result.cap, 2000) if not album_result and 'to_album' in config: album_result = export_to_telegraph.getAlbum(link) if not simplified and 'to_simplify' in config: simplified = export_to_telegraph.export(link, force_cache = True, force=True, toSimplified=True) or link if not telegraph and not album_result and 'to_telegraph' in config: telegraph = export_to_telegraph.export(link, force_cache = True, force=True) or link message = link if 'to_simplify' in config: message = simplified if 'to_telegraph' in config: message = telegraph try: if album_result: album_sender.send_v2(channel, album_result) else: channel.send_message(message + additional_info, parse_mode='HTML') except Exception as e: print(e) success = False debug_group.send_message('send fail: %s %d %s' % (link, channel.id, e)) log(message or link, site, [item[0] for item in channels]) return success
def decorate(text): if 'http' not in text: text = 'https://' + text text = getRawLink(text) if matchKey(text, [ 'www.douban.com/people/', 'twitter.com', 'facebook.com', 'm.weibo.cn' ]): return '\n\n' + text return '\n\n【%s】 %s' % (export_to_telegraph.getTitle(text), text)
def getLinkReplace(url, album): if 'telegra.ph' in url and 'douban.com/note/' in album.cap_html: return '' if 'telegra.ph' in url: soup = BeautifulSoup(cached_url.get(url, force_cache=True), 'html.parser') title = export_to_telegraph.getTitle(url) try: return '\n\n【%s】 %s' % (title, soup.find('address').find('a')['href']) except: return '' return '\n\n' + url
def getCnLink(link): link = getRawLink(link) if not link: return if not matchKey( link, ['douban.', 'thepaper', 'weixin', 'zhihu.', 'cnpoliti', 'weibo']): return link = clearUrl(link) # getTitle by default will cache if matchKey(export_to_telegraph.getTitle(link), ['链接已过期', '仅限男生', '男生350分', '常玮平', '【来自投稿】20世纪初', '做了套打拳入门']): return False return link
def loopImp(): if not scheduled: for item in db.sub.subscriptions(): scheduled.append(item) random.shuffle(scheduled) site = scheduled.pop() try: links = link_extractor.getLinks(site) except Exception as e: print('web_bot, getLinks fail', str(e), site) return for link in links: if not db.existing.add(link): continue title = ''.join(export_to_telegraph.getTitle(link).split()) if not db.existing.add(title): continue sendLink(site, link)
def gen(news_source='bbc', ebook_convert_app=ebook_convert_app, additional_setting='', filename_suffix=''): filename = '%s_%s新闻' % (date.today().strftime("%m%d"), news_source.upper()) + filename_suffix os.system('rm -rf html_result') os.system('mkdir html_result > /dev/null 2>&1') links = [] for link in findLinks(news_source): args = {} if 'twreporter.org/' in link: args['toSimplified'] = True limit = 7 else: limit = 10 name = export_to_telegraph.getTitle(link, **args) html = getArticleHtml(name, link, filename + '.html') if html: fn = cleanFileName(name) with open('html_result/%s.html' % fn, 'w') as f: f.write(html) links.append((name, fn)) if len(links) > limit: break index_html_name = 'html_result/%s.html' % filename with open(index_html_name, 'w') as f: f.write(getIndexHtml(news_source, links)) os.system('mkdir pdf_result > /dev/null 2>&1') pdf_name = 'pdf_result/%s.pdf' % filename command = '%s %s %s %s > /dev/null 2>&1' os.system( command % (ebook_convert_app, index_html_name, pdf_name, additional_setting)) return pdf_name
def testExport(): for url in urls: print(export_to_telegraph.getTitle(url))