Example #1
0
def getShortLink(link):
    if matchKey(link, ['weibo.', 'twitter.', 't.me/']):
        return
    raw_link = getRawLink(link)
    if isCN(export_to_telegraph.getTitle(raw_link)):
        return shorter(raw_link, link)
    if isCN(export_to_telegraph.getTitle(link)):
        return link
Example #2
0
def processSite(site):
	try:
		links = link_extractor.getLinks(site)
	except Exception as e:
		print('web_bot, getLinks fail', str(e), site)
		return
	count = 0
	if 'douban' in site and 'people' not in site:
		limit = 1 # may change later
	elif matchKey(site, ['tempo', 'kompas', 'nature.com']):
		limit = 1
	else:
		limit = 20
	for link in links:
		if db.existing.contain(link):
			continue
		title = ''.join(export_to_telegraph.getTitle(link).split())
		if db.existing.contain(title):
			continue
		success = sendLink(site, link)
		db.existing.add(link)
		db.existing.add(title)
		count += 1
		if (not success) or count >= limit:
			return
def yieldPoliticsRead():
    posts = webgram.getPosts('freedom_watch', force_cache=True)[1:]
    for post in posts[::-1]:
        link = getLink(post.text, getShortLink)
        if not link:
            continue
        yield export_to_telegraph.getTitle(link), link
def handleUrl(update, context):
    msg = update.effective_message
    if not msg:
        return
    raw_links = msg.text.split()
    raw_links = [x for x in raw_links if 'http' in x]
    if not raw_links:
        return
    existing = set()
    items = []
    for raw_link in raw_links:
        link = getCnLink(raw_link)
        if not link:
            continue
        title = compactText(export_to_telegraph.getTitle(link))
        if link in existing or title in existing:
            continue
        existing.add(link)
        existing.add(title)
        items.append((title, link))
    lines = ['【%s】%s' % item for item in items]
    lines = ['%d. %s' % (index + 1, item) for index, item in enumerate(lines)]
    reply = ('《每日文章精选 %s》 https://t.me/daily_read \n\n' %
             date.today().strftime("%Y-%m-%d") + '\n\n'.join(lines))
    msg.reply_text(reply, disable_web_page_preview=True)
Example #5
0
def sendLink(site, link, fixed_channel = None):
	simplified = None
	telegraph = None
	album_result = None
	additional_info = getAdditionalInfo(site)
	channels = list(db.sub.channels(site, tele.bot))
	message = link
	success = True
	for channel, config in channels:
		if fixed_channel and channel.id != fixed_channel:
			continue 
		blocked_keys = getMatchedKey(
			link + export_to_telegraph.getTitle(link) + export_to_telegraph.getTitle(link, toSimplified=True),
			blocklist.get(channel.id, []))
		if blocked_keys:
			message += ' filtered: ' + ' '.join(blocked_keys)
			continue		
		if not album_result and '.douban.' in link and '/note/' not in link:
			album_result = web_2_album.get(link, force_cache = True)
			if album_result.imgs:
				album_result.cap = cutCaptionHtml(album_result.cap, 800)
			else:
				album_result.cap = cutCaptionHtml(album_result.cap, 2000)
		if not album_result and 'to_album' in config:
			album_result = export_to_telegraph.getAlbum(link)
		if not simplified and 'to_simplify' in config:
			simplified = export_to_telegraph.export(link, 
				force_cache = True, force=True, toSimplified=True) or link
		if not telegraph and not album_result and 'to_telegraph' in config:
			telegraph = export_to_telegraph.export(link, 
				force_cache = True, force=True) or link
		message = link
		if 'to_simplify' in config:
			message = simplified
		if 'to_telegraph' in config:
			message = telegraph
		try:
			if album_result:
				album_sender.send_v2(channel, album_result)
			else:
				channel.send_message(message + additional_info, parse_mode='HTML')
		except Exception as e:
			print(e)
			success = False
			debug_group.send_message('send fail: %s %d %s' % (link, channel.id, e))
	log(message or link, site, [item[0] for item in channels])
	return success
def decorate(text):
    if 'http' not in text:
        text = 'https://' + text
    text = getRawLink(text)
    if matchKey(text, [
            'www.douban.com/people/', 'twitter.com', 'facebook.com',
            'm.weibo.cn'
    ]):
        return '\n\n' + text
    return '\n\n【%s】 %s' % (export_to_telegraph.getTitle(text), text)
def getLinkReplace(url, album):
    if 'telegra.ph' in url and 'douban.com/note/' in album.cap_html:
        return ''
    if 'telegra.ph' in url:
        soup = BeautifulSoup(cached_url.get(url, force_cache=True),
                             'html.parser')
        title = export_to_telegraph.getTitle(url)
        try:
            return '\n\n【%s】 %s' % (title,
                                    soup.find('address').find('a')['href'])
        except:
            return ''
    return '\n\n' + url
Example #8
0
def getCnLink(link):
    link = getRawLink(link)
    if not link:
        return
    if not matchKey(
            link,
        ['douban.', 'thepaper', 'weixin', 'zhihu.', 'cnpoliti', 'weibo']):
        return
    link = clearUrl(link)
    # getTitle by default will cache
    if matchKey(export_to_telegraph.getTitle(link),
                ['链接已过期', '仅限男生', '男生350分', '常玮平', '【来自投稿】20世纪初', '做了套打拳入门']):
        return False
    return link
def loopImp():
    if not scheduled:
        for item in db.sub.subscriptions():
            scheduled.append(item)
        random.shuffle(scheduled)
    site = scheduled.pop()
    try:
        links = link_extractor.getLinks(site)
    except Exception as e:
        print('web_bot, getLinks fail', str(e), site)
        return
    for link in links:
        if not db.existing.add(link):
            continue
        title = ''.join(export_to_telegraph.getTitle(link).split())
        if not db.existing.add(title):
            continue
        sendLink(site, link)
Example #10
0
def gen(news_source='bbc',
        ebook_convert_app=ebook_convert_app,
        additional_setting='',
        filename_suffix=''):
    filename = '%s_%s新闻' % (date.today().strftime("%m%d"),
                            news_source.upper()) + filename_suffix

    os.system('rm -rf html_result')
    os.system('mkdir html_result > /dev/null 2>&1')

    links = []
    for link in findLinks(news_source):
        args = {}
        if 'twreporter.org/' in link:
            args['toSimplified'] = True
            limit = 7
        else:
            limit = 10
        name = export_to_telegraph.getTitle(link, **args)
        html = getArticleHtml(name, link, filename + '.html')
        if html:
            fn = cleanFileName(name)
            with open('html_result/%s.html' % fn, 'w') as f:
                f.write(html)
            links.append((name, fn))
            if len(links) > limit:
                break

    index_html_name = 'html_result/%s.html' % filename
    with open(index_html_name, 'w') as f:
        f.write(getIndexHtml(news_source, links))

    os.system('mkdir pdf_result > /dev/null 2>&1')
    pdf_name = 'pdf_result/%s.pdf' % filename
    command = '%s %s %s %s > /dev/null 2>&1'
    os.system(
        command %
        (ebook_convert_app, index_html_name, pdf_name, additional_setting))
    return pdf_name
Example #11
0
def testExport():
    for url in urls:
        print(export_to_telegraph.getTitle(url))