def processCard(card):
    if not shouldSend(card):
        return
    url = clearUrl(card['scheme'])
    if url in db.existing.items:
        return

    r = weibo_2_album.get(url)
    print('hash', r.hash)
    if (str(r.wid) in db.existing.items or str(r.rwid) in db.existing.items
            or str(r.hash) in db.existing.items):
        return

    print('sending', url, r.wid, r.rwid)
    timer.wait(10)

    cache[r.hash] = cache.get(r.hash, 0) + 1
    if cache[r.hash] > 2:
        # for whatever reason, this url does not send to telegram, skip
        db.existing.add(r.hash)

    album_sender.send(channel, url, r)

    db.existing.add(url)
    db.existing.add(r.wid)
    db.existing.add(r.rwid)
    db.existing.add(r.hash)
def process(url):
    content = sg.getContent(url)
    content = yaml.load(content, Loader=yaml.FullLoader)
    for card in content['data']['cards']:
        try:
            processCard(card)
        except Exception as e:
            debug_group.send_message(clearUrl(card['scheme']) + '\n' + str(e))
Example #3
0
def test2():
    for url in [
            'https://www.lgbtqnation.com/2020/07/trump-administration-memo-explains-spot-transgender-woman/#.Xxhk4Z7ztRQ.wechat',
            'http://mp.weixin.qq.com/s?__biz=MzUxMzAzMzk5Ng==&mid=2247484807&idx=1&sn=101bde2cf3bfbee2dcbd1e842475e06a&chksm=f95a17e4ce2d9ef27a9dbdd6c59c831b45caa8658ac566cc2d3681216c83b0f9e28a7ed0dc3f&mpshare=1&scene=1&srcid=0723RKjQqOqqKU0gQFh4vs4S&sharer_sharetime=1595435039677&sharer_shareid=f467668849c8544e583567bf8a259f31#rd',
            'https://mp.weixin.qq.com/s?__biz=MzA5MDM1MTcyNQ==&mid=2657277726&idx=1&sn=613c0be79aebcd1ba714cbbbae64f66c&chksm=8b9a861cbced0f0af93ef107a5e6fc885431024ff0316948f75cc6738276f33cc3e92dd25661&mpshare=1&scene=1&srcid=0722lacFTcoG3jda9bA9JOoK&sharer_sharetime=1595402746390&sharer_shareid=a468f7684ed03b370e7298eb88d56e49#rd',
            'https://mp.weixin.qq.com/s/DzSn0oX7nctjnsHRM1DpKQ?url=https%3A%2F%2Fmp.weixin.qq.com%2Fs%2FDzSn0oX7nctjnsHRM1DpKQ&share_menu=1&sinainternalbrowser=topnav&mid=4532409046147275&luicode=10000011&lfid=100103type%3D1%26q%3D%E7%90%86%E8%AE%BA&u=https%3A%2F%2Fmp.weixin.qq.com%2Fs%2FDzSn0oX7nctjnsHRM1DpKQ',
            'http://www.infzm.com/wap/#/content/198828', None
    ]:
        print(clearUrl(url))
Example #4
0
def getUrl(item, post_link):
	note = item.find('div', class_='note-block')
	if (note and note.get('data-url')) or matchKey(post_link, 
			['https://book.douban.com/review/', 'https://www.douban.com/note/']):
		note = (note and note.get('data-url')) or post_link
		return export_to_telegraph.export(note, force=True) or note
	
	url_block = item.find('div', class_='url-block')
	if url_block:
		url = url_block.find('a')['href']
		return clearUrl(export_to_telegraph.export(url) or url)
Example #5
0
def getCnLink(link):
    link = getRawLink(link)
    if not link:
        return
    if not matchKey(
            link,
        ['douban.', 'thepaper', 'weixin', 'zhihu.', 'cnpoliti', 'weibo']):
        return
    link = clearUrl(link)
    # getTitle by default will cache
    if matchKey(export_to_telegraph.getTitle(link),
                ['链接已过期', '仅限男生', '男生350分', '常玮平', '【来自投稿】20世纪初', '做了套打拳入门']):
        return False
    return link
Example #6
0
def exportAllInText(soup):
	if not soup:
		return ''
	text = str(soup).replace('<br/>', '\n')
	quote = BeautifulSoup(text, features='lxml').text.strip()
	for link in soup.find_all('a', title=True, href=True):
		url = link['title']
		url = clearUrl(export(url) or url)
		if '_' in url:
			url = '[%s](%s)' % (url, url)
		quote = quote.replace(link['href'], ' ' + url + ' ')
	for link in soup.find_all('a', title=False, href=True):
		if link['href'] == link.text:
			quote.replace(link.text, ' ' + link.text + ' ') 
	return escapeMarkdown(quote)
def export(url,
           throw_exception=False,
           force=False,
           toSimplified=False,
           force_cache=False,
           noSourceLink=False,
           noAutoConvert=False):
    try:
        url = clearUrl(url)
        if not force and not isConfidentUrl(url):
            return
        p = _getPoster()
        if not force and _isEditable(p, url):
            return url
        article = getArticle(url,
                             throw_exception,
                             toSimplified=toSimplified,
                             force_cache=force_cache,
                             noAutoConvert=noAutoConvert)
        if not article.text or not article.text.text.strip():
            article.text = '<div>TO BE ADDED</div>'
        try:
            r = p.post(title=article.title,
                       author=getAuthorField(article.author, noSourceLink),
                       author_url=getAuthorUrl(article, url, noSourceLink),
                       text=str(article.text))
        except Exception as e:
            if 'CONTENT_TEXT_REQUIRED' in str(e):
                r = p.post(title=article.title,
                           author=getAuthorField(article.author, noSourceLink),
                           author_url=getAuthorUrl(article, url, noSourceLink),
                           text='<div>TO BE ADDED</div>')
            elif 'ACCESS_TOKEN_INVALID' in str(e):
                r = TelegraphPoster().post(
                    title=article.title,
                    author=getAuthorField(article.author, noSourceLink),
                    author_url=getAuthorUrl(article, url, noSourceLink),
                    text=str(article.text))
            else:
                raise e
        if force or isConfident(url, article.text):
            return _trimUrl(r['url'])
    except Exception as e:
        if throw_exception:
            raise e