def export(update, context): if update.edited_message or update.edited_channel_post: return msg = update.effective_message if msg.chat_id < 0 and ('source' in msg.text) and ('[source]' in msg.text_markdown): return if msg.chat.username == 'web_record': if (matchKey(msg.text_markdown, ['twitter', 'weibo', 'douban', 't.me/']) and not matchKey(msg.text_markdown, ['article', 'note'])): tryDelete(msg) return try: r = msg.chat.send_message('received') except: return try: exportImp(msg) if msg.chat.username == 'web_record': tryDelete(msg) except Exception as e: msg.chat.send_message(str(e)) if not matchKey(str(e), ['Content is too big.']): raise e finally: r.delete()
def veryBadMsg(msg, has_similar_log): if msg.forward_from_chat: if matchKey(msg.forward_from_chat.title, [ '新闻频道', '新闻网', '我的频道', '点我有惊喜', '引流推广', '自由之声🌈', '业务咨询', '大家好', '信息' ]): return True if badTextScore(msg.forward_from_chat.title)[0]: return True if badText(msg.text): return True if ((not msg.from_user.last_name) and (not msg.from_user.username) and len(msg.from_user.first_name) <= 3): return True if badText(msg.caption): return True if matchKey(msg.text, [ '群发需要滴滴', 'joinchat', 'Pua把妹教程五百多套', '前一百名加我的兄弟', '联系我', '+v', 'Louisville_fx', 'LY11618', '滴我', 'Binance' ]): return True if hasSeq(msg.text, ['原生话费收粮', '啦裙發']): return True if msg.contact: return True if has_similar_log: return True return False
def _formatImgUrl(raw, domain): parts = raw.split('/') success = False for index, part in enumerate(parts): if part == 'max': try: if int(parts[index + 1]) > 0: success = True break except: pass if success and 'guim' not in raw: parts[index + 1] = '1300' raw = '/'.join(parts) if not matchKey(raw, ['guim']): raw = re.sub('width=\d\d*', 'width=1300', raw) if matchKey(raw, ['condecdn']): raw = re.sub('/\d\d*/', '/1300/', raw) if matchKey(raw, ['twreport']) and matchKey(raw, ['-tiny']): raw = raw.replace('-tiny', '-desktop') if raw.startswith('//'): return 'https:' + raw if raw.startswith('/'): return domain + raw return raw
def _decompseAds(soup): for item in soup.find_all("div", class_="article-paragraph"): if matchKey(item.text, DIV_AD_WORDS): _decompose(item) for item in soup.find_all("p"): if matchKey(item.text, P_AD_WORDS) or item.text in ['广告']: _decompose(item)
def export(update, context): if update.edited_message or update.edited_channel_post: return msg = update.effective_message if msg.chat_id < 0 and 'source</a>' in msg.text_html_urled: return if msg.chat.username == 'web_record': if (matchKey(msg.text_markdown, ['twitter', 'weibo', 'douban', 't.me/']) and not matchKey(msg.text_markdown, ['article', 'note'])): return try: tmp_msg_1 = msg.chat.send_message('received') except: return error = '' result = [] try: result = list(exportImp(msg)) if str(msg.chat.id) in remove_origin._db.items: tryDelete(msg) except Exception as e: tmp_msg_2 = msg.chat.send_message(str(e)) autoDestroy(tmp_msg_2, 0.05) error = ' error: ' + str(e) finally: info_log.send_message(getBasicLog(msg) + error + ' result: ' + ' '.join(result), parse_mode='html', disable_web_page_preview=True) tmp_msg_1.delete()
def command(update, context): msg = update.message if matchKey(msg.text, ['auth', 'token']): return msgTelegraphToken(msg) if matchKey(msg.text, ['toggle', 'source']): return toggleSourceLink(msg) if msg.chat_id > 0: msg.reply_text(help_message)
def commandInternal(msg): command, text = splitCommand(msg.text) if matchKey(command, ['/abl', '/d_ba', 'blocklist_ba']): return blocklist.add(text) if matchKey(command, ['/d_br', 'blocklist_br']): return blocklist.remove(text) if matchKey(command, ['/d_bl', 'blocklist_list']): return 'blocklist:\n' + '\n'.join(blocklist.items())
def shouldSend(card): if matchKey(str(card), db.whitelist.items): return True if matchKey(str(card), db.blacklist.items): return False if matchKey(str(card), db.preferlist.items): return getCount(card.get('mblog')) > 300 if matchKey(str(card), db.popularlist.items): return getCount(card.get('mblog')) > 10000 return getCount(card.get('mblog')) > 1000
def shouldFlipFirst(key): channel = key.split('/')[0] if channels.get(channel) == -1: return False if 0 <= channels.get(channel) <= 2: return True if len(index.get(key)) < 20 and not matchKey(index.get(key), ['hasFile', 'hasLink']): return False return not matchKey(index.get(key), blocklist.items())
def command(update, context): msg = update.message or update.channel_post if matchKey(msg.text, ['auth', 'token']): return msgTelegraphToken(msg) if matchKey(msg.text, ['source', 'tnsl', 'toggle_no_source_link']): return toggleSourceLink(msg) if matchKey(msg.text, ['origin', 'trmo', 'toggle_remove_origin']): return toggleRemoveOrigin(msg) if msg.chat_id > 0: msg.reply_text(help_message)
def command(update, context): msg = update.message if matchKey(msg.text, ['auth', 'token']): return get_telegraph_token(msg) if matchKey(msg.text, ['source']): return switch_source_flag(msg) if matchKey(msg.text, ['simplify']): return switch_simplify_flag(msg) if msg.chat_id > 0: # from private msg.reply_text(help_message)
def getUrl(msg): if matchKey(msg.text_html_urled, ['source</a>']): return if (matchKey(msg.text_html_urled, ['mp.weixin.qq.com', 'telegra.ph']) and msg.chat.username == 'web_record'): return soup = BeautifulSoup(msg.text_html_urled, 'html.parser') for item in soup.find_all('a'): if 'http' in item.get('href'): return item.get('href')
def wantSee(item, page): if matchKey(str(item), ['people/gyz', '4898454']): return True if matchKey(str(item), blocklist.items()): return False require = 120 + page if 'people/renjiananhuo' in str(item.parent): require *= 4 # 这人太火,发什么都有人点赞。。。 if sum(list(dataCount(item))[:3]) > require: return True return False
def getDoubanId(link): if not matchKey(link, ['note', 'group/topic', 'status', 'album']): return if matchKey(link, ['notes', 'statuses']): return if 'http' not in link: return parts = link.split('/') for part in parts[:-1]: try: int(part) return part except: ...
def getCnLink(link): link = getRawLink(link) if not link: return if not matchKey( link, ['douban.', 'thepaper', 'weixin', 'zhihu.', 'cnpoliti', 'weibo']): return link = clearUrl(link) # getTitle by default will cache if matchKey(export_to_telegraph.getTitle(link), ['链接已过期', '仅限男生', '男生350分', '常玮平', '【来自投稿】20世纪初', '做了套打拳入门']): return False return link
def _findOrgName(soup): head = str(soup.find('head')) if matchKey(head, ['bbc.com']): return 'BBC', True if matchKey(head, ['nyt.com', 'new york times']): return 'NYT', True if matchKey(head, ['stackoverflow']): return 'StackOverflow', False if matchKey(head, ['medium.com']): return 'Medium', False if matchKey(head, ['dw.come']): return 'DW', True r = _findPossibleRawContent(_yieldPossibleOrgItem(soup)) if r: return r, False return 'Source', False
def log(url, card, key, channels, sent): if weiboo.getCount(card) < 20: return whash = weiboo.getHash(card) if not log_existing.add(whash): return additional_info = weibo_2_album.getAdditionalInfo(card['mblog']) if additional_info: additional_info += ' ' disable_web_page_preview = not matchKey(additional_info, ['imgs:', 'video:']) if sent: sent = ' weibo_bot_sent' else: sent = '' if set([channel.id for channel in channels]) & core_channels_ids: mark = '' else: mark = ' weibo_channel_ignore' message = '%s\n\n%skey: %s channel_id: %s %s%s%s %s <a href="%s">source</a>' % ( weibo_2_album.getCap(card['mblog']), additional_info, key, ' '.join([ str(channel.id) for channel in channels ]), getChannelsLog(channels), sent, mark, url, url) try: logger.send_message(message, parse_mode='html', disable_web_page_preview=disable_web_page_preview) except Exception as e: print('log failed', str(e), message) time.sleep(5)
def cleanupCap(text): text = getPrintableForProd(text).strip() soup = BeautifulSoup(text, features="html.parser") result = [] for item in soup: if item.name == 'br': result.append('\n') continue if item.name == None: result.append(str(item)) continue if item.name == 'a': link = item.get('href') if urlShouldRemove(link): continue if (not link or matchKey( link, ['weibo.cn/p', 'weibo.cn/search', 'weibo.com/show']) or item.text[:1] == '@'): result.append(item.text) continue result.append('<a href="%s">%s</a>' % (link, item.text)) text = ''.join(result).strip() text = text.replace('\n', '\n\n') for _ in range(5): text = text.replace('\n\n\n', '\n\n') return text.strip()
def handleCommand(update, context): global ban usr = update.effective_user msg = update.effective_message msg.forward(debug_group.id) usr = usr.username command, text = splitCommand(msg.text) if matchKey(command, ['get', 'search']): keys = text.split() usrs = [x for x in db.usrs() if matchAll(db.getRaw(x), keys)] usrs = [x for x in usrs if x != usr and x not in ban] random.shuffle(usrs) if usr != test_usr: usrs = usrs[:LIMIT] if not usrs: return msg.reply_text(strings['e4']) for x in usrs: sendUsr(x, msg) return if not usr: return msg.reply_text(strings['e0']) if 'start' in command: msg.reply_text(strings['h1']) return askNext(usr, msg) if 'question' in command: for q in db.questions: msg.reply_text(strings['q' + q]) return if 'update' in command: db.save(usr, 'key', text) if not checkProfileFinish(usr, msg): return if 'preview' in command: sendUsr(usr, msg) return msg.reply_text(strings['h2'])
def processSite(site): try: links = link_extractor.getLinks(site) except Exception as e: print('web_bot, getLinks fail', str(e), site) return count = 0 if 'douban' in site and 'people' not in site: limit = 1 # may change later elif matchKey(site, ['tempo', 'kompas', 'nature.com']): limit = 1 else: limit = 20 for link in links: if db.existing.contain(link): continue title = ''.join(export_to_telegraph.getTitle(link).split()) if db.existing.contain(title): continue success = sendLink(site, link) db.existing.add(link) db.existing.add(title) count += 1 if (not success) or count >= limit: return
def _isOffTopic(attrs): if not attrs: return r = [] for k, v in attrs.items(): if k != 'data-component' and matchKey( k, ['href', 'src', 'url', 'alt', 'data', 'xmlns:fb']): continue r.append(str(k) + ' : ' + str(v)) r = '\n'.join(r) for att in OFFTOPIC_ATT: if att in r: return att for att in OFFTOPIC_ATT_WITH_EXCEPTION: if att in r and not matchKey(r, OFFTOPIC_ATT_WITH_EXCEPTION[att]): return att return
def getShortLink(link): if matchKey(link, ['weibo.', 'twitter.', 't.me/']): return raw_link = getRawLink(link) if isCN(export_to_telegraph.getTitle(raw_link)): return shorter(raw_link, link) if isCN(export_to_telegraph.getTitle(link)): return link
def getVideo(b): for video in b.find_all('video'): if not video.parent or not video.parent.parent: continue wrapper = video.parent.parent if not matchKey(str(wrapper.get('id')), ['video_info']): continue return video['src']
def search(keywords): pool = Pool() while pool.pool: for name, pos in pool.items(): if matchKey(getSoup(name, pos).text, keywords): print(getRootUrl(name, post)) next_pos = getNextPos(name, pos) pool.update(name, next_pos)
def shouldKick(self, user): if (self.kick_if_name_longer_than and len(user.first_name or '') + len(user.last_name or '') > self.kick_if_name_longer_than): return True if matchKey(getDisplayUser(user), self.kick_if_name_contains): return True return False
def remindIfNecessary(msg): if not msg.text: return if matchKey(msg.text, better_avoid_words) and not matchKey(msg.text, quotes): reminder = '建议避免使用带有强烈主观判断的词哦,比如:' + ', '.join(better_avoid_words) + \ '。 谢谢啦!' autoDestroy(msg.reply_text(reminder), 10) emotional_words = ['意淫', '凭什么'] if matchKey( msg.text, emotional_words) or msg.text.count('?') + msg.text.count('?') >= 3: reminder = '反问,反讽不利于友好交流哦,建议您换成大家更容易理解的表达哦。谢谢啦!' autoDestroy(msg.reply_text(reminder), 10) attacking_words = ['太low'] if matchKey(msg.text, attacking_words): reminder = '请友好交流,争取互相理解。谢谢啦!' autoDestroy(msg.reply_text(reminder), 10)
def decorate(text): if 'http' not in text: text = 'https://' + text text = getRawLink(text) if matchKey(text, [ 'www.douban.com/people/', 'twitter.com', 'facebook.com', 'm.weibo.cn' ]): return '\n\n' + text return '\n\n【%s】 %s' % (export_to_telegraph.getTitle(text), text)
def getSrc(img): src = img.get('src') and img.get('src').strip() if not src: return if not img.parent or not img.parent.parent: return wrapper = img.parent.parent if matchKey(str(wrapper.get('class')) or '', ['f-m-img', 'group-pic']): return src return
def getMatches(text): if not text: return [] if matchKey(text, ['/', '@']) or isInt(text): user_id = getUserId(text) if user_id and isInt(text): return [user_id] if user_id: return [user_id, text] return [text]
def handleCommand(update, context, dbs): msg = update.effective_message autoDestroy(msg, 0.1) if msg.from_user and matchKey(msg.from_user.first_name, ['telegram']): # don't deal with group message auto forwarded linked channel return command, text = splitCommand(msg.text) if 's3_l' in command: subscriptions = dbs.getList(msg.chat_id) subscriptions = [str(index) + ': ' + \ formatChat(context.bot, x['id']) for \ index, x in enumerate(subscriptions)] r = msg.reply_text('subscription list: \n\n' + '\n'.join(subscriptions), quote=False, parse_mode='Markdown', disable_web_page_preview=True) autoDestroy(r) return if 's3_un' in command: try: index = int(text) except: r = msg.reply_text('please give index') autoDestroy(r) return r = dbs.deleteIndex(msg.chat_id, index) autoDestroy(msg.reply_text(r, quote=False)) return if 's3_s' in command: chat = getChat(context.bot, text) if not chat: return r = dbs.add(msg.chat, chat) autoDestroy(msg.reply_text(r, quote=False)) return if not msg.from_user or msg.from_user.id != debug_group.id: return if 'repeat' in command: msg.bot.send_message(msg.chat.id, msg.reply_to_message.text_markdown, parse_mode='Markdown', disable_web_page_preview=True) # guard this feature # if 'all' in command: # sendAll(msg, dbs) # return if 'delete' in command: to_delete = msg.reply_to_message key = (to_delete.chat_id, to_delete.message_id) for r in forward_all_record[key]: try: r.delete() except: pass