def is_spammer(self, chat_id, msg_date, text): """User is not enabled to post links due to his join date or the number of messages, and his still trying The user is allowed to post urls if: a) is group admin b) is a new user that not has been joined recently thats is more than INIT_TIME_ALLOW_URLS minutes ago. c) is a new user who has posts more that INIT_MIN_MSG_ALLOW_URLS posts """ if self.is_admin or self.is_verified: return False chat_config = Config.get(chat_id=chat_id) # Let's check for urls extractor = URLExtract() any_url = extractor.has_urls(text) if not any_url: # if no url posted we'll give him the benefit of doubt return False # OK he has posts urls # Check if allowed by time or num os posted messages user_hours_in_group = (msg_date - self.join_date).total_seconds() // 3600 return (user_hours_in_group < chat_config.time_for_allow_urls) or ( self.num_messages < chat_config.num_messages_for_allow_urls)
def applyYaml(self, path): yamlDict = yaml.safe_load(open(path)) # ensure type of loaded config for k, v in yamlDict.items(): ### check if key actually exists inside of our config if k not in self.__dict__: raise AttributeError( "Error in config: '%s'\n'%s' object has no attribute '%s'" % (path, self.__class__.__name__, k)) ### prepare type check t1, v1, t2, v2 = type(v), v, type(getattr(self, k)), getattr(self, k) if t1 != t2: raise ValueError( "Configuration parameter '%s' has failed type check! 's'<'%s'> should be 's'<'%s'>" % (k, v1, t1, v2, t2)) # special evaluations if self.max_payout_per_run < self.btc_per_transaction * self.number_payout_contributors_per_run: raise ValueError( "The specified payout amount (self.btc_per_transaction * self.number_payout_contributors_per_run) exceeds the maximum payout (max_payout_per_run)" ) self.apply(yamlDict) # block url in note extractor = URLExtract() if extractor.has_urls(self.email_note): raise ValueError("Using URLs in note not possible")
def replace_url(s): extractor = URLExtract() if extractor.has_urls(s): urls = extractor.find_urls(s, only_unique=True) for url in urls: s = s.replace(url, "<url>") return s
def read_stream_from_assia_tv(self, response, event_url, event_name, event_date): scripts = response.css("script") extractor = URLExtract() for s in scripts: text = s.get() if extractor.has_urls(text): for url in extractor.gen_urls(text): if "video.assia.tv" in url: self.logger.info("#read_stream_from_assia_tv - found video stream url %s!"%(url)) if "m3u8" in url:
def message(update: Update, context: CallbackContext) -> None: extractor = URLExtract() if extractor.has_urls(update.message.text): result_text = update.message.text for url in extractor.gen_urls(update.message.text): print(f"Url found: {url}") unshorten_url = unshort_url(url) print(f"Unshorten: {unshorten_url}") sanitized_url = trim_utm(unshorten_url) print(f"Sanitized: {sanitized_url}") if url != sanitized_url: result_text = result_text.replace(url, sanitized_url) if result_text != update.message.text: update.message.reply_text(result_text)
def __extract_domain_from_sent_field(self, sent: str) -> str: """ Get the url out of a 'sent' field in a measurement. Parameters ---------- sent: str Examples: * An empty string ("") meaning the sent packet wasn't recorded. * "GET / HTTP/1.1\r\nHost: example5718349450314.com\r\n" (echo/discard) * "GET www.bbc.co.uk HTTP/1.1\r\nHost: /content.html\r\n" (discard error) or just "www.apple.com" (HTTP/S) Returns ------- str Just the url, if found. """ extractor = URLExtract() extractor.update_when_older( 7) # updates known TLD when list is older that 7 days if sent == '': return sent match = re.search(QuackConstants.SENT_PATTERN.value, sent) if match: path = match.group(1) domain = match.group(2) # This is a bug where the domain and path were reversed in content sent. # We do our best to reconstruct the intended url # by swapping them to their intended position if extractor.has_urls(path): domain, path = path, domain if path == '/': return domain return domain + path if ' ' not in sent: return sent raise Exception(f"unknown sent field format: {sent}")
def applyYaml(self, path): yamlDict = yaml.safe_load(open(path)) ### ensure validity of provided yaml if self.validateConfig(yamlDict, path=path): ### apply config because it is valid self.apply(yamlDict) # special evaluations if (self.payout_per_run < self.random_split_btc_per_picked_contributor * self.random_split_picked_contributors): raise ValueError( "The specified payout amount (self.random_split_btc_per_picked_contributor * self.random_split_picked_contributors) exceeds the maximum payout (payout_per_run)" ) # block url in note extractor = URLExtract() if extractor.has_urls(self.optional_email_message): raise ValueError("Using URLs in note not possible")
def applyYaml(self, path): yamlDict = yaml.safe_load(open(path)) # ensure type of loaded config for k, v in yamlDict.items(): t1, v1, t2, v2 = type(v), v, type(getattr(self, k)), getattr(self, k) if t1 != t2: raise ValueError( "Configuration parameter '%s' has failed type check! 's'<'%s'> should be 's'<'%s'>" % (k, v1, t1, v2, t2)) # special evaluations if self.max_payout_per_run < self.btc_per_transaction * self.number_payout_contributors_per_run: raise ValueError( "The specified payout amount (self.btc_per_transaction * self.number_payout_contributors_per_run) exceeds the maximum payout (max_payout_per_run)" ) self.apply(yamlDict) # block url in note extractor = URLExtract() if extractor.has_urls(self.email_note): raise ValueError("Using URLs in note not possible")
def left_user(bot, update): """Member left the group event handler. On this case we try to avoid the message that telegram sends when removing the user form the group.""" chat_id = update.message.chat.id chat_config = storage.get_chat_config(chat_id) if not chat_config.enabled: return message_id = update.message.message_id user = update.message.left_chat_member left_user_name = "{} {}".format(user.first_name, user.last_name) log.info("{} left the group {}".format(left_user_name, chat_id)) try: extractor = URLExtract() if extractor.has_urls(left_user_name): bot.delete_message(chat_id, message_id) else: if len(left_user_name) > conf.MAX_USERNAME_LENGTH: bot.delete_message(chat_id, message_id) except Exception as e: log.error("Error on deleting left message {}".format(e))
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] try: generatedUrls = extractor.gen_urls(body) for url in generatedUrls: if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset except AttributeError as e: raise e print( "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git" .format(e=e)) finally: return urlset # which is empty
class VK: def __init__(self): log_tag = 'VK - init' self.settings_tag = 'VK' self.extractor = URLExtract() self.config = Config() try: self.vk_bot = vk_api.VkApi( token=str(self.config.read(self.settings_tag, 'bot_token'))) self.api_bot_vk = self.vk_bot.get_api() Log().info(log_tag, 'Инициализация токена-бота VK успешна.') except Exception as e: Log().error(log_tag, e) p_name = 'ЛИНКЕР' p_channel = 'hackathon' p_version = '0.0.1' desc = 'Бот, создающий сокращенные vk.cc ссылки прямо в диалоге.' self.info = f'{p_name} {p_version} ({p_channel})\n\n{desc}\n\nбеседа %peer_id%' def long_poll(self): tag = 'VK - Message LongPoll' from vk_api.bot_longpoll import VkBotLongPoll, VkBotEventType long_poll_bot = VkBotLongPoll( self.vk_bot, int(self.config.read(self.settings_tag, "community_id"))) for event in long_poll_bot.listen(): try: if event.type == VkBotEventType.MESSAGE_NEW: Log().info( tag, f'Новое сообщение от \"https://vk.com/id{event.obj.from_id}\".\n' f'Текст сообщения:\t\n{event.obj.text}\n' f'Прикрепленные аттачи:\t\n{event.obj.attachments}\n' f'Пересланные сообщения:\t\n{event.obj.fwd_messages}') self.listener(event) elif event.type == VkBotEventType.MESSAGE_REPLY: Log().info(tag, f'Бот ответил в чате {event.obj.peer_id}.') else: Log().info( tag, f'Обнаружено новое действие: {event.type} от ' f'\"https://vk.com/id{event.obj.from_id}\"') except Exception as e: Log().error(tag, e) def listener(self, event): tag = "VK - Message Listener" Log().info(tag, 'Обрабатываю сообщение...') from_id = event.obj.from_id peer_id = event.obj.peer_id msg_text = str(event.obj.text) msg_attach = event.obj.attachments msg_fwd = event.obj.fwd_messages Log().info(tag, 'Обработка завершена. ') if self.extractor.has_urls(msg_text) or msg_attach or msg_fwd: response_links = [] if self.extractor.has_urls(msg_text): links = self.extractor.find_urls(msg_text) Log().info(tag, 'Найдены объекты типа ссылка.') if len(links) > 1: for i in range(len(links)): response_links.append( self.get_cc_link(links[i], 0)['short_url']) else: response_links.append( self.get_cc_link(links, 0)['short_url']) if msg_attach: for i in range(len(msg_attach)): attach_type = msg_attach[i]['type'] if attach_type == 'link': ath_url = msg_attach[i][attach_type]['url'] response_links.append( str(self.get_cc_link(ath_url, 0)['short_url'])) if msg_fwd: for i_fwd in range(len(msg_fwd)): fwd_text = msg_fwd[i_fwd]['text'] fwd_attaches = msg_fwd[i_fwd]['attachments'] for i_ath in range(len(fwd_attaches)): fwd_ath_type = fwd_attaches[i_ath]['type'] if fwd_ath_type == 'link': fwd_ath_link = msg_fwd[i_fwd]['attachments'][ i_ath][fwd_ath_type]['url'] response_links.append( str( self.get_cc_link(fwd_ath_link, 0)['short_url'])) if self.extractor.find_urls(fwd_text): response_links.append( str(self.get_cc_link(fwd_text, 0)['short_url'])) response_links_wd = list(dict.fromkeys(response_links)) if len(response_links_wd) > 1: response_str = '🔗 Вот твои ссылки из сообщения:\n\n' for i_link in range(len(response_links_wd)): response_str += response_links_wd[i_link] + '\n' else: response_str = '🔗 Была найдена лишь одна ссылка в сообщении: ' + response_links_wd[ 0] self.send_message(peer_id, response_str) elif (from_id == 140830142) and \ (msg_text.__contains__('info') or msg_text.__contains__('инфо') or msg_text.__contains__('i')) or \ (msg_text.__contains__('ping') or msg_text.__contains__('пинг')): Log().info(tag, 'Инфо о боте.') self.send_message(peer_id, 'понг') self.send_message(peer_id, self.info.replace("%peer_id%", str(peer_id))) else: Log().info(tag, 'Неизвестная команда.') self.send_message(event.obj.peer_id, '🐸 Ссылок нет.') def get_cc_link(self, url, private): cc_link = self.api_bot_vk.utils.getShortLink(url=url, private=private) return cc_link def send_message(self, user_id, text): self.api_bot_vk.messages.send(peer_id=user_id, message=text, random_id=get_random_id(), dont_parse_links=1)
def main(): LOG.info("Started and opening connection.") with grpc.secure_channel( os.getenv("SEABIRD_HOST_PORT"), grpc.ssl_channel_credentials(), ) as channel: channel = grpc.intercept_channel( channel, add_header( "authorization", f'Bearer {os.getenv("SEABIRD_TOKEN")}', ), ) LOG.info("Successfully connected.") stub = seabird_pb2_grpc.SeabirdStub(channel) LOG.info("Monitoring for events.") for event in stub.StreamEvents( seabird_pb2.StreamEventsRequest(commands={ "inspect_image": seabird_pb2.CommandMetadata( name="inspect_image", short_help="AWS Rekognition to analyze an image", full_help="Analyze an image's content", ), "inspect_celebrity": seabird_pb2.CommandMetadata( name="inspect_image", short_help="AWS Rekognition to analyze an image", full_help="Analyze an image's content", ), }, )): LOG.debug("Event received: %s", event.message.text) command = event.command message = event.message if not command.command and not message: continue extractor = URLExtract(extract_localhost=False) if command.command == "inspect_image": if command.arg and extractor.has_urls(command.arg): LOG.info( "Image command detected from %s", command.source.user.display_name, ) handle_image(stub, command) elif not extractor.has_urls(command.arg): LOG.info( "No URL detected from %s", command.source.user.display_name, ) stub.SendMessage.with_call( seabird_pb2.SendMessageRequest( channel_id=command.source.channel_id, text= f"{command.source.user.display_name}: Missing URL", )) else: LOG.info( "Image command invalid from %s", command.source.user.display_name, ) stub.SendMessage.with_call( seabird_pb2.SendMessageRequest( channel_id=command.source.channel_id, text= f"{command.source.user.display_name}: Something's not right", )) continue elif command.command == "inspect_celebrity": if command.arg and extractor.has_urls(command.arg): LOG.info( "Celebrity command detected from %s", command.source.user.display_name, ) handle_celebrity(stub, command) elif not extractor.has_urls(command.arg): LOG.info( "No URL detected from %s", command.source.user.display_name, ) stub.SendMessage.with_call( seabird_pb2.SendMessageRequest( channel_id=command.source.channel_id, text= f"{command.source.user.display_name}: Missing URL", )) else: LOG.info( "Celebrity command invalid from %s", command.source.user.display_name, ) stub.SendMessage.with_call( seabird_pb2.SendMessageRequest( channel_id=command.source.channel_id, text= f"{command.source.user.display_name}: Something's not right", )) continue elif extractor.has_urls(message.text): LOG.info( "Detected link from %s", message.source.user.display_name, ) handle_url(stub, message) else: continue
def new_user(bot, update): """New member join the group event handler""" message = update.message chat_id = message.chat_id chat_config = storage.get_chat_config(chat_id) if not chat_config.enabled: return message_id = message.message_id msg_from_user_id = message.from_user.id msg_from_alias = message.from_user.name join_date = message.date lang = chat_config.language # For each new user that join or has been added for join_user in message.new_chat_members: join_user_id = join_user.id join_user_alias = join_user.name join_user_name = "{} {}".format(join_user.first_name, join_user.last_name) # we do not allow certain user names if storage.is_name_in_black_list([join_user_alias, join_user_name]): log.info( "Possible spammer [blacklisted] kicked %s on chat %s", join_user_name, chat_id, ) delete_message( chat_id, join_user_id, message_id, "[SPAMMER] {}".format(message.text), bot, ) kick_user_from_chat( bot, join_user_id, join_user_alias, chat_id, "name blacklisted" ) continue # If the added user is not myself (this Bot) if bot.id == join_user_id: # The Anti-Spam Bot has been added to a group anti_spam_bot_added_event(chat_id, bot, update) continue else: to_register_user = True # If the message user source is not the join user, # means it has been invited/added by another if msg_from_user_id != join_user_id and join_user.is_bot: # If a user has added a bot check if could be added and delete id if not to_register_user = try_to_add_a_bot_event( bot, msg_from_user_id, join_user, chat_id ) if not to_register_user: # if is not a legit bot log and no nothing log.warn( "{msg_from_user_id} has tried to join {join_user} to {chat_id}".format( msg_from_user_id=msg_from_user_id, join_user=join_user, chat_id=chat_id, ) ) continue if to_register_user and (msg_from_user_id != join_user_id): if not storage.is_user_allowed_to_add_users( bot, msg_from_user_id, chat_id ): log.warn( "%s is has tried to add another user: %s on chat %s", msg_from_alias, join_user_name, chat_id, exc_info=0, ) delete_message( chat_id, join_user_id, message_id, "[ADDER] {}".format(message.text), bot, ) kick_user_from_chat( bot, join_user_id, join_user_name, chat_id, "Tried to add another user", ) continue if to_register_user: # Check if there is an URL in the user name extractor = URLExtract() has_url = extractor.has_urls(join_user_name) or extractor.has_urls( join_user_alias ) if has_url: log.warn( "Spammer (URL name) join detected.\n (Chat) - ({}).".format( chat_id ) ) if len(join_user_name) > 15: join_user_name = "{}...".format(join_user_name)[0:10] try: bot.delete_message(chat_id, message_id) bot_message = msg(lang, "USER_URL_NAME_JOIN").format( join_user_name ) log.info( "Spammer (URL name) join message successfully removed.\n" " (Chat) - ({}).".format(chat_id) ) notifications.tlg_send_selfdestruct_msg( bot, chat_id, bot_message ) except Exception as e: log.error( "Exception when deleting a Spammer (URL name) join " "message - {}".format(str(e)) ) if str(e) == "Message can't be deleted": bot_message = msg( lang, "USER_URL_NAME_JOIN_CANT_REMOVE" ).format(join_user_name) notifications.tlg_send_selfdestruct_msg( bot, chat_id, bot_message ) continue else: # Check if user name and last name are too long if len(join_user_name) > conf.MAX_USERNAME_LENGTH: join_user_name = "{}...".format(join_user_name)[0:10] try: bot.delete_message(chat_id, message_id) bot_message = msg(lang, "USER_LONG_NAME_JOIN").format( join_user_name ) log.info( "Spammer (long name) join message successfully removed." " (Chat) - ({}).".format(chat_id) ) except Exception as e: log.error( "Exception when deleting a Spammer (long name) join " "message - {}".format(str(e)) ) if str(e) == "Message can't be deleted": bot_message = msg( lang, "USER_LONG_NAME_JOIN_CANT_REMOVE" ).format(join_user_name) notifications.tlg_send_selfdestruct_msg( bot, chat_id, bot_message ) if len(join_user_alias) > conf.MAX_USERNAME_ALIAS: # if the alias is to large, just short it join_user_alias = "{}...".format(join_user_alias)[ 0 : conf.MAX_USERNAME_ALIAS - 3 ] if (conf.VERBOSE_LIMIT > 0) and (storage.last_addition(chat_id) > conf.VERBOSE_LIMIT): notifications.tlg_send_selfdestruct_msg( bot=bot, chat_id=chat_id, message=msg(lang, "WELCOME_MSG").format( join_user_alias, chat_config.num_messages_for_allow_urls, chat_config.time_for_allow_urls, ), minutes=conf.VERBOSE_LIMIT, ) storage.register_new_user( chat_id=chat_id, user_id=join_user_id, user_name=join_user_alias, first_name=join_user.first_name, last_name=join_user.last_name, join_date=join_date, allow_user=False, ) log.info("{} added to the group {}".format(join_user_alias, chat_id))
def renderMarkdown(text, ignoreLinks=False, heading=False, alignment=False, properties=False, view_type=False): isAttribute = False if ':hiccup' in text: # THIS DOES NOT WORK WELL !!! VERY BROKEN # text = 'hr ' data = re.sub(r'\n', '', text.strip()) data = re.sub(r':hiccup \[:hr\]', r'<hr>', data) data = re.sub(r'(\[\s*?):([\w-]+)', r'\1"\2",', data) data = re.sub(r':([\w-]+)', r'"\1":', data) data = re.sub(r'([\}\]\:][\s]*?)(\w+)([\s]*?[\[\{\]])', r'\1"\2"\3', data) data = re.sub(r'([\}\]\"])([\s\n]*?)([\[\{\"])', r'\1,\2\3', data) # print(data[9:]) # data = re.sub(r'(hr)', r'hr', data) # this tag is not being converted correctly # print(data[10:]) # print(json.loads(data[10:])) # print(convert(data)) # return convert(data) return data if ignoreLinks is False: global wordcount wordcount += len(text.split()) # todo correctly render page alias {{alias: [[Roam Research]] Roam}} # todo fix URLs that contain a # # todo if attribute exists set a flag so the attribute can be picked up and attributed to the parent block if re.match(r'\b(.+)\:\:', text, flags=0): isAttribute = True text = re.sub(r'^\[\[>\]\](.*)', r'<blockquote>\1</blockquote>', text) # blockquote text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text) # attributes text = re.sub(r'^(\-\-\-)$', r'<hr>', text) text = re.sub(r'{{\[\[TODO\]\]}}', _processCheckmark(False), text) # unchecked TO DO text = re.sub(r'{{{\[\[DONE\]\]}}}}', _processCheckmark(True), text) # checked TO DO alt text = re.sub(r'{{\[\[DONE\]\]}}', _processCheckmark(True), text) # checked TO DO text = re.sub(r'\!\[([^\[\]]*?)\]\((.+?)\)', r'<img src="\2" alt="\1" />', text) # markdown images text = re.sub(r'\{\{\[\[youtube\]\]:(.+?)\}\}', lambda x: _processExternalEmbed(x, text, "youtube"), text) # external clojure embeds text = re.sub(r'\{\{\[\[query\]\]:(.+?)\}\}', lambda x: _processQueries(x, text), text) # queries text = re.sub(r'\{\{(.*):.*[^\{\}]\((.+?)\)\)(.*)\}\}', lambda x: _processInternalEmbed(x, text), text) # clojure embeds and Block aliases text = re.sub(r'\{\{(.*):.*[^\{\}]\[(.+?)\]\](.*)\}\}', lambda x: _processInternaPagelEmbed(x, text), text) # clojure page aliases text = re.sub(r'\{\{\[\[slider\]\](.*)\}\}', lambda x: _processSlider(x, text, properties), text) # sliders text = re.sub(r'(\{\{or:(.+?)\}\})', lambda x: _processTextVersion(x, text), text) # text versioning if ignoreLinks: text = re.sub(r'\[\[(.+?)\]\]', r'\1', text) # page links text = re.sub(r'\[([^\[\]]+?)\]\((.+?)\)', r'\1', text) # external links text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text) # attributes else: text = re.sub(r'\[([^\[\]]+?)\]\(\[\[(.+?)\]\]\)', lambda x: _processInternalAlias(x, text), text) # internal page aliases text = re.sub(r'\[([^\[\]]+?)\]\(\(\((.+?)\)\)\)', lambda x: _processInternalBlockAlias(x, text), text) # internal block aliases text = re.sub(r'\[([^\[\]]+?)\]\(([^\[\]\(].+?)\)', lambda x: _processExternalAlias(x, text), text) # external aliases text = re.sub(r'(?<!href="\/[A-Za-z0-9\-\_]{8})(#(\w+))', lambda x: _processInternalTag(x, text), text) # tags without brackets text = re.sub(r'(\#\[\[(.+?)\]\])', lambda x: _processInternalTag(x, text), text) # tag with brackets text = re.sub(r'(?<!\#)\[\[(.+?)\]\]', lambda x: _processInternalLink(x, text), text) # pages with brackets text = re.sub(r'\n', r'<br>', text) # newline text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text) # bold text = re.sub(r'\_\_(.*?)\_\_', r'<em>\1</em>', text) # italic text = re.sub(r'\~\~(.+?)\~\~', r'<s>\1</s>', text) # strikethrough text = re.sub(r'\^\^(.+?)\^\^', r'<span class="highlight">\1</span>', text) # highlight text = re.sub(r'\`\`\`(.+?)\`\`\`', r'<code>\1</code>', text) # large codeblock text = re.sub(r'\`(.+?)\`', r'<code>\1</code>', text) # inline codeblock def isBlockPrivate(blockID, blockText): if blockID in block_ids: # print("block not private") # print(blockText) # print(blockID) return renderMarkdown(block_ids[blockID]['string']) else: # print("block is private") # print(blockText) pass text = re.sub(r'\(\((.+?)\)\)', lambda x: isBlockPrivate(x.group(1), text), text) # block ref # deal with bare URLs # not a huge fan of this forbidden_chars = ['<a', '<img', '[', '<code', '<iframe'] results = [] for substring in forbidden_chars: results.append(substring in text) if not any(results): extractor = URLExtract() if extractor.has_urls(text): for url in extractor.gen_urls(text): text = text.replace(url, _processBareURL(url)) # print(text) if heading: text = f'<h{heading}>{text}</h{heading}>' if alignment: text = f'<div style="text-align:{alignment};">{text}</div>' return text
tweets['clean_text'] = tweets['clean_text'].str.replace("\t"," ") # Replace & with and tweets['clean_text'] = tweets['clean_text'].str.replace("&"," and ") # ========================================== EXTRACT AND REMOVE URL's ======================================================== #TBD: MAKE FUNCTION from urlextract import URLExtract extractor = URLExtract() # Adding more stop chars in case of parentesses near URL's stop_chars = list(extractor.get_stop_chars_right()) stop_chars.append(')') extractor.set_stop_chars_right(set(stop_chars)) tweets['url_count'] = 0 for i,t in enumerate(tweets['clean_text']): if extractor.has_urls(t): urls = extractor.find_urls(t) # Add URL count feature tweets['url_count'].iloc[i] = len(urls) print(i,len(urls)) # Remove url's from text line = t for item in urls: line = re.sub(item,'', line) tweets['clean_text'].iloc[i] = line print(tweets.info()) print(tweets.describe()) print(tweets['clean_text'].iloc[1060]) # ============================================= HASHTAGS AND MENTIONS COUNT =============================================
def validate_local_restrictions(local_restrictions): extractor = URLExtract() if not extractor.has_urls(local_restrictions): raise ValidationError( _('Please provide a link to your government website outlining this.' ))
def check_citation(self, citation): #NOTE: when implementing, wrap the method in a try catch and print out any error + the citation status try: pattern = re.compile("[ ][0-9]{4}") result = pattern.search(citation) self.year = result.group(0)[1:] except: raise Exception("Unable to find year in citation.") self.citation_status = MLACitationStatus.AUTHOR cursor = 0 while True: ascii_value = ord(citation[cursor]) # check if the current character is not " &-'." or any alphanumeric in English or Latin-1 if citation[cursor:cursor + 2] != ". " and ( ascii_value == 32 or ascii_value == 39 or 44 <= ascii_value <= 46 or 65 <= ascii_value <= 90 or 97 <= ascii_value <= 122 or 192 <= ascii_value <= 255): cursor += 1 else: break if cursor != 0: author_section = "" if citation[cursor:cursor + 2] == ". ": author_section = citation[:cursor + 1] else: raise Exception( "Bad formatting in the author section (unknown error).") # three or more authors if ", et al." in author_section: temp = author_section.replace(", et al", "") authors = temp.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0] + ", et al.") else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # two authors elif ", and " in author_section: authors = author_section.split(", and ") if ", " not in authors[0]: raise Exception("Bad formatting in the author section: '" + author_section + "'") firstAuthor = authors[0].split(", ") filteredFirstAuthor = [ self.filter_latin(i) for i in firstAuthor ] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredFirstAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+$", filteredFirstAuthor[1]) is not None: self.authors.append(firstAuthor[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") if " " not in authors[1]: raise Exception("Bad formatting in the author section: '" + author_section + "'") secondAuthor = authors[1].split(" ", 1) filteredSecondAuthor = [ self.filter_latin(i) for i in secondAuthor ] if re.match("^[A-Z][A-Za-z-']+$", filteredSecondAuthor[0]) is not None \ and re.match("^[A-Za-z][A-Za-z-'. ]+[.]$", filteredSecondAuthor[1]) is not None: self.authors.append(filteredSecondAuthor[1][:-1]) elif re.match("^[A-Za-z][.]$", filteredSecondAuthor[1]) is not None: author_cursor = cursor + 2 actualSecondAuthor = "" while citation[author_cursor:author_cursor + 2] != ". ": actualSecondAuthor += citation[author_cursor] author_cursor += 1 self.authors.append(actualSecondAuthor) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # one author elif ", " in author_section: authors = author_section.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-' ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") elif "et. al." in author_section or "et.al." in author_section: raise Exception( "'Et al.' should not have a period after the 'Et'.") # no match; bad formatting else: raise Exception("Bad formatting in the author section: '" + author_section + "'") self.citation_status = MLACitationStatus.TITLE cursor += 1 # check the title section if citation[cursor:cursor + 3] == "<i>": cursor += 3 elif citation[cursor + 1:cursor + 4] == "<i>": cursor += 4 elif citation[cursor + 1] == "\"": cursor += 2 elif citation[cursor - 1:cursor + 1] == ".\"": raise Exception("Bad formatting in the title section.") title = "" while citation[cursor] != ".": title += citation[cursor] cursor += 1 title = title.replace("\"", "") title = title.replace("</i>", "") if title[0] == " ": title = title[1:] if citation[cursor + 1] == "\"": cursor += 2 else: cursor += 1 #now cursor should be at the beginning of italics result = url("https://brettterpstra.com/titlecase/?title=" + title) title_cased_title = result.read().decode('utf-8') if title != title_cased_title: self.warnings.append( "the title might contain improper capitalization: '" + title + "'") self.title = title # check for url self.citation_status = MLACitationStatus.URL extractor = URLExtract() if extractor.has_urls(citation): urls = extractor.find_urls(citation) self.url = urls[0][:-1] if self.url + "." not in citation: raise Exception("Bad formatting in the URL section.") if citation[cursor:cursor + 3] != "<i>" and citation[cursor + 1:cursor + 4] != "<i>": self.warnings.append( "the container may not exist or may not be italicized") elif citation[cursor:cursor + 3] == "<i>" and citation[cursor + 1:cursor + 4] == "<i>": self.warnings.append( "the container might exist when not necessary (if the citation is about a book), or the block immediately following the title may be improperly italicized." ) if self.url != "": citation.replace(self.url + ".", "") # check for other info # right now, it's too complex to validate the entire MLA citation without prior knowledge on what type of citation it is, # so the other info is just stored without checking self.citation_status = MLACitationStatus.OTHER_INFO remainingText = citation[cursor:] info = remainingText.split(", ") self.otherInfo = [i for i in info]
class Preprocessor(object): def __init__(self, min_times): self.min_times = min_times self.wnl = WordNetLemmatizer() self.stem_tool = PorterStemmer() self.url_extractor = URLExtract() self.UK_check_list = ['u.k', 'u.k.', 'uk', 'UK', 'U.K', 'U.K.'] self.USA_check_list = [ 'u.s.', 'u.s', 'U.S', 'U.S.', 'U.S.A', 'usa', 'USA', 'u.s.a', 'U.S.A.', 'u.s.a.' ] self.twoword_pattern = re.compile("[a-z][A-Z]{1}[a-z]", re.S) self.mark_words = [] with open("tools/mark_words.txt", mode='r', encoding='utf-8') as f: for line in f.readlines(): self.mark_words.append(line.strip()) #加入Fox停用词 self.stopwords = [] with open("tools/FoxStoplist.txt", mode='r', encoding='utf-8') as f: for line in f.readlines(): self.stopwords.append(line.strip()) # 加入标点作为停用词 self.stopwords.extend(list(string.punctuation)) # 加入nltk标准停用词 nltk_stopwords = list(stopwords.words('english')) self.stopwords.extend(nltk_stopwords) #统计词频的工具 self.word_counter = Counter() def deal_text(self, text): #处理网址字符 text = self.filter(text) if self.url_extractor.has_urls(text): sentence_urls = self.url_extractor.find_urls(text) for url in sentence_urls: text = text.replace(url, 'website-holder') doc_words = [] for word in self.lemmatize_text(text): word = word.strip() match_string = re.findall(self.twoword_pattern, word) if len(match_string) != 0: for string in match_string: words = word.replace(string, string[0] + ' ' + string[1:]).split(" ") doc_words.extend(words) if word in self.stopwords: continue if word in self.UK_check_list: doc_words.append("uk") if word in self.USA_check_list: doc_words.append("usa") if word in self.mark_words: pattern = re.compile('[a-z]{1}-[a-z]{1}') match_string = re.findall(pattern, word) for string in match_string: word = word.replace(string, string[0] + string[2:]) doc_words.append(word) #看是否为数字 try: word_float = float(word) doc_words.append("#number#") except ValueError: pass if word == 'website-holder': doc_words.append("#website#") else: doc_words.append(word) #统一为小写 for i, word in enumerate(doc_words): doc_words[i] = self.stem_tool.stem(word.lower()) #统计词数 self.word_counter.update(doc_words) return doc_words def lemmatize_text(self, text): wnl = WordNetLemmatizer() sentences = sent_tokenize(text, language='english') for sentence in sentences: for word, tag in pos_tag( word_tokenize(sentence, language='english')): if tag.startswith('NN'): yield wnl.lemmatize(word, pos='n') elif tag.startswith('VB'): yield wnl.lemmatize(word, pos='v') elif tag.startswith('JJ'): yield wnl.lemmatize(word, pos='a') elif tag.startswith('R'): yield wnl.lemmatize(word, pos='r') else: yield wnl.lemmatize(word) def filter(self, text): """ 过滤掉无用的文本 """ if isinstance(text, float): return '' pattern = re.compile("\[This API is no longer available\.(.*?)\]", re.S) a = pattern.findall(text) if len(a) > 0: replace_item = "[This API is no longer available." + a[0] + ']' text = text.replace(replace_item, '').strip() else: text = text.strip() return text def read_data(self, csv_path): df = pd.read_csv(csv_path) all_process_cuts = [] all_name = [] all_tags = [] all_labels = [] label_map = dict() tag_map = {"UNK": 0} for i in tqdm(range(len(df))): row_data = df.iloc[i, :] APIName = row_data['name'].strip().lower()[:-4] # 已经去掉了最后的API和空格 # 处理tag tag_list = row_data['tags'].strip().lower().split(",") #生成tag_map for tag in tag_list: if tag not in tag_map: tag_map[tag] = len(tag_map) # description raw_desc = row_data['description'].strip() desc_cuts = self.deal_text(raw_desc) # 处理category category = row_data['category'].strip() if category not in label_map: label_map[category] = len(label_map) all_name.append(APIName.strip()) all_tags.append(tag_list) all_labels.append(category) all_process_cuts.append(desc_cuts) word_map = {'UNK': 0} for word, times in self.word_counter.items(): if times < self.min_times: continue else: if word in word_map: continue else: word_map[word] = len(word_map) return all_name, all_process_cuts, all_labels, all_tags, word_map, tag_map, label_map def begin(self, csv_path, evaluate=False, old_word_map=None, old_tag_map=None, old_label_map=None): all_name, all_process_cuts, all_labels, \ all_tags, get_word_map, get_tag_map, get_label_map = self.read_data(csv_path) all_encoded_tag = [] all_encoded_category = [] all_encoded_docs = [] if not evaluate: word_map = get_tag_map tag_map = get_tag_map label_map = get_label_map else: assert old_word_map is not None and old_tag_map is not None and old_label_map is not None word_map = old_word_map tag_map = old_tag_map label_map = old_label_map #编码tag for doc_tag in all_tags: doc_encoded_tag = [] for tag in doc_tag: if tag in tag_map: doc_encoded_tag.append(tag_map[tag]) else: doc_encoded_tag.append(tag_map['UNK']) all_encoded_tag.append(doc_encoded_tag) #编码label for doc_label in all_labels: all_encoded_category.append(label_map[doc_label]) #编码description for words in all_process_cuts: encoded_doc = [] for word in words: if word in word_map: encoded_doc.append(word_map[word]) else: encoded_doc.append(word_map['UNK']) all_encoded_docs.append(encoded_doc) #保存数据 data_dict = dict() data_dict['all_name'] = all_name data_dict['all_process_cuts'] = all_process_cuts data_dict['all_labels'] = all_labels data_dict['all_tags'] = all_tags data_dict['all_encoded_category'] = all_encoded_category data_dict['all_encoded_tag'] = all_encoded_tag data_dict['all_encoded_docs'] = all_encoded_docs data_dict['word_map'] = word_map data_dict['label_map'] = label_map data_dict['tag_map'] = tag_map return data_dict
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] for url in extractor.gen_urls(body): if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset
# plt.bar(counts.index, counts.values, label="mean: " +str(np.mean(counts.values))) # plt.xticks(counts.index, rotation='vertical') # plt.subplots_adjust(bottom=0.25) # plt.legend() # plt.show() # #plt.savefig('tweetsperday.png', format='png') # selected = [] # for i in range(len(df['Date'])): # if df['Date'][i] == '12/2/2019': # selected.append(trump['text'][i]) vec = CountVectorizer(stop_words="english") w = [] for i in trump['text']: if extractor.has_urls(i): url = extractor.find_urls(i) for k in url: i = i.replace(k, '') w.append(i) else: w.append(i) w.remove(' https://t.co/G6lGfyxSUs') #remove '@' and '#' for i in range(len(w)): if '@' in w[i]: w[i] = w[i].replace('@', '') if '#' in w[i]: w[i] = w[i].replace('#', '') if '&' in w[i]:
gc.collect() list_full_urls = [] list_domains = [] length = len(trolls) for i in range(length): # TODO set length if i % 100 == 0: print(i) troll_text = trolls.iloc[i][TWITTER_MESSAGE_CONTENT] all_urls = [] if extractor.has_urls(troll_text): all_urls = extractor.find_urls(troll_text) for url in all_urls: try: unshortened_url = get_unshortened_url(url) url_domain = get_domain_from_url(unshortened_url) #print(unshortened_url, url_domain) list_full_urls.append(unshortened_url) list_domains.append(url_domain) except: print("broken url ", url) continue troll_url_data = pd.DataFrame({"url" : list_full_urls, "domain" : list_domains}) troll_url_data.to_csv("/output/" + TROLL_TYPE + "_urls.csv", index=False)