Ejemplo n.º 1
0
    def is_spammer(self, chat_id, msg_date, text):
        """User is not enabled to post links due to his join date or the
        number of messages, and his still trying
        The user is allowed to post urls if:
            a) is group admin
            b) is a new user that not has been joined recently
            thats is more than INIT_TIME_ALLOW_URLS minutes ago.
            c) is a new user who has posts more that
            INIT_MIN_MSG_ALLOW_URLS posts
        """

        if self.is_admin or self.is_verified:
            return False
        chat_config = Config.get(chat_id=chat_id)

        # Let's check for urls
        extractor = URLExtract()
        any_url = extractor.has_urls(text)
        if not any_url:
            # if no url posted we'll give him the benefit of doubt
            return False

        # OK he has posts urls
        # Check if allowed by time or num os posted messages

        user_hours_in_group = (msg_date -
                               self.join_date).total_seconds() // 3600
        return (user_hours_in_group < chat_config.time_for_allow_urls) or (
            self.num_messages < chat_config.num_messages_for_allow_urls)
Ejemplo n.º 2
0
    def applyYaml(self, path):
        yamlDict = yaml.safe_load(open(path))
        # ensure type of loaded config
        for k, v in yamlDict.items():
            ### check if key actually exists inside of our config
            if k not in self.__dict__:
                raise AttributeError(
                    "Error in config: '%s'\n'%s' object has no attribute '%s'"
                    % (path, self.__class__.__name__, k))
            ### prepare type check
            t1, v1, t2, v2 = type(v), v, type(getattr(self,
                                                      k)), getattr(self, k)
            if t1 != t2:
                raise ValueError(
                    "Configuration parameter '%s' has failed type check! 's'<'%s'> should be 's'<'%s'>"
                    % (k, v1, t1, v2, t2))

        # special evaluations
        if self.max_payout_per_run < self.btc_per_transaction * self.number_payout_contributors_per_run:
            raise ValueError(
                "The specified payout amount (self.btc_per_transaction * self.number_payout_contributors_per_run) exceeds the maximum payout (max_payout_per_run)"
            )
        self.apply(yamlDict)

        # block url in note
        extractor = URLExtract()
        if extractor.has_urls(self.email_note):
            raise ValueError("Using URLs in note not possible")
Ejemplo n.º 3
0
def replace_url(s):
    extractor = URLExtract()

    if extractor.has_urls(s):
        urls = extractor.find_urls(s, only_unique=True)
        for url in urls:
            s = s.replace(url, "<url>")

    return s
Ejemplo n.º 4
0
 def read_stream_from_assia_tv(self, response, event_url, event_name, event_date):
     scripts = response.css("script")
     extractor = URLExtract()
     for s in scripts:
         text = s.get()
         if extractor.has_urls(text):
             for url in extractor.gen_urls(text):
                 if "video.assia.tv" in url:
                     self.logger.info("#read_stream_from_assia_tv - found video stream url %s!"%(url))
                     if "m3u8" in url:
Ejemplo n.º 5
0
def message(update: Update, context: CallbackContext) -> None:
    extractor = URLExtract()
    if extractor.has_urls(update.message.text):
        result_text = update.message.text
        for url in extractor.gen_urls(update.message.text):
            print(f"Url found: {url}")
            unshorten_url = unshort_url(url)
            print(f"Unshorten: {unshorten_url}")
            sanitized_url = trim_utm(unshorten_url)
            print(f"Sanitized: {sanitized_url}")
            if url != sanitized_url:
                result_text = result_text.replace(url, sanitized_url)
        if result_text != update.message.text:
            update.message.reply_text(result_text)
Ejemplo n.º 6
0
    def __extract_domain_from_sent_field(self, sent: str) -> str:
        """
        Get the url out of a 'sent' field in a measurement.

        Parameters
        ----------
        sent: str
            Examples:

            * An empty string ("") meaning the sent packet wasn't recorded.
            * "GET / HTTP/1.1\r\nHost: example5718349450314.com\r\n" (echo/discard)
            * "GET www.bbc.co.uk HTTP/1.1\r\nHost: /content.html\r\n" (discard error) or just "www.apple.com" (HTTP/S)

        Returns
        -------
        str
            Just the url, if found.
        """
        extractor = URLExtract()
        extractor.update_when_older(
            7)  # updates known TLD when list is older that 7 days

        if sent == '':
            return sent

        match = re.search(QuackConstants.SENT_PATTERN.value, sent)
        if match:
            path = match.group(1)
            domain = match.group(2)

            # This is a bug where the domain and path were reversed in content sent.
            # We do our best to reconstruct the intended url
            # by swapping them to their intended position
            if extractor.has_urls(path):
                domain, path = path, domain

            if path == '/':
                return domain
            return domain + path

        if ' ' not in sent:
            return sent

        raise Exception(f"unknown sent field format: {sent}")
Ejemplo n.º 7
0
    def applyYaml(self, path):
        yamlDict = yaml.safe_load(open(path))
        ### ensure validity of provided yaml
        if self.validateConfig(yamlDict, path=path):
            ### apply config because it is valid
            self.apply(yamlDict)

            # special evaluations
            if (self.payout_per_run <
                    self.random_split_btc_per_picked_contributor *
                    self.random_split_picked_contributors):
                raise ValueError(
                    "The specified payout amount (self.random_split_btc_per_picked_contributor * self.random_split_picked_contributors) exceeds the maximum payout (payout_per_run)"
                )

            # block url in note
            extractor = URLExtract()
            if extractor.has_urls(self.optional_email_message):
                raise ValueError("Using URLs in note not possible")
Ejemplo n.º 8
0
    def applyYaml(self, path):
        yamlDict = yaml.safe_load(open(path))
        # ensure type of loaded config
        for k, v in yamlDict.items():
            t1, v1, t2, v2 = type(v), v, type(getattr(self,
                                                      k)), getattr(self, k)
            if t1 != t2:
                raise ValueError(
                    "Configuration parameter '%s' has failed type check! 's'<'%s'> should be 's'<'%s'>"
                    % (k, v1, t1, v2, t2))

        # special evaluations
        if self.max_payout_per_run < self.btc_per_transaction * self.number_payout_contributors_per_run:
            raise ValueError(
                "The specified payout amount (self.btc_per_transaction * self.number_payout_contributors_per_run) exceeds the maximum payout (max_payout_per_run)"
            )
        self.apply(yamlDict)

        # block url in note
        extractor = URLExtract()
        if extractor.has_urls(self.email_note):
            raise ValueError("Using URLs in note not possible")
Ejemplo n.º 9
0
def left_user(bot, update):
    """Member left the group event handler. On this case we try to
    avoid the message that telegram sends when removing the user
    form the group."""

    chat_id = update.message.chat.id
    chat_config = storage.get_chat_config(chat_id)
    if not chat_config.enabled:
        return

    message_id = update.message.message_id
    user = update.message.left_chat_member
    left_user_name = "{} {}".format(user.first_name, user.last_name)
    log.info("{} left the group {}".format(left_user_name, chat_id))

    try:
        extractor = URLExtract()
        if extractor.has_urls(left_user_name):
            bot.delete_message(chat_id, message_id)
        else:
            if len(left_user_name) > conf.MAX_USERNAME_LENGTH:
                bot.delete_message(chat_id, message_id)
    except Exception as e:
        log.error("Error on deleting left message {}".format(e))
Ejemplo n.º 10
0
def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    try:
        generatedUrls = extractor.gen_urls(body)
        for url in generatedUrls:
            if len(url) < 5 or '.' not in url:
                continue
            if url.count('http') == 1:
                url = url.split('http')[1]
                url = 'http{}'.format(url)
            if '(' in url:
                rurl = url.split('(')
                if extractor.has_urls(rurl[1]):
                    url = rurl[1]
                elif extractor.has_urls(rurl[0]):
                    url = rurl[0]
                else:
                    continue
            if ')' in url:
                lurl = url.split(')')
                if extractor.has_urls(lurl[0]):
                    url = lurl[0]
                elif extractor.has_urls(lurl[1]):
                    url = lurl[1]
                else:
                    continue
            sem = 0
            for suffix in excluded:
                if url.endswith(suffix):
                    sem = 1
            if sem == 1:
                continue
            # """
            if '[IMG]' in url:
                try:
                    url = url.split('[IMG]')[1]
                except IndexError:
                    pass
            if '[/IMG]' in url:
                try:
                    url = url.split('[/IMG]')[0]
                except IndexError:
                    pass
            if url.endswith('?fb'):
                url = url.replace('?fb', '')
            if url.endswith('?noredirect'):
                url = url.replace('?noredirect', '')
            elif url.endswith(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
                url = url.replace(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium',
                    '')
            elif url.endswith('?s=sms'):
                url = url.replace('?s=sms', '')
            if '//m.imgur.com' in url:
                url = url.replace('//m.imgur.com', '//imgur.com')
            if url.startswith('https://thumbs.gfycat.com/'):
                url = url.replace('https://thumbs.gfycat.com/',
                                  'https://gfycat.com/')
            if url.endswith('-size_restricted.gif'):
                url = url.replace('-size_restricted.gif', '')
            # """
            urlset.add(url)
        return urlset
    except AttributeError as e:
        raise e
        print(
            "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git"
            .format(e=e))
    finally:
        return urlset  # which is empty
Ejemplo n.º 11
0
Archivo: Vk.py Proyecto: plaxeen/linker
class VK:
    def __init__(self):

        log_tag = 'VK - init'
        self.settings_tag = 'VK'

        self.extractor = URLExtract()
        self.config = Config()
        try:
            self.vk_bot = vk_api.VkApi(
                token=str(self.config.read(self.settings_tag, 'bot_token')))
            self.api_bot_vk = self.vk_bot.get_api()
            Log().info(log_tag, 'Инициализация токена-бота VK успешна.')
        except Exception as e:
            Log().error(log_tag, e)

        p_name = 'ЛИНКЕР'
        p_channel = 'hackathon'
        p_version = '0.0.1'
        desc = 'Бот, создающий сокращенные vk.cc ссылки прямо в диалоге.'
        self.info = f'{p_name} {p_version} ({p_channel})\n\n{desc}\n\nбеседа %peer_id%'

    def long_poll(self):
        tag = 'VK - Message LongPoll'
        from vk_api.bot_longpoll import VkBotLongPoll, VkBotEventType

        long_poll_bot = VkBotLongPoll(
            self.vk_bot,
            int(self.config.read(self.settings_tag, "community_id")))

        for event in long_poll_bot.listen():
            try:
                if event.type == VkBotEventType.MESSAGE_NEW:
                    Log().info(
                        tag,
                        f'Новое сообщение от \"https://vk.com/id{event.obj.from_id}\".\n'
                        f'Текст сообщения:\t\n{event.obj.text}\n'
                        f'Прикрепленные аттачи:\t\n{event.obj.attachments}\n'
                        f'Пересланные сообщения:\t\n{event.obj.fwd_messages}')
                    self.listener(event)

                elif event.type == VkBotEventType.MESSAGE_REPLY:
                    Log().info(tag, f'Бот ответил в чате {event.obj.peer_id}.')

                else:
                    Log().info(
                        tag, f'Обнаружено новое действие: {event.type} от '
                        f'\"https://vk.com/id{event.obj.from_id}\"')

            except Exception as e:
                Log().error(tag, e)

    def listener(self, event):
        tag = "VK - Message Listener"
        Log().info(tag, 'Обрабатываю сообщение...')
        from_id = event.obj.from_id
        peer_id = event.obj.peer_id
        msg_text = str(event.obj.text)
        msg_attach = event.obj.attachments
        msg_fwd = event.obj.fwd_messages
        Log().info(tag, 'Обработка завершена. ')

        if self.extractor.has_urls(msg_text) or msg_attach or msg_fwd:
            response_links = []
            if self.extractor.has_urls(msg_text):
                links = self.extractor.find_urls(msg_text)
                Log().info(tag, 'Найдены объекты типа ссылка.')
                if len(links) > 1:
                    for i in range(len(links)):
                        response_links.append(
                            self.get_cc_link(links[i], 0)['short_url'])
                else:
                    response_links.append(
                        self.get_cc_link(links, 0)['short_url'])

            if msg_attach:
                for i in range(len(msg_attach)):
                    attach_type = msg_attach[i]['type']
                    if attach_type == 'link':
                        ath_url = msg_attach[i][attach_type]['url']
                        response_links.append(
                            str(self.get_cc_link(ath_url, 0)['short_url']))

            if msg_fwd:
                for i_fwd in range(len(msg_fwd)):
                    fwd_text = msg_fwd[i_fwd]['text']
                    fwd_attaches = msg_fwd[i_fwd]['attachments']
                    for i_ath in range(len(fwd_attaches)):
                        fwd_ath_type = fwd_attaches[i_ath]['type']
                        if fwd_ath_type == 'link':
                            fwd_ath_link = msg_fwd[i_fwd]['attachments'][
                                i_ath][fwd_ath_type]['url']
                            response_links.append(
                                str(
                                    self.get_cc_link(fwd_ath_link,
                                                     0)['short_url']))

                    if self.extractor.find_urls(fwd_text):
                        response_links.append(
                            str(self.get_cc_link(fwd_text, 0)['short_url']))

            response_links_wd = list(dict.fromkeys(response_links))

            if len(response_links_wd) > 1:
                response_str = '🔗 Вот твои ссылки из сообщения:\n\n'
                for i_link in range(len(response_links_wd)):
                    response_str += response_links_wd[i_link] + '\n'

            else:
                response_str = '🔗 Была найдена лишь одна ссылка в сообщении: ' + response_links_wd[
                    0]

            self.send_message(peer_id, response_str)

        elif (from_id == 140830142) and \
                (msg_text.__contains__('info') or msg_text.__contains__('инфо') or msg_text.__contains__('i')) or \
                (msg_text.__contains__('ping') or msg_text.__contains__('пинг')):
            Log().info(tag, 'Инфо о боте.')
            self.send_message(peer_id, 'понг')
            self.send_message(peer_id,
                              self.info.replace("%peer_id%", str(peer_id)))

        else:
            Log().info(tag, 'Неизвестная команда.')
            self.send_message(event.obj.peer_id, '🐸 Ссылок нет.')

    def get_cc_link(self, url, private):
        cc_link = self.api_bot_vk.utils.getShortLink(url=url, private=private)
        return cc_link

    def send_message(self, user_id, text):
        self.api_bot_vk.messages.send(peer_id=user_id,
                                      message=text,
                                      random_id=get_random_id(),
                                      dont_parse_links=1)
Ejemplo n.º 12
0
def main():
    LOG.info("Started and opening connection.")
    with grpc.secure_channel(
            os.getenv("SEABIRD_HOST_PORT"),
            grpc.ssl_channel_credentials(),
    ) as channel:
        channel = grpc.intercept_channel(
            channel,
            add_header(
                "authorization",
                f'Bearer {os.getenv("SEABIRD_TOKEN")}',
            ),
        )
        LOG.info("Successfully connected.")
        stub = seabird_pb2_grpc.SeabirdStub(channel)
        LOG.info("Monitoring for events.")
        for event in stub.StreamEvents(
                seabird_pb2.StreamEventsRequest(commands={
                    "inspect_image":
                    seabird_pb2.CommandMetadata(
                        name="inspect_image",
                        short_help="AWS Rekognition to analyze an image",
                        full_help="Analyze an image's content",
                    ),
                    "inspect_celebrity":
                    seabird_pb2.CommandMetadata(
                        name="inspect_image",
                        short_help="AWS Rekognition to analyze an image",
                        full_help="Analyze an image's content",
                    ),
                }, )):
            LOG.debug("Event received: %s", event.message.text)
            command = event.command
            message = event.message

            if not command.command and not message:
                continue

            extractor = URLExtract(extract_localhost=False)

            if command.command == "inspect_image":
                if command.arg and extractor.has_urls(command.arg):
                    LOG.info(
                        "Image command detected from %s",
                        command.source.user.display_name,
                    )
                    handle_image(stub, command)
                elif not extractor.has_urls(command.arg):
                    LOG.info(
                        "No URL detected from %s",
                        command.source.user.display_name,
                    )
                    stub.SendMessage.with_call(
                        seabird_pb2.SendMessageRequest(
                            channel_id=command.source.channel_id,
                            text=
                            f"{command.source.user.display_name}: Missing URL",
                        ))
                else:
                    LOG.info(
                        "Image command invalid from %s",
                        command.source.user.display_name,
                    )
                    stub.SendMessage.with_call(
                        seabird_pb2.SendMessageRequest(
                            channel_id=command.source.channel_id,
                            text=
                            f"{command.source.user.display_name}: Something's not right",
                        ))
                continue
            elif command.command == "inspect_celebrity":
                if command.arg and extractor.has_urls(command.arg):
                    LOG.info(
                        "Celebrity command detected from %s",
                        command.source.user.display_name,
                    )
                    handle_celebrity(stub, command)
                elif not extractor.has_urls(command.arg):
                    LOG.info(
                        "No URL detected from %s",
                        command.source.user.display_name,
                    )
                    stub.SendMessage.with_call(
                        seabird_pb2.SendMessageRequest(
                            channel_id=command.source.channel_id,
                            text=
                            f"{command.source.user.display_name}: Missing URL",
                        ))
                else:
                    LOG.info(
                        "Celebrity command invalid from %s",
                        command.source.user.display_name,
                    )
                    stub.SendMessage.with_call(
                        seabird_pb2.SendMessageRequest(
                            channel_id=command.source.channel_id,
                            text=
                            f"{command.source.user.display_name}: Something's not right",
                        ))
                continue
            elif extractor.has_urls(message.text):
                LOG.info(
                    "Detected link from %s",
                    message.source.user.display_name,
                )
                handle_url(stub, message)
            else:
                continue
Ejemplo n.º 13
0
def new_user(bot, update):
    """New member join the group event handler"""

    message = update.message
    chat_id = message.chat_id
    chat_config = storage.get_chat_config(chat_id)
    if not chat_config.enabled:
        return
    message_id = message.message_id
    msg_from_user_id = message.from_user.id
    msg_from_alias = message.from_user.name
    join_date = message.date

    lang = chat_config.language
    # For each new user that join or has been added

    for join_user in message.new_chat_members:
        join_user_id = join_user.id
        join_user_alias = join_user.name
        join_user_name = "{} {}".format(join_user.first_name, join_user.last_name)

        # we do not allow certain user names
        if storage.is_name_in_black_list([join_user_alias, join_user_name]):
            log.info(
                "Possible spammer [blacklisted] kicked %s on chat %s",
                join_user_name,
                chat_id,
            )
            delete_message(
                chat_id,
                join_user_id,
                message_id,
                "[SPAMMER] {}".format(message.text),
                bot,
            )
            kick_user_from_chat(
                bot, join_user_id, join_user_alias, chat_id, "name blacklisted"
            )
            continue

        # If the added user is not myself (this Bot)
        if bot.id == join_user_id:
            # The Anti-Spam Bot has been added to a group
            anti_spam_bot_added_event(chat_id, bot, update)
            continue
        else:
            to_register_user = True
            # If the message user source is not the join user,
            # means it has been invited/added by another
            if msg_from_user_id != join_user_id and join_user.is_bot:
                # If a user has added a bot check if could be added and delete id if not
                to_register_user = try_to_add_a_bot_event(
                    bot, msg_from_user_id, join_user, chat_id
                )
                if not to_register_user:
                    # if is not a legit bot log and no nothing
                    log.warn(
                        "{msg_from_user_id} has tried to join {join_user} to  {chat_id}".format(
                            msg_from_user_id=msg_from_user_id,
                            join_user=join_user,
                            chat_id=chat_id,
                        )
                    )
                    continue
            if to_register_user and (msg_from_user_id != join_user_id):
                if not storage.is_user_allowed_to_add_users(
                    bot, msg_from_user_id, chat_id
                ):
                    log.warn(
                        "%s is has tried to add another user: %s on chat %s",
                        msg_from_alias,
                        join_user_name,
                        chat_id,
                        exc_info=0,
                    )
                    delete_message(
                        chat_id,
                        join_user_id,
                        message_id,
                        "[ADDER] {}".format(message.text),
                        bot,
                    )
                    kick_user_from_chat(
                        bot,
                        join_user_id,
                        join_user_name,
                        chat_id,
                        "Tried to add another user",
                    )
                    continue

            if to_register_user:
                # Check if there is an URL in the user name
                extractor = URLExtract()
                has_url = extractor.has_urls(join_user_name) or extractor.has_urls(
                    join_user_alias
                )
                if has_url:
                    log.warn(
                        "Spammer (URL name) join detected.\n  (Chat) - ({}).".format(
                            chat_id
                        )
                    )
                    if len(join_user_name) > 15:
                        join_user_name = "{}...".format(join_user_name)[0:10]
                    try:
                        bot.delete_message(chat_id, message_id)
                        bot_message = msg(lang, "USER_URL_NAME_JOIN").format(
                            join_user_name
                        )
                        log.info(
                            "Spammer (URL name) join message successfully removed.\n"
                            "  (Chat) - ({}).".format(chat_id)
                        )
                        notifications.tlg_send_selfdestruct_msg(
                            bot, chat_id, bot_message
                        )
                    except Exception as e:
                        log.error(
                            "Exception when deleting a Spammer (URL name) join "
                            "message - {}".format(str(e))
                        )
                        if str(e) == "Message can't be deleted":
                            bot_message = msg(
                                lang, "USER_URL_NAME_JOIN_CANT_REMOVE"
                            ).format(join_user_name)
                            notifications.tlg_send_selfdestruct_msg(
                                bot, chat_id, bot_message
                            )
                    continue
                else:
                    # Check if user name and last name are too long
                    if len(join_user_name) > conf.MAX_USERNAME_LENGTH:
                        join_user_name = "{}...".format(join_user_name)[0:10]
                        try:
                            bot.delete_message(chat_id, message_id)
                            bot_message = msg(lang, "USER_LONG_NAME_JOIN").format(
                                join_user_name
                            )
                            log.info(
                                "Spammer (long name) join message successfully removed."
                                "  (Chat) - ({}).".format(chat_id)
                            )

                        except Exception as e:
                            log.error(
                                "Exception when deleting a Spammer (long name) join "
                                "message - {}".format(str(e))
                            )
                            if str(e) == "Message can't be deleted":
                                bot_message = msg(
                                    lang, "USER_LONG_NAME_JOIN_CANT_REMOVE"
                                ).format(join_user_name)

                        notifications.tlg_send_selfdestruct_msg(
                            bot, chat_id, bot_message
                        )

                if len(join_user_alias) > conf.MAX_USERNAME_ALIAS:
                    # if the alias is to large, just short it
                    join_user_alias = "{}...".format(join_user_alias)[
                        0 : conf.MAX_USERNAME_ALIAS - 3
                    ]
            if (conf.VERBOSE_LIMIT > 0) and (storage.last_addition(chat_id) > conf.VERBOSE_LIMIT):
                notifications.tlg_send_selfdestruct_msg(
                    bot=bot,
                    chat_id=chat_id,
                    message=msg(lang, "WELCOME_MSG").format(
                        join_user_alias,
                        chat_config.num_messages_for_allow_urls,
                        chat_config.time_for_allow_urls,
                    ),
                    minutes=conf.VERBOSE_LIMIT,
                )

            storage.register_new_user(
                chat_id=chat_id,
                user_id=join_user_id,
                user_name=join_user_alias,
                first_name=join_user.first_name,
                last_name=join_user.last_name,
                join_date=join_date,
                allow_user=False,
            )

            log.info("{} added to the group {}".format(join_user_alias, chat_id))
Ejemplo n.º 14
0
def renderMarkdown(text, ignoreLinks=False, heading=False, alignment=False, properties=False, view_type=False):
    isAttribute = False
    if ':hiccup' in text:
        # THIS DOES NOT WORK WELL !!! VERY BROKEN
        # text = 'hr '
        data = re.sub(r'\n', '', text.strip())
        data = re.sub(r':hiccup \[:hr\]', r'<hr>', data)
        data = re.sub(r'(\[\s*?):([\w-]+)', r'\1"\2",', data)
        data = re.sub(r':([\w-]+)', r'"\1":', data)
        data = re.sub(r'([\}\]\:][\s]*?)(\w+)([\s]*?[\[\{\]])', r'\1"\2"\3', data)
        data = re.sub(r'([\}\]\"])([\s\n]*?)([\[\{\"])', r'\1,\2\3', data)
        # print(data[9:])
        # data = re.sub(r'(hr)', r'hr', data)  # this tag is not being converted correctly

        # print(data[10:])
        # print(json.loads(data[10:]))
        # print(convert(data))
        # return convert(data)
        return data

    if ignoreLinks is False:
        global wordcount
        wordcount += len(text.split())
    # todo correctly render page alias {{alias: [[Roam Research]] Roam}}
    # todo fix URLs that contain a #
    # todo if attribute exists set a flag so the attribute can be picked up and attributed to the parent block
    if re.match(r'\b(.+)\:\:', text, flags=0):
        isAttribute = True
    text = re.sub(r'^\[\[>\]\](.*)', r'<blockquote>\1</blockquote>', text)  # blockquote
    text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text)  # attributes
    text = re.sub(r'^(\-\-\-)$', r'<hr>', text)
    text = re.sub(r'{{\[\[TODO\]\]}}', _processCheckmark(False), text)  # unchecked TO DO
    text = re.sub(r'{{{\[\[DONE\]\]}}}}', _processCheckmark(True), text)  # checked TO DO alt
    text = re.sub(r'{{\[\[DONE\]\]}}', _processCheckmark(True), text)  # checked TO DO
    text = re.sub(r'\!\[([^\[\]]*?)\]\((.+?)\)', r'<img src="\2" alt="\1" />', text)  # markdown images
    text = re.sub(r'\{\{\[\[youtube\]\]:(.+?)\}\}', lambda x: _processExternalEmbed(x, text, "youtube"), text)  # external clojure embeds
    text = re.sub(r'\{\{\[\[query\]\]:(.+?)\}\}', lambda x: _processQueries(x, text), text)  # queries
    text = re.sub(r'\{\{(.*):.*[^\{\}]\((.+?)\)\)(.*)\}\}', lambda x: _processInternalEmbed(x, text), text)  # clojure embeds and Block aliases
    text = re.sub(r'\{\{(.*):.*[^\{\}]\[(.+?)\]\](.*)\}\}', lambda x: _processInternaPagelEmbed(x, text), text)  # clojure page aliases
    text = re.sub(r'\{\{\[\[slider\]\](.*)\}\}', lambda x: _processSlider(x, text, properties), text)  # sliders

    text = re.sub(r'(\{\{or:(.+?)\}\})', lambda x: _processTextVersion(x, text), text)  # text versioning
    if ignoreLinks:
        text = re.sub(r'\[\[(.+?)\]\]', r'\1', text)  # page links
        text = re.sub(r'\[([^\[\]]+?)\]\((.+?)\)', r'\1', text)  # external links
        text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text)  # attributes

    else:
        text = re.sub(r'\[([^\[\]]+?)\]\(\[\[(.+?)\]\]\)', lambda x: _processInternalAlias(x, text), text)  # internal page aliases
        text = re.sub(r'\[([^\[\]]+?)\]\(\(\((.+?)\)\)\)', lambda x: _processInternalBlockAlias(x, text), text)  # internal block aliases
        text = re.sub(r'\[([^\[\]]+?)\]\(([^\[\]\(].+?)\)', lambda x: _processExternalAlias(x, text), text)  # external aliases
        text = re.sub(r'(?<!href="\/[A-Za-z0-9\-\_]{8})(#(\w+))', lambda x: _processInternalTag(x, text), text)  # tags without brackets

        text = re.sub(r'(\#\[\[(.+?)\]\])', lambda x: _processInternalTag(x, text), text)  # tag with brackets
        text = re.sub(r'(?<!\#)\[\[(.+?)\]\]', lambda x: _processInternalLink(x, text), text)  # pages with brackets

    text = re.sub(r'\n', r'<br>', text)  # newline
    text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)  # bold
    text = re.sub(r'\_\_(.*?)\_\_', r'<em>\1</em>', text)  # italic
    text = re.sub(r'\~\~(.+?)\~\~', r'<s>\1</s>', text)  # strikethrough
    text = re.sub(r'\^\^(.+?)\^\^', r'<span class="highlight">\1</span>', text)  # highlight
    text = re.sub(r'\`\`\`(.+?)\`\`\`', r'<code>\1</code>', text)  # large codeblock
    text = re.sub(r'\`(.+?)\`', r'<code>\1</code>', text)  # inline codeblock

    def isBlockPrivate(blockID, blockText):
        if blockID in block_ids:
            # print("block not private")
            # print(blockText)
            # print(blockID)
            return renderMarkdown(block_ids[blockID]['string'])
        else:
            # print("block is private")
            # print(blockText)

            pass

    text = re.sub(r'\(\((.+?)\)\)', lambda x: isBlockPrivate(x.group(1), text), text)  # block ref

    # deal with bare URLs
    # not a huge fan of this
    forbidden_chars = ['<a', '<img', '[', '<code', '<iframe']
    results = []
    for substring in forbidden_chars:
        results.append(substring in text)
    if not any(results):
        extractor = URLExtract()
        if extractor.has_urls(text):
            for url in extractor.gen_urls(text):
                text = text.replace(url, _processBareURL(url))
                # print(text)

    if heading:
        text = f'<h{heading}>{text}</h{heading}>'
    if alignment:
        text = f'<div style="text-align:{alignment};">{text}</div>'
    return text
Ejemplo n.º 15
0
tweets['clean_text'] = tweets['clean_text'].str.replace("\t"," ")
# Replace & with and 
tweets['clean_text'] = tweets['clean_text'].str.replace("&amp;"," and ")

# ========================================== EXTRACT AND REMOVE URL's ========================================================
#TBD: MAKE FUNCTION
from urlextract import URLExtract
extractor = URLExtract()
# Adding more stop chars in case of parentesses near URL's
stop_chars = list(extractor.get_stop_chars_right())
stop_chars.append(')')
extractor.set_stop_chars_right(set(stop_chars))
     
tweets['url_count'] = 0
for i,t in enumerate(tweets['clean_text']):
    if extractor.has_urls(t):
        urls = extractor.find_urls(t)
        # Add URL count feature       
        tweets['url_count'].iloc[i] = len(urls)
        print(i,len(urls))
        # Remove url's from text        
        line = t
        for item in urls:
            line = re.sub(item,'', line)
        tweets['clean_text'].iloc[i] = line

print(tweets.info())
print(tweets.describe())
print(tweets['clean_text'].iloc[1060])

# ============================================= HASHTAGS AND MENTIONS COUNT =============================================
Ejemplo n.º 16
0
def validate_local_restrictions(local_restrictions):
    extractor = URLExtract()
    if not extractor.has_urls(local_restrictions):
        raise ValidationError(
            _('Please provide a link to your government website outlining this.'
              ))
Ejemplo n.º 17
0
    def check_citation(self, citation):
        #NOTE: when implementing, wrap the method in a try catch and print out any error + the citation status

        try:
            pattern = re.compile("[ ][0-9]{4}")
            result = pattern.search(citation)
            self.year = result.group(0)[1:]
        except:
            raise Exception("Unable to find year in citation.")

        self.citation_status = MLACitationStatus.AUTHOR

        cursor = 0

        while True:
            ascii_value = ord(citation[cursor])

            # check if the current character is not " &-'." or any alphanumeric in English or Latin-1
            if citation[cursor:cursor + 2] != ". " and (
                    ascii_value == 32 or ascii_value == 39
                    or 44 <= ascii_value <= 46 or 65 <= ascii_value <= 90
                    or 97 <= ascii_value <= 122 or 192 <= ascii_value <= 255):
                cursor += 1
            else:
                break

        if cursor != 0:
            author_section = ""
            if citation[cursor:cursor + 2] == ". ":
                author_section = citation[:cursor + 1]
            else:
                raise Exception(
                    "Bad formatting in the author section (unknown error).")

            # three or more authors
            if ", et al." in author_section:
                temp = author_section.replace(", et al", "")
                authors = temp.split(", ")
                filteredAuthor = [self.filter_latin(i) for i in authors]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-'. ]+[.]$", filteredAuthor[1]) is not None:
                    self.authors.append(authors[0] + ", et al.")
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            # two authors
            elif ", and " in author_section:
                authors = author_section.split(", and ")
                if ", " not in authors[0]:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                firstAuthor = authors[0].split(", ")
                filteredFirstAuthor = [
                    self.filter_latin(i) for i in firstAuthor
                ]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredFirstAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-'. ]+$", filteredFirstAuthor[1]) is not None:
                    self.authors.append(firstAuthor[0])
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                if " " not in authors[1]:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                secondAuthor = authors[1].split(" ", 1)
                filteredSecondAuthor = [
                    self.filter_latin(i) for i in secondAuthor
                ]

                if re.match("^[A-Z][A-Za-z-']+$", filteredSecondAuthor[0]) is not None \
                and re.match("^[A-Za-z][A-Za-z-'. ]+[.]$", filteredSecondAuthor[1]) is not None:
                    self.authors.append(filteredSecondAuthor[1][:-1])

                elif re.match("^[A-Za-z][.]$",
                              filteredSecondAuthor[1]) is not None:
                    author_cursor = cursor + 2
                    actualSecondAuthor = ""

                    while citation[author_cursor:author_cursor + 2] != ". ":
                        actualSecondAuthor += citation[author_cursor]
                        author_cursor += 1

                    self.authors.append(actualSecondAuthor)

                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            # one author
            elif ", " in author_section:
                authors = author_section.split(", ")
                filteredAuthor = [self.filter_latin(i) for i in authors]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-' ]+[.]$", filteredAuthor[1]) is not None:
                    self.authors.append(authors[0])
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            elif "et. al." in author_section or "et.al." in author_section:
                raise Exception(
                    "'Et al.' should not have a period after the 'Et'.")
            # no match; bad formatting
            else:
                raise Exception("Bad formatting in the author section: '" +
                                author_section + "'")

        self.citation_status = MLACitationStatus.TITLE
        cursor += 1
        # check the title section
        if citation[cursor:cursor + 3] == "<i>":
            cursor += 3
        elif citation[cursor + 1:cursor + 4] == "<i>":
            cursor += 4
        elif citation[cursor + 1] == "\"":
            cursor += 2
        elif citation[cursor - 1:cursor + 1] == ".\"":
            raise Exception("Bad formatting in the title section.")

        title = ""

        while citation[cursor] != ".":
            title += citation[cursor]
            cursor += 1

        title = title.replace("\"", "")
        title = title.replace("</i>", "")

        if title[0] == " ":
            title = title[1:]

        if citation[cursor + 1] == "\"":
            cursor += 2
        else:
            cursor += 1
        #now cursor should be at the beginning of italics

        result = url("https://brettterpstra.com/titlecase/?title=" + title)
        title_cased_title = result.read().decode('utf-8')

        if title != title_cased_title:
            self.warnings.append(
                "the title might contain improper capitalization: '" + title +
                "'")

        self.title = title

        # check for url
        self.citation_status = MLACitationStatus.URL

        extractor = URLExtract()
        if extractor.has_urls(citation):
            urls = extractor.find_urls(citation)
            self.url = urls[0][:-1]
            if self.url + "." not in citation:
                raise Exception("Bad formatting in the URL section.")

            if citation[cursor:cursor +
                        3] != "<i>" and citation[cursor + 1:cursor +
                                                 4] != "<i>":
                self.warnings.append(
                    "the container may not exist or may not be italicized")

        elif citation[cursor:cursor +
                      3] == "<i>" and citation[cursor + 1:cursor + 4] == "<i>":
            self.warnings.append(
                "the container might exist when not necessary (if the citation is about a book), or the block immediately following the title may be improperly italicized."
            )

        if self.url != "":
            citation.replace(self.url + ".", "")

        # check for other info
        # right now, it's too complex to validate the entire MLA citation without prior knowledge on what type of citation it is,
        # so the other info is just stored without checking
        self.citation_status = MLACitationStatus.OTHER_INFO

        remainingText = citation[cursor:]
        info = remainingText.split(", ")
        self.otherInfo = [i for i in info]
Ejemplo n.º 18
0
class Preprocessor(object):
    def __init__(self, min_times):
        self.min_times = min_times
        self.wnl = WordNetLemmatizer()
        self.stem_tool = PorterStemmer()
        self.url_extractor = URLExtract()
        self.UK_check_list = ['u.k', 'u.k.', 'uk', 'UK', 'U.K', 'U.K.']
        self.USA_check_list = [
            'u.s.', 'u.s', 'U.S', 'U.S.', 'U.S.A', 'usa', 'USA', 'u.s.a',
            'U.S.A.', 'u.s.a.'
        ]
        self.twoword_pattern = re.compile("[a-z][A-Z]{1}[a-z]", re.S)

        self.mark_words = []
        with open("tools/mark_words.txt", mode='r', encoding='utf-8') as f:
            for line in f.readlines():
                self.mark_words.append(line.strip())

        #加入Fox停用词
        self.stopwords = []
        with open("tools/FoxStoplist.txt", mode='r', encoding='utf-8') as f:
            for line in f.readlines():
                self.stopwords.append(line.strip())
        # 加入标点作为停用词
        self.stopwords.extend(list(string.punctuation))
        # 加入nltk标准停用词
        nltk_stopwords = list(stopwords.words('english'))
        self.stopwords.extend(nltk_stopwords)
        #统计词频的工具
        self.word_counter = Counter()

    def deal_text(self, text):
        #处理网址字符
        text = self.filter(text)
        if self.url_extractor.has_urls(text):
            sentence_urls = self.url_extractor.find_urls(text)
            for url in sentence_urls:
                text = text.replace(url, 'website-holder')

        doc_words = []
        for word in self.lemmatize_text(text):
            word = word.strip()

            match_string = re.findall(self.twoword_pattern, word)
            if len(match_string) != 0:
                for string in match_string:
                    words = word.replace(string, string[0] + ' ' +
                                         string[1:]).split(" ")
                    doc_words.extend(words)
            if word in self.stopwords:
                continue
            if word in self.UK_check_list:
                doc_words.append("uk")
            if word in self.USA_check_list:
                doc_words.append("usa")
            if word in self.mark_words:
                pattern = re.compile('[a-z]{1}-[a-z]{1}')
                match_string = re.findall(pattern, word)
                for string in match_string:
                    word = word.replace(string, string[0] + string[2:])
                    doc_words.append(word)
            #看是否为数字
            try:
                word_float = float(word)
                doc_words.append("#number#")
            except ValueError:
                pass
            if word == 'website-holder':
                doc_words.append("#website#")
            else:
                doc_words.append(word)
        #统一为小写
        for i, word in enumerate(doc_words):
            doc_words[i] = self.stem_tool.stem(word.lower())
        #统计词数
        self.word_counter.update(doc_words)

        return doc_words

    def lemmatize_text(self, text):
        wnl = WordNetLemmatizer()
        sentences = sent_tokenize(text, language='english')
        for sentence in sentences:
            for word, tag in pos_tag(
                    word_tokenize(sentence, language='english')):
                if tag.startswith('NN'):
                    yield wnl.lemmatize(word, pos='n')
                elif tag.startswith('VB'):
                    yield wnl.lemmatize(word, pos='v')
                elif tag.startswith('JJ'):
                    yield wnl.lemmatize(word, pos='a')
                elif tag.startswith('R'):
                    yield wnl.lemmatize(word, pos='r')
                else:
                    yield wnl.lemmatize(word)

    def filter(self, text):
        """
        过滤掉无用的文本
        """
        if isinstance(text, float):
            return ''
        pattern = re.compile("\[This API is no longer available\.(.*?)\]",
                             re.S)
        a = pattern.findall(text)
        if len(a) > 0:
            replace_item = "[This API is no longer available." + a[0] + ']'
            text = text.replace(replace_item, '').strip()
        else:
            text = text.strip()
        return text

    def read_data(self, csv_path):
        df = pd.read_csv(csv_path)
        all_process_cuts = []
        all_name = []
        all_tags = []
        all_labels = []
        label_map = dict()
        tag_map = {"UNK": 0}
        for i in tqdm(range(len(df))):
            row_data = df.iloc[i, :]
            APIName = row_data['name'].strip().lower()[:-4]  # 已经去掉了最后的API和空格
            # 处理tag
            tag_list = row_data['tags'].strip().lower().split(",")
            #生成tag_map
            for tag in tag_list:
                if tag not in tag_map:
                    tag_map[tag] = len(tag_map)
            # description
            raw_desc = row_data['description'].strip()
            desc_cuts = self.deal_text(raw_desc)
            # 处理category
            category = row_data['category'].strip()
            if category not in label_map:
                label_map[category] = len(label_map)

            all_name.append(APIName.strip())
            all_tags.append(tag_list)
            all_labels.append(category)
            all_process_cuts.append(desc_cuts)

        word_map = {'UNK': 0}
        for word, times in self.word_counter.items():
            if times < self.min_times:
                continue
            else:
                if word in word_map:
                    continue
                else:
                    word_map[word] = len(word_map)

        return all_name, all_process_cuts, all_labels, all_tags, word_map, tag_map, label_map

    def begin(self,
              csv_path,
              evaluate=False,
              old_word_map=None,
              old_tag_map=None,
              old_label_map=None):
        all_name, all_process_cuts, all_labels, \
        all_tags, get_word_map, get_tag_map, get_label_map = self.read_data(csv_path)

        all_encoded_tag = []
        all_encoded_category = []
        all_encoded_docs = []
        if not evaluate:
            word_map = get_tag_map
            tag_map = get_tag_map
            label_map = get_label_map
        else:
            assert old_word_map is not None and old_tag_map is not None and old_label_map is not None
            word_map = old_word_map
            tag_map = old_tag_map
            label_map = old_label_map

        #编码tag
        for doc_tag in all_tags:
            doc_encoded_tag = []
            for tag in doc_tag:
                if tag in tag_map:
                    doc_encoded_tag.append(tag_map[tag])
                else:
                    doc_encoded_tag.append(tag_map['UNK'])
            all_encoded_tag.append(doc_encoded_tag)

        #编码label
        for doc_label in all_labels:
            all_encoded_category.append(label_map[doc_label])

        #编码description
        for words in all_process_cuts:
            encoded_doc = []
            for word in words:
                if word in word_map:
                    encoded_doc.append(word_map[word])
                else:
                    encoded_doc.append(word_map['UNK'])
            all_encoded_docs.append(encoded_doc)

        #保存数据
        data_dict = dict()
        data_dict['all_name'] = all_name
        data_dict['all_process_cuts'] = all_process_cuts
        data_dict['all_labels'] = all_labels
        data_dict['all_tags'] = all_tags

        data_dict['all_encoded_category'] = all_encoded_category
        data_dict['all_encoded_tag'] = all_encoded_tag
        data_dict['all_encoded_docs'] = all_encoded_docs

        data_dict['word_map'] = word_map
        data_dict['label_map'] = label_map
        data_dict['tag_map'] = tag_map

        return data_dict
Ejemplo n.º 19
0
def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    for url in extractor.gen_urls(body):
        if len(url) < 5 or '.' not in url:
            continue
        if url.count('http') == 1:
            url = url.split('http')[1]
            url = 'http{}'.format(url)
        if '(' in url:
            rurl = url.split('(')
            if extractor.has_urls(rurl[1]):
                url = rurl[1]
            elif extractor.has_urls(rurl[0]):
                url = rurl[0]
            else:
                continue
        if ')' in url:
            lurl = url.split(')')
            if extractor.has_urls(lurl[0]):
                url = lurl[0]
            elif extractor.has_urls(lurl[1]):
                url = lurl[1]
            else:
                continue
        sem = 0
        for suffix in excluded:
            if url.endswith(suffix):
                sem = 1
        if sem == 1:
            continue
        # """
        if '[IMG]' in url:
            try:
                url = url.split('[IMG]')[1]
            except IndexError:
                pass
        if '[/IMG]' in url:
            try:
                url = url.split('[/IMG]')[0]
            except IndexError:
                pass
        if url.endswith('?fb'):
            url = url.replace('?fb', '')
        if url.endswith('?noredirect'):
            url = url.replace('?noredirect', '')
        elif url.endswith(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
            url = url.replace(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium', '')
        elif url.endswith('?s=sms'):
            url = url.replace('?s=sms', '')
        if '//m.imgur.com' in url:
            url = url.replace('//m.imgur.com', '//imgur.com')
        if url.startswith('https://thumbs.gfycat.com/'):
            url = url.replace('https://thumbs.gfycat.com/',
                              'https://gfycat.com/')
        if url.endswith('-size_restricted.gif'):
            url = url.replace('-size_restricted.gif', '')
        # """
        urlset.add(url)
    return urlset
Ejemplo n.º 20
0
# plt.bar(counts.index, counts.values, label="mean: " +str(np.mean(counts.values)))
# plt.xticks(counts.index, rotation='vertical')
# plt.subplots_adjust(bottom=0.25)
# plt.legend()
# plt.show()
# #plt.savefig('tweetsperday.png', format='png')

# selected = []
# for i in range(len(df['Date'])):
#     if df['Date'][i] == '12/2/2019':
#         selected.append(trump['text'][i])

vec = CountVectorizer(stop_words="english")
w = []
for i in trump['text']:
    if extractor.has_urls(i):
        url = extractor.find_urls(i)
        for k in url:
            i = i.replace(k, '')
            w.append(i)
    else:
        w.append(i)

w.remove(' https://t.co/G6lGfyxSUs')
#remove '@' and '#'
for i in range(len(w)):
    if '@' in w[i]:
        w[i] = w[i].replace('@', '')
    if '#' in w[i]:
        w[i] = w[i].replace('#', '')
    if '&amp' in w[i]:
gc.collect()

list_full_urls = []
list_domains = []

length = len(trolls)

for i in range(length): # TODO set length

    if i % 100 == 0:
       print(i)
    troll_text = trolls.iloc[i][TWITTER_MESSAGE_CONTENT]
    all_urls = []


    if extractor.has_urls(troll_text):
        all_urls = extractor.find_urls(troll_text)

    for url in all_urls:
        try:
            unshortened_url = get_unshortened_url(url)
            url_domain = get_domain_from_url(unshortened_url)
            #print(unshortened_url, url_domain)
            list_full_urls.append(unshortened_url)
            list_domains.append(url_domain)
        except:
            print("broken url ", url)
            continue

troll_url_data = pd.DataFrame({"url" : list_full_urls, "domain" : list_domains})
troll_url_data.to_csv("/output/" + TROLL_TYPE + "_urls.csv", index=False)