Python search_urls Exemples, sopel.tools.web.search_urls Python Exemples

Exemple #1

0

Afficher le fichier

def test_search_urls_exclusion_char():
    # assert url is excluded
    urls = list(search_urls('!http://example.com', exclusion_char='!'))
    assert not urls, 'Must not find URL, found %d' % len(urls)

    # assert the other url is not excluded
    urls = list(search_urls('http://b.com !http://a.com', exclusion_char='!'))
    assert len(urls) == 1, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://b.com' in urls

    # assert the order of appearance does not matter
    urls = list(search_urls('!http://a.com http://b.com', exclusion_char='!'))
    assert len(urls) == 1, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://b.com' in urls

Exemple #2

0

Afficher le fichier

Fichier : url.py Projet : r4f4/sopel

def title_command(bot, trigger):
    """
    Show the title or URL information for the given URL, or the last URL seen
    in this channel.
    """
    if not trigger.group(2):
        if trigger.sender not in bot.memory['last_seen_url']:
            return
        matched = check_callbacks(
            bot, bot.memory['last_seen_url'][trigger.sender])
        if matched:
            return
        else:
            urls = [bot.memory['last_seen_url'][trigger.sender]]
    else:
        urls = web.search_urls(
            trigger,
            exclusion_char=bot.config.url.exclusion_char)

    for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
        message = '[ %s ] - %s' % (title, domain)
        if tinyurl:
            message += ' ( %s )' % tinyurl
        bot.reply(message)
        bot.memory['last_seen_url'][trigger.sender] = url

Exemple #3

0

Afficher le fichier

Fichier : url.py Projet : heroku-miraheze/Exambot-Source

def title_auto(bot, trigger):
    """
    Automatically show titles for URLs. For shortened URLs/redirects, find
    where the URL redirects to and show the title for that (or call a function
    from another module to give more information).
    """
    if re.match(bot.config.core.prefix + 'title', trigger):
        return

    # Avoid fetching known malicious links
    if 'safety_cache' in bot.memory and trigger in bot.memory['safety_cache']:
        if bot.memory['safety_cache'][trigger]['positives'] > 1:
            return

    urls = web.search_urls(trigger,
                           exclusion_char=bot.config.url.exclusion_char,
                           clean=True)

    for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
        message = '[ %s ] - %s' % (title, domain)
        if tinyurl:
            message += ' ( %s )' % tinyurl
        # Guard against responding to other instances of this bot.
        if message != trigger:
            bot.say(message)
            bot.memory['last_seen_url'][trigger.sender] = url

Exemple #4

0

Afficher le fichier

def test_search_urls_exclusion_char_only_once():
    # assert only the instance excluded is excluded
    # ie. that it is not a global exclude, otherwise that would return 1 url
    urls = list(
        search_urls('!http://a.com http://a.com http://b.com',
                    exclusion_char='!'))
    assert len(urls) == 2, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://a.com' in urls
    assert 'http://b.com' in urls

Exemple #5

0

Afficher le fichier

def test_search_urls_multiple_urls_unique_keep_ordering():
    urls = list(
        search_urls('http://a.com/ http://c.com/ http://b.com/ http://a.com/'))
    assert len(urls) == 3, 'Must find 3 URLs, found %d' % len(urls)
    assert 'http://a.com/' in urls
    assert 'http://b.com/' in urls
    assert 'http://c.com/' in urls
    assert urls == [
        'http://a.com/',
        'http://c.com/',
        'http://b.com/',
    ]

Exemple #6

0

Afficher le fichier

def test_search_urls_defined_schemes(scheme):
    expected = {
        'http': 'http://a.com',
        'https': 'https://c.com',
        'ftp': 'ftp://b.com',
        'steam': 'steam://portal2',
    }.get(scheme)

    urls = list(
        search_urls('http://a.com ftp://b.com https://c.com steam://portal2',
                    schemes=[scheme]))
    assert len(urls) == 1, 'Only %s URLs must be found' % scheme
    assert expected in urls

Exemple #7

0

Afficher le fichier

Fichier : coretasks.py Projet : GewoonYorick/sopel

def handle_url_callbacks(bot, trigger):
    """Dispatch callbacks on URLs

    For each URL found in the trigger, trigger the URL callback registered by
    the ``@url`` decorator.
    """
    schemes = bot.config.core.auto_url_schemes
    # find URLs in the trigger
    for url in web.search_urls(trigger, schemes=schemes):
        # find callbacks for said URL
        for function, match in bot.search_url_callbacks(url):
            # trigger callback defined by the `@url` decorator
            if hasattr(function, 'url_regex'):
                function(bot, trigger, match=match)

Exemple #8

0

Afficher le fichier

Fichier : coretasks.py Projet : Sickmantella/sopel

def handle_url_callbacks(bot, trigger):
    """Dispatch callbacks on URLs

    For each URL found in the trigger, trigger the URL callback registered by
    the ``@url`` decorator.
    """
    schemes = bot.config.core.auto_url_schemes
    # find URLs in the trigger
    for url in web.search_urls(trigger, schemes=schemes):
        # find callbacks for said URL
        for function, match in bot.search_url_callbacks(url):
            # trigger callback defined by the `@url` decorator
            if hasattr(function, 'url_regex'):
                # bake the `match` argument in before passing the callback on
                @functools.wraps(function)
                def decorated(bot, trigger):
                    return function(bot, trigger, match=match)

                bot.call(decorated, bot, trigger)

Exemple #9

0

Afficher le fichier

Fichier : url.py Projet : njsmith/sopel

def title_command(bot, trigger):
    """
    Show the title or URL information for the given URL, or the last URL seen
    in this channel.
    """
    if not trigger.group(2):
        if trigger.sender not in bot.memory['last_seen_url']:
            return
        matched = check_callbacks(bot,
                                  bot.memory['last_seen_url'][trigger.sender])
        if matched:
            return
        else:
            urls = [bot.memory['last_seen_url'][trigger.sender]]
    else:
        urls = list(  # needs to be a list so len() can be checked later
            web.search_urls(trigger,
                            exclusion_char=bot.config.url.exclusion_char))

    result_count = 0
    for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
        message = '%s | %s' % (title, domain)
        if tinyurl:
            message += ' ( %s )' % tinyurl
        bot.reply(message)
        bot.memory['last_seen_url'][trigger.sender] = url
        result_count += 1

    expected_count = len(urls)
    if result_count < expected_count:
        if expected_count == 1:
            bot.reply(
                "Sorry, fetching that title failed. Make sure the site is working."
            )
        elif result_count == 0:
            bot.reply("Sorry, I couldn't fetch titles for any of those.")
        else:
            bot.reply(
                "I couldn't get all of the titles, but I fetched what I could!"
            )

Exemple #10

0

Afficher le fichier

def title_command(bot: SopelWrapper, trigger: Trigger):
    """
    Show the title or URL information for the given URL, or the last URL seen
    in this channel.
    """
    result_count = 0

    if not trigger.group(2):
        if trigger.sender not in bot.memory['last_seen_url']:
            return
        urls = [bot.memory["last_seen_url"][trigger.sender]]
    else:
        # needs to be a list so len() can be checked later
        urls = list(web.search_urls(trigger))

    for url, title, domain, tinyurl, dispatched in process_urls(
            bot, trigger, urls, requested=True):
        if dispatched:
            result_count += 1
            continue
        message = "%s | %s" % (title, domain)
        if tinyurl:
            message += ' ( %s )' % tinyurl
        bot.reply(message)
        bot.memory['last_seen_url'][trigger.sender] = url
        result_count += 1

    expected_count = len(urls)
    if result_count < expected_count:
        if expected_count == 1:
            bot.reply(
                "Sorry, fetching that title failed. Make sure the site is working."
            )
        elif result_count == 0:
            bot.reply("Sorry, I couldn't fetch titles for any of those.")
        else:
            bot.reply(
                "I couldn't get all of the titles, but I fetched what I could!"
            )

Exemple #11

0

Afficher le fichier

def title_auto(bot: SopelWrapper, trigger: Trigger):
    """
    Automatically show titles for URLs. For shortened URLs/redirects, find
    where the URL redirects to and show the title for that (or call a function
    from another plugin to give more information).
    """
    # Enabled or disabled by feature flag
    if not bot.settings.url.enable_auto_title:
        return

    # Avoid fetching links from another command
    if re.match(bot.config.core.prefix + r'\S+', trigger):
        return

    unchecked_urls = web.search_urls(
        trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)

    urls = []
    safety_cache = bot.memory.get("safety_cache", {})
    safety_cache_local = bot.memory.get("safety_cache_local", {})
    for url in unchecked_urls:
        # Avoid fetching known malicious links
        if url in safety_cache and safety_cache[url]["positives"] > 0:
            continue
        if urlparse(url).hostname.lower() in safety_cache_local:
            continue
        urls.append(url)

    for url, title, domain, tinyurl, dispatched in process_urls(
            bot, trigger, urls):
        if not dispatched:
            message = '%s | %s' % (title, domain)
            if tinyurl:
                message += ' ( %s )' % tinyurl
            # Guard against responding to other instances of this bot.
            if message != trigger:
                bot.say(message)
        bot.memory["last_seen_url"][trigger.sender] = url

Exemple #12

0

Afficher le fichier

    def __init__(self, own_nick, line, url_schemes=None):
        line = line.strip('\r\n')
        self.line = line
        self.urls = tuple()
        self.plain = ''

        # Break off IRCv3 message tags, if present
        self.tags = {}
        if line.startswith('@'):
            tagstring, line = line.split(' ', 1)
            for tag in tagstring[1:].split(';'):
                tag = tag.split('=', 1)
                if len(tag) > 1:
                    self.tags[tag[0]] = tag[1]
                else:
                    self.tags[tag[0]] = None

        self.time = datetime.datetime.utcnow().replace(
            tzinfo=datetime.timezone.utc)
        if 'time' in self.tags:
            try:
                self.time = datetime.datetime.strptime(
                    self.tags['time'],
                    "%Y-%m-%dT%H:%M:%S.%fZ",
                ).replace(tzinfo=datetime.timezone.utc)
            except ValueError:
                pass  # Server isn't conforming to spec, ignore the server-time

        # Grabs hostmask from line.
        # Example: line = ':Sopel!foo@bar PRIVMSG #sopel :foobar!'
        #          print(hostmask)  # Sopel!foo@bar
        # All lines start with ":" except PING.
        if line.startswith(':'):
            self.hostmask, line = line[1:].split(' ', 1)
        else:
            self.hostmask = None

        # Parses the line into a list of arguments.
        # Some events like MODE don't have a secondary string argument, i.e. no ' :' inside the line.
        # Example 1:  line = ':nick!ident@domain PRIVMSG #sopel :foo bar!'
        #             print(text)    # 'foo bar!'
        #             print(argstr)  # ':nick!ident@domain PRIVMSG #sopel'
        #             print(args)    # [':nick!ident@domain', 'PRIVMSG', '#sopel', 'foo bar!']
        # Example 2:  line = 'irc.libera.chat MODE Sopel +i'
        #             print(text)    # '+i'
        #             print(args)    # ['irc.libera.chat', 'MODE', 'Sopel', '+i']
        if ' :' in line:
            argstr, self.text = line.split(' :', 1)
            self.args = argstr.split(' ')
            self.args.append(self.text)
        else:
            self.args = line.split(' ')
            self.text = self.args[-1]

        self.event = self.args[0]
        self.args = self.args[1:]
        components = PreTrigger.component_regex.match(self.hostmask
                                                      or '').groups()
        self.nick, self.user, self.host = components
        self.nick = tools.Identifier(self.nick)

        # If we have arguments, the first one is the sender
        # Unless it's a QUIT event
        if self.args and self.event != 'QUIT':
            target = tools.Identifier(self.args[0])
        else:
            target = None

        # Unless we're messaging the bot directly, in which case that second
        # arg will be our bot's name.
        if target and target.lower() == own_nick.lower():
            target = self.nick
        self.sender = target

        # Parse CTCP into a form consistent with IRCv3 intents
        if self.event == 'PRIVMSG' or self.event == 'NOTICE':
            intent_match = PreTrigger.intent_regex.match(self.args[-1])
            if intent_match:
                intent, message = intent_match.groups()
                self.tags['intent'] = intent
                self.args[-1] = message or ''

            # Search URLs after CTCP parsing
            self.urls = tuple(
                web.search_urls(self.args[-1], schemes=url_schemes))

        # Populate account from extended-join messages
        if self.event == 'JOIN' and len(self.args) == 3:
            # Account is the second arg `...JOIN #Sopel account :realname`
            self.tags['account'] = self.args[1]

        # get plain text message
        if self.args:
            self.plain = formatting.plain(self.args[-1])

Exemple #13

0

Afficher le fichier

def test_search_urls():
    urls = list(search_urls('http://example.com'))
    assert len(urls) == 1, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://example.com' in urls

Exemple #14

0

Afficher le fichier

def test_search_urls_default_schemes():
    urls = list(search_urls('http://a.com ftp://b.com https://c.com'))
    assert len(urls) == 3, 'Must find all three URLs'
    assert 'http://a.com' in urls
    assert 'ftp://b.com' in urls
    assert 'https://c.com' in urls

Exemple #15

0

Afficher le fichier

def test_search_urls_with_text():
    urls = list(search_urls('before http://example.com after'))
    assert len(urls) == 1, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://example.com' in urls

Exemple #16

0

Afficher le fichier

def test_search_urls_exclusion_char_with_text():
    urls = list(
        search_urls('before !http://a.com between http://b.com after',
                    exclusion_char='!'))
    assert len(urls) == 1, 'Must find 1 URL, found %d' % len(urls)
    assert 'http://b.com' in urls

Exemple #17

0

Afficher le fichier

    def __init__(
        self,
        own_nick: identifiers.Identifier,
        line: str,
        url_schemes: Optional[Sequence] = None,
        identifier_factory: IdentifierFactory = identifiers.Identifier,
    ):
        self.make_identifier = identifier_factory
        line = line.strip('\r\n')
        self.line: str = line
        self.urls: Tuple[str, ...] = tuple()
        self.plain: str = ''
        self.ctcp: Optional[str] = None

        # Break off IRCv3 message tags, if present
        self.tags: Dict[str, Optional[str]] = {}
        if line.startswith('@'):
            tagstring, line = line.split(' ', 1)
            for raw_tag in tagstring[1:].split(';'):
                tag = raw_tag.split('=', 1)
                if len(tag) > 1:
                    self.tags[tag[0]] = tag[1]
                else:
                    self.tags[tag[0]] = None

        # Client time or server time
        self.time = datetime.datetime.utcnow().replace(
            tzinfo=datetime.timezone.utc
        )
        if 'time' in self.tags:
            # ensure "time" is a string (typecheck)
            tag_time = self.tags['time'] or ''
            try:
                self.time = datetime.datetime.strptime(
                    tag_time,
                    "%Y-%m-%dT%H:%M:%S.%fZ",
                ).replace(tzinfo=datetime.timezone.utc)
            except ValueError:
                pass  # Server isn't conforming to spec, ignore the server-time

        # Grabs hostmask from line.
        # Example: line = ':Sopel!foo@bar PRIVMSG #sopel :foobar!'
        #          print(hostmask)  # Sopel!foo@bar
        # All lines start with ":" except PING.
        self.hostmask: Optional[str]
        if line.startswith(':'):
            self.hostmask, line = line[1:].split(' ', 1)
        else:
            self.hostmask = None

        # Parses the line into a list of arguments.
        # Some events like MODE don't have a secondary string argument, i.e. no ' :' inside the line.
        # Example 1:  line = ':nick!ident@domain PRIVMSG #sopel :foo bar!'
        #             print(text)    # 'foo bar!'
        #             print(argstr)  # ':nick!ident@domain PRIVMSG #sopel'
        #             print(args)    # [':nick!ident@domain', 'PRIVMSG', '#sopel', 'foo bar!']
        # Example 2:  line = 'irc.libera.chat MODE Sopel +i'
        #             print(text)    # '+i'
        #             print(args)    # ['irc.libera.chat', 'MODE', 'Sopel', '+i']
        if ' :' in line:
            argstr, self.text = line.split(' :', 1)
            self.args = argstr.split(' ')
            self.args.append(self.text)
        else:
            self.args = line.split(' ')
            self.text = self.args[-1]

        self.event = self.args[0]
        self.args = self.args[1:]

        # The regex will always match any string, even an empty one
        components_match = cast(
            Match, PreTrigger.component_regex.match(self.hostmask or ''))
        nick, self.user, self.host = components_match.groups()
        self.nick: identifiers.Identifier = self.make_identifier(nick)

        # If we have arguments, the first one is the sender
        # Unless it's a QUIT event
        target: Optional[identifiers.Identifier] = None

        if self.args and self.event != 'QUIT':
            target = self.make_identifier(self.args[0])

            # Unless we're messaging the bot directly, in which case that
            # second arg will be our bot's name.
            if target.lower() == own_nick.lower():
                target = self.nick

        self.sender = target

        # Parse CTCP
        if self.event == 'PRIVMSG' or self.event == 'NOTICE':
            ctcp_match = PreTrigger.ctcp_regex.match(self.args[-1])
            if ctcp_match is not None:
                ctcp, message = ctcp_match.groups()
                self.ctcp = ctcp
                self.args[-1] = message or ''

            # Search URLs after CTCP parsing
            self.urls = tuple(
                web.search_urls(self.args[-1], schemes=url_schemes))

        # Populate account from extended-join messages
        if self.event == 'JOIN' and len(self.args) == 3:
            # Account is the second arg `...JOIN #Sopel account :realname`
            self.tags['account'] = self.args[1]

        # get plain text message
        if self.args:
            self.plain = formatting.plain(self.args[-1])

Exemple #18

0

Afficher le fichier

def test_search_urls_multiple_urls_with_text():
    urls = list(
        search_urls('before http://a.com/ between http://b.com/ after'))
    assert len(urls) == 2, 'Must find 2 URLs, found %d' % len(urls)
    assert 'http://a.com/' in urls
    assert 'http://b.com/' in urls

Exemple #19

0

Afficher le fichier

def test_search_urls_multiple_urls_unique():
    urls = list(search_urls('http://a.com/ http://b.com/ http://a.com/'))
    assert len(urls) == 2, 'Must find 2 URLs, found %d' % len(urls)
    assert 'http://a.com/' in urls
    assert 'http://b.com/' in urls