Python URLExtract.update_when_older Exemples, urlextract.URLExtract.update_when_older Python Exemples

Exemple #1

0

Afficher le fichier

def get_urls(body) -> List[str]:
    # Set up an extractor to extract the URLs
    extractor = URLExtract()
    # Update the TLD list if it is older than x days
    extractor.update_when_older(7)
    # Run the extractor and remove any duplicates
    urls = extractor.find_urls(body, only_unique=True)

    return urls

Exemple #2

0

Afficher le fichier

    def __extract_domain_from_sent_field(self, sent: str) -> str:
        """
        Get the url out of a 'sent' field in a measurement.

        Parameters
        ----------
        sent: str
            Examples:

            * An empty string ("") meaning the sent packet wasn't recorded.
            * "GET / HTTP/1.1\r\nHost: example5718349450314.com\r\n" (echo/discard)
            * "GET www.bbc.co.uk HTTP/1.1\r\nHost: /content.html\r\n" (discard error) or just "www.apple.com" (HTTP/S)

        Returns
        -------
        str
            Just the url, if found.
        """
        extractor = URLExtract()
        extractor.update_when_older(
            7)  # updates known TLD when list is older that 7 days

        if sent == '':
            return sent

        match = re.search(QuackConstants.SENT_PATTERN.value, sent)
        if match:
            path = match.group(1)
            domain = match.group(2)

            # This is a bug where the domain and path were reversed in content sent.
            # We do our best to reconstruct the intended url
            # by swapping them to their intended position
            if extractor.has_urls(path):
                domain, path = path, domain

            if path == '/':
                return domain
            return domain + path

        if ' ' not in sent:
            return sent

        raise Exception(f"unknown sent field format: {sent}")

Exemple #3

0

Afficher le fichier

def incoming(update, context):
    '''Check incoming stream for urls and put attempted bypasses on them if they are in the list of domains that need it'''
    extractor = URLExtract()
    extractor.update_when_older(
        7)  # gets the latest list of TLDs from iana.org every 7 days
    urls = extractor.find_urls(update.effective_message.text, check_dns=True)
    active_dict = context.chat_data.get(
        'active domains', {}
    )  # this s/could have been a set instead. stuck as dict for legacy reasons
    for url in urls:
        if get_domain(url) not in active_dict:
            continue
        context.bot.send_chat_action(chat_id=update.effective_message.chat_id,
                                     action=ChatAction.TYPING)
        text = add_bypass(url, context=context)
        say(text, update, context)
    if len(urls) == 1:
        context.chat_data['last url'] = urls[0]
        url_bookkeeping(context)

Exemple #4

0

Afficher le fichier

Fichier : linkchecker.py Projet : morenoMe/pajbot

import pajbot.utils
from pajbot.apiwrappers.safebrowsing import SafeBrowsingAPI
from pajbot.managers.adminlog import AdminLogManager
from pajbot.managers.db import Base
from pajbot.managers.db import DBManager
from pajbot.managers.handler import HandlerManager
from pajbot.models.command import Command
from pajbot.models.command import CommandExample
from pajbot.modules import BaseModule
from pajbot.modules import ModuleSetting

log = logging.getLogger(__name__)


extractor = URLExtract()
extractor.update_when_older(14)


def is_subdomain(x, y):
    """Returns True if x is a subdomain of y, otherwise return False.

    Example:
    is_subdomain('pajlada.se', 'pajlada.se') = True
    is_subdomain('test.pajlada.se', 'pajlada.se') = True
    is_subdomain('test.pajlada.se', 'pajlada.com') = False
    """
    if y.startswith("www."):
        y = y[4:]
    return x.endswith("." + y) or x == y

Exemple #5

0

Afficher le fichier

class EmailHandler(object):
    def __init__(self, config):
        self.config = config

        self.region = self.config.get("mud", {}).get("region", "us-east-1")

        email_config = self.config.get("email", {})
        self.endpoint = email_config.get("endpoint", None)
        self.use_ssl = email_config.get("useSsl", True)

        self.session = boto3.session.Session(region_name=self.region)
        self.ses = self.session.client('ses',
                                       endpoint_url=self.endpoint,
                                       use_ssl=self.use_ssl)

        self.mocked = email_config.get("mocked", False)

        self.extractor = URLExtract()
        self.extractor.update = self._update_tlds
        self.extractor_lock = Lock()

        self.url_re = re.compile(r'^(?P<type>.*://)?(?P<remainder>.*)$')
        self.html2text = HTML2Text()
        self.html2text.ignore_links = False
        self.html2text.ignore_images = False
        self.html2text.ignore_tables = False
        self.html2text.ignore_emphasis = False
        self.html2text_lock = Lock()

        self.in_channel = stackless.channel()

        stackless.tasklet(self._send_email_loop)()

    def _update_tlds(self):
        response = requests.get(
            "https://data.iana.org/TLD/tlds-alpha-by-domain.txt")
        data = response.content
        # noinspection PyProtectedMember
        filename = self.extractor._get_cache_file_path(None)
        with open(filename, "wb") as f:
            f.write(data)
        # noinspection PyProtectedMember
        self.extractor._reload_tlds_from_file()
        return True

    def send_email(self, from_, to, subject, body_html=None, body_text=None):
        request = EmailRequest(from_, to, subject, body_html, body_text)
        self.in_channel.send(request)
        response = request.channel.receive()
        return response

    def _send_email_loop(self):
        while True:
            request = self.in_channel.receive()
            stackless.tasklet(self._send_email_tasklet)(request)

    @staticmethod
    def _log_email(request, message, level="info"):
        this_logger = getattr(logger, level, logger.debug)

        # First the global copy
        this_logger(message)

        # Then send it to the accounts involved
        for account in request.accounts:
            this_logger(AccountLogMessage(account, message))

    def _send_email_tasklet(self, request):
        response = {}
        try:
            self._log_email(
                request,
                "Sending email from %s to %s" % (request.from_, request.to))
            if not request.body_html:
                if not request.body_text:
                    raise Exception("No body given")

                request.body_html = self.htmlize_body(request.body_text)

            if not request.body_text:
                request.body_text = self.unhtmlize_body(request.body_html)

            email = {
                "Destination": {
                    "ToAddresses": [
                        request.to,
                    ],
                },
                "Source": request.from_,
                "Message": {
                    "Body": {
                        "Html": {
                            "Charset": "utf-8",
                            "Data": request.body_html,
                        },
                        "Text": {
                            "Charset": "utf-8",
                            "Data": request.body_text,
                        },
                    },
                    "Subject": {
                        "Charset": "utf-8",
                        "Data": request.subject,
                    },
                },
                "ConfigurationSetName": "havokmud",
            }

            # This is necessary for SES to send the email
            # When using localstack, we "verify" as we go
            # In real life, the domain should be verified before using this, preferrably
            # by using Easy DKIM.
            if self.mocked:
                self.ses.verify_email_identity(EmailAddress=request.from_)

            response = self.ses.send_email(**email)
            self._log_email(
                request,
                "Email sent: MessageID: %s" % response.get("MessageId", None))
        except ClientError as e:
            # Display an error if something goes wrong.
            self._log_email(
                request,
                "Error sending email: %s" %
                e.response.get("Error", {}).get("Message", "unknown error"),
                level="error")
            response = e.response['Error']['Message']
        except Exception as e:
            self._log_email(request,
                            "Error sending email: %s" % str(e),
                            level="exception")
            response = {"Error": {"Message": str(e)}}

        request.channel.send(response)

    def htmlize_body(self, text):
        # Want to put links on URLs, and make the rest html-safe
        with self.extractor_lock:
            self.extractor.update_when_older(7)

            _input = text
            output = b"<html><body>"
            while _input:
                urls = self.extractor.find_urls(_input)
                if urls:
                    url = urls.pop(0)
                    start_index = _input.index(url)
                    end_index = start_index + len(url)
                    output += self._escape(_input[:start_index])
                    match = self.url_re.match(url)
                    if not match:
                        output += self._escape(url)
                    else:
                        if not match.group("type"):
                            url = "http://%s" % url
                        output += b'<a href="%s">%s</a>' % (url, url)
                    _input = _input[end_index:]
                else:
                    output += self._escape(_input)
                    _input = None

        output += b"</body></html>"
        return output.decode("utf-8")

    @staticmethod
    def _escape(text):
        return html.escape(text).encode("ascii", "xmlcharrefreplace")

    def unhtmlize_body(self, html_):
        # Strip this down to Markdown text
        with self.html2text_lock:
            return self.html2text.handle(html_)