Beispiel #1
0
    def process_unknown_code(self, response):
        if response.status_code != 200:
            return BaseService.process_unknown_code(self, response)

        url = self.params['url_template'].format(shortcode=self.current_shortcode)
        response = self.fetch_url(url, 'get')

        if response.status_code != 200:
            raise UnexpectedNoResult(
                "Didn't get OK on second try. Got {0} for {1}"
                .format(response.status_code, self.current_shortcode)
                )

        # Copied form tinyback. I don't think code will reach here anymore

        match = re.search(
            "<a class=\"btn ignore\" href=\"(.*?)\" title=",
            html_unescape(response.text)
        )

        if not match:
            raise UnexpectedNoResult(
                "Didn't get match on second try for {0}"
                .format(self.current_shortcode)
            )

        return (URLStatus.ok, match.group(1), response.encoding)
Beispiel #2
0
    def process_redirect(self, response):
        if response.status_code == 302:
            if 'location' not in response.headers:
                raise UnexpectedNoResult()

            url = urlparse.urlparse(response.headers['location'])

            if url.scheme != "http" or url.netloc != "bit.ly" or url.path != "/a/warning":
                raise UnexpectedNoResult("Unexpected Location header after HTTP status 302")

            if sys.version_info[0] == 2:
                query = urlparse.parse_qs(url.query.encode('latin-1'))
            else:
                query = urlparse.parse_qs(url.query)

            if not ("url" in query and len(query["url"]) == 1) or not ("hash" in query and len(query["hash"]) == 1):
                raise UnexpectedNoResult("Unexpected Location header after HTTP status 302")
            if query["hash"][0] != self.current_shortcode:
                raise UnexpectedNoResult("Hash mismatch for HTTP status 302")

            if sys.version_info[0] == 2:
                unshortened_url = query["url"][0].decode('latin-1')
            else:
                unshortened_url = query["url"][0]

            return (URLStatus.ok, unshortened_url, None)

        else:
            return BaseService.process_redirect(self, response)
Beispiel #3
0
    def _preview(self, code, affiliate_url):
        response = self.fetch_url("https://tinyurl.com/preview.php?num=" +
                                  code,
                                  method='get')

        if response.status_code != 200:
            raise UnexpectedNoResult(
                "Unexpected HTTP status %i on preview page %s" %
                (response.status_code, response.url))

        match = re.search(
            "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>",
            response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult("No redirect on preview page {0}".format(
                response.url))

        url = match.group(1)

        if url == "":
            return self._scrub_url(code, affiliate_url)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
    def process_redirect(self, response):
        if 'Location' in response.headers:
            result_url = response.headers['Location']

            if sys.version_info[0] == 2 and \
                    isinstance(result_url, terroroftinytown.six.binary_type):
                # Headers are treated as latin-1
                # This is needed so that unit tests don't need to
                # do implicit unicode conversion. Ick!
                result_url = result_url.decode('latin-1')

            response.content  # read the response to allow connection reuse
            return self.check_anti_regex(response, result_url, None)
        elif self.params.get('body_regex'):
            return self.process_redirect_body(response)
        elif self.tolerate_missing_location_header:
            response.content  # read the response to allow connection reuse
            return self.process_no_redirect(response)
        else:
            response.content  # read the response to allow connection reuse

            raise UnexpectedNoResult(
                'Unexpectedly did not get a redirect result for {0}'
                .format(repr(response.url))
            )
Beispiel #5
0
    def _parse_errorhelp(self, response):
        match = re.search('<meta http-equiv="refresh" content="0;url=(.*?)">',
                          response.text)

        if not match:
            raise UnexpectedNoResult(
                "No redirect on \"errorhelp\" page on HTTP status 200 for {0}".
                format(response.url))

        url = urlparse.urlparse(match.group(1))

        if url.scheme != "http" or url.netloc != "tinyurl.com" or url.path != "/errorb.php":
            raise UnexpectedNoResult(
                "Unexpected redirect on \"errorhelp\" page  on HTTP status 200 for {0}"
                .format(response.url))

        if sys.version_info[0] == 2:
            query = urlparse.parse_qs(url.query.encode('utf-8'))
        else:
            query = urlparse.parse_qs(url.query)

        if not ("url" in query and len(query["url"])
                == 1) or not ("path" in query and len(query["path"]) == 1):
            raise UnexpectedNoResult(
                "Unexpected redirect on \"errorhelp\" page  on HTTP status 200 for {0}"
                .format(response.url))

        if query["path"][0] != ("/" + self.current_shortcode):
            raise UnexpectedNoResult(
                "Code mismatch on \"errorhelp\" on HTTP status 200")

        encoding = response.encoding

        if sys.version_info[0] == 2:
            try:
                result_url = query["url"][0].decode('utf-8')
            except UnicodeError:
                try:
                    result_url = query["url"][0].decode('cp1252')
                    encoding = 'cp1252'
                except UnicodeError:
                    result_url = query["url"][0].decode('latin-1')
                    encoding = 'latin-1'
        else:
            result_url = query["url"][0]

        return (URLStatus.ok, result_url, encoding)
    def process_redirect_body(self, response):
        pattern = self.params['body_regex']
        match = re.search(pattern, html_unescape(response.text))

        if match:
            return self.check_anti_regex(response, match.group(1), response.encoding)
        else:
            raise UnexpectedNoResult(
                'Unexpectedly did not get a body result for {0}'
                .format(repr(response.url))
            )
Beispiel #7
0
    def _parse_spam_blocklist(self, response):
        match = re.search("<p>This TinyURL went to: (.*?)</p>", response.text,
                          re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "No redirect on \"spam redirect\" page on HTTP status 200 for {0}"
                .format(response.url))

        url = match.group(1)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
Beispiel #8
0
    def _parse_tinyurl_redirect(self, response):
        match = re.search(
            "<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is (?:<script>.*?</script>)?<a href=\"(.*?)\">",
            response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "No redirect on \"tinyurl redirect\" page on HTTP status 200 for {0}"
                .format(response.url))

        url = match.group(1)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
Beispiel #9
0
    def process_unavailable(self, response):
        if response.status_code != 410:
            return BaseService.process_unavailable(self, response)

        match = re.search(r'was forwarding to: <BR> <font color=red>(.*)</font>', response.text)

        if not match:
            if re.search(r'This shortURL address was REMOVED for SPAMMING', response.text):
                return URLStatus.unavailable, None, None

        if not match and 'REMOVED FOR SPAMMING' in response.text:
            return URLStatus.unavailable, None, None

        if not match:
            raise UnexpectedNoResult(
                "Could not find target URL on blocked page for {0}"
                .format(self.current_shortcode))

        url = html_unescape(match.group(1))

        return URLStatus.ok, url, response.encoding
Beispiel #10
0
    def process_unknown_code(self, response):
        first_status_code = response.status_code

        if first_status_code not in (200, 500):
            return BaseService.process_unknown_code(self, response)

        url = self.params['url_template'].format(
            shortcode=self.current_shortcode)
        response = self.fetch_url(url, 'get')
        second_status_code = response.status_code

        if second_status_code not in (200, 500):
            raise UnhandledStatusCode(
                "HTTP status changed from %s to %i on second request for %s" %
                (first_status_code, second_status_code,
                 self.current_shortcode))

        pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />"
        match = re.search(pattern, response.text)

        if not match:
            text = response.text.replace("<br />\n", "")
            match = re.search(pattern, text)

        if not match:
            pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />"
            match = re.search(pattern, response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "Could not find target URL on preview page for {0}".format(
                    self.current_shortcode))

        url = html_unescape(match.group(1))
        url = url.replace('\n', '').replace('\r', '')

        return URLStatus.ok, url, response.encoding
Beispiel #11
0
    def process_redirect(self, response):
        if '<title>Spammer</title>' in response.text or \
                '<title>Phisher</title>' in response.text or \
                'It has automatically been terminated.' in response.text or \
                'This link was created by a spammer' in response.text or \
                'This link was created by an unknown spammer' in response.text or \
                'This link was abused by' in response.text or \
                '<title>Abuse</title>' in response.text or \
                '<title>Link Removed</title>' in response.text or \
                '<title>Phishing Link</title>' in response.text or \
                '<title>TOS</title>' in response.text:
            return (URLStatus.unavailable, None, None)

        if not response.text.strip():
            return (URLStatus.not_found, None, None)

        groups = re.findall((r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|'
                             '<frame src="(.*)(?:\r\n|">)|'
                             'rel="canonical" href="(.*)"/>'), response.text)

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)

            if 'ad.adjix.com' in link:
                continue

            return (URLStatus.ok, link, response.encoding)

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)
            return (URLStatus.ok, link, response.encoding)

        raise UnexpectedNoResult("Didn't get anything for {0}".format(
            self.current_shortcode))