Example #1
0
    def process_unknown_code(self, response):
        if response.status_code != 200:
            return BaseService.process_unknown_code(self, response)

        url = self.params['url_template'].format(shortcode=self.current_shortcode)
        response = self.fetch_url(url, 'get')

        if response.status_code != 200:
            raise UnexpectedNoResult(
                "Didn't get OK on second try. Got {0} for {1}"
                .format(response.status_code, self.current_shortcode)
                )

        # Copied form tinyback. I don't think code will reach here anymore

        match = re.search(
            "<a class=\"btn ignore\" href=\"(.*?)\" title=",
            html_unescape(response.text)
        )

        if not match:
            raise UnexpectedNoResult(
                "Didn't get match on second try for {0}"
                .format(self.current_shortcode)
            )

        return (URLStatus.ok, match.group(1), response.encoding)
Example #2
0
    def process_unknown_code(self, response):
        first_status_code = response.status_code

        if first_status_code not in (200, 500):
            return BaseService.process_unknown_code(self, response)

        url = self.params["url_template"].format(shortcode=self.current_shortcode)
        response = self.fetch_url(url, "get")
        second_status_code = response.status_code

        if second_status_code not in (200, 500):
            raise UnhandledStatusCode(
                "HTTP status changed from %s to %i on second request for %s"
                % (first_status_code, second_status_code, self.current_shortcode)
            )

        pattern = '<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class="quote"><span class="quotet"></span><br/>(.*?)</div> <br />'
        match = re.search(pattern, response.text)

        if not match:
            text = response.text.replace("<br />\n", "")
            match = re.search(pattern, text)

        if not match:
            pattern = '<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class="quote"><span class="quotet"></span><br/>(.*?)</div> <br />'
            match = re.search(pattern, response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult("Could not find target URL on preview page for {0}".format(self.current_shortcode))

        url = html_unescape(match.group(1))
        url = url.replace("\n", "").replace("\r", "")

        return URLStatus.ok, url, response.encoding
Example #3
0
    def process_redirect(self, response):
        if '<title>Spammer</title>' in response.text or \
                '<title>Phisher</title>' in response.text or \
                'It has automatically been terminated.' in response.text or \
                'This link was created by a spammer' in response.text or \
                'This link was created by an unknown spammer' in response.text or \
                'This link was abused by' in response.text or \
                '<title>Abuse</title>' in response.text or \
                '<title>Link Removed</title>' in response.text or \
                '<title>Phishing Link</title>' in response.text or \
                '<title>TOS</title>' in response.text:
            return (URLStatus.unavailable, None, None)

        if not response.text.strip():
            return (URLStatus.not_found, None, None)

        groups = re.findall((
            r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|'
            '<frame src="(.*)(?:\r\n|">)|'
            'rel="canonical" href="(.*)"/>'
            ),
            response.text
        )

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)

            if 'ad.adjix.com' in link:
                continue

            return (URLStatus.ok, link, response.encoding)

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)
            return (URLStatus.ok, link, response.encoding)

        raise UnexpectedNoResult(
            "Didn't get anything for {0}".format(self.current_shortcode))
Example #4
0
    def process_redirect(self, response):
        if '<title>Spammer</title>' in response.text or \
                '<title>Phisher</title>' in response.text or \
                'It has automatically been terminated.' in response.text or \
                'This link was created by a spammer' in response.text or \
                'This link was created by an unknown spammer' in response.text or \
                'This link was abused by' in response.text or \
                '<title>Abuse</title>' in response.text or \
                '<title>Link Removed</title>' in response.text or \
                '<title>Phishing Link</title>' in response.text or \
                '<title>TOS</title>' in response.text:
            return (URLStatus.unavailable, None, None)

        if not response.text.strip():
            return (URLStatus.not_found, None, None)

        groups = re.findall((r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|'
                             '<frame src="(.*)(?:\r\n|">)|'
                             'rel="canonical" href="(.*)"/>'), response.text)

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)

            if 'ad.adjix.com' in link:
                continue

            return (URLStatus.ok, link, response.encoding)

        for group in groups:
            text = group[0] or group[1] or group[2]
            link = html_unescape(text)
            return (URLStatus.ok, link, response.encoding)

        raise UnexpectedNoResult("Didn't get anything for {0}".format(
            self.current_shortcode))
Example #5
0
    def process_unavailable(self, response):
        if response.status_code != 410:
            return BaseService.process_unavailable(self, response)

        match = re.search(r'was forwarding to: <BR> <font color=red>(.*)</font>', response.text)

        if not match:
            if re.search(r'This shortURL address was REMOVED for SPAMMING', response.text):
                return URLStatus.unavailable, None, None

        if not match and 'REMOVED FOR SPAMMING' in response.text:
            return URLStatus.unavailable, None, None

        if not match:
            raise UnexpectedNoResult(
                "Could not find target URL on blocked page for {0}"
                .format(self.current_shortcode))

        url = html_unescape(match.group(1))

        return URLStatus.ok, url, response.encoding
Example #6
0
    def process_unknown_code(self, response):
        first_status_code = response.status_code

        if first_status_code not in (200, 500):
            return BaseService.process_unknown_code(self, response)

        url = self.params['url_template'].format(
            shortcode=self.current_shortcode)
        response = self.fetch_url(url, 'get')
        second_status_code = response.status_code

        if second_status_code not in (200, 500):
            raise UnhandledStatusCode(
                "HTTP status changed from %s to %i on second request for %s" %
                (first_status_code, second_status_code,
                 self.current_shortcode))

        pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />"
        match = re.search(pattern, response.text)

        if not match:
            text = response.text.replace("<br />\n", "")
            match = re.search(pattern, text)

        if not match:
            pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />"
            match = re.search(pattern, response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "Could not find target URL on preview page for {0}".format(
                    self.current_shortcode))

        url = html_unescape(match.group(1))
        url = url.replace('\n', '').replace('\r', '')

        return URLStatus.ok, url, response.encoding