def process_unknown_code(self, response): if response.status_code != 200: return BaseService.process_unknown_code(self, response) url = self.params['url_template'].format(shortcode=self.current_shortcode) response = self.fetch_url(url, 'get') if response.status_code != 200: raise UnexpectedNoResult( "Didn't get OK on second try. Got {0} for {1}" .format(response.status_code, self.current_shortcode) ) # Copied form tinyback. I don't think code will reach here anymore match = re.search( "<a class=\"btn ignore\" href=\"(.*?)\" title=", html_unescape(response.text) ) if not match: raise UnexpectedNoResult( "Didn't get match on second try for {0}" .format(self.current_shortcode) ) return (URLStatus.ok, match.group(1), response.encoding)
def process_unknown_code(self, response): first_status_code = response.status_code if first_status_code not in (200, 500): return BaseService.process_unknown_code(self, response) url = self.params["url_template"].format(shortcode=self.current_shortcode) response = self.fetch_url(url, "get") second_status_code = response.status_code if second_status_code not in (200, 500): raise UnhandledStatusCode( "HTTP status changed from %s to %i on second request for %s" % (first_status_code, second_status_code, self.current_shortcode) ) pattern = '<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class="quote"><span class="quotet"></span><br/>(.*?)</div> <br />' match = re.search(pattern, response.text) if not match: text = response.text.replace("<br />\n", "") match = re.search(pattern, text) if not match: pattern = '<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class="quote"><span class="quotet"></span><br/>(.*?)</div> <br />' match = re.search(pattern, response.text, re.DOTALL) if not match: raise UnexpectedNoResult("Could not find target URL on preview page for {0}".format(self.current_shortcode)) url = html_unescape(match.group(1)) url = url.replace("\n", "").replace("\r", "") return URLStatus.ok, url, response.encoding
def process_redirect(self, response): if '<title>Spammer</title>' in response.text or \ '<title>Phisher</title>' in response.text or \ 'It has automatically been terminated.' in response.text or \ 'This link was created by a spammer' in response.text or \ 'This link was created by an unknown spammer' in response.text or \ 'This link was abused by' in response.text or \ '<title>Abuse</title>' in response.text or \ '<title>Link Removed</title>' in response.text or \ '<title>Phishing Link</title>' in response.text or \ '<title>TOS</title>' in response.text: return (URLStatus.unavailable, None, None) if not response.text.strip(): return (URLStatus.not_found, None, None) groups = re.findall(( r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|' '<frame src="(.*)(?:\r\n|">)|' 'rel="canonical" href="(.*)"/>' ), response.text ) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) if 'ad.adjix.com' in link: continue return (URLStatus.ok, link, response.encoding) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) return (URLStatus.ok, link, response.encoding) raise UnexpectedNoResult( "Didn't get anything for {0}".format(self.current_shortcode))
def process_redirect(self, response): if '<title>Spammer</title>' in response.text or \ '<title>Phisher</title>' in response.text or \ 'It has automatically been terminated.' in response.text or \ 'This link was created by a spammer' in response.text or \ 'This link was created by an unknown spammer' in response.text or \ 'This link was abused by' in response.text or \ '<title>Abuse</title>' in response.text or \ '<title>Link Removed</title>' in response.text or \ '<title>Phishing Link</title>' in response.text or \ '<title>TOS</title>' in response.text: return (URLStatus.unavailable, None, None) if not response.text.strip(): return (URLStatus.not_found, None, None) groups = re.findall((r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|' '<frame src="(.*)(?:\r\n|">)|' 'rel="canonical" href="(.*)"/>'), response.text) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) if 'ad.adjix.com' in link: continue return (URLStatus.ok, link, response.encoding) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) return (URLStatus.ok, link, response.encoding) raise UnexpectedNoResult("Didn't get anything for {0}".format( self.current_shortcode))
def process_unavailable(self, response): if response.status_code != 410: return BaseService.process_unavailable(self, response) match = re.search(r'was forwarding to: <BR> <font color=red>(.*)</font>', response.text) if not match: if re.search(r'This shortURL address was REMOVED for SPAMMING', response.text): return URLStatus.unavailable, None, None if not match and 'REMOVED FOR SPAMMING' in response.text: return URLStatus.unavailable, None, None if not match: raise UnexpectedNoResult( "Could not find target URL on blocked page for {0}" .format(self.current_shortcode)) url = html_unescape(match.group(1)) return URLStatus.ok, url, response.encoding
def process_unknown_code(self, response): first_status_code = response.status_code if first_status_code not in (200, 500): return BaseService.process_unknown_code(self, response) url = self.params['url_template'].format( shortcode=self.current_shortcode) response = self.fetch_url(url, 'get') second_status_code = response.status_code if second_status_code not in (200, 500): raise UnhandledStatusCode( "HTTP status changed from %s to %i on second request for %s" % (first_status_code, second_status_code, self.current_shortcode)) pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />" match = re.search(pattern, response.text) if not match: text = response.text.replace("<br />\n", "") match = re.search(pattern, text) if not match: pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />" match = re.search(pattern, response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "Could not find target URL on preview page for {0}".format( self.current_shortcode)) url = html_unescape(match.group(1)) url = url.replace('\n', '').replace('\r', '') return URLStatus.ok, url, response.encoding