def vul_scan(domain): """ 常见漏洞扫描 """ urls = Crawler(target=domain, dynamic=False).run() Xss(targets=urls).scan() Sqli(targets=urls).scan() Struts2(urls).scan()
def crawl(): crawler = Crawler() param_did = input("预先输入本用户cookie中的did值:") crawler.set_did(param_did) uid = input("输入此次要爬取的用户id:") crawler.add_to_list(uid) crawler.crawl() input("请按回车键退出......")
def post(self, collection): """Crawl collection""" try: Crawler.crawl_and_save_articles_and_keywords(collection) return ( json.dumps({"success": True}), 200, { "ContentType": "application/json" }, ) except: return ( json.dumps({"success": False}), 500, { "ContentType": "application/json" }, )
def site_scan(domain): if domain.startswith('http://www.'): domain = domain[11:] elif domain.startswith('https://www.'): domain = domain[12:] elif domain.startswith('http://'): domain = domain[7:] elif domain.startswith('https://'): domain = domain[8:] id = Database().insert_task(domain) domains, ips = Domain(domain, id).run() for domain in domains: url = Crawler(domain).scan() Vul(url, id).run() Sendir(domains, id).run() Port(id).run(ips)
def main(): # .envか実行引数からアクセストークンを取得 if dotenv.load_dotenv(): consumer_key = os.getenv("CLIENT_ID") consumer_secret = os.getenv("CLIENT_SECRET") access_token = os.getenv("ACCESS_TOKEN") access_token_secret = os.getenv("ACCESS_TOKEN_SECRET") elif len(sys.argv) >= 5: consumer_key = sys.argv[1] consumer_secret = sys.argv[2] access_token = sys.argv[3] access_token_secret = sys.argv[4] else: print("\033[31mERROR\033[0m: No credentials has been passed.") return # アクセストークンを渡してtweepy.apiのインスタンスを生成 auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # ログインユーザの情報を吐き出す(auth.status的なものが見つからなかったのでこれで妥協) try: target_user_object = api.me() print(f"Fetch target: {target_user_object.name}(@{target_user_object.screen_name})") except tweepy.TweepError: print("\033[31mERROR\033[0m: Couldn't get user object.Make sure whether you have passed a valid token.") return # ツイート収集・削除を行うスレッドをそれぞれ起動し、キューを直結 tweetqueue = StatusQueue() crawlthread = CrawlThread(Crawler(api), tweetqueue, endreq_event) eliminatethread = EliminateThread(Eliminator(api), tweetqueue, endreq_event) crawlthread.start() eliminatethread.start() print("Process started.") try: crawlthread.join() eliminatethread.join() except KeyboardInterrupt: print("Ctrl+C") endreq_event.set() crawlthread.join() eliminatethread.join()
def enumerate_twit(args): twit = Crawler(args.config_path, args.enum_search)
from lib.config import Config from lib.zip import Zip def init_driver(config_instance: Config): drv = webdriver.PhantomJS(executable_path=config_instance.phantomjs_bin_path) drv.wait = WebDriverWait(drv, 1) return drv if __name__ == "__main__": # init config object and driver config = Config('conf/config.yml') driver = init_driver(config) crawler = Crawler(driver, config) print("====Driver logging in====") crawler.login() print("====Driver logged in====") print("====Starting backup process====") # head driver to pages section crawler.goto_pages() time.sleep(2) # create folder for backups if it doesnt exist current_datetime = time.strftime('%d%m%Y%H%M%S') backup_dir_for_now = config.dir_backup + os.path.sep + current_datetime if not os.path.exists(config.dir_backup): os.makedirs(config.dir_backup)
def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth_limit = opts.depth_limit confine_prefix = opts.confine exclude = opts.exclude sTime = time.time() print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit) crawler = Crawler(url, depth_limit, confine_prefix, exclude) crawler.crawl() # create log directory if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) num_links = 0 if opts.out_urls: for url_crawl in crawler.urls_seen: parsed_uri = urlparse.urlparse(url_crawl) # only base url if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host: continue if not opts.out_path: print url_crawl else: domain = '{uri.netloc}'.format(uri=parsed_uri) log_file = "%s/%s.log" % (LOG_DIRECTORY, domain) logging.basicConfig( filename=log_file, filemode='w+', level=logging.DEBUG, format= '%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') try: directory = opts.out_path + domain + '/' path = directory + toSeoFriendly(url_crawl, 50) + '.html' if not os.path.exists(directory): os.makedirs(directory) r = requests.get(url_crawl, allow_redirects=True, timeout=30) if not os.path.exists(path): target = open(path, 'w') target.write(r.text.encode('utf-8')) target.close() num_links = num_links + 1 logging.debug("Saving: {0}".format(url_crawl)) except IOError as e: logging.error("IOError: {0} {1}".format(url, e.message)) pass except Exception as e: logging.error("Error({0}): {1}".format( url, e.__doc__, e.message), exc_info=True) pass if opts.out_links: print "\n".join([str(l) for l in crawler.links_remembered]) if opts.out_dot: d = DotWriter() d.asDot(crawler.links_remembered) eTime = time.time() tTime = eTime - sTime print >> sys.stderr, "Found: %d" % num_links print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % (int( math.ceil(float(num_links) / tTime)), tTime)
class Taobaoke(MongoClient): """ Taobaoke """ start_urls = ( 'http://pub.alimama.com/promo/item/channel/index.htm?channel=qqhd&toPage={}&catIds={}&level=1&perPageSize=100' ) crawler = Crawler() def start_requests(self): # item list range for cat in range(1, 18): for page in range(10, 31): print "crawl:%s,%s" % (cat, page) self.requests_url(self.start_urls.format(page, cat), callback=self.callback) def requests_url(self, url, callback=None): """ :param url: :param callback: :return: """ crawler = self.crawler.crawl(url) callback(crawler) def safe_febx_text(self, crawler, xpath, count=1, max_count=10): """ :param crawler: :param xpath: :param count: :param max_count: :return: """ if count > max_count: return "" try: return crawler.febx(xpath).text.strip() except Exception: print "try:%s" % count return self.safe_febx_text(crawler, xpath, count=count + 1, max_count=max_count) def callback(self, crawler): """ 回调处理 :param crawler: :return: """ time.sleep(10) for index in range(1, 101): item = { "title": self.safe_febx_text( crawler, u".//*[@id='J_search_results']/div/div[{}]" u"/div[@class='box-content']/div[1]/p/a/node()".format( index)), "category": self.safe_febx_text(crawler, u".//*[@class='top-nav-tag']/span") } if not item["title"] or not item[ "category"] or self.db.tbk_test.find({ "title": item["title"] }).count(): print "continue", item["category"], item["title"] continue print item["category"], item["title"] self.db.tbk_test.insert(item)
browser.quit() def content_func(res): words = [] soup = BeautifulSoup(res.text, 'lxml') content = '' divs = soup.select('div.comp_detail') del divs[-1] for div in divs: content += div.text.replace('\n', '').replace('\xa0', '') words += list( set( re.findall('java script|objective c|visual basic|[A-Za-z.+#]+', content, re.IGNORECASE))) return words # Create an instance of Crawler class crawler = Crawler(open_thread=True) # Call grab_content_th_auto to get content page by page crawler.grab_content_th_auto(alinks, content_func, sleep_time=1) # Call get_counter to get word count result print(crawler.get_counter().most_common()) with open('yes123_1_new.csv', 'w') as f: for lang, counts in crawler.get_counter().most_common(): f.write('{},{}\n'.format(lang, counts))
from lib.signature import Signature from lib.crawler import Crawler import coloredlogs coloredlogs.install(fmt="%(asctime)s %(name)s [%(levelname)s] %(message)s") signature = Signature(status_code=200) signature.add(lambda x: "an error" not in x.text) c = Crawler("https://www.youtube.com", "./test_dic.txt", signature=signature) c.start()
# lock = threading.Lock() # 全局资源锁 # 提取数据、多线程、存储、异常处理、日志 initDir = "f:/thz/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Referer": "http://thzbbt.net", } initUrls = [ "http://thzbbt.net/forum-181-{num}.html".format(num=num) for num in range(1, 10) ] print "initUrls", initUrls crawler = Crawler(initUrls, initDir, headers, "crawler-thz") def fn1(url): arr = [] r = requests.get(url, headers=headers, timeout=100).text for father in BeautifulSoup(r, 'lxml').find_all('th', class_="common"): link = father.find_all("a")[3]["href"] arr.append("http://thzbbt.net/" + link) return arr def fn2(url): arr = [] # print "url",url r = requests.get(url, headers=headers, timeout=100).text
def vul_scan(domain): url = Crawler(domain).scan() Vul(url).run()
def crawl(collection): Crawler.crawl_and_save_articles_and_keywords(collection)
def crawl_scan(domain): Crawler(domain).scan()
def crawl(): c = Crawler() c.set_did(param_did) c.crawl_like("3xzixigzwy4kj5c")
links = soup.select('div.jbInfo > div > h3 > a') if res.status_code == 200 and len(links) != 0: for link in links: page_links.append('https:' + str(link['href'])) return page_links # Get total page from the website you want to crawl HOST = 'https://www.1111.com.tw' url = HOST + '/job-bank/job-index.asp?si=1&d0=140400,140200,140300&fs=1&ps=100&page=1' res = requests.get(url) soup = BeautifulSoup(res.text, 'lxml') page = int(soup.select_one('div.pagedata').text.split('/')[1].split('жаЂ')[0]) # Create an instance of Crawler class crawler = Crawler(open_thread=True) # Call grab_pagelinks_th_auto to get all links page_url = HOST + '/job-bank/job-index.asp?si=1&d0=140400,140200,140300&fs=1&ps=100&page={}' crawler.grab_pagelinks_th_auto(page_url, page_func, page, sleep_time=1) # Call get_alinks to get all links crawled from previous pages links = crawler.get_alinks() # Call grab_content_th_auto to get content page by page crawler.grab_content_th_auto(links, content_func, sleep_time=1) # Call get_counter to get word count result print(crawler.get_counter().most_common()) with open('1111_1_new.csv', 'w') as f:
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Referer": "http://www.mmjpg.com", } initUrls = [ "http://www.mmjpg.com/mm/{num}".format(num=num) for num in range(1, 2) ] print "initUrls", initUrls def getCurFileName(): filename = os.path.basename(__file__) return filename[0:filename.find(".")] crawler = Crawler(initUrls, initDir, headers, getCurFileName()) print "crawler初始化成功" def fn1(url): r = requests.get(url, headers=headers, timeout=100).text maxCount = BeautifulSoup(r, 'lxml').find('div', class_="page").find_all('a')[-2].text # print maxCount page_urls = [url + "/" + str(i) for i in range(1, int(maxCount) + 1)] return page_urls def fn2(url): r = requests.get(url, headers=headers, timeout=100).text
URL = "https://www.ptt.cc/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/58.0.3029.96 Safari/537.36', "cookies": "over18=1" } board = "Soft_Job" #希望爬取ptt板面、但是頁面擷取的網址內容會缺乏URL,需要自行補上 res = requests.get(URL + "bbs/" + board + "/index.html", headers=headers) soup = BeautifulSoup(res.text, 'lxml') bottons = soup.select('a.btn.wide') #totalpage=上一頁的頁數+1 totalpage = int(bottons[1]['href'].split('index')[1].split('.')[0]) + 1 crawler = Crawler(open_thread=True) page_url = URL + "bbs/" + board + "/index{}.html" crawler.grab_pagelinks_th_auto(page_url, pttURL_crawler, totalpage, sleep_time=1, header=headers) links = crawler.get_alinks() print(links) # Call grab_content_th_auto to get content page by page crawler.grab_content_th_auto(links, content_func, sleep_time=2) # Call get_counter to get word count result
def crawl(): crawler = Crawler(False) crawler.set_did(param_did) crawler.crawl()
def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth_limit = opts.depth_limit confine_prefix = opts.confine exclude = opts.exclude sTime = time.time() print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit) crawler = Crawler(url, depth_limit, confine_prefix, exclude) crawler.crawl() # create log directory if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) num_links = 0 if opts.out_urls: for url_crawl in crawler.urls_seen: parsed_uri = urlparse.urlparse(url_crawl) # only base url if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host: continue if not opts.out_path: print url_crawl else: domain = '{uri.netloc}'.format(uri=parsed_uri) log_file = "%s/%s.log" % (LOG_DIRECTORY, domain) logging.basicConfig( filename=log_file, filemode='w+', level=logging.DEBUG, format='%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p' ) try: directory = opts.out_path + domain + '/' path = directory + toSeoFriendly(url_crawl, 50) + '.html' if not os.path.exists(directory): os.makedirs(directory) r = requests.get(url_crawl, allow_redirects=True, timeout=30) if not os.path.exists(path): target = open(path, 'w') target.write(r.text.encode('utf-8')) target.close() num_links = num_links + 1 logging.debug("Saving: {0}".format(url_crawl)) except IOError as e: logging.error("IOError: {0} {1}".format(url, e.message)) pass except Exception as e: logging.error("Error({0}): {1}".format(url, e.__doc__, e.message), exc_info=True) pass if opts.out_links: print "\n".join([str(l) for l in crawler.links_remembered]) if opts.out_dot: d = DotWriter() d.asDot(crawler.links_remembered) eTime = time.time() tTime = eTime - sTime print >> sys.stderr, "Found: %d" % num_links print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % ( int(math.ceil(float(num_links) / tTime)), tTime)