def run(self): robot_url = "http://allrecipes.com/" root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1' depth_limit = 5 confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$'] c = Crawler(root, depth_limit,confine_reg,robot_url) c.crawl()
def runScan(target): crawler = Crawler() findings = {} print("Scanning: ", target) findings.clear() findings = {"target":target,"sqlinjection":[], "WeakPassword":[]} if not crawler.init(target): return crawler.crawl() crawler.findLoginPanel() AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings) WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings) if len(crawler.loginFormEndpoints) > 0: findings["loginForm"]="yes" else: findings["loginForm"] = "no" sqli_scan_urls(crawler.uEndPoints, findings) sqli_scan_forms(crawler.fEndpoints, findings) CommonFunctions.save_findings(findings)
def main() -> None: urls = None with open('./urls.json') as f: urls = json.loads(f.read()) for i in urls: crawler = Crawler(i, urls[i]) crawler.crawl() add_data(i, crawler.sorted_time_table) save() return
def main(): # Configuration configurator = Config("./ConfigFile.xml") ret = configurator.config() # Crawl crawler = Crawler() crawler.crawl() # Parse parser = Parser() parser.parse() return
def scrape_and_crawl(input_page: str, file_path: str, link_status_report: dict = {}, all_checked_links: dict = {}, is_local_file: bool = False): scraper = Scraper() if is_local_file: links = list(scraper.extract_links(input_page, "")) else: links = list(scraper.extract_links(input_page, file_path)) crawler = Crawler(urls=links, checked=all_checked_links) crawler.crawl() checked_links = crawler.get_responses() link_status_report[file_path] = checked_links return checked_links, crawler.get_checked()
def main(): parser = argparse.ArgumentParser( description='Process Parameters for searching') parser.add_argument('key', type=str, help='Input the keys of Google API') args = parser.parse_args() # command loop while True: print('1 Add new class data to events library\n' '2 Events classification\n' '3 Query classification\n' '4 Exit\n') cmd = input('Please input a option:\n') if cmd == '1': # create the crawler spider = Crawler(args.key) while True: print('\nEnter the search item and keywords like this\n' 'num_of_res item keyword_1 keyword_2 ... keyword_n\n' "--- type 'exit' to exit ---\n") cmd = input('Please input your command\n') # input check for cmd if cmd == '': print('Empty string!') continue elif cmd == 'exit': break else: cmd = cmd.split(' ') if cmd[0].isdigit(): spider.crawl(cmd[0], cmd[2:], cmd[1]) print('crawling...') continue else: print('The number of search item is invalid!\n') continue continue elif cmd == '2': print('Events classifier in developing...\n') continue elif cmd == '3': print('Query classifier in developing...\n') continue elif cmd == '4' or cmd == 'exit': break else: print('Command error, please input your option again\n') continue
def main(self): if self.config.has_option("sources", "bootstrap"): self.bootstrap( filename = self.config.get("sources", "bootstrap") ) b = Base( endpoint = self.config.get("xserver", "endpoint"), base = self.config.get("xserver", "base") ) c = Crawler(base = b) c.crawl(callback = self.callback) self.processCache() self.addTopConcepts() self.addLinks() self.write() self.writeTables() shutil.rmtree("temp")
def crawl(max_page): text.delete('1.0', END) text.insert(END, 'Currently Crawling Please Wait\n') search_engine.update() count = int(max_page) while len(Crawler.queue) > 0 and count > 0: queue = str(Crawler.queue.pop()) Crawler.crawl(queue) count -= 1 text.insert(END, 'Currently Crawling: ' + queue + '\n') search_engine.update() print('Crawl Finished Can Now Search') text.delete('1.0', END) text.insert(END, 'Crawl Finished Can Now Search\n') text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n") text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n") search_engine.update() Crawler.save_lists()
def test_crawl_limit(self): c = Crawler("http://a.com") c.SLEEP_TIME = 0 def side_effect(): c.process_q.pop(0) c._process_next_url = mock.Mock(side_effect=side_effect) c.render_sitemap = mock.Mock() c.URL_LIMIT = 10 c.process_q = ["test"] * 5 c.crawl() self.assertEqual(c._process_next_url.call_count, 5) c._process_next_url.call_count = 0 c.process_q = ["test"] * 10 c.URL_LIMIT = 5 c.crawl() self.assertEqual(c._process_next_url.call_count, 5) c._process_next_url.call_count = 0 c.process_q = ["test"] * 10 c.URL_LIMIT = float("inf") c.crawl() self.assertEqual(c._process_next_url.call_count, 10)
def scrape_documents(min_count=0): doc_count = 0 s = Crawler() docs = s.crawl(min_count) while min_count <= 0 or doc_count < min_count: for doc in docs: log.debug('uploaded image doc from %s', doc.url) doc_count += 1 if doc_count % 100 == 0: log.info('%d images and counting...', doc_count) yield doc
def getWebPage(self, URL, depth): ''' Retreve all the text data from webpage/webpages. @param URL: URL which is going to be the sourse @param depth: the depth of the links from the URL which should be searched default = 0 @return: string of all text from all webpages. ''' if int(depth) != 0: t = "" crawler = Crawler(URL, int(depth)-1) crawler.crawl() for l in crawler.links_remembered: text = self.Alchemy.URLGetText(str(l.dst)) element = ET.XML(text) t += element.findtext("text") else: text = self.Alchemy.URLGetText(URL) element = ET.XML(text) t = element.findtext("text") return t.encode('ascii','ignore')
def getWebPage(self, URL, depth): ''' Retreve all the text data from webpage/webpages. @param URL: URL which is going to be the sourse @param depth: the depth of the links from the URL which should be searched default = 0 @return: string of all text from all webpages. ''' if int(depth) != 0: t = "" crawler = Crawler(URL, int(depth) - 1) crawler.crawl() for l in crawler.links_remembered: text = self.Alchemy.URLGetText(str(l.dst)) element = ET.XML(text) t += element.findtext("text") else: text = self.Alchemy.URLGetText(URL) element = ET.XML(text) t = element.findtext("text") return t.encode('ascii', 'ignore')
def test_crawl_inline(self): path_rules = { "start": "./", "file": { "include": ["\\.py$"] } } rules = { "search_author": { "include": "author", "result": { "author": "author[\\s_]+=\s+'([\\w\\s]+)'" } } } result = { "BUILT-IN": ["FILENAME"] } output = None crawl_res_sync = Crawler.crawl(path_rules, rules, result, output) current_test_file = "./test/test_crawler.py" self.assertIsNotNone(crawl_res_sync.get(current_test_file)) self.assertTrue("matches" in crawl_res_sync[current_test_file] and len(crawl_res_sync[current_test_file]) > 0) self.assertEqual(crawl_res_sync[current_test_file]["matches"]["search_author"]["author"][0], __author__)
def main(): urls = raw_input("\n Pages to crawl: ") maxLinksToCrawl = int(raw_input(" Maximum amount of links to crawl: ")) crawler = Crawler(urls, maxLinksToCrawl) crawler.crawl()
# Written by Kevin Keraudren, 14/06/2011 import argparse from Crawler import Crawler parser = argparse.ArgumentParser( usage = "Usage: %(prog)s seed_url [options]" ) parser.add_argument( 'seed', metavar='seed_url', help='url for starting the crawl' ) parser.add_argument( '--dir', default='./', help="root directory to store the result of the crawl" ) parser.add_argument( '--verbose', action="store_true", default=True, help="verbose mode" ) args = parser.parse_args() crawler = Crawler( args.seed, rootdir=args.dir, verbose=args.verbose ) print crawler crawler.crawl() print crawler print "Crawl complete"
import logging import time from Analyzer import Analyzer from Cleaner import Cleaner from Cluster import Cluster from Crawler import Crawler from Uploader import Uploader this_date = time.strftime("%Y%m%d", time.localtime()) # 爬取新闻 crawler = Crawler(this_date=this_date) crawler.crawl() # 聚类 cluster = Cluster(date=this_date) cluster.remove_useless_articles() cluster.load_articles() cluster.cluster() cluster.upload_groups_to_DB() # 情绪分析 analyzer = Analyzer(date=this_date) analyzer.analyze() # 上传至LeanCloud uploader = Uploader(date=this_date) uploader.upload_new_groups() # 删除过老或分数过低的新闻组 cleaner = Cleaner(date=this_date)
for regex, tags in self.regexes_tags: if(regex.match(resource)): auto_tags.extend(tags) resources_tags.append((resource, auto_tags)) assert isinstance(resources_tags, list) return resources_tags # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': from Crawler import Crawler crawler = Crawler(white_list=set(( '..', ))) auto_tagger = AutoTagger({ r'^.*\.py$': ['python', 'development'], r'^.*\.css$': ['css', 'development'], r'^.*\.js$': ['javascript', 'development'], }) for resource, tags in auto_tagger.process(crawler.crawl()): print(resource, tags) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def test_crawl_fake_directCrawl(self): parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json") Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"]) self.assertTrue(os.path.isfile(parameters["output"]["path"])) result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"]) self.assertEqual(len(result_from_file), 3)
auto_tags = [] for regex, tags in self.regexes_tags: if (regex.match(resource)): auto_tags.extend(tags) resources_tags.append((resource, auto_tags)) assert isinstance(resources_tags, list) return resources_tags # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': from Crawler import Crawler crawler = Crawler(white_list=set(('..', ))) auto_tagger = AutoTagger({ r'^.*\.py$': ['python', 'development'], r'^.*\.css$': ['css', 'development'], r'^.*\.js$': ['javascript', 'development'], }) for resource, tags in auto_tagger.process(crawler.crawl()): print(resource, tags) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from Crawler import Crawler import argparse as parser if __name__ == '__main__': spider = Crawler('severe weather') spider.crawl()
import sys import signal from Crawler import Crawler def exit_handler(sig, frame): sys.exit(0) if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: main.py [root-url]") sys.exit(1) signal.signal(signal.SIGINT, exit_handler) crawler = Crawler() crawler.crawl(sys.argv[1])
import re from Crawler import Crawler, Crawler_SQLite if __name__ == '__main__': crawler = Crawler(Crawler_SQLite('crawler.db'), depth=2) root_re = re.compile('^/$').match crawler.crawl('http://wmp.uksw.edu.pl', no_cache=root_re) print(crawler.content['wmp.uksw.edu.pl'].keys()) print(len(crawler.content['wmp.uksw.edu.pl'].keys()))
def main(): config = get_args() crawler = Crawler(config) crawler.crawl()
import os.path parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them') parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True, help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key") parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json') parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread') parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file') args = parser.parse_args() if "p" not in args or args.p is None: parser.error(parser.format_usage()) param = FO.get_from_JSON_file(args.p.name) if "rules" not in param or ("o" not in args and "output" not in param): print("rules error") parser.error(parser.format_usage()) if "crawling" not in param and ("s" not in args or args.s is None): parser.error(parser.format_usage()) elif "s" in args and args.s is not None: param["crawling"] = { "start": args.s} if "o" in args and args.o is not None: output_name, output_extension = os.path.splitext(args.o.name) param["output"] = { "path": args.o.name, "type": "csv" if ".csv" in output_extension else "json" } if "mt" in args and args.mt is not None: Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt) else: Crawler.crawl(param.get("crawling"), param.get("rules"), param.get("result"), param["output"])
from Extractor import Extractor from UrlMatcher import UrlMatcher from Crawler import Crawler from urllib.parse import urlparse import re class Title(Extractor): def get_url(self, url, bs): data = url msg = '' return data, msg def get_title(self, url, bs): data = bs.head.title.get_text() msg = '' return data, msg args = { 'scheme_pattern': r'http|https', 'domain_pattern': r'.*', 'path_pattern': r'.*', 'extractors': [Title('title')], 'workingList': [], 'autoAddInternalLinks': True } c = Crawler(**args) c.crawl()
def start(url, numpages): if url != '' and numpages > 0: crawler = Crawler('Output', url, 5) crawler.crawl(numpages) else: raise ValueError('The input is invalid')