def main(): t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/") t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy") t3 = task("http://bbs.byr.cn/") event = Event() tasks = TaskQueue(event) pages = TaskQueue(None) tasks.add(t1) tasks.add(t2) tasks.add(t3) taskLock = BoundedSemaphore(tasks.numOfNewTasks) pageLock = BoundedSemaphore(1) f = open("test.txt",'w') Connector0 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3000) Connector1 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3001) Connector0.start() Connector1.start() Crawler0 = Crawler('',3000) Crawler1 = Crawler('',3001) Crawler0.start() Crawler1.start() Connector1.join() Connector0.join() Crawler0.join() Crawler1.join() f.close()
def __init__(self): # parent construct Crawler.__init__(self) self.crawl_cookie = {} self.status_code = '' self.history = ''
def run(self): robot_url = "http://allrecipes.com/" root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1' depth_limit = 5 confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$'] c = Crawler(root, depth_limit,confine_reg,robot_url) c.crawl()
def main(): try: spider = Crawler() spider.go() except KeyboardInterrupt: print("Stopped!")
def start(): '''抓取进程开始,每次取出一个节点抓取 ''' # 初始化 mongo_peoples , redis_client = Init() # 待抓取节点集合是否为空 while redis_client.scard(info_success_set) == 0: # 为空 # 等待 waiting_size 秒 time.sleep(wait_time) # 从待抓取节点集合随机(右端)取出一个节点 node = redis_client.spop(info_success_set) urlToken = node # 抓取节点代表用户的个人主页 # printx('准备代理……') printx('正在抓取用户 %s 的个人信息……'%urlToken) try_cnt = try_limit while try_cnt > 0: try: c = Crawler(isCookie=False,timeout=socket_timeout) # 手动设置代理IP ip = proxyip.get() c.set_proxyip(ip) people = get_Info(c,urlToken) if people==None: raise Exception,'抓取的用户信息为空' except Exception,e: try_cnt -= 1 print e printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt)) else: break
def __init__(self): crawler.__init__(self) self.title = '' self.press = '' self.date = '' self.time = '' self.contents = ''
def __init__(self): myCrawler = Crawler(self.LINKS) crawledURLs = myCrawler.getVisited() linkStructure = myCrawler.getLinkStructure() print("Link-Struktur:\n") myCrawler.printLinkStructure() myPageRank = PageRank(linkStructure) pageRanks = myPageRank.getPageRank() print("\n\nPageRanks:\n") myPageRank.printPageRank() myIndex = Index(self.STOPWORDS, crawledURLs) index = myIndex.getIndex() print("\n\nIndex:\n") myIndex.printIndex() myScorer = Scorer(pageRanks, index,linkStructure) #myScorer.usePageRank(True) print("\n\nDokumentenlängen:\n") myScorer.printDocumentLengths() print("\n\nSuchergebnisse:\n") myScorer.calculateScores(["tokens"]) myScorer.calculateScores(["index"]) myScorer.calculateScores(["classification"]) myScorer.calculateScores(["tokens", "classification"])
def get_article(self, url): crawler = Crawler() # get html data from url web_data = crawler.get_page(url) soup = BeautifulSoup(web_data, 'html.parser') # remove link news [e.extract() for e in soup('div', {'class':'link_news'})] # article title self.title = soup('h3', {'id':'articleTitle'})[0].text # create date and time of article date_time = soup('span', {'class':'t11'})[0].text.split() self.date = date_time[0] self.time = date_time[1] # press name press_logo = soup('div', {'class':'press_logo'})[0] self.press = press_logo.find('img')['alt'] del press_logo # article contents self.contents = soup('div', {'id':'articleBodyContents'})[0].text self.contents = re.sub('[\n\r]', '', self.contents)
def test_crawl_native_fakeCrawler(self): parameter_file = "./test/search_parameters.json" c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file)) self.assertEqual(c.name, "SimpleTest") c.crawl_native() self.assertTrue(os.path.isfile(parameter_file)) result_from_file = FileOperations.get_from_JSON_file(c.output["path"]) self.assertEqual(len(result_from_file), 3)
def test_crawl_multithread_mmcoreAsync(self): parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json") crawlers = parameter_data["crawlers"] crawlerName = "dotAsync" c = Crawler(crawlerName, parameters=crawlers[crawlerName]) data = c.crawl_native(threads=None) self.assertTrue(len(data) > 0) c.save_crawler_data(data, crawlers[crawlerName]["output"])
def test_crawl_clientIntegrations(self): parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json") crawlers = parameter_data["crawlers"] crawlerName = "Integration" c = Crawler(crawlerName, parameters=crawlers[crawlerName]) data = c.crawl_native() self.assertTrue(len(data) > 0) c.save_crawler_data(data, crawlers[crawlerName]["output"])
def __init__(self, forced=False): Crawler.__init__(self) self.results = set() self.forced = forced self.success_count = None self.failure_count = None self.blacklist = [] self.name_exceptions = ["http://www.cplusplus.com/reference/string/swap/"]
def test__process_html(self): soup = BeautifulSoup(self.html_test_string) c = Crawler("http://test.com") c._process_html_asset = mock.Mock() c._process_html_link = mock.Mock() c._process_html(soup) self.assertEqual(c._process_html_asset.call_count, 3) self.assertEqual(c._process_html_link.call_count, 4)
def test_render_sitemap(self): try: os.remove("sitemap.pdf") except OSError: pass self.assertEqual(os.path.exists("sitemap.pdf"), False) c = Crawler("http://a.com") c.render_sitemap() self.assertEqual(os.path.exists("sitemap.pdf"), True)
def test__process_html_link(self): c = Crawler("http://test.com") soup = BeautifulSoup(self.html_test_string) for link in soup.find_all("a"): c._process_html_link(link, "/") self.assertEqual(len(c.sitemap.nodes()), 3) self.assertEqual(len(c.sitemap.edges()), 2) self.assertEqual(len(c.process_q), 3)
def test__process_html_good_asset(self): c = Crawler("http://test.com") soup = BeautifulSoup(self.html_test_string) c._does_static_file_exist = mock.Mock(return_value=True) for asset in soup.find_all(True, src=True): c._process_html_asset(asset, "/") self.assertEqual(c._does_static_file_exist.call_count, 2) self.assertEqual(len(c.sitemap.nodes()), 3) self.assertEqual(len(c.sitemap.edges()), 2)
def test__parse_url(self): test_list = { "http://www.test2.com": None, "/a/b/c": "https://www.test.com/a/b/c", "/?q=1": "https://www.test.com/?q=1", "https://sub.test.com": None } for test in test_list: c = Crawler("https://www.test.com") self.assertEqual(c._parse_url(test), test_list[test])
def test_saveLog(self): """Test the method of saving logs. """ c = Crawler() c.log = ['first', 'second'] c.saveLog() self.assertTrue( os.path.exists(c.logPath) and os.path.isfile(c.logPath)) # Log file existence with open(c.logPath, 'r') as fin: self.assertEqual(fin.read(), 'first\nsecond') # Log file content correctness
def crawl_one(): if request.method == "GET": return "Hello" elif request.method == "POST": req = request.get_json() print(req) crawler = Crawler() currDate = datetime.date(int(req["yr"]), int(req["month"]), int(req["day"])) duration = datetime.timedelta(days=int(req["duration"])) l_id = req["id"] out = crawler.crawl_one(l_id, currDate, duration) return json.dumps(out)
def make_database(db): cr = Crawler('https://hackbulgaria.com/api/students/') users = cr.get_users() courses = cr.get_courses() # Making first table for course in courses: db.add_course(course) # Making second and third table for user in users: db.add_user(user)
def start(): parser=argparse.ArgumentParser(description="") parser.add_argument("-n","--number-of-levels",type=int,default=1,help="") parser.add_argument("url",nargs=1,help="target URL") args=parser.parse_args() target_url=args.url.pop() deep=args.number_of_levels mycrawler=Crawler() currentDeep=1 mycrawler.startCrawler(target_url,currentDeep,deep)
def __init__(self, forced=False): Crawler.__init__(self) self.results = set() self.forced = forced self.success_count = None self.failure_count = None self.blacklist = [ ] self.name_exceptions = [ 'http://www.cplusplus.com/reference/string/swap/' ]
def __init__(self, query): self.query = "" filterWords = POS.POS(query) for item in filterWords: if self.query == "": self.query += item[0] else: self.query += " " + item[0] print(self.query) self.crawler = Crawler(2, 1000, self.query) self.crawler.start()
def reindex(): #defines our indexing limit (N) limit = 100 root = "http://lyle.smu.edu/~fmoore/" myCrawl = Crawler(root, limit) myCrawl.startCrawl() #creating our reverse index script_dir = os.path.dirname(os.path.abspath(__file__)) stopWords = './assets/stopwords.txt' stopWords = open(os.path.join(script_dir, stopWords)).read().splitlines() myIndex = Indexer(myCrawl.getTreeIndex(), stopWords).createReverseIndex().saveIndex('assets/reverse_index.json').savePageDigests('assets/page_digests.json') return True
def main(): # Configuration configurator = Config("./ConfigFile.xml") ret = configurator.config() # Crawl crawler = Crawler() crawler.crawl() # Parse parser = Parser() parser.parse() return
def __init__(self, b_type='Chrome', username='******', pwd='Blank123'): if 'simas' in username: print( 'This is the real account. Please be cautious while trading. semiAuto mode is recommended.' ) self.controller = Crawler(type=b_type) self.browser = None self.username = username self.pwd = pwd self.strategy = None
def main(): thread_count = ThreadCount() # 创建线程计数对象 error_log = ErrorLog() # 创建错误日志记录对象 error_log.clear_error_log() # 如果没有文件新建文件,清空错误日志 crawler = Crawler(thread_count) # 新建爬虫对象 url = get_one_url() while not url is None: # 判断是否为空 if thread_count.total < 5: # 判断当前线程数量是否超出 print("加载:" + url) crawler.get_book(url) # 让爬虫获取书籍页面 url = get_one_url() # 获取新的URL数据 else: # 线程数量超出 sleep(10) # 休眠10秒等待,线程执行完成
def test__nromalize_url(self): test_list = { "http://www.a.com#abc": "http://www.a.com/", "http://www.a.com/a/b/c": "http://www.a.com/a/b/c", # if no scheme is provided, urlsplit treats the domain name as the path # so we don't expect a trailing "/" after www.a.com "www.a.com?abc=123#abc": "://www.a.com?abc=123" } for test in test_list: usplit = urlparse.urlsplit(test) c = Crawler("http://mydomain.com") self.assertEqual(c._normalize_url(usplit), test_list[test])
def _count_process_done(self, files, folders, others, total): if not self.counter.canceled: self.crawler = Crawler(self, files, folders, others, total) self.crawler.processDone.connect(self._process_done) self.crawler.updateStatusBar.connect(self._update_status_bar) self.crawler.setProgressBarValue.connect( self._update_progress_bar_value) self.crawler.updateDuplicateInformation.connect( self._update_duplicate_information) self.crawler.createDuplicateInformation.connect( self._create_duplicate_information) self.crawler.start() else: self._end_process_set_button_states()
def valuta_sito_web(self, sito): crawler = Crawler() img_processor = ImageProcessor() # img_worker = ImageWorker() report_pagine = crawler.generate_reportpagine(sito) print(report_pagine.toJSON()) # TODO: generazione report foto report_foto = img_processor.generate_report_foto('web') # report_foto = img_worker.generate_reportfoto() # TODO: cancellare directory foto report = self.evaluate_report(sito, report_pagine, report_foto) # TODO: valuta report # report = Report(sito, report_pagine, report_foto, valutazione) return report
def scrape_documents(min_count=0): doc_count = 0 s = Crawler() docs = s.crawl(min_count) while min_count <= 0 or doc_count < min_count: for doc in docs: log.debug('uploaded image doc from %s', doc.url) doc_count += 1 if doc_count % 100 == 0: log.info('%d images and counting...', doc_count) yield doc
def test_extract1(self): """The testing method of extracting a string from a web page. In this case, all the targets of the extractors exist. """ c = Crawler(extractors=[self.Extractor_1('title')]) url = self.testUrls[0] bs = c.getPageBs(url) info = c.extract(url, bs) self.assertEqual(info, 'Get title: An Interesting Title' ) # Returned information correctness expectDict = {'0': {'title': 'An Interesting Title'}} with open('tmp/title.json', 'r') as fin: self.assertEqual(json.loads(fin.read()), expectDict) # Record file correctness
def main(start=0, end=len(const.city_code)): city_list = const.city_code c = Crawler() pool = multiprocessing.Pool(multiprocessing.cpu_count() - 2) for i in range(int(start), int(end)): if c.check_compete(city_list[i]): continue for j in range(0, len(city_list)): if i != j: pool.apply_async(c.get_all_info, ( city_list[i], city_list[j], )) pool.close() pool.join()
class CrawlerTest(unittest.TestCase): def setUp(self): self.crawler = Crawler() def testNavigateAndRetrieveLinks(self): crawler = self.crawler crawler.navigate("http://www.google.ca") ret = crawler.findNext(".*") for r in ret: print r def tearDown(self): self.crawler.close()
def main(): parser = argparse.ArgumentParser( description='Process Parameters for searching') parser.add_argument('key', type=str, help='Input the keys of Google API') args = parser.parse_args() # command loop while True: print('1 Add new class data to events library\n' '2 Events classification\n' '3 Query classification\n' '4 Exit\n') cmd = input('Please input a option:\n') if cmd == '1': # create the crawler spider = Crawler(args.key) while True: print('\nEnter the search item and keywords like this\n' 'num_of_res item keyword_1 keyword_2 ... keyword_n\n' "--- type 'exit' to exit ---\n") cmd = input('Please input your command\n') # input check for cmd if cmd == '': print('Empty string!') continue elif cmd == 'exit': break else: cmd = cmd.split(' ') if cmd[0].isdigit(): spider.crawl(cmd[0], cmd[2:], cmd[1]) print('crawling...') continue else: print('The number of search item is invalid!\n') continue continue elif cmd == '2': print('Events classifier in developing...\n') continue elif cmd == '3': print('Query classifier in developing...\n') continue elif cmd == '4' or cmd == 'exit': break else: print('Command error, please input your option again\n') continue
def crawl(seed_list, token): # Make 5 threads(number of seeds) crawler_list = [] for seed in seed_list: name = seed.split('/')[2] crawler = Crawler(name, seed, token) crawler.start() crawler_list.append(crawler) # simple work checker while True: time.sleep(10) print('#######{}########'.format(time.ctime())) for crawler in crawler_list: print('{}: {}/{}'.format(crawler.name, crawler.num_post, crawler.num_delete))
def step2_download_zipfiles(): desktop_path = Helper.get_desktop_dir() directory = os.path.join(desktop_path, RAW_DATA_PATH) if not os.path.exists(directory): os.makedirs(directory) db = Database() currency_list = db.get_currency_list() crawler_list = [Crawler(db) for x in range(THREAD_NUMBER)] lock = threading.RLock() def down_data(crawler): while len(currency_list) > 0: with lock: currency = currency_list[0] currency_list.remove(currency) crawler.download_historical_data(currency["symbol"], currency["time"], directory) crawler.quit() for crawler in crawler_list: t = threading.Thread(target=down_data, args=(crawler, )) t.start()
class Getter(): def __init__(self): # 对这两个方法进行初始化 self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): '''判断代理池的数量书否达到设置的值,达到数量了就返回True,否则返回False''' if self.redis.count() >= POOL_UPPER_HRESHOLD: return True return False def run(self): print('获取器开始执行...') # 如果没有达到数量 if not self.is_over_threshold(): # 遍历__CrawlFuncCount__:比如说,当__CrawlFuncCount__==0的时候,callback就取得数crawlFunc里面的索引为0的方法 for callback_label in range(self.crawler.__CrawlFuncCount__): # 获取所有以crawl_开头方法的列表 callback = self.crawler.__CrawlFunc__[callback_label] # 调用get_proxies方法,获取抓取到的代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() # 遍历获取到的代理,添加到数据库中 for proxy in proxies: self.redis.add(proxy)
class Main(Process): urlToCrawl = '' crawling = False crawler = Crawler() db = DbHandler() def __init__(self, url=None): global urlToCrawl if url is not None: urlToCrawl = url def start(self, url): global urlToCrawl, crawler urlToCrawl = url self.idle() def printen(self, url): print url def idle(self): while self.db.getCrawlstate('crawler')[0]: while not Main.crawling: status = self.db.getCrawlstate(urlToCrawl) if status[0]: Main.crawling = True else: time.sleep(1800) Main.crawling = False Main.crawler.startCrawler(urlToCrawl, status[1])
def getPriceInfo(self, code): r = random.uniform(0, 3) time.sleep(r) url = self.sinaApi % (code) page = Crawler.getPage(url, "gbk") tmp = page.split('"') return tmp[1].split(',')
def main(): if len(sys.argv) > 1 and len(sys.argv) < 5: context = sys.argv[1] place = "./" clean = False if len(sys.argv) == 3: if sys.argv[2] == "--clean": clean = True else: place = sys.argv[2] elif len(sys.argv) == 4: place = sys.argv[2] if sys.argv[3] == "--clean": clean = True else: print("Unrecognized option '" + sys.argv[3] + "'") with Crawler(place, context) as spider: if not clean: if (os.path.exists(spider.listFileName) and os.path.exists(spider.visitedFileName)): spider.loadState(spider.listFileName, spider.visitedFileName) else: print("Could not load one or more of the URL lists.") spider.recursivePull() else: printHelp()
def CompanyInfoThread(category): # 建立SQL連線 conn_cfg = {'host': '', 'user': '', 'password': '', 'db': ''} conn = pymysql.connect(**conn_cfg) cursor = conn.cursor() # 原sql指令為Like '_____' %,在python最後要打兩個%,只打一個無法運作 sql = "select distinct `公司連結` from db_104.job_link where `職類編號` LIKE '%s%%'" % category cursor.execute(sql) companylink = cursor.fetchall() companylink = [i[0] for i in companylink] # 建立職缺連結list conn.close() # 依序讀取公司連結,將公司資訊存成json for l in companylink: try: url = "https://" + l content = Crawler.company_info(url) dn = "[directory]" fn = url.split("company/")[1].split("?")[0] if not os.path.exists(dn): os.makedirs(dn) f = open(dn + fn + ".json", "w", encoding="utf-8") json.dump(content, f) f.close() print(url) print("職務類型", category, "的公司:", fn, "complete") except: pass
def start(): parser = argparse.ArgumentParser(description="") parser.add_argument("-n", "--number-of-levels", type=int, default=1, help="") parser.add_argument("url", nargs=1, help="target URL") args = parser.parse_args() target_url = args.url.pop() deep = args.number_of_levels mycrawler = Crawler() currentDeep = 1 mycrawler.startCrawler(target_url, currentDeep, deep)
def test_blocks(): """ Check transactions in each of a random sample of blocks. Send a request to https://etherchain.org/api/block/:block/tx to get a list of all transactions that occurred in that block. Cross-reference with the transactions in the local block (in mongo). """ c = Crawler.Crawler(start=False) client = c.mongo_client sample = random.sample(range(1, 1700000), 100) N = len(sample) # Track the number of times the number of transactions is different. wrong_blocks = list() num_error = "Incorrect number of transactions in {}% of {} blocks." blocks = client.find({"number": {"$in": sample}}) for block in blocks: n = block["number"] uri = "https://etherchain.org/api/block/{}/tx".format(n) ethchain = json.loads(requests.get(uri).text) # Check the number of transactions in the block if len(ethchain["data"]) != len(block["transactions"]): wrong_blocks.append(n) wrong_nums = len(wrong_blocks) pprint.pprint(wrong_blocks) assert wrong_nums == 0, num_error.format(100.0 * wrong_nums / N, N)
def __sync_thread(self): while True: try: resources_tags = self.auto_tagger.process( Crawler( self.get('black-list'), self.get('white-list'), self.get('crawled-resources'), ).crawl()) SyncAgent( self.get('settings')['server'], self.get('settings')['user-token'], self.get('settings')['device-token'], ).sync(resources_tags) except Exception as new_exception: print('[ERROR]: When trying to sync: {0}'.format( new_exception.message)) else: self.get('crawled-resources').update( set(resource for resource, _ in resources_tags)) time.sleep(self.get('settings')['sync']['interval'])
def __init__(self): # Checks to see if a json file with the data for the week exists # TODO: Check if the file is up to date if os.path.isfile('../CafeAPI/data.json'): # Reads the data from the file into a variable with open('../CafeAPI/data.json', 'r') as f: self.base = json.load(f) print("Database: Retrieved data from file") else: # Run the Crawler a max of 5 times for more stability in case of unstable internet for i in range(4): try: # Release Crawler with Crawler() as c: # Navigate and collect data c.nav() # Set data to variable self.base = c.get_info() # Write the data to a file for future reference with open('../Throwaway/CafeAPI/data.json', 'w') as f: json.dump(self.base, f) # Break if all of the above works successfully print(f"Database: Retrieved data from Crawler on try #{i}") break except: # This means that something failed and the program has to retry print( f"Database: Something went wrong, loading data retry #{i}" ) pass print("Database: Initiated Data Collection")
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): ''' Jude whether the number limit of proxies is reached ''' if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): crawlerCount = self.crawler.__CrawlFuncCount__ print('crawlerCount: ', crawlerCount) for callback_lable in range(crawlerCount): if self.is_over_threshold(): print('The proxies in the pool is too many!') break callback = self.crawler.__CrawlFunc__[callback_lable] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
def __init__(self, start_month="2017/01"): # 其他网页 self._crawler = Crawler(start_month=start_month) # 国家统计局网页 self._url = "http://data.stats.gov.cn/easyquery.htm?" self._sj_start = start_month.replace("/", "") self._index_dict = { "制造业采购经理指数(%)": "A0B0101", "非制造业商务活动指数(%)": "A0B0201", "工业生产者出厂价格指数(上年同月=100)": "A01080101", "工业增加值累计增长(%)": "A020102", "房地产投资累计值(亿元)": "A060101", "流通中现金(M0)供应量期末值(亿元)": "A0D0105", "货币(M1)供应量期末值(亿元)": "A0D0103", "货币和准货币(M2)供应量期末值(亿元)": "A0D0101", "GDP": "A010201", # "生产指数": "A0B0102", # "新订单指数": "A0B0103", # "新出口订单指数": "A0B0104", # "在手订单指数": "A0B0105", # "产成品库存指数": "A0B0106", # "采购量指数": "A0B0107", # "进口指数": "A0B0108", # "出厂价格指数": "A0B0109", # "主要原材料购进价格指数": "A0B010A", # "原材料库存指数": "A0B010B", # "从业人员指数": "A0B010C", # "供应商配送时间指数": "A0B010D", # "生产经营活动预期指数": "A0B010E", # # "新订单指数": "A0B0202", # "新出口订单指数": "A0B0203", # "在手订单指数": "A0B0204", # "存货指数": "A0B0205", # "投入品价格指数": "A0B0206", # "销售价格指数": "A0B0207", # "从业人员指数": "A0B0208", # "供应商配送时间指数": "A0B0209", # "业务活动预期指数": "A0B020A", # # "综合PMI产出指数": "A0B0301" } self._data_by_quarter = ["GDP"]
def main(self): if self.config.has_option("sources", "bootstrap"): self.bootstrap( filename = self.config.get("sources", "bootstrap") ) b = Base( endpoint = self.config.get("xserver", "endpoint"), base = self.config.get("xserver", "base") ) c = Crawler(base = b) c.crawl(callback = self.callback) self.processCache() self.addTopConcepts() self.addLinks() self.write() self.writeTables() shutil.rmtree("temp")
def main(): argparser = argparse.ArgumentParser(description="Scrapes a Web site and writes the generated HTML to disk for caching") argparser.add_argument('root', help='The starting point URL for the crawl (beginning with http:// or https://)') args = argparser.parse_args() assert args.root.startswith(('https://', 'http://')) policy = ScrapingPolicy(args.root) Crawler(policy).crawl()
async def main(): webpage_store = WebpageStore() webpage_processor = WebpageProcessor(webpage_store) crawler = Crawler(webpage_processor, max_depth=3, verbose=True) # initial_urls = ["https://en.wikipedia.org/wiki/Web_scraping"] # initial_urls = [f"https://swapi.co/api/people/{i}" for i in range(1, 3)] initial_urls = get_initial_urls() await crawler.run(initial_urls)
def test_extract3(self): """The testing method of extracting with serveral extractors. In this case, some targets of the extractors do not extist. """ self.assertEqual.__self__.maxDiff = None c = Crawler(extractors=[ self.Extractor_1('title'), self.Extractor_2('para'), self.Extractor_3('gift1'), self.Extractor_4('giftTitles') ]) url = self.testUrls[1] bs = c.getPageBs(url) info = c.extract(url, bs) para = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.' pattern1 = self.clean( f'Get title: An Interesting Title\nGet fakeLatin: {para}\nGet classIsBody: {para}\nFailed to get gift1: .*\nFailed to get giftTitles: .*' ) pattern2 = self.clean( f'Get title: An Interesting Title\nGet classIsBody: {para}\nGet fakeLatin: {para}\nFailed to get gift1: .*\nFailed to get giftTitles: .*' ) self.assertTrue( re.match(f'({pattern1})|({pattern2})', info.replace( '\n', ' '))) # Returned information pattern correctness expectDict = {'0': {'title': 'An Interesting Title'}} with open('tmp/title.json', 'r') as fin: self.assertEqual(json.loads(fin.read()), expectDict) # Extracted record file correctness expectDict = {'0': {'fakeLatin': para, 'classIsBody': para}} with open('tmp/para.json', 'r') as fin: self.assertEqual(json.loads(fin.read()), expectDict) # Extracted record file correctness expectDict = {'0': {'gift1': ''}} with open('tmp/gift1.json', 'r') as fin: self.assertEqual(json.loads(fin.read()), expectDict) # Extracted record file correctness expectDict = {'0': {'giftTitles': ''}} with open('tmp/giftTitles.json', 'r') as fin: self.assertEqual(json.loads(fin.read()), expectDict) # Extracted record file correctness
def runScan(target): crawler = Crawler() findings = {} print("Scanning: ", target) findings.clear() findings = {"target":target,"sqlinjection":[], "WeakPassword":[]} if not crawler.init(target): return crawler.crawl() crawler.findLoginPanel() AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings) WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings) if len(crawler.loginFormEndpoints) > 0: findings["loginForm"]="yes" else: findings["loginForm"] = "no" sqli_scan_urls(crawler.uEndPoints, findings) sqli_scan_forms(crawler.fEndpoints, findings) CommonFunctions.save_findings(findings)
def run(self): # Spawn threads and start crawling. for i in range(self.no_crawlers): crawler = Crawler(i, self.queue, self.visited_urls, self.mutex, self.excluded, self.target_domain, self.robotparser) self.crawlers.append(crawler) crawler.start() # Wait for all crawlers to finish. self.queue.join() # Notify all crawlers to stop. for i in range(self.no_crawlers): self.queue.put(None) self.queue.join() # Wait for all threads to exit for t in self.crawlers: t.join()
def build(self, keyWord, num, bfs=False): if bfs is False: queue = WebQueue() else: queue = BfsQueue() top10List = self.__getTop10(keyWord) for url in top10List: queue.offer(url, 0) return Crawler(num, queue)
def crawl(max_page): text.delete('1.0', END) text.insert(END, 'Currently Crawling Please Wait\n') search_engine.update() count = int(max_page) while len(Crawler.queue) > 0 and count > 0: queue = str(Crawler.queue.pop()) Crawler.crawl(queue) count -= 1 text.insert(END, 'Currently Crawling: ' + queue + '\n') search_engine.update() print('Crawl Finished Can Now Search') text.delete('1.0', END) text.insert(END, 'Crawl Finished Can Now Search\n') text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n") text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n") search_engine.update() Crawler.save_lists()
def get_list(section, date): # NAVER news url naver_news_url = 'http://news.naver.com/main/list.nhn' naver_news_parameter = {'mode':'LSD', 'mid':'sec', 'sid1':'', 'date':date, 'page':''} naver_news_parameter['sid1'] = section page = 1 url_list = [] while True: naver_news_parameter['page'] = page url = naver_news_url + '?' + urllib.urlencode(naver_news_parameter) # get html data crawler = Crawler() web_data = crawler.get_page(url) # html parsing soup= BeautifulSoup(web_data, 'html.parser') list_body = soup('div', {'class':'list_body newsflash_body'})[0] # get each article's url list_body = list_body.findAll('li') current_list = [] for e in list_body: current_list.append(e.find('a')['href']) del list_body # break when current page is end of url list if current_list[0] in url_list: break # add to url list url_list += current_list # next url page page += 1 return url_list
def getWebPage(self, URL, depth): ''' Retreve all the text data from webpage/webpages. @param URL: URL which is going to be the sourse @param depth: the depth of the links from the URL which should be searched default = 0 @return: string of all text from all webpages. ''' if int(depth) != 0: t = "" crawler = Crawler(URL, int(depth)-1) crawler.crawl() for l in crawler.links_remembered: text = self.Alchemy.URLGetText(str(l.dst)) element = ET.XML(text) t += element.findtext("text") else: text = self.Alchemy.URLGetText(URL) element = ET.XML(text) t = element.findtext("text") return t.encode('ascii','ignore')
def test__does_static_file_exist(self): exist_codes = [ "200", "300", "304" ] nonexist_codes = [ "404", "500" ] c = Crawler("http://test.com") with mock.patch("Crawler.requests") as mock_requests: mock_requests.head.return_value = mock_response = mock.Mock() for code in exist_codes: mock_response.status_code = code self.assertEqual(c._does_static_file_exist(""), True) for code in nonexist_codes: mock_response.status_code = code self.assertEqual(c._does_static_file_exist(""), False)
def topology_generator(): '''Creates the Crawler object and calls all the functions to fill a file with the information.''' username = raw_input("Username: "******"Password: "******"Please enter your starting IP address: ") crawler = Crawler(ip_address, username, password) for address in crawler.address_list: crawler.update_address(address) err_flag = get_lldp.get_switch(crawler, working_file) if not err_flag: l3_scrape.bgpcreation(crawler, working_file) print "\nScript complete! Check the 'neighbors.txt' file that has been generated.\n" except IOError, e: print e