def test_attributes(self): """Test the setting of attributes. 1. Test default attributes. 2. Test user-defined attributes. All arguments are different from the default settings. """ # Test default attributes patterns = { 'scheme_pattern': r'http|https', 'domain_pattern': r'.*', 'path_pattern': r'.*', } c = Crawler() ## Test the WorkingListManager self.assertTrue(isinstance(c.WLM, WorkingListManager)) self.assertFalse(c.WLM.workExists()) ## Test the UrlMatcher self.assertTrue(isinstance(c.UM, UrlMatcher)) for k, p in patterns.items(): self.assertEqual(getattr(c.UM, k), p) ## Test other attributes self.assertFalse(c.curUrl) self.assertFalse(c.extractors) self.assertTrue(c.autoAddInternalLinks) #----------------------------# # Test user-defined attributes extractors = [ self.Extractor_1('title'), self.Extractor_2('para'), self.Extractor_3('gift1'), self.Extractor_4('giftTitles') ] patterns = { 'scheme_pattern': 'http', 'domain_pattern': 'www.yahoo.com', 'path_pattern': r'/path/page.*$', } c = Crawler(workingList=self.testUrls, **patterns, extractors=extractors, autoAddInternalLinks=False) ## Test the WorkingListManager self.assertTrue(c.WLM.workExists()) self.assertEqual(c.WLM.records, deque(self.testUrls)) ## Test the UrlMatcher self.assertTrue(isinstance(c.UM, UrlMatcher)) for k, p in patterns.items(): self.assertEqual(getattr(c.UM, k), p) ## Test other attributes self.assertFalse(c.curUrl) self.assertEqual(c.extractors, extractors) self.assertFalse(c.autoAddInternalLinks)
def set_crawlers(self): old_page = Page.get_or_create(self.db, self.project_config.project_name, self.versions.old, Url.clean_url(Constants.DOCKER_URL)) new_page = Page.get_or_create(self.db, self.project_config.project_name, self.versions.new, Url.clean_url(Constants.DOCKER_URL)) self.old_crawler = Crawler(old_page, self.old_project.port) self.new_crawler = Crawler(new_page, self.new_project.port)
def test_scarpe_keyword(self): lista = [ "https://www.cucineluberoma.it", "https://lubecreomilano.it/", ] crw = Crawler() crw.scrape_keyword(lista)
def __init__(self): # Checks to see if a json file with the data for the week exists # TODO: Check if the file is up to date if os.path.isfile('../CafeAPI/data.json'): # Reads the data from the file into a variable with open('../CafeAPI/data.json', 'r') as f: self.base = json.load(f) print("Database: Retrieved data from file") else: # Run the Crawler a max of 5 times for more stability in case of unstable internet for i in range(4): try: # Release Crawler with Crawler() as c: # Navigate and collect data c.nav() # Set data to variable self.base = c.get_info() # Write the data to a file for future reference with open('../Throwaway/CafeAPI/data.json', 'w') as f: json.dump(self.base, f) # Break if all of the above works successfully print(f"Database: Retrieved data from Crawler on try #{i}") break except: # This means that something failed and the program has to retry print( f"Database: Something went wrong, loading data retry #{i}" ) pass print("Database: Initiated Data Collection")
def step2_download_zipfiles(): desktop_path = Helper.get_desktop_dir() directory = os.path.join(desktop_path, RAW_DATA_PATH) if not os.path.exists(directory): os.makedirs(directory) db = Database() currency_list = db.get_currency_list() crawler_list = [Crawler(db) for x in range(THREAD_NUMBER)] lock = threading.RLock() def down_data(crawler): while len(currency_list) > 0: with lock: currency = currency_list[0] currency_list.remove(currency) crawler.download_historical_data(currency["symbol"], currency["time"], directory) crawler.quit() for crawler in crawler_list: t = threading.Thread(target=down_data, args=(crawler, )) t.start()
def start(): '''抓取进程开始,每次取出一个节点抓取 ''' # 初始化 mongo_peoples , redis_client = Init() # 待抓取节点集合是否为空 while redis_client.scard(waiting_set) == 0: # 为空 # 等待 waiting_size 秒 time.sleep(wait_time) # 从待抓取节点集合随机(右端)取出一个节点 node = redis_client.spop(waiting_set) urlToken = node # 抓取节点代表用户的个人信息 # printx('准备代理……') printx('正在抓取用户 %s 的个人信息……'%urlToken) try_cnt = try_limit while try_cnt > 0: try: c = Crawler(isCookie=False,timeout=socket_timeout) # 手动设置代理IP ip = proxyip.get() c.set_proxyip(ip) people = get_Info(c,urlToken) if people==None: raise Exception,'抓取的用户信息为空' except Exception,e: try_cnt -= 1 print e printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt)) else: break
def load_more(url): crawler = Crawler() crawler.get(url) assert "Influencer Love | Fashion ID" in crawler.title, "TITLE INCORRECT" try: times_clicked = 0 start = int(time()) while True: # delete tab if we accidentally trip a twitter tab open if "Twitter" in crawler.getTitle(): crawler.driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 'w') any_button = crawler.findElementsByXPath("//a")[0] any_button.send_keys(Keys.COMMAND + 'w') crawler.closeExtraTabs() # find load more button load_more_button = crawler.findElementByXPath("//a[@id='ctf-more']") crawler.highlight("//a[@id='ctf-more']") crawler.click(load_more_button) times_clicked += 1 print('%s CLICKS' % times_clicked) crawler.closeExtraTabs() except Exception as e: print('EXCEPTION', e) crawler.close() end = int(time()) print(start) print(end) print('TOTAL TIME ELAPSED: %s' % (end - start))
def main(): if len(sys.argv) > 1 and len(sys.argv) < 5: context = sys.argv[1] place = "./" clean = False if len(sys.argv) == 3: if sys.argv[2] == "--clean": clean = True else: place = sys.argv[2] elif len(sys.argv) == 4: place = sys.argv[2] if sys.argv[3] == "--clean": clean = True else: print("Unrecognized option '" + sys.argv[3] + "'") with Crawler(place, context) as spider: if not clean: if (os.path.exists(spider.listFileName) and os.path.exists(spider.visitedFileName)): spider.loadState(spider.listFileName, spider.visitedFileName) else: print("Could not load one or more of the URL lists.") spider.recursivePull() else: printHelp()
def random_page(url): crawler = Crawler() crawler.get(url) assert "Urban Dictionary" in crawler.title, "TITLE INCORRECT" try: # find random page random_button = crawler.findElementByXPath( "//a[@class='circle-button' and @href='/random.php']") crawler.highlight( "//a[@class='circle-button' and @href='/random.php']") crawler.click(random_button) # extract content content = {} content["word"] = crawler.findElementByXPath( "(//a[@class='word'])[1]").text crawler.highlight("(//a[@class='word'])[1]") content["meaning"] = crawler.findElementByXPath( "(//div[@class='meaning'])[1]").text crawler.highlight("(//div[@class='meaning'])[1]") content_dict = dumps(content) return content_dict except: print('MISSING', e) crawler.close()
def __sync_thread(self): while True: try: resources_tags = self.auto_tagger.process( Crawler( self.get('black-list'), self.get('white-list'), self.get('crawled-resources'), ).crawl()) SyncAgent( self.get('settings')['server'], self.get('settings')['user-token'], self.get('settings')['device-token'], ).sync(resources_tags) except Exception as new_exception: print('[ERROR]: When trying to sync: {0}'.format( new_exception.message)) else: self.get('crawled-resources').update( set(resource for resource, _ in resources_tags)) time.sleep(self.get('settings')['sync']['interval'])
def test_blocks(): """ Check transactions in each of a random sample of blocks. Send a request to https://etherchain.org/api/block/:block/tx to get a list of all transactions that occurred in that block. Cross-reference with the transactions in the local block (in mongo). """ c = Crawler.Crawler(start=False) client = c.mongo_client sample = random.sample(range(1, 1700000), 100) N = len(sample) # Track the number of times the number of transactions is different. wrong_blocks = list() num_error = "Incorrect number of transactions in {}% of {} blocks." blocks = client.find({"number": {"$in": sample}}) for block in blocks: n = block["number"] uri = "https://etherchain.org/api/block/{}/tx".format(n) ethchain = json.loads(requests.get(uri).text) # Check the number of transactions in the block if len(ethchain["data"]) != len(block["transactions"]): wrong_blocks.append(n) wrong_nums = len(wrong_blocks) pprint.pprint(wrong_blocks) assert wrong_nums == 0, num_error.format(100.0 * wrong_nums / N, N)
def check_with_cosine(): url_file = open("categories/index/url.txt", encoding="utf-8") urls = url_file.read().split() login_vector_file = open("categories/login/vector.txt", encoding="utf-8") index_vector_file = open("categories/index/vector.txt", encoding="utf-8") register_vector_file = open("categories/register/vector.txt", encoding="utf-8") login_vector = login_vector_file.read().split() for i in range(0, len(login_vector)): login_vector[i] = int(login_vector[i]) indexVector = index_vector_file.read().split() registerVector = register_vector_file.read().split() crawler = Crawler() for url in urls: # url = "https://mail.sjtu.edu.cn/" try: words = open("categories/words.txt", encoding='utf-8').read().split("\n") vector = crawler.word_frequency_statistics_by_url(url, words) print(login_vector) print(vector) print("Possibility of " + url + " being a login page is " + str(cos(login_vector, vector))) except Exception: continue
def runScan(target): crawler = Crawler() findings = {} print("Scanning: ", target) findings.clear() findings = {"target":target,"sqlinjection":[], "WeakPassword":[]} if not crawler.init(target): return crawler.crawl() crawler.findLoginPanel() AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings) WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings) if len(crawler.loginFormEndpoints) > 0: findings["loginForm"]="yes" else: findings["loginForm"] = "no" sqli_scan_urls(crawler.uEndPoints, findings) sqli_scan_forms(crawler.fEndpoints, findings) CommonFunctions.save_findings(findings)
def get_per_followerList(urlToken, page, sum_page): ''' 抓取 follower 列表的每一页''' printx('正在抓取第 %d/%d 页……' % (page, sum_page)) try_cnt = try_limit follower_list = [] while try_cnt > 0: try: # 设置抓取器 c = Crawler(isCookie=False, timeout=socket_timeout) # 手动设置代理IP ip = proxyip.get() c.set_proxyip(ip) # 解析当前页的 html url = '%s/people/%s/followers?page=%d' % (host, urlToken, page) html = c.get_html(url) s = BS(html, 'html.parser') # 获得当前页的所有关注用户 data = s.find('div', attrs={'id': 'data'})['data-state'] data = json.loads(data) items = data['people']['followersByUser'][urlToken]['ids'] for item in items: if item != None and item != False and item != True and item != '知乎用户'.decode( 'utf8'): node = item.encode('utf8') follower_list.append(node) except Exception, e: try_cnt -= 1 #printx(e) printx('用户 %s 第 %d 页抓取出错,还可以尝试抓取 %d 次' % (urlToken, page, try_cnt)) else: break
def main(token, args): config = args.config storageConfig = config['storage'] rootLogger.setLevel(args.log or config['general']['log_level'].upper()) crawler = Crawler(config) RawEvent.BASE_TZ = config['crawler']['defaults']['timezone'] repo, icsSha, jsonSha, jsonContent = loadFromGit(token, storageConfig) if not repo: return 1 importEvents = json.loads(jsonContent or '[]') if importEvents and not args.ignore_previous_crawls: crawler.importJSON(importEvents) crawler.discover() crawler.resolve() exportedJSON = crawler.exportJSON(force=args.force_write) exportedICS = crawler.exportICS(force=args.force_write) if exportedJSON is False or exportedICS is False: logger.info('No new events') return 0 res = storeToGit(repo, storageConfig, icsSha, exportedICS, jsonSha, exportedJSON) return 0 if res else 1
def get_article(self, url): crawler = Crawler() # get html data from url web_data = crawler.get_page(url) soup = BeautifulSoup(web_data, 'html.parser') # remove link news [e.extract() for e in soup('div', {'class':'link_news'})] # article title self.title = soup('h3', {'id':'articleTitle'})[0].text # create date and time of article date_time = soup('span', {'class':'t11'})[0].text.split() self.date = date_time[0] self.time = date_time[1] # press name press_logo = soup('div', {'class':'press_logo'})[0] self.press = press_logo.find('img')['alt'] del press_logo # article contents self.contents = soup('div', {'id':'articleBodyContents'})[0].text self.contents = re.sub('[\n\r]', '', self.contents)
class Main(Process): urlToCrawl = '' crawling = False crawler = Crawler() db = DbHandler() def __init__(self, url=None): global urlToCrawl if url is not None: urlToCrawl = url def start(self, url): global urlToCrawl, crawler urlToCrawl = url self.idle() def printen(self, url): print url def idle(self): while self.db.getCrawlstate('crawler')[0]: while not Main.crawling: status = self.db.getCrawlstate(urlToCrawl) if status[0]: Main.crawling = True else: time.sleep(1800) Main.crawling = False Main.crawler.startCrawler(urlToCrawl, status[1])
def test_getPageBs(self): """Test the method of getting the :obj:`BeautifulSoup` object from a web page. 1. Test getting page bs object from an available page 2. Test getting page bs object from an unavailable page """ c = Crawler() # Test getting page bs object from an available page bs = c.getPageBs(self.testUrls[0]) self.assertTrue(isinstance( bs, BeautifulSoup)) # Returned data type correctness self.assertEqual( self.strIO.getvalue(), f'Getting: {self.testUrls[0]}\n') # Standard output correctness self.assertTrue(c.curUrl) # Current url existence self.assertEqual(c.curUrl, self.testUrls[0]) # Current url attribute correctness #----------------------------------------------------# # Test getting page bs object from an unavailable page sys.stdout = self.strIO = io.StringIO() bs = c.getPageBs('') self.assertFalse(c.curUrl) self.assertFalse(bs or isinstance(bs, BeautifulSoup)) self.assertTrue( re.match(f'Getting: \nFailed to get : ', self.strIO.getvalue()))
def test_printInfo(self): """Test the :obj:``printInfo`` method. """ c = Crawler(workingList=self.testUrls) c.printInfo('this is a message') expectedStr = f'Result:\nthis is a message\nRemained Work Amount: 3\n{"-"*20}\n' self.assertEqual(self.strIO.getvalue(), expectedStr) # Standard output string correctness
def step1_get_currency_list(): db = Database() crawler = Crawler(db) currency_list, time_list = crawler.get_currency_list_with_url(DEFAULT_SITE_URL + CURRENCYLIST_URL) currency_list_dict = [{"symbol": currency_list[index], "time": int(time_list[index])} for index in range(len(currency_list))] db.currency_list.insert_many(currency_list_dict) crawler.quit() db.close()
def main(): argparser = argparse.ArgumentParser(description="Scrapes a Web site and writes the generated HTML to disk for caching") argparser.add_argument('root', help='The starting point URL for the crawl (beginning with http:// or https://)') args = argparser.parse_args() assert args.root.startswith(('https://', 'http://')) policy = ScrapingPolicy(args.root) Crawler(policy).crawl()
async def main(): webpage_store = WebpageStore() webpage_processor = WebpageProcessor(webpage_store) crawler = Crawler(webpage_processor, max_depth=3, verbose=True) # initial_urls = ["https://en.wikipedia.org/wiki/Web_scraping"] # initial_urls = [f"https://swapi.co/api/people/{i}" for i in range(1, 3)] initial_urls = get_initial_urls() await crawler.run(initial_urls)
def run_crawler(self): while True: try: crawler = Crawler( host=self.__host, port=self.__port, key=self.__key) crawler.run() except Exception: pass sleep(self.__cr_time)
def download_articles_from(titles_list): crawler = Crawler() print("Starting download") pool = ThreadPoolExecutor(max_workers=5) for title in titles_list: pool.submit(crawler.search, title) pool.shutdown(wait=True) crawler.write_fails()
def test_download(self): crawler = Crawler() with open(r'C:\Users\matti\OneDrive\Desktop\lista.txt') as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] for x in content: print(x) crawler.scrape_photos("https://www.cucineluberoma.it/", content)
def test_extendWorkingList(self): """Test the method of extending the working list. 1. Test extending an empty working list. 2. Test extending a none empty working list. """ # Test extending an empty working list c = Crawler() self.assertFalse(c.WLM.records) c.extendWorkingList(self.testUrls[:2]) self.assertTrue(c.WLM.records) self.assertEqual(c.WLM.records, deque(self.testUrls[:2])) # Test extending a none empty working list c = Crawler(workingList=self.testUrls[:1]) self.assertTrue(c.WLM.records) c.extendWorkingList(self.testUrls[1:2]) self.assertEqual(c.WLM.records, deque(self.testUrls[:2]))
def setup_project(self, version: str, db: Session) -> bool: project = self.deploy_version(version) if project is not None: self.projects[version] = project page = Page.get_or_create(db, self.project_config.project_name, version, Url.clean_url(Constants.DOCKER_URL)) self.crawlers[version] = Crawler(page, self.projects[version].port) return True return False
def main() -> None: urls = None with open('./urls.json') as f: urls = json.loads(f.read()) for i in urls: crawler = Crawler(i, urls[i]) crawler.crawl() add_data(i, crawler.sorted_time_table) save() return
def build(self, keyWord, num, bfs=False): if bfs is False: queue = WebQueue() else: queue = BfsQueue() top10List = self.__getTop10(keyWord) for url in top10List: queue.offer(url, 0) return Crawler(num, queue)
def __init__(self, threads, tor=False): self.counter = 0 self.threads = threads self.tor = tor self.q = (Queue.Queue(), Queue.Queue()) self.qq = [] for i in xrange(self.threads): self.qq.append((Queue.Queue(), Queue.Queue())) c = Crawler(self.qq[-1], self.tor) daemon_thread(c.run)