def GET(): """ This method fetches the information needed to create the statistic of average comment karma by user from Reddit.com and returns it to the main application. :return: data needed to calculate the average comment karma by user """ logging.debug('GetKarmaStats method called') errors = [] try: chosen_username = web.input().chosen_username except Exception as e: errors.append('GetKarmaStats error:' + str(e)) logging.error('GetKarmaStats error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_user_avg_karma(chosen_username) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def GET(): """ This method fetches the information needed to create the ranking of top users by submissions from Reddit.com and returns it to the main application. :return: data needed to create the Ranking of top users by comments score """ logging.debug('GetTopUsersCommentsScore method called') errors = [] try: chosen_subreddit = web.input().chosen_subreddit except Exception as e: errors.append('GetTopUsersCommentsScore error:' + str(e)) logging.error('GetTopUsersCommentsScore error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_total_user_comments_score( chosen_subreddit) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def GET(): """ This method fetches the information needed to create the list of submissions by user from Reddit.com and returns it to the main application. :return: data needed to create the list of submissions by user """ logging.debug('GetPostsByUser method called') errors = [] try: chosen_username = web.input().chosen_username except Exception as e: errors.append('GetPostsByUser error:' + str(e)) logging.error('GetPostsByUser error:' + str(e)) my_crawler = Crawler(db) result = my_crawler.retrieve_user_posts(chosen_username) del my_crawler if len(errors) == 0: return json.dumps({'result': result, 'code': 0}) else: return json.dumps({ 'result': result, 'code': len(errors), 'errors': list(errors) })
def test_init_positive02(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru/?p=505' res = my_crawler.load_and_tokenize([site_to_check], depth=2) # print(res) # check number of parsed urls self.assertEqual(len(list(set(res.keys()))), 36)
def test_init_positive01(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru' res = my_crawler.load_and_tokenize([site_to_check], depth=1) # print(res) # check number of paragraphs self.assertEqual(len(res[site_to_check]), 244)
def quick_test(request): """ quick test page """ form, results, url_to_test = None, None, u'' if "POST" == request.method: form = QuickTestCheckForm(request.POST) if form.is_valid(): url_to_test = form.cleaned_data["url"] if "url-to-test" in request.session: url_to_test = request.session.pop("url-to-test") if url_to_test: # lets check c = Crawler(url_to_test) raw_results = c.run() results = {"error": raw_results["error"], "results_by_category": ((u'External links', 'ext', raw_results["external"], len(raw_results["external"]["web"]) + len(raw_results["external"]["img"])), (u'Internal links', 'int', raw_results["internal"], len(raw_results["internal"]["web"]) + len(raw_results["internal"]["img"])), (u'System', 'system', raw_results["system"], len(raw_results["system"]["css"]) + len(raw_results["system"]["js"])), ) } if form is None: initial = {} if url_to_test: initial.update({"url": url_to_test}) form = QuickTestCheckForm(initial=initial) return render_to_response('index/quick-test.html', {"form": form, "results": results}, context_instance=RequestContext(request))
def post(self): data = request.form #print(request.cookies) # Check for data validity: if data is None: error(400, "Missing one or more required fields") if data.get('oecilogin') is None: error(400, "Missing OECI login username") if data.get('oecipassword') is None: error(400, "Missing OECI login password") response = make_response() credentials = {'username': data['oecilogin'], 'password': data['oecipassword']} #fake_creds = {'username': '******', 'password': '******'} cipher = DataCipher(key=current_app.config.get("SECRET_KEY")) encrypted_credentials = cipher.encrypt(credentials) #encrypted_fake_credentials = cipher.encrypt(fake_creds) # Try to log into OECI database try: Crawler.attempt_login(requests.Session(), credentials['username'], credentials['password']) response.set_cookie( "oeci_token", secure=os.getenv("TIER") == "production", samesite="strict", value=encrypted_credentials ) except UnableToReachOECI: error(404, "Unable to reach OECI database") except InvalidLoginCreds: error(401, "Invalid login credentials") return response, 201
def setUp(self): self.mock_handler = Mock() self.mock_http = AsyncMock() self.mock_robots_agent = Mock() self.crawler = Crawler( self.mock_robots_agent, self.mock_http, self.mock_handler )
def get_list(self, page=1, category_code="ALL"): is_continue = True res = rq.get(Naver.MOVIES_URL % (category_code, page)) soup = BeautifulSoup(res.content, 'lxml') items = soup.select('.lst_thum_wrap .lst_thum li a') results = [] Crawler.progress_bar(len(items), 0, 0) for idx, item in enumerate(items): href, product_no, title, body = self.parse(item) movie = Movie(href, product_no, title, body, category_code) sleep = 0 if not movie.is_exist_by_redis(): movie.save() results.append(movie) sleep = 1 Crawler.progress_bar(len(items), idx + 1, sleep) if len(items) != Naver.MAX_CNT_OF_PAGE: is_continue = False return is_continue, results
def main(): _setup_logging() downloader = SeleniumHTMLDownloader( r'C:\Users\hananavr\Documents\לימודים\מעבדה בין תחומית\facebookCrawler copy\crawler\chromedriver.exe' ) # mongodb://<dbuser>:<dbpassword>@ds215633.mlab.com:15633/pytheas store = MongoItemStore(host='ds215633.mlab.com', port='15633', db='pytheas', article_collection='hanan', username='******', password='******') items_loader = MongoItemsLoader(host='ds215633.mlab.com', port='15633', db='pytheas', items_collection='hanan', username='******', password='******') crawler = Crawler(downloader, { 'www.facebook.com/': FacebookParser(), 'www.twitter.com/': TwitterParser() }, store, items_loader) crawler.crawl(FACEBOOK_PAGE_TO_DOWNLOAD_FROM)
def crawl(url, output_dir, depth=2, method="normal", gecko_path="geckodriver", page_name=None, custom_stats_handler=None, custom_process_handler=None): head_handlers = {} get_handlers = {} # get name of page for sub-directories etc. if not custom name given if page_name is None: page_name = urlparse(url).netloc get_handlers['application/pdf'] = LocalStoragePDFHandler( directory=output_dir, subdirectory=page_name) if custom_stats_handler is None: head_handlers['application/pdf'] = CSVStatsPDFHandler(directory=output_dir, name=page_name) else: for content_type, Handler in custom_stats_handler.items(): head_handlers[content_type] = Handler if custom_process_handler is None: process_handler = ProcessHandler() else: process_handler = custom_process_handler if not get_handlers and not head_handlers: raise ValueError('You did not specify any output') crawler = Crawler( downloader=requests_downloader, head_handlers=head_handlers, get_handlers=get_handlers, follow_foreign_hosts=False, crawl_method=method, gecko_path=gecko_path, process_handler=process_handler ) crawler.crawl(url, depth)
def main(): c = Crawler() indices = c.get_master_indices() fs = FileSaver() fs.save_files(indices)
def __init__(self, starting_url=False, save=False, initial_seed=False): Crawler.__init__(self) #No database values, just these attributes self.url = starting_url if starting_url else self.base_url + "/w/Category:Artist" self.save = save self.initial_seed = initial_seed #self.url = self.tree.xpath("//div[@class='listPagination'][1]/a[contains(text(), 'next')]/@href")[0] return
def main(): _setup_logging() downloader = SeleniumHTMLDownloader('./lib/chromedriver.exe') store = MongoArticleStore("localhost", 27017, "Crawler", "Articles") crawler = Crawler(downloader, { 'ynet.co.il': YnetParser() }, store) crawler.crawl('https://www.ynet.co.il')
def test_find_all_urls(self): c = Crawler('http://www.hotline.ua') expected_urls = [ 'http://www.hotline.ua/help/hl_checkout/h1/', 'http://www.hotline.ua/help/hl_checkout/', 'http://www.hotline.ua/help/hl_checkout/h3/', 'http://www.hotline.ua/help/hl_checkout/h2/' ] self.assertEqual(sorted(c.find_all_urls(self.page)), sorted(expected_urls))
def test_structure_positive_01(self): my_crawler = Crawler() site_to_check = 'http://zabaykin.ru/?p=505' res = my_crawler.load_and_tokenize([site_to_check], depth=2) self.assertIsInstance(res, OrderedDict) for k, el in res.items(): self.assertIsInstance(k, str) self.assertTrue(k.startswith('http')) self.assertIsInstance(el, List[str])
def __init__(self): try: self.crawler = Crawler() self.pgrpah = Graph() self.parser = Parser() pass except Exception as e: print("ERROR " + str(e)) sys.exit(-1)
def test_lucky_path(self): # Simulates parsed pages with posts represented by ints in range [page_index*7, page_index*7+7[ crawler = Crawler(FetcherMock(lambda x: range(x*7, (x+1)*7)), ParserMock()) # Check when number of posts is not a multiple of posts per page result = crawler.crawl(32) self.assertEqual(result, list(range(32))) # Check when number of posts is a multiple of posts per page result = crawler.crawl(21) self.assertEqual(result, list(range(21)))
def main(): configs = load_configs() conf_db = configs["database"] conf_param = configs["params"] crawler = Crawler() db = MongoDb(conf_db["address"], conf_db["port"]) for stock_id in conf_param["stock_ids"]: data = crawler.fetch_stock_data(stock_id, conf_param["period"]) db.append_one(data)
def __init__(self, url, save=False, initial_seed=False): Crawler.__init__(self) #Database values self.row_id = False self.name = '' self.url = url #Other variables self.save = save #If not saving, print for debugging and testing purposes self.initial_seed = initial_seed return
def stopCrawler(request): try: id = request.POST.get('id') source = Source.objects.get(id=id) sourceurl = source.url crawlerone = Crawler(sourceurl) crawlerone.stop() runingcrawlers.update( {'id':id,'inst':crawlerone} ) return redirect('dashboard') except ObjectDoesNotExist: return redirect('dashboard')
def crawl(url, cfgs): click.secho("Crawler will begin on '{url}' with below settings:\n".format(url=url), fg='green') config = configs.load_config_section(config_section=cfgs) if config is None: print(f"Invalid config {cfgs}. Switching to DEFAULT.") config = configs.load_config_section(config_section='DEFAULT') else: print(f"Config set {cfgs} loaded.") click.echo() crawler = Crawler() print(f"Target URL = {url}") crawler.crawl(url, config['traversal'], config['user_agent'], int(config['max_depth']), int(config['max_total']))
def test(url, vector_input, sensitive_input, random, speed): """Uses provided vectors and input to test against target.""" # TODO(piper): pass files in to test. c = Crawler(url[0], auth=True) if vector_input: vectored = c.crawl([Gatherer()]) if sensitive_input: [print(line) for line in sensitive_input] # result = c.crawl([VectorGatherer()]) print("Finished testing...")
def test_searcher_with_seen_urls(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}, 2) test_crawler.seen_urls.add(Page(URL('http://scala-lang.org'))) test_result = test_crawler.crawl() test_crawler.close() assert 'http://scala-lang.org' not in test_result
def start(self): for game_type, crawl_range in self.crawl_range().items(): crawler = Crawler(game_type) for date in pd.date_range(start=crawl_range["begin"], end=crawl_range["end"]): logging.debug(f"command: crawling {game_type} at {date}") crawler.run(self.format_date(date)) # random sleep time.sleep( abs( np.random.normal( self.config["commander"]["queryPeriod"])))
class CrawlerTestCase(unittest.TestCase): def setUp(self): self.crawler = Crawler() def test_scrape(self): flag = self.crawler.scrape('手機') self.assertTrue(flag) def test_scrape_return_none(self): flag = self.crawler.scrape() self.assertFalse(flag)
def test_crawler_zero_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['dog'], {}, 2) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(test_result, set())
def main(): c = Crawler() fs = FileSaver() for i in range(EARLIEST, LATEST + 1): for j in range(Q1, Q4 + 1): print("\nStarting on year %s, quarter %s\n" % (str(i), str(j))) if i == 2016 and j == 4: pass else: f = open(BASE_MASTER_PATH.format(i, j), 'r') for line in f: if contains_8k(line): line = line.split('|') doc = c.get_8k_form(line[URL]) wait = random.randint(1, 30) print('Waiting %s seconds' % (str(wait))) time.sleep(int(wait)) print('Current time is %s' % strftime('%Y-%m-%d %H:%M:%S', gmtime())) if isinstance(line[DATE], str) and isinstance( line[COMPANY], str): form_date = line[DATE] string_array = line[COMPANY].split('/') form_company = "" for idx in range(len(string_array)): form_company += string_array[idx] else: form_date = line[DATE].decode('utf-8', 'ignore') form_company = line[COMPANY].decode( 'utf-8', 'ignore') print('Found form from company %s on date of %s' % (form_company, form_date)) fs.save_8k_file(i, j, doc, form_date, form_company)
def main(): logging.config.fileConfig("logging.conf") logger = logging.getLogger("sLogger") logger.info("Crawling started.") crawler = Crawler() crawler.crawl() logger.info("Crawling finished.") logger.info("Generating CSV files...") generate_data.generate() logger.info("CSV Files are generated.") sys.exit()
def setUp(self): self.database_writer = MagicMock() self.database_reader = MagicMock() self.parser = MagicMock() self.database_reader.get_weburls_table_size = MagicMock( return_value=50) self.database_reader.get_weburls_and_content_table_size = MagicMock( return_value=10) self.database_reader.get_next_url = MagicMock(return_value=None) self.database_writer.database_limit = 10 self.crawler = Crawler(self.database_writer, self.database_reader, self.parser) self.local_index_html_file = "file://" + os.path.abspath( "test/website/index.html") self.crawler.crawl(self.local_index_html_file)
def test_update_parents(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://a/c/></a>' \ '<a href=http://a/b/></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'http://a', [''], {}, max_urls_count=3) test_result = test_crawler.crawl() test_crawler.close() for page in test_result: if page.parent: self.assertEqual(page.parent, Page(URL('http://a')))
async def _main(site, concurrent, minimum_wait, maximum_wait, max_items): start_time = time.time() robot_agent = create_robot_agent(USER_AGENT, site) emitter = Emitter(max_items) handler = KolonialHandlers(site, emitter) http_client = HttpClient(USER_AGENT, minimum_wait, maximum_wait) crawler = Crawler(robot_agent, http_client, handler) queue = asyncio.Queue() queue.put_nowait("/") workers = [ asyncio.create_task(worker_task(n, crawler, queue)) for n in range(concurrent) ] await queue.join() for worker in workers: worker.cancel() await asyncio.gather(*workers, return_exceptions=True) await http_client.close() end_time = time.time() print(f"{http_client.gets} requests") print(f"{emitter.count} entries") print(f"took {end_time - start_time}s")
class TestCrawler(unittest.TestCase): def setUp(self): self.START_URL = 'http://www.facebook.com/' self.BRANCHING_FACTOR = 3 self.c = Crawler(self.START_URL, self.BRANCHING_FACTOR) def test_get_url(self): ans = self.c.get_url(self.START_URL, []) self.assertTrue(len(ans) > 0) def test_start_dfs(self): ans = self.c.start_dfs() self.assertTrue(len(ans) > 0) def test_start_bfs(self): ans = self.c.start_bfs() self.assertTrue(len(ans) > 0)
def main(): nflcrawler = Crawler() seeds = [ "http://www.nfl.com/teams/roster?team=STL", "http://www.nfl.com/teams/roster?team=TEN", "http://www.nfl.com/teams/roster?team=WAS", "http://www.nfl.com/teams/roster?team=CAR", "http://www.nfl.com/teams/roster?team=CLE", "http://www.nfl.com/teams/roster?team=JAC", "http://www.nfl.com/teams/roster?team=KC", ] nflcrawler.add_seeds(seeds) rules = { "^(http://www.nfl.com/teams/roster)(\?team=[a-zA-Z]+)$": [ "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$" ], "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$": [ "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/careerstats)$" ], } nflcrawler.add_rules(rules) nflcrawler.start()
def main(): nfltweetcrawler = Crawler() seeds = ['http://www.tweeting-athletes.com/index.cfm?CatID=2&People=1'] nfltweetcrawler.add_seeds(seeds) rules = {'^(http://www.tweeting-athletes.com/)(index.cfm\?CatID=2&People=1)$': ['^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$'], '^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$':['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$'], '^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$': ['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$']} nfltweetcrawler.add_rules(rules) nfltweetcrawler.start()
from crawler.crawler import Crawler mycrawler = Crawler() seeds = ['http://www.fdprice.com/'] # list of url mycrawler.add_seeds(seeds) rules = {'^(http://.+fdprice\.com)(.+)$':[ '^(http://.+fdprice\.com)(.+)$' ]} #your crawling rules: a dictionary type, #key is the regular expressions for url, #value is the list of regular expressions for urls which you want to follow from the url in key. mycrawler.add_rules(rules) mycrawler.start() # start crawling
def setUp(self): self.START_URL = 'http://www.facebook.com/' self.BRANCHING_FACTOR = 3 self.c = Crawler(self.START_URL, self.BRANCHING_FACTOR)
from urllib.request import urlopen from posts.repository import Posts from crawler.parser import Parser from crawler.fetcher import Fetcher from crawler.crawler import Crawler import sys if __name__ == "__main__": # Parse arguments nb_requested_posts = int(sys.argv[1]) if len(sys.argv) > 1 else 10 library_file = sys.argv[2] if len(sys.argv) > 2 else "posts.pdl" # Crawl crawler = Crawler(Fetcher(), Parser()) posts = crawler.crawl(nb_requested_posts) # Persist crawled posts repository = Posts(library_file, True) for post in posts: repository.addPost(post) print("%s posts have been parsed and saved to %s" % (repository.getPostsCount(), library_file))
from crawler.crawler import Crawler from knowledge.trainingSet import training_set if __name__ == "__main__": train_list = ["Ann Arbor"] dest_list = [ "Ann Arbor", "Ypsilanti", "Detroit", "Romulus", "Troy", "Auburn Hills", ] crawler = Crawler(dest_list, train_list, training_set) crawler.learn() for dest in dest_list: crawler.crawl_dest(dest)
class CrawlerTest(TestCase): def setUp(self): self.crawler = Crawler() self.base_url = 'http://www.oldclassiccar.co.uk/forum/phpbb/phpBB2/viewtopic.php?t=12591' def test_get_page_returns_html(self): page = self.crawler.get_request(self.base_url) self.assertIn('</html>', page) def test_soupify_response(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) self.assertEquals(str(type(soup)), "<class 'bs4.BeautifulSoup'>") def test_get_pages_returns_url_list(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) url_list = self.crawler.get_links(soup) self.assertIs(type(url_list), list) self.assertEqual(len(url_list), 8) def test_get_post_id_list_returns_list(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) post_ids = self.crawler.get_post_id_list(soup) self.assertIs(type(post_ids), list) self.assertIs(type(post_ids[0]), int) self.assertEqual(len(post_ids), 15) def test_get_tag_by_id_returns_id_tag(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) tag = self.crawler.get_tag_by_id(soup, 87120) self.assertEquals(str(type(tag)), "<class 'bs4.element.Tag'>") def test_get_user_by_post_id(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) tag_1 = self.crawler.get_tag_by_id(soup, 87120) user_1 = self.crawler.get_user_by_post_id(tag_1) self.assertIs(type(user_1), str) self.assertEqual(user_1, "Rick") tag_2 = self.crawler.get_tag_by_id(soup, 87131) user_2 = self.crawler.get_user_by_post_id(tag_2) self.assertIs(type(user_2), str) self.assertEqual(user_2, "pigtin") def test_get_post_date(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) tag = self.crawler.get_tag_by_id(soup, 87120) post_date = self.crawler.get_post_date(tag) self.assertIs(type(post_date), str) self.assertNotIn(post_date, "Posted: ") def test_get_post_msg(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) tag = self.crawler.get_tag_by_id(soup, 87120) post_msg = self.crawler.get_post_msg(tag) self.assertIs(type(post_msg), str) self.assertIn("Tonight, 8pm, might be worth a look", post_msg) tag_2 = self.crawler.get_tag_by_id(soup, 87131) post_msg_2 = self.crawler.get_post_msg(tag_2) self.assertIs(type(post_msg_2), str) self.assertIn("Oh dear! Just switched off", post_msg_2) self.assertNotIn("_________________", post_msg_2) def test_gets_next_msg_span_if_empty(self): page = self.crawler.get_request('http://www.oldclassiccar.co.uk/forum/phpbb/phpBB2/viewtopic.php?t=12591') soup = self.crawler.soupify(page) tag = self.crawler.get_tag_by_id(soup, 87140) post_msg = self.crawler.get_post_msg(tag) self.assertIs(type(post_msg), str) self.assertIn("i wouldn't bother", post_msg) def test_build_post_data(self): page = self.crawler.get_request(self.base_url) soup = self.crawler.soupify(page) data = self.crawler.build_post_data(soup, 87120) self.assertEqual(data[0:3], [87120, "Rick", "Mon Sep 24, 2012 4:53 pm"]) def test_to_csv(self): id_1 = 87120 id_2 = 871233 name_1 = "Rick" name_2 = "Chuck" date_1 = "Mon Sep 24, 2012 4:53 pm" date_2 = "Mon Sep 23, 2012 4:53 pm" msg = "Tonight, 8pm, might be worth a look...?\n\nRJ".encode('unicode_escape') data = [ [id_1, name_1, date_1, msg], [id_2, name_2, date_2, msg] ] csv = self.crawler.to_csv(data, "test_forum") self.assertEqual(data, csv)
def setUp(self): self.crawler = Crawler() self.base_url = 'http://www.oldclassiccar.co.uk/forum/phpbb/phpBB2/viewtopic.php?t=12591'