def main(): _setup_logging() downloader = SeleniumHTMLDownloader( r'C:\Users\hananavr\Documents\לימודים\מעבדה בין תחומית\facebookCrawler copy\crawler\chromedriver.exe' ) # mongodb://<dbuser>:<dbpassword>@ds215633.mlab.com:15633/pytheas store = MongoItemStore(host='ds215633.mlab.com', port='15633', db='pytheas', article_collection='hanan', username='******', password='******') items_loader = MongoItemsLoader(host='ds215633.mlab.com', port='15633', db='pytheas', items_collection='hanan', username='******', password='******') crawler = Crawler(downloader, { 'www.facebook.com/': FacebookParser(), 'www.twitter.com/': TwitterParser() }, store, items_loader) crawler.crawl(FACEBOOK_PAGE_TO_DOWNLOAD_FROM)
def crawl(url, output_dir, depth=2, method="normal", gecko_path="geckodriver", page_name=None, custom_stats_handler=None, custom_process_handler=None): head_handlers = {} get_handlers = {} # get name of page for sub-directories etc. if not custom name given if page_name is None: page_name = urlparse(url).netloc get_handlers['application/pdf'] = LocalStoragePDFHandler( directory=output_dir, subdirectory=page_name) if custom_stats_handler is None: head_handlers['application/pdf'] = CSVStatsPDFHandler(directory=output_dir, name=page_name) else: for content_type, Handler in custom_stats_handler.items(): head_handlers[content_type] = Handler if custom_process_handler is None: process_handler = ProcessHandler() else: process_handler = custom_process_handler if not get_handlers and not head_handlers: raise ValueError('You did not specify any output') crawler = Crawler( downloader=requests_downloader, head_handlers=head_handlers, get_handlers=get_handlers, follow_foreign_hosts=False, crawl_method=method, gecko_path=gecko_path, process_handler=process_handler ) crawler.crawl(url, depth)
def main(): _setup_logging() downloader = SeleniumHTMLDownloader('./lib/chromedriver.exe') store = MongoArticleStore("localhost", 27017, "Crawler", "Articles") crawler = Crawler(downloader, { 'ynet.co.il': YnetParser() }, store) crawler.crawl('https://www.ynet.co.il')
def test_lucky_path(self): # Simulates parsed pages with posts represented by ints in range [page_index*7, page_index*7+7[ crawler = Crawler(FetcherMock(lambda x: range(x*7, (x+1)*7)), ParserMock()) # Check when number of posts is not a multiple of posts per page result = crawler.crawl(32) self.assertEqual(result, list(range(32))) # Check when number of posts is a multiple of posts per page result = crawler.crawl(21) self.assertEqual(result, list(range(21)))
def crawl(url, cfgs): click.secho("Crawler will begin on '{url}' with below settings:\n".format(url=url), fg='green') config = configs.load_config_section(config_section=cfgs) if config is None: print(f"Invalid config {cfgs}. Switching to DEFAULT.") config = configs.load_config_section(config_section='DEFAULT') else: print(f"Config set {cfgs} loaded.") click.echo() crawler = Crawler() print(f"Target URL = {url}") crawler.crawl(url, config['traversal'], config['user_agent'], int(config['max_depth']), int(config['max_total']))
def main(): logging.config.fileConfig("logging.conf") logger = logging.getLogger("sLogger") logger.info("Crawling started.") crawler = Crawler() crawler.crawl() logger.info("Crawling finished.") logger.info("Generating CSV files...") generate_data.generate() logger.info("CSV Files are generated.") sys.exit()
def test(url, vector_input, sensitive_input, random, speed): """Uses provided vectors and input to test against target.""" # TODO(piper): pass files in to test. c = Crawler(url[0], auth=True) if vector_input: vectored = c.crawl([Gatherer()]) if sensitive_input: [print(line) for line in sensitive_input] # result = c.crawl([VectorGatherer()]) print("Finished testing...")
def test_searcher_with_seen_urls(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}, 2) test_crawler.seen_urls.add(Page(URL('http://scala-lang.org'))) test_result = test_crawler.crawl() test_crawler.close() assert 'http://scala-lang.org' not in test_result
def test_crawler_zero_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['dog'], {}, 2) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(test_result, set())
def test_update_parents(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=http://a/c/></a>' \ '<a href=http://a/b/></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'http://a', [''], {}, max_urls_count=3) test_result = test_crawler.crawl() test_crawler.close() for page in test_result: if page.parent: self.assertEqual(page.parent, Page(URL('http://a')))
def run(): """Execute the cli action Returns: None """ parser = argparse.ArgumentParser( description="Simple one domain web crawler by Jonathan Harden") parser.add_argument( "domain", help= "Domain to crawl (will not leave the subdomain specified and will ignore any path part)", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Verbose, print INFO level logging", ) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.INFO) crawler = Crawler(args.domain) crawler.crawl() for page in crawler.site_map.all_pages(): print("Page: {}".format(page.link)) print(" Outbound Links:") for out_link in set(page.out_links): print(" {}".format(out_link)) print("\n\n")
def main(config): login_args = '{student} {password} {semester}'.format(**config['account']) cookie = loginceiba.info(*login_args.split()) if cookie == 1: print("can't login!!") return print('login_success, cookie:') print(cookie) crawler = Crawler(cookie.strip('\n')) courses = crawler.crawl() notifications, calendars, downloads = diff.diff(courses[0], []) notifier = Notifier() notifier.show_diff_notifications(notifications) downloadfile.downloadfile(downloads)
def discover(url, common_words=None): """Retrieves information from the provided URL.""" print("Beginning...") c = Crawler(url, auth=True) result = c.crawl([CountGatherer(), GuessingGatherer(), Gatherer(), URLParamsGatherer()]) print("FINISHED CRAWLING") print("") print("") print("") print(result) print("") print("") print("") r = requests.get(url, verify=False) report(inputs, None, cookies)
def test_searcher_with_result(self): with patch.object(Crawler, 'get_html') as mock_get_html: mock_get_html.return_value = '<a href=https://scala1.html></a>' \ '<a href=https://scala2.html></a>' \ '<a href=https://scala3.html></a>' \ '<a href=https://scala4.html></a>' \ '<a href=https://scala5.html></a>' \ '<a href=https://scala6.html></a>' \ '<a href=https://scala7.html></a>' \ '<a href=https://scala8.html></a>' \ '<a href=https://scala9.html></a>' \ '<a href=https://scala10.html></a>' \ '<a href=https://scala11.html></a>' with patch.object(Crawler, 'write_html') as mock_write_html: mock_write_html.return_value = None test_crawler = Crawler( 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html', ['scala'], {}) test_result = test_crawler.crawl() test_crawler.close() self.assertEqual(len(test_result), 10)
default='pages', help='Directory for downloaded pages') arguments_parser.add_argument('-g', action='store_true', help='Show graph') arguments_parser.add_argument('-w', action='store_true', help='Save founded pages') args = arguments_parser.parse_args() white_domains = [] for domain in args.wildcard: if domain.startswith('*'): white_domains.append(re.compile(fr'[^.]+.{domain[1::]}')) else: white_domains.append(domain) if args.start_url[-1] == '/': url = args.start_url[:-1] else: url = args.start_url crawler = Crawler(url, args.request, white_domains, args.d, args.f, args.w) try: result = crawler.crawl() if args.g: show_graph(result) for link in result: print(link) print('Program is completed') plt.show() except KeyboardInterrupt: print('Program is completed') finally: crawler.close()
import os import sys from crawler.crawler import Crawler from crawler.database_writer import DatabaseWriter # Set Python's recursion depth limit (default is 1000) sys.setrecursionlimit(15499) sites_to_crawl = "file://" + os.path.abspath("sites_to_crawl.html") database_writer = DatabaseWriter('postgresql://localhost/eureka_development', 10000) crawler = Crawler(database_writer) crawler.crawl(sites_to_crawl)
from urllib.request import urlopen from posts.repository import Posts from crawler.parser import Parser from crawler.fetcher import Fetcher from crawler.crawler import Crawler import sys if __name__ == "__main__": # Parse arguments nb_requested_posts = int(sys.argv[1]) if len(sys.argv) > 1 else 10 library_file = sys.argv[2] if len(sys.argv) > 2 else "posts.pdl" # Crawl crawler = Crawler(Fetcher(), Parser()) posts = crawler.crawl(nb_requested_posts) # Persist crawled posts repository = Posts(library_file, True) for post in posts: repository.addPost(post) print("%s posts have been parsed and saved to %s" % (repository.getPostsCount(), library_file))
class TestingCrawler(unittest.TestCase): def setUp(self): self.database_writer = MagicMock() self.database_reader = MagicMock() self.parser = MagicMock() self.database_reader.get_weburls_table_size = MagicMock( return_value=50) self.database_reader.get_weburls_and_content_table_size = MagicMock( return_value=10) self.database_reader.get_next_url = MagicMock(return_value=None) self.database_writer.database_limit = 10 self.crawler = Crawler(self.database_writer, self.database_reader, self.parser) self.local_index_html_file = "file://" + os.path.abspath( "test/website/index.html") self.crawler.crawl(self.local_index_html_file) def get_test_soup(self): test_soup = BeautifulSoup( '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>', 'html.parser') return test_soup def test_crawler_is_instance_of_Crawler(self): self.assertIsInstance(self.crawler, Crawler) def test_crawl_calls_database_writer_write_url(self): self.database_writer.write_url = MagicMock() self.crawler.crawl(self.local_index_html_file) self.database_writer.write_url.assert_called_once_with( self.local_index_html_file) def test_crawl_accepts_and_assigns_url(self): self.assertEqual(self.crawler.url, self.local_index_html_file) def test_return_all_content_calls_database_writer_write_urls_and_content( self): self.crawler.database_writer.write_urls_and_content = MagicMock() self.crawler.return_all_content() self.crawler.database_writer.write_urls_and_content.assert_called_once( ) def test_return_all_content_calls_crawl_next_url(self): self.crawler.crawl_next_url = MagicMock() self.crawler.return_all_content() self.crawler.crawl_next_url.assert_called_once() def test_return_all_content_calls_parser_create_soup_and_save_content( self): self.crawler.page = bytes() self.crawler.save_found_weburls = MagicMock() self.parser.create_soup_and_save_content = MagicMock() self.crawler.return_all_content() self.parser.create_soup_and_save_content.assert_called_once() def test_save_found_weburls_calls_database_writer_prepare_urls_for_writing_to_db( self): self.database_writer.prepare_urls_for_writing_to_db = MagicMock() self.crawler.save_found_weburls() test_urls_array = ["www.dogs.com", "www.cats.com"] self.database_writer.prepare_urls_for_writing_to_db.assert_called_once( ) def test_crawl_next_url_calls_database_reader_get_next_url(self): self.crawler.crawl_next_url() self.database_reader.get_next_url.assert_called()
help="End page of searches results [2]") parser.add_option("-d", "--download-searches", dest="download_searches", default="True", help="Download search result pages (True/False) [True]") parser.add_option("-n", "--download-offers", dest="download_offers", default="True", help="Download offers (True/False) [True]") parser.add_option( "-r", "--remove-files", dest="remove_files", default="True", help="Empty dirs containing old files (True/False) [True]") opt, _ = parser.parse_args() crawler = Crawler(opt.city, opt.property_type) crawler.crawl(download_searches=opt.download_searches.lower() in "true", download_offers=opt.download_offers.lower() in "true", remove_files=opt.remove_files.lower() in "true", start_page=opt.start_page, end_page=opt.end_page, rent=opt.offer_type)
import os from crawler.crawler import Crawler url = os.getenv('URL') output_dir = os.getenv('OUTPUT_DIR') crawler = Crawler(url, output_dir) crawler.crawl()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-u", "--url", help="Start URL that will be the first step of crawling", type=str, required=True) parser.add_argument("-d", "--max_depth", help="Max depth of crawling", type=int, default=2, required=False) parser.add_argument("-ureg", "--url_regex", help="URL regex of the URLs you want to crawl", type=str, default=None, required=False) parser.add_argument("-ua", "--user_agent", help="Max depth of crawling", type=str, required=False) parser.add_argument( "-op", "--pickle_path", help= "The path of a pickle object that stores a dict where each key is an URL and the values" "are a list of URLs found there.", type=str, required=False) parser.add_argument( "-csv", "--csv_path", help="The path of a csv file that the URLs found in each page.", type=str, required=False) args = parser.parse_args() if not validators.url(args.url): raise ValueError( f"The first argument must be an URL. It is {args.url} instead") if args.max_depth < 1: raise ValueError( f"Max depth argument must be greater or equal than 1. It is {args.max_depth} instead" ) url_filter: Optional[UrlFilter] = None if args.url_regex: try: url_filter = UrlFilter(args.url_regex) except re.error: raise ValueError( f"URL regex argument is not a valid regex. It is {args.url_regex} instead" ) if not args.pickle_path and not args.csv_path: raise ValueError(f"An output file path (pickle or CSV) is required.") crawl_delay = (1, 2) logger = make_logger() crawler = Crawler(start_url=args.url, stop_condition=StopCondition.depth_is_reached( args.max_depth), url_filters=[url_filter], user_agent=UserAgent.none(), timeout=None, logger=logger, delay=Delay.uniform(*crawl_delay)) crawl: Crawl = asyncio.run(crawler.crawl()) if args.pickle_path: PickleWriter(crawl).write(args.pickle_path) if args.csv_path: CSVWriter(crawl).write(args.csv_path)
elif test == "test_dom_family": dom_family("a.com", "b.a.com") dom_family("x.com", "x.com/login") elif test == "crawler": url = "http://forum.18.217.9.142.xip.io/login/" # url = "http://w.com" # url = "http://email.kumo.x10host.com" # url = "mizio.herokuapp.com/test" # url = "http://18.217.9.142.xip.io/" method = "bfs" agent = HTTP_UA_CHROME depth = 20 pages = 100 crawler = Crawler() crawler.crawl(url, method, agent, depth, pages) # configs elif test == "configs": config = configs.DEFAULT_CONFIGS for val in config: print("%s : %s" % (val, config[val])) # http_local elif test == "http_local": host = "localhost" port = 5000 url = "/" ua = "chrome" # Test GET request = HttpRequest(host, port, "GET")