def test_add_duplicate(): new_crawler = crawler.Crawler(["test-url-1"], max_size=10) assert new_crawler.link_queue.qsize() == 1 new_crawler.add_new_link("test-url-2", 1) assert new_crawler.link_queue.qsize() == 2 new_crawler.add_new_link("test-url-2", 1) assert new_crawler.link_queue.qsize() == 2
def test_crawler_cloud_args(): new_crawler = crawler.Crawler(["test-url"], gcs="gcs-project.gcs-bucket", bq="bq-project.bq-dataset") assert new_crawler.log.gcs_project == "gcs-project" assert new_crawler.log.gcs_bucket == "gcs-bucket" assert new_crawler.log.bq_project == "bq-project" assert new_crawler.log.bq_dataset == "bq-dataset"
def start_crawler(): """ Parses command-line args and starts the crawler. """ parser = argparse.ArgumentParser(description="SQL Web Crawler") parser.add_argument("urls", help="A space-separated list of URLs to be crawled", nargs='+') parser.add_argument("--max_depth", help="The max depth of the crawler (default=3)", type=int, default=3) parser.add_argument("--max_size", help="The maximum number of links to be crawled (default=100)", type=int, default=100) parser.add_argument("--cloud_storage", help="Project and bucket to store in GCS. Formatted as project_id.bucket (default=None)", default=None) parser.add_argument("--bigquery", help="Project and dataset to store in BQ. Formatted as project_id.dataset (default=None)", default=None) args = parser.parse_args() new_crawler = crawler.Crawler(args.urls, max_size=args.max_size, max_depth=args.max_depth, gcs=args.cloud_storage, bq=args.bigquery) new_crawler.crawl()
def start_crawler(): """ Parses command-line args and starts the crawler. """ parser = argparse.ArgumentParser(description="SQL Web Crawler") parser.add_argument("urls", help="A space-separated list of URLs to be crawled", nargs='+') parser.add_argument("--max_depth", help="The max depth of the crawler (default=3)", type=int, default=3) parser.add_argument("--max_size", help="The maximum number of links to be crawled (default=100)", type=int, default=100) parser.add_argument("--cloud_storage", help="Project and bucket to store in GCS. Formatted as project_id.bucket (default=None)", default=None) parser.add_argument("--bigquery", help="Project and dataset to store in BQ. Formatted as project_id.dataset (default=None)", default=None) parser.add_argument("--stream", help="Only stream data instead of saving locally. Simply put '--stream' to set this; no variable required afterward. Requires --bigquery variable to be set as well", action='store_true', default=False) args = parser.parse_args() if args.stream and args.bigquery is None: logging.error("Need to specify BigQuery table if streaming data") return new_crawler = crawler.Crawler(args.urls, max_size=args.max_size, max_depth=args.max_depth, gcs=args.cloud_storage, bq=args.bigquery, stream=args.stream) new_crawler.crawl()
def start_crawler(): urls = sys.argv[1:] new_crawler = crawler.Crawler(urls, max_size=50) new_crawler.crawl()
def test_invalid_request(): new_crawler = crawler.Crawler(["test-url"], max_size=10) assert new_crawler.get_html("incorrect-request") is None
def test_crawler_args(): new_crawler = crawler.Crawler(["test-url"], 5, 10) assert new_crawler.max_depth == 5 assert new_crawler.max_size == 10