Ejemplo n.º 1
0
def test_add_duplicate():
    new_crawler = crawler.Crawler(["test-url-1"], max_size=10)
    assert new_crawler.link_queue.qsize() == 1
    new_crawler.add_new_link("test-url-2", 1)
    assert new_crawler.link_queue.qsize() == 2
    new_crawler.add_new_link("test-url-2", 1)
    assert new_crawler.link_queue.qsize() == 2
Ejemplo n.º 2
0
def test_crawler_cloud_args():
    new_crawler = crawler.Crawler(["test-url"],
                                  gcs="gcs-project.gcs-bucket",
                                  bq="bq-project.bq-dataset")
    assert new_crawler.log.gcs_project == "gcs-project"
    assert new_crawler.log.gcs_bucket == "gcs-bucket"
    assert new_crawler.log.bq_project == "bq-project"
    assert new_crawler.log.bq_dataset == "bq-dataset"
Ejemplo n.º 3
0
def start_crawler():
    """ Parses command-line args and starts the crawler.
    """

    parser = argparse.ArgumentParser(description="SQL Web Crawler")
    parser.add_argument("urls", help="A space-separated list of URLs to be crawled", nargs='+')
    parser.add_argument("--max_depth", help="The max depth of the crawler (default=3)", type=int, default=3)
    parser.add_argument("--max_size", help="The maximum number of links to be crawled (default=100)", type=int, default=100)
    parser.add_argument("--cloud_storage", help="Project and bucket to store in GCS. Formatted as project_id.bucket (default=None)", default=None)
    parser.add_argument("--bigquery", help="Project and dataset to store in BQ. Formatted as project_id.dataset (default=None)", default=None)
    args = parser.parse_args()
    new_crawler = crawler.Crawler(args.urls, max_size=args.max_size, max_depth=args.max_depth, gcs=args.cloud_storage, bq=args.bigquery)
    new_crawler.crawl()
Ejemplo n.º 4
0
def start_crawler():
    """ Parses command-line args and starts the crawler.
    """

    parser = argparse.ArgumentParser(description="SQL Web Crawler")
    parser.add_argument("urls", help="A space-separated list of URLs to be crawled", nargs='+')
    parser.add_argument("--max_depth", help="The max depth of the crawler (default=3)", type=int, default=3)
    parser.add_argument("--max_size", help="The maximum number of links to be crawled (default=100)", type=int, default=100)
    parser.add_argument("--cloud_storage", help="Project and bucket to store in GCS. Formatted as project_id.bucket (default=None)", default=None)
    parser.add_argument("--bigquery", help="Project and dataset to store in BQ. Formatted as project_id.dataset (default=None)", default=None)
    parser.add_argument("--stream", help="Only stream data instead of saving locally. Simply put '--stream' to set this; no variable required afterward. Requires --bigquery variable to be set as well", action='store_true', default=False)
    args = parser.parse_args()

    if args.stream and args.bigquery is None:
        logging.error("Need to specify BigQuery table if streaming data")
        return

    new_crawler = crawler.Crawler(args.urls, max_size=args.max_size, max_depth=args.max_depth, gcs=args.cloud_storage, bq=args.bigquery, stream=args.stream)
    new_crawler.crawl()
Ejemplo n.º 5
0
def start_crawler():
    urls = sys.argv[1:]
    new_crawler = crawler.Crawler(urls, max_size=50)
    new_crawler.crawl()
Ejemplo n.º 6
0
def test_invalid_request():
    new_crawler = crawler.Crawler(["test-url"], max_size=10)
    assert new_crawler.get_html("incorrect-request") is None
Ejemplo n.º 7
0
def test_crawler_args():
    new_crawler = crawler.Crawler(["test-url"], 5, 10)
    assert new_crawler.max_depth == 5
    assert new_crawler.max_size == 10