Beispiel #1
0
 def get_test_config(
     self,
     data_dir: Optional[Path] = None,
     num_browsers: int = NUM_BROWSERS,
     display_mode: str = "headless",
 ) -> Tuple[ManagerParams, List[BrowserParams]]:
     """Load and return the default test parameters."""
     if not data_dir:
         data_dir = self.tmpdir
     assert data_dir is not None  # Mypy doesn't understand this without help
     manager_params = ManagerParams(num_browsers=num_browsers)
     browser_params = [BrowserParams() for _ in range(num_browsers)]
     manager_params.log_directory = data_dir
     manager_params.num_browsers = num_browsers
     for i in range(num_browsers):
         browser_params[i].display_mode = display_mode
     return manager_params, browser_params
Beispiel #2
0
 def get_test_config(
     self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless"
 ):
     """Load and return the default test parameters."""
     if not data_dir:
         data_dir = self.tmpdir
     manager_params = ManagerParams(num_browsers=num_browsers)
     browser_params = [BrowserParams() for _ in range(num_browsers)]
     manager_params.data_directory = data_dir
     manager_params.log_directory = data_dir
     manager_params.num_browsers = num_browsers
     for i in range(num_browsers):
         browser_params[i].display_mode = display_mode
     manager_params.database_name = join(
         manager_params.data_directory, manager_params.database_name
     )
     return manager_params, browser_params
Beispiel #3
0
def default_params(
    tmp_path: Path,
    num_browsers: int = NUM_BROWSERS
) -> Tuple[ManagerParams, List[BrowserParams]]:
    """Just a simple wrapper around task_manager.load_default_params"""

    manager_params = ManagerParams(
        num_browsers=NUM_BROWSERS
    )  # num_browsers is necessary to let TaskManager know how many browsers to spawn

    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]
    manager_params.data_directory = tmp_path
    manager_params.log_directory = tmp_path
    for i in range(num_browsers):
        browser_params[i].display_mode = "headless"
    return manager_params, browser_params
    # Record HTTP Requests and Responses
    browser_params[i].http_instrument = True
    # Record cookie changes
    browser_params[i].cookie_instrument = True
    # Record Navigations
    browser_params[i].navigation_instrument = True
    # Record JS Web API calls
    browser_params[i].js_instrument = True
    # Record the callstack of all WebRequests made
    browser_params[i].callstack_instrument = True
    # Record DNS resolution
    browser_params[i].dns_instrument = True

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = "~/Desktop/"
manager_params.log_directory = "~/Desktop/"

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
Beispiel #5
0
def processSite(site):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = [site]

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )
            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())
            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
    return (None)
Beispiel #6
0
    # Record HTTP Requests and Responses
    browser_params[i].http_instrument = True
    # Record cookie changes
    browser_params[i].cookie_instrument = True
    # Record Navigations
    browser_params[i].navigation_instrument = True
    # Record JS Web API calls
    browser_params[i].js_instrument = True
    # Record the callstack of all WebRequests made
    browser_params[i].callstack_instrument = True
    # Record DNS resolution
    browser_params[i].dns_instrument = True

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = Path("./datadir/")
manager_params.log_directory = Path("./datadir/")

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Commands time out by default after 60 seconds
with TaskManager(
        manager_params,
        browser_params,
        SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
        None,
) as manager:
    # Visits the sites
    for index, site in enumerate(sites):
Beispiel #7
0
    browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
    browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
    browser_params[i].js_instrument = JS_INSTRUMENT
    browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
    if SAVE_CONTENT == "1":
        browser_params[i].save_content = True
    elif SAVE_CONTENT == "0":
        browser_params[i].save_content = False
    else:
        browser_params[i].save_content = SAVE_CONTENT
    if PREFS:
        browser_params[i].prefs = json.loads(PREFS)

# Manager configuration
manager_params.data_directory = Path("~/Desktop/") / CRAWL_DIRECTORY
manager_params.log_directory = Path("~/Desktop/") / CRAWL_DIRECTORY

structured = GcsStructuredProvider(
    project=GCP_PROJECT,
    bucket_name=GCS_BUCKET,
    base_path=CRAWL_DIRECTORY,
    token=AUTH_TOKEN,
)
unstructured = GcsUnstructuredProvider(
    project=GCP_PROJECT,
    bucket_name=GCS_BUCKET,
    base_path=CRAWL_DIRECTORY + "/data",
    token=AUTH_TOKEN,
)
# Instantiates the measurement platform
# Commands time out by default after 60 seconds
Beispiel #8
0
    browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
    browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
    browser_params[i].js_instrument = JS_INSTRUMENT
    browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
    if SAVE_CONTENT == "1":
        browser_params[i].save_content = True
    elif SAVE_CONTENT == "0":
        browser_params[i].save_content = False
    else:
        browser_params[i].save_content = SAVE_CONTENT
    if PREFS:
        browser_params[i].prefs = json.loads(PREFS)

# Manager configuration
manager_params.data_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY
manager_params.log_directory = "~/Desktop/%s/" % CRAWL_DIRECTORY
manager_params.output_format = "s3"
manager_params.s3_bucket = S3_BUCKET
manager_params.s3_directory = CRAWL_DIRECTORY

# Allow the use of localstack's mock s3 service
S3_ENDPOINT = os.getenv("S3_ENDPOINT")
if S3_ENDPOINT:
    boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
    manager_params.s3_bucket = local_s3_bucket(boto3.resource("s3"), name=S3_BUCKET)

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params, logger_kwargs=LOGGER_SETTINGS)

# At this point, Sentry should be initiated
Beispiel #9
0
def crawl(sites, db_filename):
    '''
    sites ihe list of sites that we wish to crawl
    db_filename is the file name of the output database
    '''

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams
    NUM_BROWSERS = 12

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

        browser_params[i].bot_mitigation = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(
                Path("./datadir/{}.sqlite".format(db_filename))),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                reset=True,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)

            # Run commands across the three browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)