def run_default_profile_creation_control(anticv_on=False, is_monitoring=False, **kwargs):

    thread_name = "Process-" + randomword(5)
    logger.debug("%s - Created thread name " % thread_name)

    profile_path = collect_core.PROFILE_STARTING_POINT__CONTROL
    if is_monitoring:
        profile_path = collect_core.PROFILE_STARTING_POINT_MONITORING__CONTROL
    if anticv_on:
        profile_path = collect_core.PROFILE_STARTING_POINT__ANTICV_ON__CONTROL
        if is_monitoring:
            profile_path = collect_core.PROFILE_STARTING_POINT_MONITORING__ANTICV_ON__CONTROL

    # make profile directories
    if not is_monitoring:
        collect_core.create_profile_directories()
    else:
        collect_core.create_profile_directories_monitoring()

    driver_name, driver = create_control_driver(profile_path=profile_path, **kwargs)

    collect_core._visit_domain(driver,
                               driver_name,
                               "https://google.com",
                               0,
                               None,
                               thread_name,
                               log_prefix="Main Driver")

    logger.debug("%s  - Thread will run forever" % str(thread_name))

    while (True):
        continue

    quit_drivers([driver_name, driver])
def update_filter_list_for_default_profiles(
    abp_extension_absolute_path="/home/ubuntu/github/adblockpluschrome-anticv/devenv.chrome",
    anticv_on=False, 
    thread_name=None,
    **kwargs):

    virtual_display = collect_core.start_virtual_screen()

    if not thread_name:
        thread_name = "Process-" + randomword(5)
        logger.debug("%s - Created thread name ", thread_name)

    # variant
    logger.debug("%s - Updating variant profile filter list", thread_name)

    profile_path = collect_core.PROFILE_STARTING_POINT
    if anticv_on:
        profile_path = collect_core.PROFILE_STARTING_POINT__ANTICV_ON
    
    variant_driver_name, variant_driver = create_variant_driver(
        profile_path=profile_path, **kwargs)

    selen_exception = None
    try:
        update_filter_list_adblock_plus_through_options(variant_driver, abp_extension_absolute_path)
    except Exception as e:
        logger.warning("Could not update filter list")
        selen_exception = e
    finally:
        time.sleep(5)
        quit_drivers([(variant_driver_name, variant_driver)])
        # stop the virtual display
        if virtual_display is not None:
            collect_core.stop_virtual_screen(virtual_display)
        
        # clean up first before throwing exception
        if selen_exception:
            raise selen_exception

        chrome_singletons = glob.glob(profile_path + os.sep + "*" + "Singleton" +"*")
        for f in chrome_singletons:
            os.remove(f)

        time.sleep(5)
Example #3
0
def create_new_profile(starting_profile_path,
                       profile_directory,
                       new_profile_prefix=None):
    dest_profile_name = randomword(10)
    if new_profile_prefix:
        dest_profile_name = new_profile_prefix + dest_profile_name

    dest_profile_path = profile_directory + dest_profile_name + os.sep
    logger.debug("Creating new profile " + dest_profile_name +
                 " FROM original profile " + starting_profile_path)

    try:
        shutil.copytree(starting_profile_path, dest_profile_path)
        return dest_profile_path
    except OSError as e:
        # If the error was caused because the source wasn't a directory
        if e.errno == errno.ENOTDIR:
            shutil.copy(starting_profile_path, dest_profile_path)
        else:
            logger.warning('Directory not copied. Error: %s' % e)
Example #4
0
def plot_time_series_process(process_index,
                             all_rows,
                             output_csv_queue,
                             output_directory,
                             positive_label_domains=None,
                             negative_label_domains=None,
                             chunk_csv=None,
                             trials=4,
                             thread_limit=5,
                             chunk_size=50):

    logger.debug("Starting process " + str(process_index))

    # chunking
    THREADS_LIMIT = thread_limit  # how many threads can run each thread
    chunks = chunk(all_rows, n=chunk_size)

    chunk_count = len(chunks)
    current_threads = []
    chunk_index = 0
    chunk_completed = 0
    logger.debug("Processing chunk %d out of %d", chunk_index + 1, chunk_count)

    stop = False
    while chunk_completed < chunk_count and not stop:
        if len(current_threads) < THREADS_LIMIT and chunk_index < chunk_count:
            thread_name = "Thread-" + randomword(5)
            logger.debug("Creating " + thread_name)

            some_thread = TimeseriesPrepThread(
                chunk_index,
                thread_name,
                chunks[chunk_index],
                output_directory,
                output_csv_queue,
                chunk_csv=chunk_csv,
                positive_label_domains=positive_label_domains,
                negative_label_domains=negative_label_domains,
                trials=trials)

            # Start new Threads
            some_thread.start()
            logger.debug("Processing chunk %d out of %d with thread %s",
                         chunk_index + 1, chunk_count, some_thread.name)
            current_threads.append(some_thread)
            chunk_index += 1
            time.sleep(1)
        else:
            done_threads = []
            for t in current_threads:
                if not t.is_alive():
                    done_threads.append(t)

            if len(done_threads) == 0:
                logger.debug("Found no done threads")
                time.sleep(2)
            else:
                for done_thread in done_threads:
                    logger.debug(
                        "Done with thread %s and chunk index %d out of %d",
                        done_thread.name, done_thread.threadID + 1,
                        chunk_count)
                    chunk_completed += 1
                    current_threads.remove(done_thread)
                time.sleep(2)
Example #5
0
    def run(self):
        logger.debug("Running thread %s", self.name)

        FILE_PATH_WR_CONTROL = "File Path WR Vanilla"
        FILE_PATH_WR_VARIANT = "File Path WR"
        FILE_PATH_DOM_CONTROL = "File Path DOM Vanilla"
        FILE_PATH_DOM_VARIANT = "File Path DOM"

        for row in self.rows:
            url = row[URL_CRAWLED]
            logger.info("%s - Processing URL %s", self.name, url)

            crawl_chunk = row["Chunk"]

            if self.chunk_csv and self.chunk_csv != crawl_chunk:
                continue

            cv_detect = row[CV_DETECT_TARGET_NAME]

            if self.positive_label_domains or self.negative_label_domains:
                if url not in self.positive_label_domains and url not in self.negative_label_domains:
                    continue

            ts_trials_json = dict()
            for trial_index in range(self.trials):
                trial_label = get_trial_label(trial_index)
                dom_control_name = FILE_PATH_DOM_CONTROL + " " + trial_label
                wr_control_name = FILE_PATH_WR_CONTROL + " " + trial_label

                dom_variant_name = FILE_PATH_DOM_VARIANT + " " + trial_label
                wr_variant_name = FILE_PATH_WR_VARIANT + " " + trial_label

                control_file_dom = row[dom_control_name] or ""
                control_file_wr = row[wr_control_name] or ""
                variant_file_dom = row[dom_variant_name] or ""
                variant_file_wr = row[wr_variant_name] or ""

                # if all file exists
                random_part = randomword(10)
                if len(control_file_dom) > 0 and len(control_file_wr) > 0 and \
                    len(variant_file_dom) > 0 and len(variant_file_wr) > 0:

                    try:

                        chunk_path = self.output_directory + os.sep + crawl_chunk
                        if not os.path.isdir(chunk_path):
                            os.mkdir(chunk_path)

                        control_prep = chunk_path + os.sep + random_part + trial_label + "_control"
                        prep_timeseries_file_json(control_file_dom,
                                                  control_file_wr,
                                                  control_prep)

                        variant_prep = chunk_path + os.sep + random_part + trial_label + "_variant"
                        prep_timeseries_file_json(variant_file_dom,
                                                  variant_file_wr,
                                                  variant_prep)

                        control_prep_csv = control_prep + ".csv"
                        variant_prep_csv = variant_prep + ".csv"

                        if os.path.isfile(control_prep_csv) and os.path.isfile(
                                variant_prep_csv):
                            logger.debug(
                                "%s - Success creating control+variant json for time series %s",
                                self.name, url)
                            ts_trials_json[trial_label] = (control_prep_csv,
                                                           variant_prep_csv)
                            time.sleep(2)
                    except Exception as e:
                        logger.error(e)
                        logger.warning(
                            "%s - Could not create plot for %s, exception plotting",
                            self.name, url + " " + str(trial_label))
                        ts_trials_json[trial_label] = (None, None)

                else:
                    ts_trials_json[trial_label] = (None, None)
                    logger.warning(
                        "%s - Could not create plot for %s, not enough files",
                        self.name, url + " " + str(trial_label))

            # write out rows
            csv_row = [url, crawl_chunk, cv_detect]
            for trial_index in range(self.trials):
                trial_label = get_trial_label(trial_index)
                ctr_prep, var_prep = ts_trials_json.get(trial_label)
                csv_row.append(ctr_prep if ctr_prep else "")
                csv_row.append(var_prep if var_prep else "")

            # add to queue
            self.output_csv_queue.put(csv_row)
def _run_measurement_for_beyond_pages_only(
        driver,
        driver_name,
        domain,
        rank,
        pagesource_directory,
        screenshot_directory,
        profile_path=None,
        thread_name=None,
        sleep_time_sec=2,
        chrome_default_download_directory=None,
        is_control=True,
        scrollto_height=None,
        find_more_pages=False,
        random_suffix_input=None,
        domain_separator="__",
        trial_suffix="trial0",
        use_https=True,
        **kwargs):

    logger.debug("%s - Running measurements..." % str(thread_name))

    potential_pages = []
    success = False
    original_domain = domain
    trunc_domain = domain
    random_suffix = random_suffix_input
    is_https = True
    if random_suffix is None:
        random_suffix = randomword(15)

    logger.debug("\t%s - Attempting to process domain %s and rank %s",
                str(thread_name), domain, rank)
    try:
        should_sleep = False

        # create drivers
        if driver is None:
            if is_control:
                driver_name, driver = create_control_driver(
                    profile_path=profile_path,
                    chrome_default_download_directory=
                    chrome_default_download_directory,
                    **kwargs)
            else:
                driver_name, driver = create_variant_driver(
                    profile_path=profile_path,
                    chrome_default_download_directory=
                    chrome_default_download_directory,
                    **kwargs)

        should_sleep, is_https = collect_core._visit_domain(
            driver,
            driver_name,
            domain,
            rank,
            pagesource_directory,
            thread_name,
            log_prefix="Main Driver",
            check_source_file=False,
            use_https=use_https)

        before_time = time.time()

        # simulate scrolling down and up
        scrollto_height = collect_core._simulate_scrolling(
            driver,
            driver_name,
            domain,
            thread_name,
            log_prefix="Main Driver",
            scrollto_height=scrollto_height)

        # sleeping
        if should_sleep:
            sleep_time = collect_core._get_sleep_time(
                before_time, collect_core.MEASUREMENT_TIMER)
            if sleep_time > 0:
                logger.debug("%s - Sleeping for %d seconds" %
                             (str(thread_name), sleep_time))
                time.sleep(sleep_time)
            else:
                time.sleep(1)

        # get domain from driver if it is really different
        domain_from_driver = driver.current_url
        logger.debug("%s - Original Domain: %s, Domain from Driver: %s" %
                     (str(thread_name), domain, domain_from_driver))
        if domain not in domain_from_driver:
            domain = domain_from_driver
            logger.debug("%s - Updated domain to %s " %
                         (str(thread_name), domain))

        # get more pages from this domain
        if find_more_pages:
            potential_pages = collect_core.find_beyond_landing_page(
                driver, domain)

        # we need to change the domain to a diff value we can use to save

        trunc_domain = domain[:
                              50] + domain_separator + random_suffix + domain_separator + trial_suffix
        logger.debug("%s - Using truncated domain %s" %
                     (str(thread_name), trunc_domain))
        # trigger an event for custom extensions to pick up
        collect_core.trigger_js_event_for_filename(driver, trunc_domain)

        # quit regular drivers
        quit_drivers([(driver_name, driver)])
        driver = None
        driver_name = None
        success = True

    except Exception as e:
        logger.warn(str(e))
        logger.warn(
            str(thread_name) + " - Could not crawl: " + original_domain)

        if driver:
            if len(trunc_domain) > 150:
                trunc_domain = trunc_domain[:
                                            50] + domain_separator + random_suffix + domain_separator
                logger.debug("truncing domain name to %s" % trunc_domain)

            collect_core._save_page_source_exception(
                [(driver_name, driver)],
                trunc_domain,
                pagesource_directory,
                thread_name=thread_name,
                original_domain=original_domain)

        logger.debug("%s - Sleeping before quitting drivers" % str(thread_name))
        time.sleep(sleep_time_sec)

        # cleanup
        quit_drivers([(driver_name, driver)])
        driver = None
        driver_name = None
        success = False

    time.sleep(1)

    logger.info("%s - Done with running measurements, domain %s, success: %s" %
                (str(thread_name), original_domain, str(success)))

    # return the scrollto_height to reuse later
    return success, scrollto_height, potential_pages, random_suffix, is_https
def run_data_collection(csv_file_path,
                        output_directory,
                        crawler_group_name,
                        anticv_on=False,
                        csv_delimiter=',',
                        start_index=0,
                        end_index=50,
                        sleep_time_sec=2,
                        use_dynamic_profile=True,
                        trials=4,
                        beyond_landing_pages=True,
                        beyond_landing_pages_only=False,
                        by_rank=True,
                        **kwargs):

    thread_name = "Process-" + randomword(5)
    logger.info("%s - Created thread name " % thread_name)

    # prepare directories
    pagesource_dir, screenshot_dir, downloads_dir = collect_core.create_directories(
        output_directory, crawler_group_name)

    site_file_path = csv_file_path

    logger.info("%s - Using list of websites from %s" %
                (thread_name, site_file_path))
    logger.info("%s - Parsing start_index %d to end index %d" %
                (thread_name, start_index, end_index))

    f = open(site_file_path)
    reader = csv.reader(f, delimiter=csv_delimiter)

    logger.debug("%s - Reading in data from file" % thread_name)
    file_data = []
    CSV_RANK_INDEX = 0
    CSV_URL_INDEX = 1
    file_data_chunk = []
    for row in reader:
        rank = int(row[CSV_RANK_INDEX])
        domain = row[CSV_URL_INDEX]
        if by_rank:
            if start_index <= rank <= end_index:
                file_data_chunk.append((rank, domain))
        else:
            file_data.append((rank, domain))

    #print("Done reading in data from file")
    if not by_rank:
        # go by file index order and not rank
        file_data_chunk = file_data[start_index:end_index]

    retry = True
    max_retry = 3
    retry_count = 0

    while retry_count <= max_retry and retry:
        # start virtual display
        virtual_display = collect_core.start_virtual_screen()

        try:
            process_sites(file_data_chunk,
                          pagesource_dir,
                          screenshot_dir,
                          downloads_dir,
                          anticv_on=anticv_on,
                          use_dynamic_profile=use_dynamic_profile,
                          thread_name=thread_name,
                          trials=trials,
                          beyond_landing_pages=beyond_landing_pages,
                          beyond_landing_pages_only=beyond_landing_pages_only,
                          **kwargs)

            retry = False
        except WebDriverException as e:
            retry_count += 1
            logger.warn(e)
            logger.warn("%s - Major exception: Retrying crawling again %d" %
                        (str(thread_name), retry_count))

        # stop the virtual display
        if virtual_display is not None:
            collect_core.stop_virtual_screen(virtual_display)

        time.sleep(30)

    logger.info("%s - Done" % str(thread_name))