def run_default_profile_creation_control(anticv_on=False, is_monitoring=False, **kwargs): thread_name = "Process-" + randomword(5) logger.debug("%s - Created thread name " % thread_name) profile_path = collect_core.PROFILE_STARTING_POINT__CONTROL if is_monitoring: profile_path = collect_core.PROFILE_STARTING_POINT_MONITORING__CONTROL if anticv_on: profile_path = collect_core.PROFILE_STARTING_POINT__ANTICV_ON__CONTROL if is_monitoring: profile_path = collect_core.PROFILE_STARTING_POINT_MONITORING__ANTICV_ON__CONTROL # make profile directories if not is_monitoring: collect_core.create_profile_directories() else: collect_core.create_profile_directories_monitoring() driver_name, driver = create_control_driver(profile_path=profile_path, **kwargs) collect_core._visit_domain(driver, driver_name, "https://google.com", 0, None, thread_name, log_prefix="Main Driver") logger.debug("%s - Thread will run forever" % str(thread_name)) while (True): continue quit_drivers([driver_name, driver])
def update_filter_list_for_default_profiles( abp_extension_absolute_path="/home/ubuntu/github/adblockpluschrome-anticv/devenv.chrome", anticv_on=False, thread_name=None, **kwargs): virtual_display = collect_core.start_virtual_screen() if not thread_name: thread_name = "Process-" + randomword(5) logger.debug("%s - Created thread name ", thread_name) # variant logger.debug("%s - Updating variant profile filter list", thread_name) profile_path = collect_core.PROFILE_STARTING_POINT if anticv_on: profile_path = collect_core.PROFILE_STARTING_POINT__ANTICV_ON variant_driver_name, variant_driver = create_variant_driver( profile_path=profile_path, **kwargs) selen_exception = None try: update_filter_list_adblock_plus_through_options(variant_driver, abp_extension_absolute_path) except Exception as e: logger.warning("Could not update filter list") selen_exception = e finally: time.sleep(5) quit_drivers([(variant_driver_name, variant_driver)]) # stop the virtual display if virtual_display is not None: collect_core.stop_virtual_screen(virtual_display) # clean up first before throwing exception if selen_exception: raise selen_exception chrome_singletons = glob.glob(profile_path + os.sep + "*" + "Singleton" +"*") for f in chrome_singletons: os.remove(f) time.sleep(5)
def create_new_profile(starting_profile_path, profile_directory, new_profile_prefix=None): dest_profile_name = randomword(10) if new_profile_prefix: dest_profile_name = new_profile_prefix + dest_profile_name dest_profile_path = profile_directory + dest_profile_name + os.sep logger.debug("Creating new profile " + dest_profile_name + " FROM original profile " + starting_profile_path) try: shutil.copytree(starting_profile_path, dest_profile_path) return dest_profile_path except OSError as e: # If the error was caused because the source wasn't a directory if e.errno == errno.ENOTDIR: shutil.copy(starting_profile_path, dest_profile_path) else: logger.warning('Directory not copied. Error: %s' % e)
def plot_time_series_process(process_index, all_rows, output_csv_queue, output_directory, positive_label_domains=None, negative_label_domains=None, chunk_csv=None, trials=4, thread_limit=5, chunk_size=50): logger.debug("Starting process " + str(process_index)) # chunking THREADS_LIMIT = thread_limit # how many threads can run each thread chunks = chunk(all_rows, n=chunk_size) chunk_count = len(chunks) current_threads = [] chunk_index = 0 chunk_completed = 0 logger.debug("Processing chunk %d out of %d", chunk_index + 1, chunk_count) stop = False while chunk_completed < chunk_count and not stop: if len(current_threads) < THREADS_LIMIT and chunk_index < chunk_count: thread_name = "Thread-" + randomword(5) logger.debug("Creating " + thread_name) some_thread = TimeseriesPrepThread( chunk_index, thread_name, chunks[chunk_index], output_directory, output_csv_queue, chunk_csv=chunk_csv, positive_label_domains=positive_label_domains, negative_label_domains=negative_label_domains, trials=trials) # Start new Threads some_thread.start() logger.debug("Processing chunk %d out of %d with thread %s", chunk_index + 1, chunk_count, some_thread.name) current_threads.append(some_thread) chunk_index += 1 time.sleep(1) else: done_threads = [] for t in current_threads: if not t.is_alive(): done_threads.append(t) if len(done_threads) == 0: logger.debug("Found no done threads") time.sleep(2) else: for done_thread in done_threads: logger.debug( "Done with thread %s and chunk index %d out of %d", done_thread.name, done_thread.threadID + 1, chunk_count) chunk_completed += 1 current_threads.remove(done_thread) time.sleep(2)
def run(self): logger.debug("Running thread %s", self.name) FILE_PATH_WR_CONTROL = "File Path WR Vanilla" FILE_PATH_WR_VARIANT = "File Path WR" FILE_PATH_DOM_CONTROL = "File Path DOM Vanilla" FILE_PATH_DOM_VARIANT = "File Path DOM" for row in self.rows: url = row[URL_CRAWLED] logger.info("%s - Processing URL %s", self.name, url) crawl_chunk = row["Chunk"] if self.chunk_csv and self.chunk_csv != crawl_chunk: continue cv_detect = row[CV_DETECT_TARGET_NAME] if self.positive_label_domains or self.negative_label_domains: if url not in self.positive_label_domains and url not in self.negative_label_domains: continue ts_trials_json = dict() for trial_index in range(self.trials): trial_label = get_trial_label(trial_index) dom_control_name = FILE_PATH_DOM_CONTROL + " " + trial_label wr_control_name = FILE_PATH_WR_CONTROL + " " + trial_label dom_variant_name = FILE_PATH_DOM_VARIANT + " " + trial_label wr_variant_name = FILE_PATH_WR_VARIANT + " " + trial_label control_file_dom = row[dom_control_name] or "" control_file_wr = row[wr_control_name] or "" variant_file_dom = row[dom_variant_name] or "" variant_file_wr = row[wr_variant_name] or "" # if all file exists random_part = randomword(10) if len(control_file_dom) > 0 and len(control_file_wr) > 0 and \ len(variant_file_dom) > 0 and len(variant_file_wr) > 0: try: chunk_path = self.output_directory + os.sep + crawl_chunk if not os.path.isdir(chunk_path): os.mkdir(chunk_path) control_prep = chunk_path + os.sep + random_part + trial_label + "_control" prep_timeseries_file_json(control_file_dom, control_file_wr, control_prep) variant_prep = chunk_path + os.sep + random_part + trial_label + "_variant" prep_timeseries_file_json(variant_file_dom, variant_file_wr, variant_prep) control_prep_csv = control_prep + ".csv" variant_prep_csv = variant_prep + ".csv" if os.path.isfile(control_prep_csv) and os.path.isfile( variant_prep_csv): logger.debug( "%s - Success creating control+variant json for time series %s", self.name, url) ts_trials_json[trial_label] = (control_prep_csv, variant_prep_csv) time.sleep(2) except Exception as e: logger.error(e) logger.warning( "%s - Could not create plot for %s, exception plotting", self.name, url + " " + str(trial_label)) ts_trials_json[trial_label] = (None, None) else: ts_trials_json[trial_label] = (None, None) logger.warning( "%s - Could not create plot for %s, not enough files", self.name, url + " " + str(trial_label)) # write out rows csv_row = [url, crawl_chunk, cv_detect] for trial_index in range(self.trials): trial_label = get_trial_label(trial_index) ctr_prep, var_prep = ts_trials_json.get(trial_label) csv_row.append(ctr_prep if ctr_prep else "") csv_row.append(var_prep if var_prep else "") # add to queue self.output_csv_queue.put(csv_row)
def _run_measurement_for_beyond_pages_only( driver, driver_name, domain, rank, pagesource_directory, screenshot_directory, profile_path=None, thread_name=None, sleep_time_sec=2, chrome_default_download_directory=None, is_control=True, scrollto_height=None, find_more_pages=False, random_suffix_input=None, domain_separator="__", trial_suffix="trial0", use_https=True, **kwargs): logger.debug("%s - Running measurements..." % str(thread_name)) potential_pages = [] success = False original_domain = domain trunc_domain = domain random_suffix = random_suffix_input is_https = True if random_suffix is None: random_suffix = randomword(15) logger.debug("\t%s - Attempting to process domain %s and rank %s", str(thread_name), domain, rank) try: should_sleep = False # create drivers if driver is None: if is_control: driver_name, driver = create_control_driver( profile_path=profile_path, chrome_default_download_directory= chrome_default_download_directory, **kwargs) else: driver_name, driver = create_variant_driver( profile_path=profile_path, chrome_default_download_directory= chrome_default_download_directory, **kwargs) should_sleep, is_https = collect_core._visit_domain( driver, driver_name, domain, rank, pagesource_directory, thread_name, log_prefix="Main Driver", check_source_file=False, use_https=use_https) before_time = time.time() # simulate scrolling down and up scrollto_height = collect_core._simulate_scrolling( driver, driver_name, domain, thread_name, log_prefix="Main Driver", scrollto_height=scrollto_height) # sleeping if should_sleep: sleep_time = collect_core._get_sleep_time( before_time, collect_core.MEASUREMENT_TIMER) if sleep_time > 0: logger.debug("%s - Sleeping for %d seconds" % (str(thread_name), sleep_time)) time.sleep(sleep_time) else: time.sleep(1) # get domain from driver if it is really different domain_from_driver = driver.current_url logger.debug("%s - Original Domain: %s, Domain from Driver: %s" % (str(thread_name), domain, domain_from_driver)) if domain not in domain_from_driver: domain = domain_from_driver logger.debug("%s - Updated domain to %s " % (str(thread_name), domain)) # get more pages from this domain if find_more_pages: potential_pages = collect_core.find_beyond_landing_page( driver, domain) # we need to change the domain to a diff value we can use to save trunc_domain = domain[: 50] + domain_separator + random_suffix + domain_separator + trial_suffix logger.debug("%s - Using truncated domain %s" % (str(thread_name), trunc_domain)) # trigger an event for custom extensions to pick up collect_core.trigger_js_event_for_filename(driver, trunc_domain) # quit regular drivers quit_drivers([(driver_name, driver)]) driver = None driver_name = None success = True except Exception as e: logger.warn(str(e)) logger.warn( str(thread_name) + " - Could not crawl: " + original_domain) if driver: if len(trunc_domain) > 150: trunc_domain = trunc_domain[: 50] + domain_separator + random_suffix + domain_separator logger.debug("truncing domain name to %s" % trunc_domain) collect_core._save_page_source_exception( [(driver_name, driver)], trunc_domain, pagesource_directory, thread_name=thread_name, original_domain=original_domain) logger.debug("%s - Sleeping before quitting drivers" % str(thread_name)) time.sleep(sleep_time_sec) # cleanup quit_drivers([(driver_name, driver)]) driver = None driver_name = None success = False time.sleep(1) logger.info("%s - Done with running measurements, domain %s, success: %s" % (str(thread_name), original_domain, str(success))) # return the scrollto_height to reuse later return success, scrollto_height, potential_pages, random_suffix, is_https
def run_data_collection(csv_file_path, output_directory, crawler_group_name, anticv_on=False, csv_delimiter=',', start_index=0, end_index=50, sleep_time_sec=2, use_dynamic_profile=True, trials=4, beyond_landing_pages=True, beyond_landing_pages_only=False, by_rank=True, **kwargs): thread_name = "Process-" + randomword(5) logger.info("%s - Created thread name " % thread_name) # prepare directories pagesource_dir, screenshot_dir, downloads_dir = collect_core.create_directories( output_directory, crawler_group_name) site_file_path = csv_file_path logger.info("%s - Using list of websites from %s" % (thread_name, site_file_path)) logger.info("%s - Parsing start_index %d to end index %d" % (thread_name, start_index, end_index)) f = open(site_file_path) reader = csv.reader(f, delimiter=csv_delimiter) logger.debug("%s - Reading in data from file" % thread_name) file_data = [] CSV_RANK_INDEX = 0 CSV_URL_INDEX = 1 file_data_chunk = [] for row in reader: rank = int(row[CSV_RANK_INDEX]) domain = row[CSV_URL_INDEX] if by_rank: if start_index <= rank <= end_index: file_data_chunk.append((rank, domain)) else: file_data.append((rank, domain)) #print("Done reading in data from file") if not by_rank: # go by file index order and not rank file_data_chunk = file_data[start_index:end_index] retry = True max_retry = 3 retry_count = 0 while retry_count <= max_retry and retry: # start virtual display virtual_display = collect_core.start_virtual_screen() try: process_sites(file_data_chunk, pagesource_dir, screenshot_dir, downloads_dir, anticv_on=anticv_on, use_dynamic_profile=use_dynamic_profile, thread_name=thread_name, trials=trials, beyond_landing_pages=beyond_landing_pages, beyond_landing_pages_only=beyond_landing_pages_only, **kwargs) retry = False except WebDriverException as e: retry_count += 1 logger.warn(e) logger.warn("%s - Major exception: Retrying crawling again %d" % (str(thread_name), retry_count)) # stop the virtual display if virtual_display is not None: collect_core.stop_virtual_screen(virtual_display) time.sleep(30) logger.info("%s - Done" % str(thread_name))