def visit_landing_url_n_times(self, crawl_log, n_times, revisit_dir_prefix, word_md5, word_md5_delimiter): """ @parameter crawl_log: crawl log to visit n_times: visit crawl_log for n_times """ valid_instance(crawl_log, CD.CrawlLog) valid_instance(n_times, int) # prepare landing_url_set landing_url_set = crawl_log_attr_set(crawl_log, "landing_url") landing_url_set_size = len(landing_url_set) if landing_url_set_size < 8: record_maximum_threads = self.crawl_config.maximum_threads self.crawl_config.maximum_threads = 2 url_fetcher = UrlFetcher(self.crawl_config) for i in range(n_times): # the time label is set for each iteration of visit revisit_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") self.crawl_config.user_agent_md5_dir = word_md5.join( revisit_dir_prefix.split(word_md5_delimiter)) + \ '.revisit_time' + revisit_now_suffix + '/' url_fetcher.update_dir(self.crawl_config.user_agent_md5_dir) self.visit_landing_url(landing_url_set, url_fetcher) self.write_crawl_log(False) url_fetcher.quit() if landing_url_set_size < 8: self.crawl_config.maximum_threads = record_maximum_threads
def __init__(self, url_file, user_agent_file, crawl_config): valid_instance(crawl_config, CD.CrawlConfig) self.crawl_config = CD.CrawlConfig() self.crawl_config.CopyFrom(crawl_config) # Prepare the input self.urls = filter(bool, open(url_file, 'r').read().split('\n')) self.user_agents = filter(bool, open(user_agent_file, 'r').read().split('\n')) # self.referers = filter(bool, open(referer_file, 'r').read().split('\n')) # Prepare the output directory crawl_type = None for user_agent in self.user_agents: if "bot" in user_agent: crawl_type = "bot" break if not crawl_type: crawl_type = "user" now = datetime.now().strftime("%Y%m%d-%H%M%S") self.base_dir = url_file + '.' + crawl_type + '.' + now + '.selenium.crawl/' mkdir_if_not_exist(self.base_dir) # Prepare log files # self.htmls_f = open(self.base_dir + 'html_path_list', 'a') self.md5_UA_filename = self.base_dir + 'md5_UA.log' self.crawl_log_filename = self.base_dir + 'crawl_log'
def __init__(self, crawl_config): # user_agent should be set. valid_instance(crawl_config, CD.CrawlConfig) self.crawl_config = CD.CrawlConfig() self.crawl_config.CopyFrom(crawl_config) switch_vpn_state(True) self.connected = False
def visit_landing_url(self, landing_url_set, url_fetcher=None): """ @parameter landing_url_set: landing url set to visit url_fetcher: selenium handles to use for crawl """ valid_instance(landing_url_set, set) mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir) # crawl web pages landing_url_set_size = len(landing_url_set) if landing_url_set_size < 8: record_maximum_threads = self.crawl_config.maximum_threads self.crawl_config.maximum_threads = 2 quit_fetcher_when_done = False if not url_fetcher: url_fetcher = UrlFetcher(self.crawl_config) quit_fetcher_when_done = True thread_computer = ThreadComputer(url_fetcher, 'fetch_url', landing_url_set) if quit_fetcher_when_done: url_fetcher.quit() if landing_url_set_size < 8: self.crawl_config.maximum_threads = record_maximum_threads # create and fill current_search, including urls, search_term etc. current_search = CD.CrawlSearchTerm() for p, s in thread_computer.result: result = current_search.result.add() result.CopyFrom(s) # update current_log if self.first: self.first = False self.current_log = CD.CrawlLog() result_search = self.current_log.result_search.add() result_search.CopyFrom(current_search)
def __init__(self, crawl_config, max_word_per_file=5): # user_agent, user_agent_md5_dir should be set. valid_instance(crawl_config, CD.CrawlConfig) self.crawl_config = CD.CrawlConfig() self.crawl_config.CopyFrom(crawl_config) self.first = True self.max_word_per_file = max_word_per_file self.counter = 0
def __init__(self, crawl_config): valid_instance(crawl_config, CD.CrawlConfig) self.crawl_config = CD.CrawlConfig() self.crawl_config.CopyFrom(crawl_config) self.browser_queue = Queue.Queue() for i in xrange(self.crawl_config.maximum_threads): browser = start_browser(self.crawl_config.browser_type, incognito=False, \ user_agent=self.crawl_config.user_agent) browser.set_page_load_timeout(15) self.browser_queue.put(browser) self.lock = threading.Lock()
def build_site_simhash_dict(observed_sites): """ Return two dict, one maps site name to all the simhashs, the other maps site name to observed site """ valid_instance(observed_sites, CD.ObservedSites) site_simhash_dict = dict() observed_sites_dict = dict() attr_name = get_simhash_type(observed_sites.config.simhash_type) for observed_site in observed_sites.site: if not observed_site.name in site_simhash_dict: site_simhash_dict[observed_site.name] = set() observed_sites_dict[observed_site.name] = observed_site for observation in observed_site.observation: site_simhash_dict[observed_site.name].add(getattr(observation, attr_name)) return site_simhash_dict, observed_sites_dict
def crawl_log_attr_set(crawl_log, attr_name, success_only=True): """ Get attribute set from CrawlLog. @parameter crawl_log: the crawl log to extract attribute set from. attr_name: the name of attribute in crawl_log to collect. @return attr_set: the set of attrbiutes corresponding to attr_name """ valid_instance(crawl_log, CD.CrawlLog) attr_set = set() for result_search in crawl_log.result_search: for result in result_search.result: # collect information on success or not if success_only: if result.success: attr_set.add(getattr(result, attr_name)) else: attr_set.add(getattr(result, attr_name)) return attr_set
def collect_site_for_plot(site_set, outdir, mode="user"): """ Collect user and google observation for site in site_set. This is scheduled by cron job. In order to show how hash values of websites change over time. @parameter site_set: the set of urls to visit outdir: the output directory mode: which user agent to use, supported mode includes user, google, both """ valid_instance(site_set, set) mkdir_if_not_exist(outdir) user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36" google_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" crawl_config = CD.CrawlConfig() crawl_config.maximum_threads = 1 crawl_config.browser_type = CD.CrawlConfig.CHROME crawl_config.crawl_log_dir = outdir now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") UAs = dict() if mode == "user": UAs["user"] = user_UA elif mode == "google": UAs["google"] = google_UA elif mode == "both": UAs["user"] = user_UA UAs["google"] = google_UA else: raise Exception("Unknown mode {0}".format(mode)) for mode in UAs: crawl_config.user_agent = UAs[mode] crawl_config.user_agent_md5_dir = outdir + hex_md5(crawl_config.user_agent) \ + now_suffix + '/' crawl_config.log_filename = mode + '_crawl_log' + now_suffix mode_visit = Visit(crawl_config) mode_visit.visit_landing_url(site_set) mode_visit.write_crawl_log(False)
def update_crawl_config(self, crawl_config): valid_instance(crawl_config, CD.CrawlConfig) self.crawl_config = CD.CrawlConfig() self.crawl_config.CopyFrom(crawl_config)
def search_and_revisit(word_file, n, threads=6, ad_only=False): """ This function does the following things. 1. Search each word in word file. 2. Grab the top 200 returned results and corresponding ads 3. Visit all the results and ads with "chrome user agent", repeat n times 4. Visit all the landing pages in step 3 with "google ads bot user agent" @parameter word_file: the filename containing the words to search n: repeat step 3 for n times ad_only: Only retrieve the advertisements. In this case, we only view the first 5 pages. @output Following are output of this function Running log: [WORD_FILE].selenium.crawl/running_log.[SEARCH_TIME] "chrome user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME]/[URL_MD5]/index.html "google ads bot user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME].revisit.[REVISIT_TIME]/[URL_MD5]/index.html """ valid_instance(threads, int) # prepare search and visit user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36" user_suffix = "selenium.crawl/" search_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") word_md5_delimiter = "WORD_MD5" # compute base_dir and start logging base_dir = '.'.join([word_file, user_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+search_now_suffix, level=logging.DEBUG) logging.getLogger("global") # set search and visit crawl_config search_config = CD.CrawlConfig() search_config.maximum_threads = threads search_config.user_agent = user_UA # number of top search results to be inspected if ad_only: search_config.count = 50 search_config.browser_type = CD.CrawlConfig.CHROME ad_crawl_config = CD.CrawlConfig() ad_crawl_config.CopyFrom(search_config) ad_crawl_config.result_type = CD.AD ad_crawl_config.crawl_log_dir = base_dir ad_log_filename_prefix = 'ad_crawl_log' + search_now_suffix ad_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(ad_crawl_config.user_agent) + search_now_suffix + '/' search_crawl_config = CD.CrawlConfig() search_crawl_config.CopyFrom(search_config) search_crawl_config.result_type = CD.SEARCH search_crawl_config.crawl_log_dir = base_dir search_log_filename_prefix = 'search_crawl_log' + search_now_suffix search_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(search_crawl_config.user_agent) + search_now_suffix + '/' # print crawl_config.user_agent words = SearchTerm(word_file) search = Search(search_config) ad_visit = Visit(ad_crawl_config, 1) search_visit = Visit(search_crawl_config, 1) # prepare the revisit google_ad_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" google_search_UA = "Googlebot/2.1 (+http://www.google.com/bot.html)" # set revisit crawl_config revisit_crawl_config = CD.CrawlConfig() revisit_crawl_config.maximum_threads = threads revisit_crawl_config.browser_type = CD.CrawlConfig.CHROME # base directory uses search_now_suffix to correlate these two revisit_crawl_config.crawl_log_dir = base_dir # search, visit and revisit each word for word in words.get_word_list(): print "Processing {0} word: {1}".format(words.current(), word) # update word_md5 related directories print word word_md5 = hex_md5(word) ad_crawl_config.log_filename = ad_log_filename_prefix + "." + word_md5 ad_crawl_config.user_agent_md5_dir = word_md5.join( ad_dir_prefix.split(word_md5_delimiter)) search_crawl_config.log_filename = search_log_filename_prefix + "." + word_md5 search_crawl_config.user_agent_md5_dir = word_md5.join( search_dir_prefix.split(word_md5_delimiter)) ad_visit.update_crawl_config(ad_crawl_config) search_visit.update_crawl_config(search_crawl_config) # search and crawl right_click = not ad_only ad_set, search_set = search.search(word, right_click) ad_crawl_log_filename = ad_visit.visit(ad_set, word) if ad_only: search_crawl_log_filename = None else: search_crawl_log_filename = search_visit.visit(search_set, word) # revisit crawl_log_file_list = list() if ad_crawl_log_filename: crawl_log_file_list.append(ad_crawl_log_filename) if search_crawl_log_filename: crawl_log_file_list.append(search_crawl_log_filename) for crawl_log_file in crawl_log_file_list: if crawl_log_file == ad_crawl_log_filename: revisit_crawl_config.user_agent = google_ad_UA else: revisit_crawl_config.user_agent = google_search_UA revisit_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(revisit_crawl_config.user_agent) + search_now_suffix revisit_crawl_config.log_filename = crawl_log_file.split('/')[-1] + '.google' revisit = Visit(revisit_crawl_config) crawl_log = CD.CrawlLog() read_proto_from_file(crawl_log, crawl_log_file) revisit.visit_landing_url_n_times(crawl_log, int(n), revisit_dir_prefix, word_md5, word_md5_delimiter) words.next() """
def update_groundtruth_redundant(count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): valid_instance(count, int) in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) diff_e_list = list(diff_e_set) logger.info(len(diff_e_list)) random.shuffle(diff_e_list) diff_e_sample = diff_e_list[:count] """ get the sites that are in "de-deduplicated" examples and add them this is necessary, because there are sites, that are cloaking, but remove in de-dup phase. Doesn't know why. """ add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n')) diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all + ".u.text.temp")) # use the updated diff expected set, to generate the new data _output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp") add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom') _output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp") add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp") add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_all + ".g.text.temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_all + ".g.dom.temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def _replace_list_by(to_replace_list, src, dst): valid_instance(to_replace_list, list) return [item.replace(src, dst) for item in to_replace_list]