def revisit(crawl_log_file_list, word_file, n): """ visit landing urls in crawl_log_file n times @parameter crawl_log_file_list: list of filenames of crawl_log word_file: file containing words in crawl_log_file, used for creating base_dir n: number of times to visit """ # google_UA is not used in search and crawl. Used in later visit. google_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" google_suffix = 'google.crawl/' for i in range(int(n)): # the time label is set for each iteration of visit now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") for crawl_log_file in crawl_log_file_list: # compute base_dir and start logging base_dir = '.'.join([word_file, google_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+now_suffix, level=logging.DEBUG) logging.getLogger("global") # set crawl_config crawl_config = CD.CrawlConfig() crawl_config.maximum_threads = 6 crawl_config.user_agent = google_UA crawl_config.user_agent_md5_dir = base_dir + hex_md5(crawl_config.user_agent) \ + now_suffix + '/' crawl_config.browser_type = CD.CrawlConfig.CHROME google_crawl_log = crawl_log_file.split('/')[-1] + '.google' crawl_config.log_filename = google_crawl_log + now_suffix revisit = Visit(crawl_config) crawl_log = CD.CrawlLog() read_proto_from_file(crawl_log, crawl_log_file) landing_url_set = crawl_log_attr_set(crawl_log, "landing_url") revisit.visit_landing_url(landing_url_set) revisit.write_crawl_log(False)
def search_and_crawl(word_file, max_word_per_file=50): """ search words in word_file, get clickstring for search results and ads, then visit these clickstrings. @parameter word_file: the filename containing the words to search max_word_per_file: the maximum number of words to store in one crawl_log file """ # define constants user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36" user_suffix = "selenium.crawl/" now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") # compute base_dir and start logging base_dir = '.'.join([word_file, user_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+now_suffix, level=logging.DEBUG) logging.getLogger("global") # set crawl_config crawl_config = CD.CrawlConfig() crawl_config.maximum_threads = 6 crawl_config.user_agent = user_UA crawl_config.user_agent_md5_dir = base_dir + hex_md5(crawl_config.user_agent) \ + now_suffix + '/' crawl_config.browser_type = CD.CrawlConfig.CHROME # print crawl_config.user_agent words = SearchTerm(word_file) search = Search(crawl_config) crawl_config.result_type = CD.AD crawl_config.log_filename = 'ad_crawl_log' + now_suffix ad_visit = Visit(crawl_config, max_word_per_file) crawl_config.result_type = CD.SEARCH crawl_config.log_filename = 'search_crawl_log' + now_suffix search_visit = Visit(crawl_config, max_word_per_file) """ word_list = words.get_word_list() print 'word list size ', len(word_list) print word_list word_list = list() word_list.append('Essay Writing') word_list.append('P**n sale') for word in word_list: """ for word in words.get_word_list(): ad_set, search_set = search.search(word) # print clickstring_set ad_visit.visit(ad_set, word) search_visit.visit(search_set, word) words.next()
def search_and_revisit(word_file, n, threads=6, ad_only=False): """ This function does the following things. 1. Search each word in word file. 2. Grab the top 200 returned results and corresponding ads 3. Visit all the results and ads with "chrome user agent", repeat n times 4. Visit all the landing pages in step 3 with "google ads bot user agent" @parameter word_file: the filename containing the words to search n: repeat step 3 for n times ad_only: Only retrieve the advertisements. In this case, we only view the first 5 pages. @output Following are output of this function Running log: [WORD_FILE].selenium.crawl/running_log.[SEARCH_TIME] "chrome user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME]/[URL_MD5]/index.html "google ads bot user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME].revisit.[REVISIT_TIME]/[URL_MD5]/index.html """ valid_instance(threads, int) # prepare search and visit user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36" user_suffix = "selenium.crawl/" search_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") word_md5_delimiter = "WORD_MD5" # compute base_dir and start logging base_dir = '.'.join([word_file, user_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+search_now_suffix, level=logging.DEBUG) logging.getLogger("global") # set search and visit crawl_config search_config = CD.CrawlConfig() search_config.maximum_threads = threads search_config.user_agent = user_UA # number of top search results to be inspected if ad_only: search_config.count = 50 search_config.browser_type = CD.CrawlConfig.CHROME ad_crawl_config = CD.CrawlConfig() ad_crawl_config.CopyFrom(search_config) ad_crawl_config.result_type = CD.AD ad_crawl_config.crawl_log_dir = base_dir ad_log_filename_prefix = 'ad_crawl_log' + search_now_suffix ad_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(ad_crawl_config.user_agent) + search_now_suffix + '/' search_crawl_config = CD.CrawlConfig() search_crawl_config.CopyFrom(search_config) search_crawl_config.result_type = CD.SEARCH search_crawl_config.crawl_log_dir = base_dir search_log_filename_prefix = 'search_crawl_log' + search_now_suffix search_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(search_crawl_config.user_agent) + search_now_suffix + '/' # print crawl_config.user_agent words = SearchTerm(word_file) search = Search(search_config) ad_visit = Visit(ad_crawl_config, 1) search_visit = Visit(search_crawl_config, 1) # prepare the revisit google_ad_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" google_search_UA = "Googlebot/2.1 (+http://www.google.com/bot.html)" # set revisit crawl_config revisit_crawl_config = CD.CrawlConfig() revisit_crawl_config.maximum_threads = threads revisit_crawl_config.browser_type = CD.CrawlConfig.CHROME # base directory uses search_now_suffix to correlate these two revisit_crawl_config.crawl_log_dir = base_dir # search, visit and revisit each word for word in words.get_word_list(): print "Processing {0} word: {1}".format(words.current(), word) # update word_md5 related directories print word word_md5 = hex_md5(word) ad_crawl_config.log_filename = ad_log_filename_prefix + "." + word_md5 ad_crawl_config.user_agent_md5_dir = word_md5.join( ad_dir_prefix.split(word_md5_delimiter)) search_crawl_config.log_filename = search_log_filename_prefix + "." + word_md5 search_crawl_config.user_agent_md5_dir = word_md5.join( search_dir_prefix.split(word_md5_delimiter)) ad_visit.update_crawl_config(ad_crawl_config) search_visit.update_crawl_config(search_crawl_config) # search and crawl right_click = not ad_only ad_set, search_set = search.search(word, right_click) ad_crawl_log_filename = ad_visit.visit(ad_set, word) if ad_only: search_crawl_log_filename = None else: search_crawl_log_filename = search_visit.visit(search_set, word) # revisit crawl_log_file_list = list() if ad_crawl_log_filename: crawl_log_file_list.append(ad_crawl_log_filename) if search_crawl_log_filename: crawl_log_file_list.append(search_crawl_log_filename) for crawl_log_file in crawl_log_file_list: if crawl_log_file == ad_crawl_log_filename: revisit_crawl_config.user_agent = google_ad_UA else: revisit_crawl_config.user_agent = google_search_UA revisit_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(revisit_crawl_config.user_agent) + search_now_suffix revisit_crawl_config.log_filename = crawl_log_file.split('/')[-1] + '.google' revisit = Visit(revisit_crawl_config) crawl_log = CD.CrawlLog() read_proto_from_file(crawl_log, crawl_log_file) revisit.visit_landing_url_n_times(crawl_log, int(n), revisit_dir_prefix, word_md5, word_md5_delimiter) words.next() """