def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'], url=url) except Exception as exc: wl_log.exception('Exception in worker function %s %s' % (url_tuple, exc))
def _new_identity(self): wl_log.info("Creating a new identity...") try: ActionChains(self.driver).send_keys(Keys.CONTROL + Keys.SHIFT + 'U').perform() except WebDriverException: pass except Exception: wl_log.exception("Exception while creating new identity.")
def add_captcha(self): try: captcha_filepath = ut.capture_dirpath_to_captcha(self.path) move(self.path, captcha_filepath) self.captchas[self.global_visit] = True except OSError as e: wl_log.exception('%s could not be renamed to %s', self.path, captcha_filepath) raise e
def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join( agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy( stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs'] ) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical( 'Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep( 2 ) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg[ 'post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'], url=url) except Exception as exc: wl_log.exception('Exception in worker function %s %s' % (url_tuple, exc))
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""): """Populate domain info object by parsing crawl log file of a site. Call dump function to output dump log. Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. See, fontconfig library for details. """ origins_to_fonts = {} # will keep origin to loaded fonts mapping domaInfo = DomainInfo() file_content = fu.read_file(filename) wl_log.info("Parsing log for %s %s" % (url, filename)) fonts_by_fc_debug = re.findall( r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE ) # match family field of font request (not the matched one) domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content)) # offset width attempts domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content)) # offset height attempts # TODO add getBoundingClientRect font_and_urls = re.findall( r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content ) # output from modified browser # print 'font_and_urls', font_and_urls font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser # print 'font_and_urls', font_and_urls domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser # print 'domaInfo.log_complete', domaInfo.log_complete js_log_prefix = ">>>FPLOG" fpd_logs = re.findall(r"%s.*" % js_log_prefix, file_content) # output from modified browser domaInfo.fpd_logs = [call[len(js_log_prefix) + 1 :] for call in set(fpd_logs)] for font_name, font_url in font_and_urls: if font_url.startswith("http") and len(font_name) > 1 and not font_name[:5] in ("data:", "http:", "https"): # font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line font_name = font_name.lower().strip() # origin = pub_suffix.get_public_suffix(font_url)\ origin = font_url if origin in origins_to_fonts: origins_to_fonts[origin].add(font_name) # print 'added', font_name, 'to', origin, origins_to_fonts[origin] else: origins_to_fonts[origin] = set([font_name]) for font, face in font_face_pairs: font = font.lower().strip() face = face.lower().strip() # replace all occurrences of this font-family name with the face for fonts_by_origin in origins_to_fonts.itervalues(): try: fonts_by_origin.remove(font) except: # we cannot find this font in this origin's list pass else: fonts_by_origin.add(face) # print 'removed', font, 'added', face for origin, fonts in origins_to_fonts.iteritems(): domaInfo.fonts_by_origins[origin] = list(fonts) domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin] domaInfo.fc_dbg_font_loads = list( set([font.lower() for font in fonts_by_fc_debug if not font[:5] in ("data:", "http:", "https")]) ) # filter out the data urls and web fonts domaInfo.fonts_loaded = list( set([font.lower() for font in domaInfo.fonts_loaded if not font[:5] in ("data:", "http:", "https")]) ) # filter out the data urls and web fonts requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE) if not requests and filename.endswith(MITM_LOG_EXTENSION): requests = re.findall(r"(http.*)", file_content, re.MULTILINE) responses = "" # populate domain info obj domaInfo.num_font_loads = len(domaInfo.fonts_loaded) domaInfo.requests = list(set(requests)) domaInfo.responses = list(set(responses)) domaInfo.fp_detected = get_fp_from_reqs(requests) domaInfo.url = url domaInfo.rank = get_rank_domain_from_filename(filename)[ 0 ] # !!! rank may not be right. It's only true if we make top Alexa crawl. domaInfo.log_filename = filename domaInfo.crawl_id = crawl_id # Read canvas events and print them to log in canvas urls_read_from_canvas = Set() urls_wrote_to_canvas = Set() canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER, str(crawl_id) + "canvas.log") read = wrote = False for read_event in cm.CANVAS_READ_EVENTS: if read_event in file_content: read = True break for write_event in cm.CANVAS_WRITE_EVENTS: if write_event in file_content: wrote = True break if read and wrote: wl_log.info("Found both canvas read and write events in log %s, registering in : %s" % (filename, canvas_log)) with open(canvas_log, "a+") as f: f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n") if dump_fun: # call dump function try: dump_fun(domaInfo) except KeyboardInterrupt: raise except Exception as exc: wl_log.exception("Exception while dumping %s: %s" % (domaInfo.url, exc))
def _do_visits(self): self._config_driver() with TorBrowserDriver(**self.driver_config) as driver: self.driver = driver failed = 0 while self.job.visit < self.job.visits: self.job.visit += 1 raised = False num_retry, retry = 0, RETRY while True: wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) try: ut.create_dir(self.job.path) self.save_checkpoint() self.set_page_load_timeout() try: self._do_instance() self.get_screenshot_if_enabled() except (cm.HardTimeoutException, TimeoutException, SnifferTimeoutError): wl_log.exception("Visit to %s has timed out!", self.job.url) else: self.post_visit() finally: self._reset_tor() self.cleanup_visit() except OSError as ose: wl_log.exception("OS Error %s" % repr(ose)) raise ose # we better halt here, we may be out of disk space except TBDriverPortError as tbde: raise tbde # cannot connect to Tor, the following ones will fail except cm.ConnErrorPage as cnpe: raised = cnpe except stem.SocketError as sckte: raised = sckte except Exception as exc: wl_log.exception("Unknown exception: %s" % repr(exc)) raised = exc finally: if not raised: break # there was a non-timeout exception failed += 1 if failed >= MAX_FAILED: self._mark_failed() self.job.visit = self.job.visits break # retry visit? if not retry: break wl_log.info("Will retry the visit. Retry num %s/%s" % (retry, cm.MAX_RETRIES)) rmtree(self.job.path) num_retry += 1 if num_retry == cm.MAX_RETRIES: retry = False self.controller.restart_tor() if self.job.visit == self.job.visits: self.job.visit = 0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""): """Populate domain info object by parsing crawl log file of a site. Call dump function to output dump log. Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. See, fontconfig library for details. """ origins_to_fonts = {} # will keep origin to loaded fonts mapping domaInfo = DomainInfo() file_content = fu.read_file(filename) wl_log.info('Parsing log for %s %s' % (url, filename)) fonts_by_fc_debug = re.findall( r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE ) # match family field of font request (not the matched one) domaInfo.num_offsetWidth_calls = len( re.findall(r"Element::offsetWidth", file_content)) # offset width attempts domaInfo.num_offsetHeight_calls = len( re.findall(r"Element::offsetHeight", file_content)) # offset height attempts # TODO add getBoundingClientRect font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls domaInfo.log_complete = int( bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser #print 'domaInfo.log_complete', domaInfo.log_complete js_log_prefix = ">>>FPLOG" fpd_logs = re.findall(r'%s.*' % js_log_prefix, file_content) # output from modified browser domaInfo.fpd_logs = [ call[len(js_log_prefix) + 1:] for call in set(fpd_logs) ] for font_name, font_url in font_and_urls: if font_url.startswith('http') and len( font_name) > 1 and not font_name[:5] in ('data:', 'http:', 'https'): #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line font_name = font_name.lower().strip() # origin = pub_suffix.get_public_suffix(font_url)\ origin = font_url if origin in origins_to_fonts: origins_to_fonts[origin].add(font_name) #print 'added', font_name, 'to', origin, origins_to_fonts[origin] else: origins_to_fonts[origin] = set([ font_name, ]) for font, face in font_face_pairs: font = font.lower().strip() face = face.lower().strip() # replace all occurrences of this font-family name with the face for fonts_by_origin in origins_to_fonts.itervalues(): try: fonts_by_origin.remove(font) except: # we cannot find this font in this origin's list pass else: fonts_by_origin.add(face) # print 'removed', font, 'added', face for origin, fonts in origins_to_fonts.iteritems(): domaInfo.fonts_by_origins[origin] = list(fonts) domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin] domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE) if not requests and filename.endswith(MITM_LOG_EXTENSION): requests = re.findall(r"(http.*)", file_content, re.MULTILINE) responses = '' # populate domain info obj domaInfo.num_font_loads = len(domaInfo.fonts_loaded) domaInfo.requests = list(set(requests)) domaInfo.responses = list(set(responses)) domaInfo.fp_detected = get_fp_from_reqs(requests) domaInfo.url = url domaInfo.rank = get_rank_domain_from_filename( filename )[0] # !!! rank may not be right. It's only true if we make top Alexa crawl. domaInfo.log_filename = filename domaInfo.crawl_id = crawl_id # Read canvas events and print them to log in canvas urls_read_from_canvas = Set() urls_wrote_to_canvas = Set() canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER, str(crawl_id) + "canvas.log") read = wrote = False for read_event in cm.CANVAS_READ_EVENTS: if read_event in file_content: read = True break for write_event in cm.CANVAS_WRITE_EVENTS: if write_event in file_content: wrote = True break if read and wrote: wl_log.info( 'Found both canvas read and write events in log %s, registering in : %s' % (filename, canvas_log)) with open(canvas_log, "a+") as f: f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n") if dump_fun: # call dump function try: dump_fun(domaInfo) except KeyboardInterrupt: raise except Exception as exc: wl_log.exception("Exception while dumping %s: %s" % (domaInfo.url, exc))