Exemple #1
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'], url=url)
            
    except Exception as exc:
        wl_log.exception('Exception in worker function %s %s' % (url_tuple, exc))
 def _new_identity(self):
     wl_log.info("Creating a new identity...")
     try:
         ActionChains(self.driver).send_keys(Keys.CONTROL + Keys.SHIFT +
                                             'U').perform()
     except WebDriverException:
         pass
     except Exception:
         wl_log.exception("Exception while creating new identity.")
    def add_captcha(self):
        try:
            captcha_filepath = ut.capture_dirpath_to_captcha(self.path)
            move(self.path, captcha_filepath)

            self.captchas[self.global_visit] = True
        except OSError as e:
            wl_log.exception('%s could not be renamed to %s', self.path,
                             captcha_filepath)
            raise e
Exemple #4
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10  # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB)  # sleep for a while

    try:
        idx, url = url_tuple
        idx = str(idx)

        stdout_log = os.path.join(
            agent_cfg['job_dir'],
            fu.get_out_filename_from_url(url, str(idx), '.txt'))

        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url

        proxy_opt = mitm.init_mitmproxy(
            stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']
        ) if agent_cfg['use_mitm_proxy'] else ""

        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd)  # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical(
                    'Error while visiting %s(%s) w/ command: %s: (%s) %s' %
                    (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))

        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)

        sleep(
            2
        )  # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg[
                'post_visit_func']:  # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log,
                                         crawl_id=agent_cfg['crawl_id'],
                                         url=url)

    except Exception as exc:
        wl_log.exception('Exception in worker function %s %s' %
                         (url_tuple, exc))
Exemple #5
0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""):
    """Populate domain info object by parsing crawl log file of a site.
    Call dump function to output dump log.
    
    Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. 
    See, fontconfig library for details.  
    
    """
    origins_to_fonts = {}  # will keep origin to loaded fonts mapping

    domaInfo = DomainInfo()

    file_content = fu.read_file(filename)
    wl_log.info("Parsing log for %s %s" % (url, filename))

    fonts_by_fc_debug = re.findall(
        r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE
    )  # match family field of font request (not the matched one)
    domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content))  # offset width attempts
    domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content))  # offset height attempts
    # TODO add getBoundingClientRect

    font_and_urls = re.findall(
        r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content
    )  # output from modified browser
    # print 'font_and_urls', font_and_urls

    font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content)  # output from modified browser
    # print 'font_and_urls', font_and_urls
    domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content)))  # output from modified browser
    # print 'domaInfo.log_complete', domaInfo.log_complete
    js_log_prefix = ">>>FPLOG"
    fpd_logs = re.findall(r"%s.*" % js_log_prefix, file_content)  # output from modified browser
    domaInfo.fpd_logs = [call[len(js_log_prefix) + 1 :] for call in set(fpd_logs)]

    for font_name, font_url in font_and_urls:
        if font_url.startswith("http") and len(font_name) > 1 and not font_name[:5] in ("data:", "http:", "https"):
            # font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line
            font_name = font_name.lower().strip()
            #             origin = pub_suffix.get_public_suffix(font_url)\
            origin = font_url
            if origin in origins_to_fonts:
                origins_to_fonts[origin].add(font_name)
                # print 'added', font_name, 'to', origin, origins_to_fonts[origin]
            else:
                origins_to_fonts[origin] = set([font_name])

    for font, face in font_face_pairs:
        font = font.lower().strip()
        face = face.lower().strip()
        # replace all occurrences of this font-family name with the face
        for fonts_by_origin in origins_to_fonts.itervalues():
            try:
                fonts_by_origin.remove(font)
            except:  # we cannot find this font in this origin's list
                pass
            else:
                fonts_by_origin.add(face)
                # print 'removed', font, 'added', face

    for origin, fonts in origins_to_fonts.iteritems():
        domaInfo.fonts_by_origins[origin] = list(fonts)
        domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin]

    domaInfo.fc_dbg_font_loads = list(
        set([font.lower() for font in fonts_by_fc_debug if not font[:5] in ("data:", "http:", "https")])
    )  # filter out the data urls and web fonts

    domaInfo.fonts_loaded = list(
        set([font.lower() for font in domaInfo.fonts_loaded if not font[:5] in ("data:", "http:", "https")])
    )  # filter out the data urls and web fonts

    requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE)
    if not requests and filename.endswith(MITM_LOG_EXTENSION):
        requests = re.findall(r"(http.*)", file_content, re.MULTILINE)
    responses = ""
    # populate domain info obj

    domaInfo.num_font_loads = len(domaInfo.fonts_loaded)
    domaInfo.requests = list(set(requests))
    domaInfo.responses = list(set(responses))
    domaInfo.fp_detected = get_fp_from_reqs(requests)
    domaInfo.url = url
    domaInfo.rank = get_rank_domain_from_filename(filename)[
        0
    ]  # !!! rank may not be right. It's only true if we make top Alexa crawl.
    domaInfo.log_filename = filename
    domaInfo.crawl_id = crawl_id

    # Read canvas events and print them to log in canvas
    urls_read_from_canvas = Set()
    urls_wrote_to_canvas = Set()

    canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER, str(crawl_id) + "canvas.log")
    read = wrote = False
    for read_event in cm.CANVAS_READ_EVENTS:
        if read_event in file_content:
            read = True
            break
    for write_event in cm.CANVAS_WRITE_EVENTS:
        if write_event in file_content:
            wrote = True
            break

    if read and wrote:
        wl_log.info("Found both canvas read and write events in log %s, registering in : %s" % (filename, canvas_log))
        with open(canvas_log, "a+") as f:
            f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n")

    if dump_fun:  # call dump function
        try:
            dump_fun(domaInfo)
        except KeyboardInterrupt:
            raise
        except Exception as exc:
            wl_log.exception("Exception while dumping %s: %s" % (domaInfo.url, exc))
    def _do_visits(self):
        self._config_driver()
        with TorBrowserDriver(**self.driver_config) as driver:
            self.driver = driver
            failed = 0
            while self.job.visit < self.job.visits:
                self.job.visit += 1

                raised = False
                num_retry, retry = 0, RETRY
                while True:
                    wl_log.info("*** Visit #%s to %s ***", self.job.visit,
                                self.job.url)
                    try:
                        ut.create_dir(self.job.path)
                        self.save_checkpoint()
                        self.set_page_load_timeout()
                        try:
                            self._do_instance()
                            self.get_screenshot_if_enabled()
                        except (cm.HardTimeoutException, TimeoutException,
                                SnifferTimeoutError):
                            wl_log.exception("Visit to %s has timed out!",
                                             self.job.url)
                        else:
                            self.post_visit()
                        finally:
                            self._reset_tor()
                            self.cleanup_visit()
                    except OSError as ose:
                        wl_log.exception("OS Error %s" % repr(ose))
                        raise ose  # we better halt here, we may be out of disk space
                    except TBDriverPortError as tbde:
                        raise tbde  # cannot connect to Tor, the following ones will fail
                    except cm.ConnErrorPage as cnpe:
                        raised = cnpe
                    except stem.SocketError as sckte:
                        raised = sckte
                    except Exception as exc:
                        wl_log.exception("Unknown exception: %s" % repr(exc))
                        raised = exc
                    finally:
                        if not raised:
                            break

                        # there was a non-timeout exception
                        failed += 1
                        if failed >= MAX_FAILED:
                            self._mark_failed()
                            self.job.visit = self.job.visits
                            break

                        # retry visit?
                        if not retry:
                            break
                        wl_log.info("Will retry the visit. Retry num %s/%s" %
                                    (retry, cm.MAX_RETRIES))
                        rmtree(self.job.path)
                        num_retry += 1
                        if num_retry == cm.MAX_RETRIES:
                            retry = False
                        self.controller.restart_tor()

            if self.job.visit == self.job.visits:
                self.job.visit = 0
Exemple #7
0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""):
    """Populate domain info object by parsing crawl log file of a site.
    Call dump function to output dump log.
    
    Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. 
    See, fontconfig library for details.  
    
    """
    origins_to_fonts = {}  # will keep origin to loaded fonts mapping

    domaInfo = DomainInfo()

    file_content = fu.read_file(filename)
    wl_log.info('Parsing log for %s %s' % (url, filename))

    fonts_by_fc_debug = re.findall(
        r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE
    )  # match family field of font request (not the matched one)
    domaInfo.num_offsetWidth_calls = len(
        re.findall(r"Element::offsetWidth",
                   file_content))  # offset width attempts
    domaInfo.num_offsetHeight_calls = len(
        re.findall(r"Element::offsetHeight",
                   file_content))  # offset height attempts
    # TODO add getBoundingClientRect

    font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)",
                               file_content)  # output from modified browser
    #print 'font_and_urls', font_and_urls

    font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)",
                                 file_content)  # output from modified browser
    #print 'font_and_urls', font_and_urls
    domaInfo.log_complete = int(
        bool(re.findall(r"Finished all steps!",
                        file_content)))  # output from modified browser
    #print 'domaInfo.log_complete', domaInfo.log_complete
    js_log_prefix = ">>>FPLOG"
    fpd_logs = re.findall(r'%s.*' % js_log_prefix,
                          file_content)  # output from modified browser
    domaInfo.fpd_logs = [
        call[len(js_log_prefix) + 1:] for call in set(fpd_logs)
    ]

    for font_name, font_url in font_and_urls:
        if font_url.startswith('http') and len(
                font_name) > 1 and not font_name[:5] in ('data:', 'http:',
                                                         'https'):
            #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line
            font_name = font_name.lower().strip()
            #             origin = pub_suffix.get_public_suffix(font_url)\
            origin = font_url
            if origin in origins_to_fonts:
                origins_to_fonts[origin].add(font_name)
                #print 'added', font_name, 'to', origin, origins_to_fonts[origin]
            else:
                origins_to_fonts[origin] = set([
                    font_name,
                ])

    for font, face in font_face_pairs:
        font = font.lower().strip()
        face = face.lower().strip()
        # replace all occurrences of this font-family name with the face
        for fonts_by_origin in origins_to_fonts.itervalues():
            try:
                fonts_by_origin.remove(font)
            except:  # we cannot find this font in this origin's list
                pass
            else:
                fonts_by_origin.add(face)
                # print 'removed', font, 'added', face

    for origin, fonts in origins_to_fonts.iteritems():
        domaInfo.fonts_by_origins[origin] = list(fonts)
        domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin]


    domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts


    domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts

    requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE)
    if not requests and filename.endswith(MITM_LOG_EXTENSION):
        requests = re.findall(r"(http.*)", file_content, re.MULTILINE)
    responses = ''
    # populate domain info obj

    domaInfo.num_font_loads = len(domaInfo.fonts_loaded)
    domaInfo.requests = list(set(requests))
    domaInfo.responses = list(set(responses))
    domaInfo.fp_detected = get_fp_from_reqs(requests)
    domaInfo.url = url
    domaInfo.rank = get_rank_domain_from_filename(
        filename
    )[0]  # !!! rank may not be right. It's only true if we make top Alexa crawl.
    domaInfo.log_filename = filename
    domaInfo.crawl_id = crawl_id

    # Read canvas events and print them to log in canvas
    urls_read_from_canvas = Set()
    urls_wrote_to_canvas = Set()

    canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER,
                              str(crawl_id) + "canvas.log")
    read = wrote = False
    for read_event in cm.CANVAS_READ_EVENTS:
        if read_event in file_content:
            read = True
            break
    for write_event in cm.CANVAS_WRITE_EVENTS:
        if write_event in file_content:
            wrote = True
            break

    if read and wrote:
        wl_log.info(
            'Found both canvas read and write events in log %s, registering in : %s'
            % (filename, canvas_log))
        with open(canvas_log, "a+") as f:
            f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n")

    if dump_fun:  # call dump function
        try:
            dump_fun(domaInfo)
        except KeyboardInterrupt:
            raise
        except Exception as exc:
            wl_log.exception("Exception while dumping %s: %s" %
                             (domaInfo.url, exc))