def gen_url_list(stop, start=1, get_rank=False, filename="", sep=','):
    """Yield URLs for a given rank range from a given file (or default Alexa list).
    
    start and stop is inclusive and 1-based indexes (to match the ranks).
    """
    if not filename:
        filename = ALEXA_TOP1M_PATH

    if not ospath.isfile(filename):
        wl_log.critical('Cannot find URL list (Top Alexa CSV etc.) file!')
        return

    for line in open(filename).readlines()[start - 1:stop]:
        if sep in line:
            rank, site_url = line.split(
                sep,
                1)  # we expect a comma between rank and URL (Alexa format)
            #beware: URLs may also include commas.
            site_url = site_url.rstrip()
            if get_rank:  # if caller asked for rank
                yield int(rank), site_url
            else:
                yield site_url
        else:
            if get_rank:
                yield 0, line.rstrip(
                )  # we couldn't find the rank, just send 0
            else:
                yield line.rstrip()  # no comma
def mysql_init_db(db_name='fp_detective'):
    db_conn = None
    try:
        db_conn = mdb.connect(DB_IP_ADDRESS, DB_USERNAME, DB_PASSWD, db_name)
    except mdb.Error, e:
        wl_log.critical("Error %d: %s" % (e.args[0], e.args[1]))
        raise e
Exemple #3
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
Exemple #4
0
def mysql_init_db(db_name='fp_detective'):
    db_conn = None
    try:
        db_conn = mdb.connect(DB_IP_ADDRESS, DB_USERNAME, DB_PASSWD, db_name);
    except mdb.Error, e:  
        wl_log.critical("Error %d: %s" % (e.args[0],e.args[1]))
        raise e
Exemple #5
0
def gen_url_list(stop, start=1, get_rank=False, filename = "", sep=','):
    """Yield URLs for a given rank range from a given file (or default Alexa list).
    
    start and stop is inclusive and 1-based indexes (to match the ranks).
    """
    if not filename:
        filename = ALEXA_TOP1M_PATH
    
    if not ospath.isfile(filename):
        wl_log.critical('Cannot find URL list (Top Alexa CSV etc.) file!')
        return
    
    for line in open(filename).readlines()[start-1:stop]:
        if sep in line:
            rank, site_url = line.split(sep, 1) # we expect a comma between rank and URL (Alexa format)
                                                #beware: URLs may also include commas.
            site_url =  site_url.rstrip()
            if get_rank: # if caller asked for rank
                yield int(rank), site_url  
            else:
                yield site_url
        else:
            if get_rank:
                yield 0, line.rstrip() # we couldn't find the rank, just send 0
            else:
                yield line.rstrip() # no comma
Exemple #6
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
    def launch_tor_service(self, logfile='/dev/null'):
        """Launch Tor service and return the process."""
        self.log_file = logfile
        self.tmp_tor_data_dir = ut.clone_dir_with_timestap(
            cm.get_tor_data_path(self.tbb_version))

        self.torrc_dict.update({
            'DataDirectory': self.tmp_tor_data_dir,
            'Log': ['INFO file %s' % logfile]
        })

        wl_log.debug("Tor config: %s" % self.torrc_dict)
        try:
            self.tor_process = stem.process.launch_tor_with_config(
                config=self.torrc_dict,
                init_msg_handler=self.tor_log_handler,
                tor_cmd=cm.get_tor_bin_path(self.tbb_version),
                timeout=270)
            self.controller = Controller.from_port()
            self.controller.authenticate()
            return self.tor_process

        except stem.SocketError as exc:
            wl_log.critical("Unable to connect to tor on port %s: %s" %
                            (cm.SOCKS_PORT, exc))
            sys.exit(1)
        except:
            # most of the time this is due to another instance of
            # tor running on the system
            wl_log.critical("Error launching Tor", exc_info=True)
            sys.exit(1)

        wl_log.info("Tor running at port {0} & controller port {1}.".format(
            cm.SOCKS_PORT, cm.CONTROLLER_PORT))
        return self.tor_process
Exemple #8
0
def is_targz_archive_corrupt(arc_path):
    # http://stackoverflow.com/a/2001749/3104416
    tar_gz_check_cmd = "gunzip -c %s | tar t > /dev/null" % arc_path
    tar_status, tar_txt = commands.getstatusoutput(tar_gz_check_cmd)
    if tar_status:
        wl_log.critical("Tar check failed: %s tar_status: %s tar_txt: %s" %
                        (tar_gz_check_cmd, tar_status, tar_txt))
        return tar_status
    return False  # no error
def is_targz_archive_corrupt(arc_path):
    # http://stackoverflow.com/a/2001749/3104416
    tar_gz_check_cmd = "gunzip -c %s | tar t > /dev/null" % arc_path
    tar_status, tar_txt = commands.getstatusoutput(tar_gz_check_cmd)
    if tar_status:
        wl_log.critical("Tar check failed: %s tar_status: %s tar_txt: %s"
                        % (tar_gz_check_cmd, tar_status, tar_txt))
        return tar_status
    return False  # no error
Exemple #10
0
def init_mitmproxy(basename, timeout, logging):
    try:
        port, pid = run_mitmdump(basename, timeout+1, logging) # runs a mitmdump process with the timeout+1 sec
    except:
        wl_log.critical('Exception initializing mitmdump')
    else:
        wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid))

    return "127.0.0.1:%s " % port if port and pid else ""
Exemple #11
0
def insert_to_db(db_conn, query, args):
    with closing(db_conn.cursor(mdb.cursors.DictCursor)) as db_cursor:
        #db_cursor = db_conn.cursor(mdb.cursors.DictCursor)
        try:
            db_cursor.execute(query, args)
        except Exception as ex:
            wl_log.critical('Exception executing query: %s %s' % (query, args))
            raise ex
        db_conn.commit()
        return db_cursor.lastrowid
Exemple #12
0
def update_crawl_time(db_conn, crawl_id):
     
    with closing( db_conn.cursor(mdb.cursors.DictCursor) ) as db_cursor:
        try:
            db_cursor.execute("UPDATE crawl_job SET finish_time= %s WHERE crawl_id = %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), crawl_id))
        except Exception as ex:
            wl_log.critical('Exception executing UPDATE query: %s %s')
            raise ex
        db_conn.commit()
        return db_cursor.lastrowid
Exemple #13
0
def insert_to_db(db_conn, query, args):
    with closing( db_conn.cursor(mdb.cursors.DictCursor) ) as db_cursor:
    #db_cursor = db_conn.cursor(mdb.cursors.DictCursor)
        try:
            db_cursor.execute(query, args)
        except Exception as ex:
            wl_log.critical('Exception executing query: %s %s' % (query, args))
            raise ex
        db_conn.commit()
        return db_cursor.lastrowid
Exemple #14
0
def update_crawl_time(db_conn, crawl_id):
     
    with closing( db_conn.cursor(mdb.cursors.DictCursor) ) as db_cursor:
        try:
            db_cursor.execute("UPDATE crawl_job SET finish_time= %s WHERE crawl_id = %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), crawl_id))
        except Exception as ex:
            wl_log.critical('Exception executing UPDATE query: %s %s')
            raise ex
        db_conn.commit()
        return db_cursor.lastrowid
Exemple #15
0
    def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0):
        wl_log.info('Crawl configuration: batches %s, instances: %s, tbb_version %s, no of URLs: %s, crawl dir: %s, XVFB: %s, screenshot: %s'
                    % (num_batches, num_instances, self.tbb_version, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen))

        # for each batch
        for batch_num in range(num_batches):
            wl_log.info('********** Starting batch %s **********' % batch_num)
            site_num = start_line
            bg_site = None
            batch_dir = ut.create_dir(
                os.path.join(self.crawl_dir, str(batch_num)))

            # init/reset tor process to have a different circuit.
            # make sure that we're not using the same guard node again
            wl_log.info('********** Restarting Tor Before Batch **********')
            self.tor_controller.restart_tor()
            sites_crawled_with_same_proc = 0

            # for each site
            for page_url in self.urls:
                sites_crawled_with_same_proc += 1
                if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS:
                    wl_log.info('********** Restarting Tor Process **********')
                    self.tor_controller.restart_tor()
                    sites_crawled_with_same_proc = 0

                wl_log.info('********** Crawling %s **********' % page_url)
                page_url = page_url[:cm.MAX_FNAME_LENGTH]
                site_dir = ut.create_dir(os.path.join(
                    batch_dir, ut.get_filename_from_url(page_url, site_num)))

                for instance_num in range(num_instances):
                    wl_log.info('********** Visit #%s to %s **********' %
                                (instance_num, page_url))
                    self.visit = None
                    try:
                        self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir,
                                           self.tbb_version, self.tor_controller, bg_site, self.xvfb, self.capture_screen)
                        self.visit.get()
                    except KeyboardInterrupt:  # CTRL + C
                        raise KeyboardInterrupt
                    except (ut.TimeExceededError, TimeoutException) as exc:
                        wl_log.critical('Visit to %s timed out! %s %s' % (
                            page_url, exc, type(exc)))
                        if self.visit:
                            self.visit.cleanup_visit()
                    except Exception:
                        wl_log.critical('Exception crawling %s' %
                                        page_url, exc_info=True)
                        if self.visit:
                            self.visit.cleanup_visit()

                # END - for each visit
                site_num += 1
                time.sleep(cm.PAUSE_BETWEEN_SITES)
Exemple #16
0
def init_mitmproxy(basename, timeout, logging):
    try:
        port, pid = run_mitmdump(
            basename, timeout + 1,
            logging)  # runs a mitmdump process with the timeout+1 sec
    except:
        wl_log.critical('Exception initializing mitmdump')
    else:
        wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid))

    return "127.0.0.1:%s " % port if port and pid else ""
Exemple #17
0
def crawl_urls(br_type, urls, fn=lambda x: x):
    for url in urls:
        try:
            br = init_browser(br_type)
        except:
            wl_log.critical('Init browser')
        else:
            try:
                crawl_url(br, url, fn)
            except Exception as e:
                wl_log.error("Error crawling %s: %s" % (url, e))
            br.quit()
Exemple #18
0
def crawl_urls(br_type, urls, fn=lambda x:x):
    for url in urls:
        try:
            br = init_browser(br_type)
        except:
            wl_log.critical('Init browser')
        else:
            try:
                crawl_url(br, url, fn)
            except Exception as e:
                wl_log.error("Error crawling %s: %s" %(url, e))
            br.quit()
Exemple #19
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename + '.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try:
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(
                    msg, crawl_id
                )  # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as exc:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)

    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(
        os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)

    # parse
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION

    insert_js_fun = functools.partial(lp.insert_js_info_to_db,
                                      site_info_id=site_info_id,
                                      db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun,
                       crawl_id)  # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
 def close_all_streams(self):
     """Close all streams of a controller."""
     wl_log.debug("Closing all streams")
     try:
         ut.timeout(cm.STREAM_CLOSE_TIMEOUT)
         for stream in self.controller.get_streams():
             wl_log.debug(
                 "Closing stream %s %s %s " %
                 (stream.id, stream.purpose, stream.target_address))
             self.controller.close_stream(stream.id)  # MISC reason
     except ut.TimeExceededError:
         wl_log.critical("Closing streams timed out!")
     except:
         wl_log.debug("Exception closing stream")
     finally:
         ut.cancel_timeout()
Exemple #21
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename +'.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try: 
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as _:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)
    
    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)
    
    # parse 
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION
        
    insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
Exemple #22
0
def get_free_port():
    """Get a free port number for mitmdump.
    
    http://stackoverflow.com/questions/1365265/on-localhost-how-to-pick-a-free-port-number?#answer-1365284
    
    """
    max_tries = 0
    while max_tries < MITM_MAX_TRIES:
        max_tries += 1
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.bind(('', 0))
            port = s.getsockname()[1]
        except Exception as ex:
            wl_log.critical('Exception when trying to bind to socket %s ' % ex)
            sleep(1)
        else:
            return port
    return None
Exemple #23
0
def get_free_port():
    """Get a free port number for mitmdump.
    
    http://stackoverflow.com/questions/1365265/on-localhost-how-to-pick-a-free-port-number?#answer-1365284
    
    """
    max_tries = 0
    while max_tries < MITM_MAX_TRIES:
        max_tries += 1
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.bind(('', 0))
            port = s.getsockname()[1]
        except Exception as ex:
            wl_log.critical('Exception when trying to bind to socket %s ' % ex)
            sleep(1)
        else:
            return port
    return None
def pack_crawl_data(crawl_dir):
    """Compress the crawl dir into a tar archive."""
    if not os.path.isdir(crawl_dir):
        wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir)
        return False
    if crawl_dir.endswith(os.path.sep):
        crawl_dir = crawl_dir[:-1]
    crawl_name = os.path.basename(crawl_dir)
    containing_dir = os.path.dirname(crawl_dir)
    os.chdir(containing_dir)
    arc_path = "%s.tar.gz" % crawl_name
    tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name)
    wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or is_targz_archive_corrupt(arc_path):
        wl_log.critical("Tar command failed or archive is corrupt:\
                         %s \nSt: %s txt: %s" % (tar_cmd, status, txt))
        return False
    else:
        return True
Exemple #25
0
def pack_crawl_data(crawl_dir):
    """Compress the crawl dir into a tar archive."""
    if not os.path.isdir(crawl_dir):
        wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir)
        return False
    if crawl_dir.endswith(os.path.sep):
        crawl_dir = crawl_dir[:-1]
    crawl_name = os.path.basename(crawl_dir)
    containing_dir = os.path.dirname(crawl_dir)
    os.chdir(containing_dir)
    arc_path = "%s.tar.gz" % crawl_name
    tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name)
    wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or is_targz_archive_corrupt(arc_path):
        wl_log.critical("Tar command failed or archive is corrupt:\
                         %s \nSt: %s txt: %s" % (tar_cmd, status, txt))
        return False
    else:
        return True
Exemple #26
0
def crawl_url(crawler_type, page_url, proxy_opt):
    
    if 'clicker' in crawler_type:
        worker = click_crawler
    else:
        worker = lazy_crawler
    
    br = init_browser('chrome', ['--allow-running-insecure-content', '--ignore-certificate-errors', '--disk-cache-size=0', \
                                         '--enable-logging', '--v=1', "--proxy-server=%s" % proxy_opt])
            
    if not page_url.startswith('http') and not page_url.startswith('file:'): 
        page_url = 'http://' + page_url
        
    wl_log.info('***Will crawl  %s***' % page_url)
    
    try:
        ut.timeout(CRAWLER_CLICKER_VISIT_TIMEOUT)
        worker(br, page_url) # run the worker function
    except ut.TimeExceededError as texc:
        wl_log.critical('***CRAWLER_CLICKER_VISIT_TIMEOUT at %s (%s)' % (page_url, texc))
    finally:    
        br.quit()
Exemple #27
0
def crawl_url(crawler_type, page_url, proxy_opt):

    if 'clicker' in crawler_type:
        worker = click_crawler
    else:
        worker = lazy_crawler

    br = init_browser('chrome', ['--allow-running-insecure-content', '--ignore-certificate-errors', '--disk-cache-size=0', \
                                         '--enable-logging', '--v=1', "--proxy-server=%s" % proxy_opt])

    if not page_url.startswith('http') and not page_url.startswith('file:'):
        page_url = 'http://' + page_url

    wl_log.info('***Will crawl  %s***' % page_url)

    try:
        ut.timeout(CRAWLER_CLICKER_VISIT_TIMEOUT)
        worker(br, page_url)  # run the worker function
    except ut.TimeExceededError as texc:
        wl_log.critical('***CRAWLER_CLICKER_VISIT_TIMEOUT at %s (%s)' %
                        (page_url, texc))
    finally:
        br.quit()
Exemple #28
0
 def get_public_suffix(self, url):
     try:
         return self.psl.get_public_suffix(urlparse(url).hostname)
     except Exception as e:
         wl_log.critical('Exception(%s) parsing url: %s' %(e, url))
         return ''
Exemple #29
0
def die(last_words):
    """Log last words and exit."""
    wl_log.critical(last_words)
    sys.exit(1)
Exemple #30
0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0):
    """Populate domain info object by parsing crawl log file of a site.
    Call dump function to output dump log.
    
    Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. 
    See, fontconfig library for details.  
    
    """
    origins_to_fonts = {}  # will keep origin to loaded fonts mapping

    domaInfo = DomainInfo()

    file_content = fu.read_file(filename)
    # TODO chromium?
    url_match = re.search(r"opening url: ([^,]*)", file_content)
    url = url_match.group(1) if url_match else filename

    wl_log.info('Parsing log for  %s %s' % (url, filename))

    fonts_by_fc_debug = re.findall(
        r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE
    )  # match family field of font request (not the matched one)
    domaInfo.num_offsetWidth_calls = len(
        re.findall(r"Element::offsetWidth",
                   file_content))  # offset width attempts
    domaInfo.num_offsetHeight_calls = len(
        re.findall(r"Element::offsetHeight",
                   file_content))  # offset height attempts
    # TODO add getBoundingClientRect

    font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)",
                               file_content)  # output from modified browser
    #print 'font_and_urls', font_and_urls

    font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)",
                                 file_content)  # output from modified browser
    #print 'font_and_urls', font_and_urls
    domaInfo.log_complete = int(
        bool(re.findall(r"Finished all steps!",
                        file_content)))  # output from modified browser
    #print 'domaInfo.log_complete', domaInfo.log_complete
    js_log_prefix = ">>>FPLOG"
    fpd_logs = re.findall(r'%s.*' % js_log_prefix,
                          file_content)  # output from modified browser
    domaInfo.fpd_logs = [
        call[len(js_log_prefix) + 1:] for call in set(fpd_logs)
    ]

    for font_name, font_url in font_and_urls:
        if font_url.startswith('http') and len(
                font_name) > 1 and not font_name[:5] in ('data:', 'http:',
                                                         'https'):
            #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line
            font_name = font_name.lower().strip()
            #             origin = pub_suffix.get_public_suffix(font_url)\
            origin = font_url
            if origin in origins_to_fonts:
                origins_to_fonts[origin].add(font_name)
                #print 'added', font_name, 'to', origin, origins_to_fonts[origin]
            else:
                origins_to_fonts[origin] = set([
                    font_name,
                ])

    for font, face in font_face_pairs:
        font = font.lower().strip()
        face = face.lower().strip()
        # replace all occurrences of this font-family name with the face
        for fonts_by_origin in origins_to_fonts.itervalues():
            try:
                fonts_by_origin.remove(font)
            except:  # we cannot find this font in this origin's list
                pass
            else:
                fonts_by_origin.add(face)
                # print 'removed', font, 'added', face

    for origin, fonts in origins_to_fonts.iteritems():
        domaInfo.fonts_by_origins[origin] = list(fonts)
        domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin]


    domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts


    domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts

    requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE)
    if not requests and filename.endswith(MITM_LOG_EXTENSION):
        requests = re.findall(r"(http.*)", file_content, re.MULTILINE)
    responses = ''
    # populate domain info obj

    domaInfo.num_font_loads = len(domaInfo.fonts_loaded)
    domaInfo.requests = list(set(requests))
    domaInfo.responses = list(set(responses))
    domaInfo.fp_detected = get_fp_from_reqs(requests)
    domaInfo.url = url
    domaInfo.rank = get_rank_domain_from_filename(
        filename
    )[0]  # !!! rank may not be right. It's only true if we make top Alexa crawl.
    domaInfo.log_filename = filename
    domaInfo.crawl_id = crawl_id

    if dump_fun:  # call dump function
        try:
            dump_fun(domaInfo)
        except KeyboardInterrupt:
            raise
        except Exception as exc:
            wl_log.critical("Exception while dumping %s: %s" %
                            (domaInfo.url, exc))
 def get_public_suffix(self, url):
     try:
         return self.psl.get_public_suffix(urlparse(url).hostname)
     except Exception as e:
         wl_log.critical('Exception(%s) parsing url: %s' % (e, url))
         return ''
    def crawl(self, num_batches=cm.NUM_BATCHES,
              num_instances=cm.NUM_INSTANCES, start_line=0):
        wl_log.info("Crawl configuration: batches: %s, instances: %s,"
                    " tbb_version: %s, experiment: %s, no of URLs: %s, "
                    "crawl dir: %s, XVFB: %s, screenshot: %s"
                    % (num_batches, num_instances, self.tbb_version,
                       self.experiment, len(self.urls), self.crawl_dir,
                       self.xvfb, self.capture_screen))
        # for each batch
        for batch_num in xrange(num_batches):
            wl_log.info("********** Starting batch %s **********" % batch_num)
            site_num = start_line
            bg_site = None
            batch_dir = ut.create_dir(os.path.join(self.crawl_dir,
                                                   str(batch_num)))
            # init/reset tor process to have a different circuit.
            # make sure that we're not using the same guard node again
            wl_log.info("********** Restarting Tor Before Batch **********")
            self.tor_controller.restart_tor()
            sites_crawled_with_same_proc = 0

            # for each site
            for page_url in self.urls:
                sites_crawled_with_same_proc += 1
                if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS:
                    wl_log.info("********** Restarting Tor Process **********")
                    self.tor_controller.restart_tor()
                    sites_crawled_with_same_proc = 0

                wl_log.info("********** Crawling %s **********" % page_url)
                page_url = page_url[:cm.MAX_FNAME_LENGTH]
                site_dir = ut.create_dir(os.path.join(
                    batch_dir, ut.get_filename_from_url(page_url, site_num)))

                if self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
                    bg_site = choice(self.urls)
                # for each visit
                for instance_num in range(num_instances):
                    wl_log.info("********** Visit #%s to %s **********" %
                                (instance_num, page_url))
                    self.visit = None
                    try:
                        self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tor_controller,
                                           bg_site, self.experiment, self.xvfb, self.capture_screen)

                        self.visit.get()
                    except KeyboardInterrupt:  # CTRL + C
                        raise KeyboardInterrupt
                    except (ut.TimeExceededError, TimeoutException) as exc:
                        wl_log.critical("Visit to %s timed out! %s %s" %
                                        (page_url, exc, type(exc)))
                        if self.visit:
                            self.visit.cleanup_visit()
                    except Exception:
                        wl_log.critical("Exception crawling %s" % page_url,
                                        exc_info=True)
                        if self.visit:
                            self.visit.cleanup_visit()
                # END - for each visit
                site_num += 1
                time.sleep(cm.PAUSE_BETWEEN_SITES)
Exemple #33
0
def die(last_words):
    """Log last words and exit."""
    wl_log.critical(last_words)
    sys.exit(1)    
Exemple #34
0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0):
    """Populate domain info object by parsing crawl log file of a site.
    Call dump function to output dump log.
    
    Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. 
    See, fontconfig library for details.  
    
    """
    origins_to_fonts = {} # will keep origin to loaded fonts mapping
    
    domaInfo = DomainInfo()
    
    file_content = fu.read_file(filename)
    # TODO chromium?
    url_match = re.search(r"opening url: ([^,]*)", file_content)
    url = url_match.group(1) if url_match else filename
        
    wl_log.info('Parsing log for  %s %s' % (url, filename))
    
    fonts_by_fc_debug = re.findall(r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE) # match family field of font request (not the matched one) 
    domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content)) # offset width attempts
    domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content)) # offset height attempts
    # TODO add getBoundingClientRect
     
    font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content) # output from modified browser
    #print 'font_and_urls', font_and_urls  
    
    font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser
    #print 'font_and_urls', font_and_urls
    domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser
    #print 'domaInfo.log_complete', domaInfo.log_complete
    js_log_prefix = ">>>FPLOG"
    fpd_logs = re.findall(r'%s.*' % js_log_prefix, file_content) # output from modified browser
    domaInfo.fpd_logs = [call[len(js_log_prefix)+1:] for call in set(fpd_logs)]
    
    for font_name, font_url in font_and_urls:
        if font_url.startswith('http') and len(font_name) > 1  and not font_name[:5] in ('data:', 'http:', 'https'):
            #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line 
            font_name = font_name.lower().strip()
#             origin = pub_suffix.get_public_suffix(font_url)\
            origin = font_url
            if origin in origins_to_fonts:
                origins_to_fonts[origin].add(font_name)
                #print 'added', font_name, 'to', origin, origins_to_fonts[origin]  
            else: 
                origins_to_fonts[origin] = set([font_name,])
        
    for font, face in font_face_pairs:
        font = font.lower().strip()
        face = face.lower().strip()
        # replace all occurrences of this font-family name with the face
        for fonts_by_origin in origins_to_fonts.itervalues():
            try:
                fonts_by_origin.remove(font)
            except: # we cannot find this font in this origin's list
                pass
            else:
                fonts_by_origin.add(face)
                # print 'removed', font, 'added', face
    
    for origin, fonts in origins_to_fonts.iteritems():
        domaInfo.fonts_by_origins[origin] = list(fonts)
        domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin]
    

    domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts

         
    domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \
                    if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts
    
    requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE)
    if not requests and filename.endswith(MITM_LOG_EXTENSION):
        requests = re.findall(r"(http.*)", file_content, re.MULTILINE)
    responses = ''
    # populate domain info obj 
    
    domaInfo.num_font_loads = len(domaInfo.fonts_loaded)
    domaInfo.requests = list(set(requests))
    domaInfo.responses = list(set(responses))
    domaInfo.fp_detected = get_fp_from_reqs(requests)
    domaInfo.url = url
    domaInfo.rank = get_rank_domain_from_filename(filename)[0]  # !!! rank may not be right. It's only true if we make top Alexa crawl.
    domaInfo.log_filename = filename
    domaInfo.crawl_id = crawl_id
    
    if dump_fun: # call dump function
        try:
            dump_fun(domaInfo)
        except KeyboardInterrupt:
            raise
        except Exception as exc:
            wl_log.critical("Exception while dumping %s: %s" % (domaInfo.url, exc))