Beispiel #1
0
def main():
    # Parse arguments
    args = parse_arguments()

    # Read URLs
    url_list = read_list_urls(args)

    # Get torrc matching experiment type
    torrc_dict = cm.TORRC_BY_TYPE[args.experiment]

    # Instantiate crawler
    crawler = Crawler(url_list, torrc_dict,
                      output=args.output,
                      experiment=args.experiment,
                      xvfb=args.xvfb,
                      capture_screen=True)

    # Run the crawl
    try:
        crawler.crawl(args.batches, args.instances,
                      start_line=args.start_line - 1)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
    except Exception as e:
        wl_log.error("Exception: \n%s" % (traceback.format_exc()))
    finally:
        crawler.stop_crawl()
Beispiel #2
0
def run():
    # build dirs
    build_crawl_dirs()

    # setup environment variables for exporting keys
    os.environ['TOR_CIRCUIT_KEY_EXPORT'] = cm.CRAWL_DIR + "/circuit-key.txt"
    os.environ['TOR_SSL_KEY_EXPORT'] = cm.CRAWL_DIR + "/ssl-key.txt"

    # Parse arguments
    args, config = parse_arguments()

    # Read URLs
    url_list = parse_url_list(args.url_file, args.start, args.stop)
    host_list = [urlparse(url).hostname for url in url_list]

    # Configure logger
    add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(cm.TBB_DIR,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    driver = TorBrowserWrapper(cm.TBB_DIR,
                               tbb_logfile_path=cm.DEFAULT_FF_LOG,
                               tor_cfg=USE_RUNNING_TOR,
                               pref_dict=ffprefs,
                               socks_port=int(torrc_config['socksport']))

    # Instantiate crawler
    crawl_type = getattr(crawler_mod, "Crawler" + args.type)
    crawler = crawl_type(driver, controller, args.screenshots, args.export_har)

    # Configure crawl
    job_config = ut.get_dict_subconfig(config, args.config, "job")
    job = crawler_mod.CrawlJob(job_config, url_list)

    # Run display
    xvfb_display = setup_virtual_display(args.virtual_display)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    finally:
        # Post crawl
        post_crawl()

        # Close display
        ut.stop_xvfb(xvfb_display)

    # die
    sys.exit(0)
 def check_conn_error(self):
     if self.driver.current_url == "about:newtab":
         wl_log.warning('Stuck in about:newtab, visit #%s to %s' %
                        (self.job.visit, self.job.url))
     if self.driver.is_connection_error_page:
         wl_log.warning('Connection Error, visit  #%s to %s' %
                        (self.job.visit, self.job.url))
         raise cm.ConnErrorPage
Beispiel #4
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):
    
    referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else ""
    
    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them
            
            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()
            
            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)
            
            if not rows:
                swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'
                    
                wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer))
                
                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1
            
            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()
            
            swf_info.rank = rank # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer        
            swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) 
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id
            
            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()
            
            
        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100]))
        else:
            pass
def run():
    # build dirs
    build_crawl_dirs()

    # Parse arguments
    args, config = parse_arguments()

    # Read URLs
    url_list = read_list_urls(args.url_file, args.start, args.stop)
    host_list = [urlparse(url).hostname for url in url_list]

    # Configure logger
    add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(cm.TBB_DIR,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    driver = TorBrowserWrapper(cm.TBB_DIR,
                               tbb_logfile_path=cm.DEFAULT_FF_LOG,
                               tor_cfg=USE_RUNNING_TOR,
                               pref_dict=ffprefs,
                               socks_port=int(torrc_config['socksport']),
                               canvas_allowed_hosts=host_list)

    # Instantiate crawler
    crawl_type = getattr(crawler_mod, "Crawler" + args.type)
    crawler = crawl_type(driver, controller, args.screenshots)

    # Configure crawl
    job_config = ut.get_dict_subconfig(config, args.config, "job")
    job = crawler_mod.CrawlJob(job_config, url_list)

    # Run display
    xvfb_display = setup_virtual_display(args.virtual_display)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    finally:
        # Post crawl
        post_crawl()

        # Close display
        ut.stop_xvfb(xvfb_display)

    # die
    sys.exit(0)
 def stop_capture(self):
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     os.kill(self.p0.pid, signal.SIGINT)
     #self.p0.kill()
     if os.path.isfile(self.logpath):
         wl_log.info('Onionperf killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.logpath), self.logpath))
     else:
         wl_log.warning('Onionperf killed but cannot find capture file: %s'
                        % self.logpath)
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     if os.path.isfile(self.pcap_file):
         wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     else:
         wl_log.warning('Dumpcap killed but cannot find capture file: %s'
                        % self.pcap_file)
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     if os.path.isfile(self.pcap_file):
         wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     else:
         wl_log.warning('Dumpcap killed but cannot find capture file: %s' %
                        self.pcap_file)
Beispiel #9
0
 def __do_batch(self):
     """
     Must init/restart the Tor process to have a different circuit.
     If the controller is configured to not pollute the profile, each
     restart forces to switch the entry guard.
     """
     with self.controller.launch():
         for self.job.site in xrange(len(self.job.urls)):
             if len(self.job.url) > cm.MAX_FNAME_LENGTH:
                 wl_log.warning("URL is too long: %s" % self.job.url)
                 continue
             self.__do_instance()
             sleep(float(self.job.config['pause_between_sites']))
Beispiel #10
0
 def __do_batch(self):
     """
     Must init/restart the Tor process to have a different circuit.
     If the controller is configured to not pollute the profile, each
     restart forces to switch the entry guard.
     """
     with self.controller.launch():
         for self.job.site in xrange(len(self.job.urls)):
             if len(self.job.url) > cm.MAX_FNAME_LENGTH:
                 wl_log.warning("URL is too long: %s" % self.job.url)
                 continue
             self.__do_instance()
             sleep(float(self.job.config['pause_between_sites']))
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     captcha_filepath = ut.capture_filepath_to_captcha(self.pcap_file)
     if os.path.isfile(self.pcap_file):
         wl_log.info('Sniffer killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     elif os.path.isfile(captcha_filepath):
         wl_log.info(
             'Sniffer killed, file renamed to captcha_*. Capture size: %s Bytes %s'
             % (os.path.getsize(captcha_filepath), captcha_filepath))
     else:
         wl_log.warning(
             'Sniffer killed but cannot find capture file: %s or %s' %
             (self.pcap_file, captcha_filepath))
Beispiel #12
0
def click_to_xpath_selector(br, page_url, selector):
    #wl_log.info('Will find els by selector %s on %s' % (selector, page_url))
    els = br.find_elements_by_xpath(selector)
    for el in els:
        if is_clickable(el, page_url):
            href = el.get_attribute('href') or "?"
            try:
                el.click()
            except Exception as es:
                wl_log.warning('Exception while clicking: href: %s %s %s' % (href, es, page_url))                
            else:
                wl_log.info('Clicked!: href: %s %s %s' % (href, selector, page_url))
                sleep(WAIT_AFTER_CLICK)
                get_and_sleep(br, page_url)
                return 1
    #wl_log.debug('No clickable element found for: %s %s' % (selector, page_url))
    return 0 # we couldn't find any element to click
Beispiel #13
0
def click_to_xpath_selector(br, page_url, selector):
    #wl_log.info('Will find els by selector %s on %s' % (selector, page_url))
    els = br.find_elements_by_xpath(selector)
    for el in els:
        if is_clickable(el, page_url):
            href = el.get_attribute('href') or "?"
            try:
                el.click()
            except Exception as es:
                wl_log.warning('Exception while clicking: href: %s %s %s' %
                               (href, es, page_url))
            else:
                wl_log.info('Clicked!: href: %s %s %s' %
                            (href, selector, page_url))
                sleep(WAIT_AFTER_CLICK)
                get_and_sleep(br, page_url)
                return 1
    #wl_log.debug('No clickable element found for: %s %s' % (selector, page_url))
    return 0  # we couldn't find any element to click
    def _do_batch(self):
        """
        Must init/restart the Tor process to have a different circuit.
        If the controller is configured to not pollute the profile, each
        restart forces to switch the entry guard.
        """
        while self.job.site < len(self.job.urls):
            if self.job.url in self.job.batch_failed and self.job.batch_failed[
                    self.job.url]:
                wl_log.info("Skipping URL because has failed too many times")
                self.job.site += 1
                continue

            if len(self.job.url) > cm.MAX_FNAME_LENGTH:
                wl_log.warning("URL is too long: %s" % self.job.url)
                self.job.site += 1
                continue

            self._do_visits()
            sleep(float(self.job.config['pause_between_sites']))
            self.job.site += 1
        if self.job.site == len(self.job.urls):
            self.job.site = 0
Beispiel #15
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):

    referer = msg.request.headers['Referer'][0] if msg.request.headers[
        'Referer'] else ""

    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS
            ):  # to wide, but decompiler will discard them

            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()

            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)

            if not rows:
                swf_filename = os.path.join(
                    dir_path,
                    "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'

                wl_log.info("SWF saved %s referrer: %s" %
                            (os.path.basename(swf_filename), referer))

                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(
                    swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" %
                            (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1

            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()

            swf_info.rank = rank  # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer
            swf_info.duplicate = duplicate_swf  # !!! Y for repeated swfs(that we know before)
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(
                swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id

            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()

        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" %
                           (msg.request.path, msg.response.content[:100]))
        else:
            pass
def run():
    # Parse arguments
    args, config = parse_arguments()

    # build dirs
    build_crawl_dirs()

    # Read URLs
    if isfile(args.urls):
        url_list = parse_url_list(args.urls, args.start, args.stop)
    else:
        try:
            url_list = args.urls.split(',')
        except Exception as e:
            wl_log.error("ERROR: expects a string with comma-separated list "
                         "of URLs of a path to file")
    host_list = [urlparse(url).hostname for url in url_list]

    # Configure logger
    add_log_file_handler(wl_log, cm.CRAWL_LOG_FILENAME)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(tbb_path=args.tbb_path,
                               tor_binary_path=args.tor_binary_path,
                               tor_data_path=args.tor_data_path,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    ffprefs = ut.set_dict_value_types(ffprefs)
    print(ffprefs)
    addons_path = [abspath(args.addons_dir)] if args.addons_dir else []
    driver_config = {'tbb_path': cm.TBB_DIR,
                     'tor_cfg': USE_RUNNING_TOR,
                     'pref_dict': ffprefs,
                     'extensions': addons_path,
                     'socks_port': int(torrc_config['socksport']),
                     'control_port': int(torrc_config['controlport']),
                     'canvas_allowed_hosts': host_list
                     }

    # Instantiate crawler
    crawl_type = getattr(crawler_mod, "Crawler" + args.type)
    crawler = crawl_type(controller,
                         driver_config=driver_config,
                         device=args.device,
                         screenshots=args.screenshots)

    # Configure crawl
    if args.recover_file is not None:
        if isfile(args.recover_file):
            with open(args.recover_file) as fchkpt:
                job = pickle.load(fchkpt)
                wl_log.info("Job recovered: %s" % str(job))
        else:
            wl_log.error("Checkpoint file %s does not exist" % args.recover_file)
            sys.exit(1)
    else:
        # parse job configuration
        job_config = ut.get_dict_subconfig(config, args.config, "job")

        # get chunk of urls to crawl
        chunk = int(job_config.get('chunk', 0))
        chunks = int(job_config.get('chunks', 1))
        range_chunk = len(url_list) / chunks
        if chunk == chunks - 1: # last chunk takes remaining urls
            url_list_chunk = url_list[chunk * range_chunk:]
        else:
            url_list_chunk = url_list[chunk * range_chunk:(chunk + 1) * range_chunk]
        job = crawler_mod.CrawlJob(job_config, url_list_chunk)

    # Run display
    xvfb_display = setup_virtual_display(args.virtual_display)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    except Exception as e:
        wl_log.error("ERROR: unknown exception while crawling: %s" % e)
    finally:
        #driver.quit()
        #controller.quit()
        # Post crawl
        post_crawl()

        # Close display
        ut.stop_xvfb(xvfb_display)

    # die
    wl_log.info("[tbcrawler] the crawl has finished.")
    sys.exit(0)
Beispiel #17
0
    if verbose:
        wl_log.setLevel(logging.DEBUG)
    else:
        wl_log.setLevel(logging.INFO)

    # Validate the given arguments
    # Read urls
    url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str)
    url_list = url_list.tolist()
    url_list = url_list[start_line - 1:stop_line]
    torrc_dict = cm.TORRC_DEFAULT

    if not tbb_version:
        tbb_version = cm.TBB_DEFAULT_VERSION
    elif tbb_version not in cm.TBB_KNOWN_VERSIONS:
        ut.die('Version of Tor browser is not recognized.')

    crawler = Crawler(torrc_dict, url_list, tbb_version, xvfb, capture_screen)
    wl_log.info('Command line parameters: %s' % sys.argv)

    # Run the crawl
    try:
        crawler.crawl(no_of_batches,
                      no_of_instances,
                      start_line=start_line - 1)
    except KeyboardInterrupt:
        wl_log.warning('Keyboard interrupt! Quitting...')
    except Exception as e:
        wl_log.error('Exception: \n%s' % (traceback.format_exc()))
    finally:
        crawler.stop_crawl()
 def check_captcha(self):
     if CHECK_CAPTCHA and ut.has_captcha(self.page_source):
         wl_log.warning('captcha found')
         self.job.add_captcha()
Beispiel #19
0
    if experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG:
        torrc_dict = cm.TORRC_WANG_AND_GOLDBERG
    elif experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
        torrc_dict = cm.TORRC_DEFAULT
    else:
        ut.die("Experiment type is not recognized."
               " Use --help to see the possible values.")

    if not tbb_version:
        # Assign the last stable version of TBB
        tbb_version = cm.TBB_DEFAULT_VERSION
    elif tbb_version not in cm.TBB_KNOWN_VERSIONS:
        ut.die("Version of Tor browser is not recognized."
               " Use --help to see which are the accepted values.")

    crawler = Crawler(torrc_dict, url_list, tbb_version, experiment, xvfb,
                      capture_screen)
    wl_log.info("Command line parameters: %s" % sys.argv)

    # Run the crawl
    try:
        crawler.crawl(no_of_batches,
                      no_of_instances,
                      start_line=start_line - 1)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
    except Exception as e:
        wl_log.error("Exception: \n%s" % (traceback.format_exc()))
    finally:
        crawler.stop_crawl(pack_results=False)