def test_update_visit(self):
        test_url = "http://example.com"
        start_time = strftime("%Y%m%d-%H%M%S")
        be = cm.BrowserEvent()
        be.event_type = cm.EVENT_NEW_VISIT
        vi = cm.VisitInfo()
        vi.url = test_url
        vi.start_time = start_time
        vi.out_db = self.test_db
        vi.duration = 0
        vi.incomplete = 1

        vi.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)

        vi.duration = 33
        vi.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)
Beispiel #2
0
    def test_update_visit(self):
        test_url = "http://example.com"
        start_time = strftime("%Y%m%d-%H%M%S")
        be = cm.BrowserEvent()
        be.event_type = cm.EVENT_NEW_VISIT
        vi = cm.VisitInfo()
        vi.url = test_url
        vi.start_time = start_time
        vi.out_db = self.test_db
        vi.duration = 0
        vi.incomplete = 1

        vi.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)

        vi.duration = 33
        vi.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)
Beispiel #3
0
 def test_lso_db_ops(self):
     lso_events = lso.parse_strace_logs(self.vi, test_lso=self.lso_file)
     dbu.insert_to_db(dbu.DBCmd.ADD_LSO_ITEMS, lso_events, self.vi)
     lso_items_db = dbu.get_db_entry(self.vi.out_db,
                                     dbu.DBCmd.GET_FLASH_COOKIES,
                                     self.vi.visit_id).fetchall()
     self.assertEqual(len(lso_items_db), 1)
     lso_event = lso_items_db[0]
     page_url, domain, filename, local_path, key, content = lso_event[2:8]
     self.assertEqual(page_url, self.vi.url)
     self.assertEqual(domain, cm.ONLINE_TEST_HOST)
     self.assertEqual(filename, TEST_LSO_FILENAME)
     self.assertEqual(local_path, TEST_LSO_REL_PATH)
     self.assertEqual(key, TEST_LSO_KEYNAME)
     self.assertEqual(content, TEST_LSO_VALUE)
Beispiel #4
0
 def test_lso_db_ops(self):
     lso_events = lso.parse_strace_logs(self.vi, test_lso=self.lso_file)
     dbu.insert_to_db(dbu.DBCmd.ADD_LSO_ITEMS, lso_events, self.vi)
     lso_items_db = dbu.get_db_entry(self.vi.out_db,
                                     dbu.DBCmd.GET_FLASH_COOKIES,
                                     self.vi.visit_id).fetchall()
     self.assertEqual(len(lso_items_db), 1)
     lso_event = lso_items_db[0]
     page_url, domain, filename, local_path, key, content = lso_event[2:8]
     self.assertEqual(page_url, self.vi.url)
     self.assertEqual(domain, cm.ONLINE_TEST_HOST)
     self.assertEqual(filename, TEST_LSO_FILENAME)
     self.assertEqual(local_path, TEST_LSO_REL_PATH)
     self.assertEqual(key, TEST_LSO_KEYNAME)
     self.assertEqual(content, TEST_LSO_VALUE)
Beispiel #5
0
def process_crawler_output(ff_log_file, visit_info, flash=1):
    # wait until tmp files are merged to db, otherwise we won't
    # find the recently added items in db

    db_jobs = {}
    flash_cookies = []
    sleep_until_sqlite_checkpoint(visit_info.profile_dir)

    cache_entries = cu.get_ff_cache(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_CACHE_ITEMS] = cache_entries

    js_calls, all_calls = ff_log_parser(ff_log_file, visit_info)
    db_jobs[dbu.DBCmd.ADD_CANVAS] = js_calls

    cookies = cookie.get_ff_cookies(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_COOKIES] = cookies

    local_storage_entries = ls.get_ff_local_storage(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_LOCALSTORAGE_ITEMS] = local_storage_entries

    indexed_db_entries = indexedDB.gen_ff_indexedDB(visit_info)
    db_jobs[dbu.DBCmd.ADD_INDEXEDDB_ITEMS] = indexed_db_entries

    if flash:
        time.sleep(5)  # strace logs may not be available immediately
        flash_cookies = lso.parse_strace_logs(visit_info)
        db_jobs[dbu.DBCmd.ADD_LSO_ITEMS] = flash_cookies

    http_msgs = list(mitm.parse_mitm_dump(visit_info.http_dump))
    db_jobs[dbu.DBCmd.ADD_HTTP_HEADERS] = http_msgs

    dbu.insert_to_db(dbu.DBCmd.ADD_ALL_VISIT_DATA, db_jobs, visit_info)

    return {
        "calls": all_calls,
        "cookies": cookies,
        "flash_cookies": flash_cookies,
        "local_storage": local_storage_entries,
        "indexed_db": indexed_db_entries,
        "cache": cache_entries,
        "http_msgs": http_msgs
    }
Beispiel #6
0
def process_crawler_output(ff_log_file, visit_info, flash=1):
    # wait until tmp files are merged to db, otherwise we won't
    # find the recently added items in db

    db_jobs = {}
    flash_cookies = []
    sleep_until_sqlite_checkpoint(visit_info.profile_dir)

    cache_entries = cu.get_ff_cache(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_CACHE_ITEMS] = cache_entries

    js_calls, all_calls = ff_log_parser(ff_log_file, visit_info)
    db_jobs[dbu.DBCmd.ADD_CANVAS] = js_calls

    cookies = cookie.get_ff_cookies(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_COOKIES] = cookies

    local_storage_entries = ls.get_ff_local_storage(visit_info.profile_dir)
    db_jobs[dbu.DBCmd.ADD_LOCALSTORAGE_ITEMS] = local_storage_entries

    indexed_db_entries = indexedDB.gen_ff_indexedDB(visit_info)
    db_jobs[dbu.DBCmd.ADD_INDEXEDDB_ITEMS] = indexed_db_entries

    if flash:
        time.sleep(5)  # strace logs may not be available immediately
        flash_cookies = lso.parse_strace_logs(visit_info)
        db_jobs[dbu.DBCmd.ADD_LSO_ITEMS] = flash_cookies

    http_msgs = list(mitm.parse_mitm_dump(visit_info.http_dump))
    db_jobs[dbu.DBCmd.ADD_HTTP_HEADERS] = http_msgs

    dbu.insert_to_db(dbu.DBCmd.ADD_ALL_VISIT_DATA, db_jobs, visit_info)

    return {"calls": all_calls,
            "cookies": cookies,
            "flash_cookies": flash_cookies,
            "local_storage": local_storage_entries,
            "indexed_db": indexed_db_entries,
            "cache": cache_entries,
            "http_msgs": http_msgs}
 def test_r_w_visit_to_db(self):
     test_url = "http://example.com"
     start_time = strftime("%Y%m%d-%H%M%S")
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_NEW_VISIT
     vi = cm.VisitInfo()
     vi.url = test_url
     vi.start_time = start_time
     vi.out_db = self.test_db
     visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
     vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                visit_id)
     self.assertEqual(vi.url, vi_read.url)
     self.assertEqual(vi.start_time, vi_read.start_time)
Beispiel #8
0
 def test_r_w_visit_to_db(self):
     test_url = "http://example.com"
     start_time = strftime("%Y%m%d-%H%M%S")
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_NEW_VISIT
     vi = cm.VisitInfo()
     vi.url = test_url
     vi.start_time = start_time
     vi.out_db = self.test_db
     visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
     vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                visit_id)
     self.assertEqual(vi.url, vi_read.url)
     self.assertEqual(vi.start_time, vi_read.start_time)
 def test_r_w_canvas_to_db(self):
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_TODATAURL
     be.url = "http://example.com"
     be.js_file = "http://example.com/fp.js"
     be.js_line = 5
     be.txt = "data:asdsads"
     vi = cm.VisitInfo()
     vi.visit_id = 1
     vi.out_db = self.test_db
     canvas_ev_id = dbu.insert_to_db(dbu.DBCmd.ADD_CANVAS, be, vi)
     self.assertGreater(canvas_ev_id, 0)
     visit_id, data_url_id, event_time, be_db = \
         dbu.get_db_entry(self.test_db, dbu.DBCmd.CANVAS_BY_ID,
                          canvas_ev_id)
     self.assertEqual(vi.visit_id, visit_id)
     self.assertEqual(data_url_id, 1)
     self.assertEqual(be_db.event_type, be.event_type)
     self.assertEqual(be_db.url, be.url)
     self.assertEqual(be_db.js_file, be.js_file)
     self.assertEqual(be_db.js_line, be.js_line)
     self.assertEqual(event_time, 0)
Beispiel #10
0
 def test_r_w_canvas_to_db(self):
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_TODATAURL
     be.url = "http://example.com"
     be.js_file = "http://example.com/fp.js"
     be.js_line = 5
     be.txt = "data:asdsads"
     vi = cm.VisitInfo()
     vi.visit_id = 1
     vi.out_db = self.test_db
     canvas_ev_id = dbu.insert_to_db(dbu.DBCmd.ADD_CANVAS, be, vi)
     self.assertGreater(canvas_ev_id, 0)
     visit_id, data_url_id, event_time, be_db = \
         dbu.get_db_entry(self.test_db, dbu.DBCmd.CANVAS_BY_ID,
                          canvas_ev_id)
     self.assertEqual(vi.visit_id, visit_id)
     self.assertEqual(data_url_id, 1)
     self.assertEqual(be_db.event_type, be.event_type)
     self.assertEqual(be_db.url, be.url)
     self.assertEqual(be_db.js_file, be.js_file)
     self.assertEqual(be_db.js_line, be.js_line)
     self.assertEqual(event_time, 0)
Beispiel #11
0
def visit_page(url_tuple,
               timeout=cm.HARD_TIME_OUT,
               wait_on_site=cm.WAIT_ON_SITE,
               pre_crawl_sleep=False,
               out_dir=cm.BASE_TMP_DIR,
               flash_support=cm.FLASH_ENABLE,
               cookie_support=cm.COOKIE_ALLOW_ALL):
    driver = None
    visit_info = cm.VisitInfo()
    try:
        visit_info.rank, visit_info.url = url_tuple
    except:
        # When rank of the page is not provided, we'll use rank=0
        visit_info.rank, visit_info.url = 0, url_tuple

    visit_info.sys_log = join(
        out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str()))
    visit_info.http_log = join(
        out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str()))
    visit_info.http_dump = join(
        out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str()))
    visit_info.start_time = strftime("%Y%m%d-%H%M%S")
    visit_info.out_dir = out_dir
    visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME)
    visit_info.err_log = join(out_dir, "error.log")
    visit_info.debug_log = join(out_dir, "debug.log")

    be = cm.BrowserEvent()
    be.event_type = cm.EVENT_NEW_VISIT

    visit_info.ff_log = open_log_file(out_dir, visit_info.url)

    if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'):
        visit_info.url = 'http://' + visit_info.url

    try:
        visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be,
                                               visit_info)
        cm.print_debug(
            visit_info, "Visiting: %s %s (%s)" %
            (visit_info.visit_id, visit_info.url, visit_info.rank))
        setup_nspr_logging(visit_info.http_log)
        visit_info.vdisplay = start_xvfb()
        port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump)
        driver, visit_info.profile_dir, visit_info.sel_proc =\
            get_browser(visit_info.ff_log, port, flash_support, cookie_support)
        if flash_support:
            visit_info.strace_proc = log_syscalls(visit_info.sel_proc,
                                                  visit_info.sys_log)

        #############################################################
        driver_get(driver, visit_info, cm.SOFT_TIMEOUT)  # real visit
        #############################################################
        time.sleep(wait_on_site)
        close_driver(driver, timeout=10)
        stop_strace(visit_info.strace_proc)
        result_dict = process_crawler_output(visit_info.ff_log, visit_info,
                                             flash_support)
        cm.print_debug(
            visit_info, "Visit OK: %s %s (%s)" %
            (visit_info.visit_id, visit_info.url, visit_info.rank))
        visit_info.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info)
        quit_driver(driver)
        stop_xvfb(visit_info.vdisplay)
        remove_visit_files(visit_info)
    except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc:
        err_str = "Visit to %s(%s) timed out %s" % \
            (visit_info.url, visit_info.rank, texc)
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    except Exception as exc:
        err_str = "Exception visiting %s(%s) %s %s" % \
            (visit_info.url, visit_info.rank, exc, traceback.format_exc())
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    else:
        return result_dict
Beispiel #12
0
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT,
               wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False,
               out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE,
               cookie_support=cm.COOKIE_ALLOW_ALL):
    driver = None
    visit_info = cm.VisitInfo()
    try:
        visit_info.rank, visit_info.url = url_tuple
    except:
        # When rank of the page is not provided, we'll use rank=0
        visit_info.rank, visit_info.url = 0, url_tuple

    visit_info.sys_log = join(out_dir, "syscall-%s-%s.log" %
                              (visit_info.rank, ut.rand_str()))
    visit_info.http_log = join(out_dir, "http-%s-%s.log" %
                               (visit_info.rank, ut.rand_str()))
    visit_info.http_dump = join(out_dir, "mitm-%s-%s.dmp" %
                                (visit_info.rank, ut.rand_str()))
    visit_info.start_time = strftime("%Y%m%d-%H%M%S")
    visit_info.out_dir = out_dir
    visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME)
    visit_info.err_log = join(out_dir, "error.log")
    visit_info.debug_log = join(out_dir, "debug.log")

    be = cm.BrowserEvent()
    be.event_type = cm.EVENT_NEW_VISIT

    visit_info.ff_log = open_log_file(out_dir, visit_info.url)

    if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'):
        visit_info.url = 'http://' + visit_info.url

    try:
        visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be,
                                               visit_info)
        cm.print_debug(visit_info, "Visiting: %s %s (%s)" %
                       (visit_info.visit_id, visit_info.url, visit_info.rank))
        setup_nspr_logging(visit_info.http_log)
        visit_info.vdisplay = start_xvfb()
        port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump)
        driver, visit_info.profile_dir, visit_info.sel_proc =\
            get_browser(visit_info.ff_log, port, flash_support, cookie_support)
        if flash_support:
            visit_info.strace_proc = log_syscalls(visit_info.sel_proc,
                                                  visit_info.sys_log)

        #############################################################
        driver_get(driver, visit_info, cm.SOFT_TIMEOUT)  # real visit
        #############################################################
        time.sleep(wait_on_site)
        close_driver(driver, timeout=10)
        stop_strace(visit_info.strace_proc)
        result_dict = process_crawler_output(visit_info.ff_log, visit_info,
                                             flash_support)
        cm.print_debug(visit_info, "Visit OK: %s %s (%s)" %
                       (visit_info.visit_id, visit_info.url, visit_info.rank))
        visit_info.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info)
        quit_driver(driver)
        stop_xvfb(visit_info.vdisplay)
        remove_visit_files(visit_info)
    except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc:
        err_str = "Visit to %s(%s) timed out %s" % \
            (visit_info.url, visit_info.rank, texc)
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    except Exception as exc:
        err_str = "Exception visiting %s(%s) %s %s" % \
            (visit_info.url, visit_info.rank, exc, traceback.format_exc())
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    else:
        return result_dict