Exemple #1
0
def dump_profile_cookies(start_time, visit_id, webdriver, browser_params,
                         manager_params):
    """ Save changes to Firefox's cookies.sqlite to database

    We determine which cookies to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.

    Note that the extension's cookieInstrument is preferred to this approach,
    as this is likely to miss changes still present in the sqlite `wal` files.
    This will likely be removed in a future version.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills traffic
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = ("INSERT INTO profile_cookies (crawl_id, visit_id, "
                     "baseDomain, name, value, host, path, expiry, accessed, "
                     "creationTime, isSecure, isHttpOnly) "
                     "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
                     (browser_params['crawl_id'], visit_id) + row)
            sock.send(query)

    # Close connection to db
    sock.close()
Exemple #2
0
def dump_profile_cookies(start_time, visit_id, webdriver, browser_params, manager_params):
    """ Save changes to Firefox's cookies.sqlite to database

    We determine which cookies to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.

    Note that the extension's cookieInstrument is preferred to this approach,
    as this is likely to miss changes still present in the sqlite `wal` files.
    This will likely be removed in a future version.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills traffic so we can cleanly record data
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = ("INSERT INTO profile_cookies (crawl_id, visit_id, baseDomain, name, value, \
                      host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \
                      VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], visit_id) + row)
            sock.send(query)

    # Close connection to db
    sock.close()
Exemple #3
0
def dump_storage_vectors(top_url, start_time, webdriver, browser_params):
    """ Grab the newly changed items in supported storage vectors """

    # Set up a connection to DataAggregator
    tab_restart_browser(
        webdriver)  # kills traffic so we can cleanly record data
    sock = clientsocket()
    sock.connect(*browser_params['aggregator_address'])

    # Wait for SQLite Checkpointing - never happens when browser open

    # Flash cookies
    flash_cookies = get_flash_cookies(start_time)
    for cookie in flash_cookies:
        query = (
            "INSERT INTO flash_cookies (crawl_id, page_url, domain, filename, local_path, \
                  key, content) VALUES (?,?,?,?,?,?,?)",
            (browser_params['crawl_id'], top_url, cookie.domain,
             cookie.filename, cookie.local_path, cookie.key, cookie.content))
        sock.send(query)

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = (
                "INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \
                      host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \
                      VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
                (browser_params['crawl_id'], top_url) + row)
            sock.send(query)

    # localStorage - TODO this doesn't have a modified time support
    #rows = get_localStorage(profile_dir, start_time)
    #if rows is not None:
    #    for row in rows:
    #        query = ("INSERT INTO localStorage (crawl_id, page_url, scope, KEY, value) \
    #                  VALUES (?,?,?,?)",(crawl_id, top_url) + row)
    #        sock.send(query)

    # Close connection to db
    sock.close()
def dump_storage_vectors(top_url, start_time, webdriver, browser_params, manager_params):
    """ Grab the newly changed items in supported storage vectors """

    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills traffic so we can cleanly record data
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Wait for SQLite Checkpointing - never happens when browser open

    # Flash cookies
    flash_cookies = get_flash_cookies(start_time)
    for cookie in flash_cookies:
        query = ("INSERT INTO flash_cookies (crawl_id, page_url, domain, filename, local_path, \
                  key, content) VALUES (?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url, cookie.domain,
                                                          cookie.filename, cookie.local_path,
                                                          cookie.key, cookie.content))
        sock.send(query)

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = ("INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \
                      host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \
                      VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url) + row)
            sock.send(query)
    
    # localStorage - TODO this doesn't have a modified time support
    #rows = get_localStorage(profile_dir, start_time)
    #if rows is not None:
    #    for row in rows:
    #        query = ("INSERT INTO localStorage (crawl_id, page_url, scope, KEY, value) \
    #                  VALUES (?,?,?,?)",(crawl_id, top_url) + row)
    #        sock.send(query)

    # Close connection to db
    sock.close()
Exemple #5
0
def dump_profile_cookies(top_url, start_time, webdriver, browser_params, manager_params):
    """ Save changes to Firefox's cookies.sqlite to database

    We determine which cookies to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.

    Note that the extension's cookieInstrument is preferred to this approach,
    as this is likely to miss changes still present in the sqlite `wal` files.
    This will likely be removed in a future version.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills traffic so we can cleanly record data
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = ("INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \
                      host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \
                      VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url) + row)
            sock.send(query)

    # Close connection to db
    sock.close()

# def get_checkout_price(webdriver,browser_params):
    # pass
    # add to cart
    # proceed to checkout
    # ship to this address
    # continue
    # continue
    # div#subtotals-marketplace-table

    #option 1
    # a href="https://www.amazon.com/ref=ox_spc_footer_homepage"
    # a href="/gp/cart/view.html/ref=nav_crt_ewc_hd"
    # input value="Delete"
    # a a-link-normal sc-product-link

# def get_price_list(webdriver,browser_params):
#     """
#     CSS Selectors:
#     Prices  : span.a-size-large.a-color-price.olpOfferPrice-text-bold 
#     Vendors : p.a-spacing-small.olpSellerName
#     Delivery: div.a-column.a-span3.olpDeliveryColumn 
#     """
#     if  len(webdriver.find_elements(By.CSS_SELECTOR,"li#olpTabNew")) > 0:
#         webdriver.find_element(By.CSS_SELECTOR,"li#olpTabNew").click()
#         time.sleep(2)

#     product_data = {'prices':[],'vendors':[],'condition':[],'delivery':[],'vendor_index':[],'shipping':[]}
#     count = 1
#     while not webdriver.find_elements(By.CSS_SELECTOR,'li.a-disabled.a-last'):
#         print 'clicking next on offers list...'
#         product_data['prices'] +=  [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-large.a-color-price.olpOfferPrice")]
#         product_data['condition'] +=  [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-medium.olpCondition.a-text-bold")]
#         product_data['delivery'] +=  [element.text.split('\n')[0] for element in webdriver.find_elements(By.CSS_SELECTOR,"div.a-column.a-span3.olpDeliveryColumn")]
#         product_data['shipping'] += [ element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"p.olpShippingInfo")]

#         for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName"):
#             product_data['vendor_index'].append(count)
#             count = count + 1
#             if element.text:
#                 product_data['vendors'].append(element.text)
#             else:
#                 product_data['vendors'].append(element.find_element_by_tag_name('img').get_attribute('alt'))
#         webdriver.find_element(By.CSS_SELECTOR,"li.a-last").click()
#         time.sleep(2)

#     #we are done with pagination. Capture last page
#     product_data['prices']     +=  [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-large.a-color-price.olpOfferPrice")]
#     product_data['condition']  +=  [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-medium.olpCondition.a-text-bold")]
#     product_data['delivery']   +=  [element.text.split('\n')[0] for element in webdriver.find_elements(By.CSS_SELECTOR,"div.a-column.a-span3.olpDeliveryColumn")]
#     product_data['vendors']    +=  [ element.text if element.text else element.find_element_by_tag_name('img').get_attribute('alt') \
#                                         for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName")]
#     for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName"):
#         product_data['vendor_index'].append(count)
#         count = count + 1
#         if element.text:
#             product_data['vendors'].append(element.text)
#         else:
#             product_data['vendors'].append(element.find_element_by_tag_name('img').get_attribute('alt'))
#     product_data['shipping'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"p.olpShippingInfo")]

#     num_items = len(product_data['prices'])
    
#     pd = []
#     for i in xrange(num_items):
#         p = PriceRecord(product_data['vendor_index'][i],product_data['prices'][i],product_data['vendors'][i],product_data['condition'][i],product_data['delivery'][i],product_data['shipping'][i])
#         pd.append(p)
#     return pd