def dump_profile_cookies(start_time, visit_id, webdriver, browser_params, manager_params): """ Save changes to Firefox's cookies.sqlite to database We determine which cookies to save by the `start_time` timestamp. This timestamp should be taken prior to calling the `get` for which creates these changes. Note that the extension's cookieInstrument is preferred to this approach, as this is likely to miss changes still present in the sqlite `wal` files. This will likely be removed in a future version. """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills traffic sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: query = ("INSERT INTO profile_cookies (crawl_id, visit_id, " "baseDomain, name, value, host, path, expiry, accessed, " "creationTime, isSecure, isHttpOnly) " "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], visit_id) + row) sock.send(query) # Close connection to db sock.close()
def dump_profile_cookies(start_time, visit_id, webdriver, browser_params, manager_params): """ Save changes to Firefox's cookies.sqlite to database We determine which cookies to save by the `start_time` timestamp. This timestamp should be taken prior to calling the `get` for which creates these changes. Note that the extension's cookieInstrument is preferred to this approach, as this is likely to miss changes still present in the sqlite `wal` files. This will likely be removed in a future version. """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills traffic so we can cleanly record data sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: query = ("INSERT INTO profile_cookies (crawl_id, visit_id, baseDomain, name, value, \ host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], visit_id) + row) sock.send(query) # Close connection to db sock.close()
def dump_storage_vectors(top_url, start_time, webdriver, browser_params): """ Grab the newly changed items in supported storage vectors """ # Set up a connection to DataAggregator tab_restart_browser( webdriver) # kills traffic so we can cleanly record data sock = clientsocket() sock.connect(*browser_params['aggregator_address']) # Wait for SQLite Checkpointing - never happens when browser open # Flash cookies flash_cookies = get_flash_cookies(start_time) for cookie in flash_cookies: query = ( "INSERT INTO flash_cookies (crawl_id, page_url, domain, filename, local_path, \ key, content) VALUES (?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url, cookie.domain, cookie.filename, cookie.local_path, cookie.key, cookie.content)) sock.send(query) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: query = ( "INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \ host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url) + row) sock.send(query) # localStorage - TODO this doesn't have a modified time support #rows = get_localStorage(profile_dir, start_time) #if rows is not None: # for row in rows: # query = ("INSERT INTO localStorage (crawl_id, page_url, scope, KEY, value) \ # VALUES (?,?,?,?)",(crawl_id, top_url) + row) # sock.send(query) # Close connection to db sock.close()
def dump_storage_vectors(top_url, start_time, webdriver, browser_params, manager_params): """ Grab the newly changed items in supported storage vectors """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills traffic so we can cleanly record data sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Wait for SQLite Checkpointing - never happens when browser open # Flash cookies flash_cookies = get_flash_cookies(start_time) for cookie in flash_cookies: query = ("INSERT INTO flash_cookies (crawl_id, page_url, domain, filename, local_path, \ key, content) VALUES (?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url, cookie.domain, cookie.filename, cookie.local_path, cookie.key, cookie.content)) sock.send(query) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: query = ("INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \ host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url) + row) sock.send(query) # localStorage - TODO this doesn't have a modified time support #rows = get_localStorage(profile_dir, start_time) #if rows is not None: # for row in rows: # query = ("INSERT INTO localStorage (crawl_id, page_url, scope, KEY, value) \ # VALUES (?,?,?,?)",(crawl_id, top_url) + row) # sock.send(query) # Close connection to db sock.close()
def dump_profile_cookies(top_url, start_time, webdriver, browser_params, manager_params): """ Save changes to Firefox's cookies.sqlite to database We determine which cookies to save by the `start_time` timestamp. This timestamp should be taken prior to calling the `get` for which creates these changes. Note that the extension's cookieInstrument is preferred to this approach, as this is likely to miss changes still present in the sqlite `wal` files. This will likely be removed in a future version. """ # Set up a connection to DataAggregator tab_restart_browser(webdriver) # kills traffic so we can cleanly record data sock = clientsocket() sock.connect(*manager_params['aggregator_address']) # Cookies rows = get_cookies(browser_params['profile_path'], start_time) if rows is not None: for row in rows: query = ("INSERT INTO profile_cookies (crawl_id, page_url, baseDomain, name, value, \ host, path, expiry, accessed, creationTime, isSecure, isHttpOnly) \ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'], top_url) + row) sock.send(query) # Close connection to db sock.close() # def get_checkout_price(webdriver,browser_params): # pass # add to cart # proceed to checkout # ship to this address # continue # continue # div#subtotals-marketplace-table #option 1 # a href="https://www.amazon.com/ref=ox_spc_footer_homepage" # a href="/gp/cart/view.html/ref=nav_crt_ewc_hd" # input value="Delete" # a a-link-normal sc-product-link # def get_price_list(webdriver,browser_params): # """ # CSS Selectors: # Prices : span.a-size-large.a-color-price.olpOfferPrice-text-bold # Vendors : p.a-spacing-small.olpSellerName # Delivery: div.a-column.a-span3.olpDeliveryColumn # """ # if len(webdriver.find_elements(By.CSS_SELECTOR,"li#olpTabNew")) > 0: # webdriver.find_element(By.CSS_SELECTOR,"li#olpTabNew").click() # time.sleep(2) # product_data = {'prices':[],'vendors':[],'condition':[],'delivery':[],'vendor_index':[],'shipping':[]} # count = 1 # while not webdriver.find_elements(By.CSS_SELECTOR,'li.a-disabled.a-last'): # print 'clicking next on offers list...' # product_data['prices'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-large.a-color-price.olpOfferPrice")] # product_data['condition'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-medium.olpCondition.a-text-bold")] # product_data['delivery'] += [element.text.split('\n')[0] for element in webdriver.find_elements(By.CSS_SELECTOR,"div.a-column.a-span3.olpDeliveryColumn")] # product_data['shipping'] += [ element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"p.olpShippingInfo")] # for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName"): # product_data['vendor_index'].append(count) # count = count + 1 # if element.text: # product_data['vendors'].append(element.text) # else: # product_data['vendors'].append(element.find_element_by_tag_name('img').get_attribute('alt')) # webdriver.find_element(By.CSS_SELECTOR,"li.a-last").click() # time.sleep(2) # #we are done with pagination. Capture last page # product_data['prices'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-large.a-color-price.olpOfferPrice")] # product_data['condition'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"span.a-size-medium.olpCondition.a-text-bold")] # product_data['delivery'] += [element.text.split('\n')[0] for element in webdriver.find_elements(By.CSS_SELECTOR,"div.a-column.a-span3.olpDeliveryColumn")] # product_data['vendors'] += [ element.text if element.text else element.find_element_by_tag_name('img').get_attribute('alt') \ # for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName")] # for element in webdriver.find_elements(By.CSS_SELECTOR,"h3.a-spacing-none.olpSellerName"): # product_data['vendor_index'].append(count) # count = count + 1 # if element.text: # product_data['vendors'].append(element.text) # else: # product_data['vendors'].append(element.find_element_by_tag_name('img').get_attribute('alt')) # product_data['shipping'] += [element.text for element in webdriver.find_elements(By.CSS_SELECTOR,"p.olpShippingInfo")] # num_items = len(product_data['prices']) # pd = [] # for i in xrange(num_items): # p = PriceRecord(product_data['vendor_index'][i],product_data['prices'][i],product_data['vendors'][i],product_data['condition'][i],product_data['delivery'][i],product_data['shipping'][i]) # pd.append(p) # return pd