def _get_data_external_links(scripts, driver=None): """ @param scripts: a list of HTML internal scripts and exernal script links (src) @returns: an ordered list containing inline scripts and the contents of the REACHABLE external script links """ data = [] if driver is None: # use python requests for item in scripts: script_type = item[0] if script_type == "external_script": link = item[1] d = RequesterModule.requester(link) if RequesterModule.is_http_response_valid(d): d_str = str(d).strip() if (not d_str.startswith("""<!doctype html>""")) and ('doctype html' not in d_str): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests data.append([script_type, d]) else: ## no valid content if constantsModule.DEBUG_PRINTS: print("+ InvalidResourceURL encountered!") continue else: data.append(item) return data else: # use browser for item in scripts: script_type = item[0] if script_type == "external_script": link = item[1] current_handle = driver.current_window_handle driver.execute_script("""window.open('', '_blank')""") # new tab time.sleep(1) driver.switch_to_window(driver.window_handles[1]) driver.get(link) time.sleep(1) d = driver.page_source driver.close() # closes the new tab driver.switch_to_window(current_handle) dp = BeautifulSoup(d, 'html.parser') d_str = dp.find('pre', recursive=True) # js is rendered in a pre tag in chrome if d_str is None: continue else: d_str = d_str.text # get the 'pre' tag content if (not d_str.startswith("""<!doctype html>""")): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests data.append([script_type, d_str]) else: ## no valid content if constantsModule.DEBUG_PRINTS: print("+ InvalidResourceURL encountered!") continue else: data.append(item) return data
def get_external_resource(resource_url): """ @param {string} resource_url @return {string} http response if valid, o/w empty string """ response = RequesterModule.requester(resource_url) if RequesterModule.is_http_response_valid(response): return response return ''
def main_data_collection(): args = sys.argv if len(args) > 1: low = int(args[1]) high = low if len(args) > 2: high = int(args[2]) for i in range(low, high + 1): site_id = args[1] # 1. get saved URLs or find URLs if needed urls = get_site_urls(site_id) # 2. collect js and data of the site, for each URL found if CrawlerConfig.PLATFORM == "linux": display = Display(visible=0, size=(800, 600)) display.start() driver = seleniumModule.get_new_browser(xhr_logger=True, event_logger=True, headless_mode=False) ## load predefined states into the browser (e.g., login) driver = CrawlerModule.get_logged_driver(driver, site_id) for navigation_url in urls: # crawlerUtilityModule.collect_site_data(site_id, navigation_url, driver) d = RequesterModule.requester(navigation_url) ## check if the site base address is reachable if RequesterModule.is_http_response_valid(d): try: crawlerUtilityModule.collect_site_data( site_id, navigation_url, driver) except BaseException as error: print('chrome runinto error for site: %s' % site_id) driver = seleniumModule.get_new_browser( xhr_logger=True, event_logger=True, headless_mode=False) continue else: continue if CrawlerConfig.PLATFORM == "linux": display.stop()
def collect_site_data(site_id, url, driver, out_path=CrawlerConfig.OUTPUT_DATA_DIRECTORY): """ @param {string} site_id @param {string} url @param {object} driver: selenium driver handle @param {string} out_path @return {bool} whether or not the operation has succeeded """ # possible return values ERR_INVALID_URL = False SUCCESS = True # created output file names NAME_HTML_UNRENDERED = "html_initial.html" NAME_HTML_RENDERED = "html_rendered.html" NAME_JS_PROGRAM = "js_program.js" NAME_URL_FILE = "navigation_url.out" NAME_DOCUMENT_PROPS = 'document_props.out' NAME_DOCUMENT_PROPS_MACHINE = 'document_props_short.out' NAME_LIBRARIES_FOLDER = "libraries" NAME_XHR_LOGS = "request_logs.out" NAME_XHR_LOGS_MACHINE = "request_logs_short.out" NAME_COOKIE_FILE = "cookies.pkl" NAME_COOKIE_FILE_STR = "cookies_str.out" NAME_FIRED_EVENTS = "events.out" NAME_FIRED_EVENTS_PICKLE = "events_pickle.pkl" # prepare save path directories # site_map_name = sitesmapModule.get_site_data(site_id)[0] output_folder_of_this_site = os.path.join(out_path, str(site_id)) folder_name_of_this_page = _hash(url) output_folder_path_name_of_this_page = os.path.join( output_folder_of_this_site, folder_name_of_this_page) if not os.path.exists(output_folder_path_name_of_this_page): os.makedirs(output_folder_path_name_of_this_page) # save the navigation url path_name_navigation_url = os.path.join( output_folder_path_name_of_this_page, NAME_URL_FILE) with open(path_name_navigation_url, "wb") as fp: fp.write(url.encode('utf-8')) # step 2: capture the rendered HTML page and JS dynamic_data = DOMCollectorModule.get_dynamic_data(site_id, url, driver, close_conn=False) if dynamic_data is None: return ERR_INVALID_URL time.sleep(1) html_content = dynamic_data[0] soup_content = dynamic_data[1] js_of_page = DOMCollectorModule.combine_js_scripts(dynamic_data) inline_js_of_html = DOMCollectorModule.process_inline_dom_javascript( html_content, soup_content) # capture xhr requests via extension for lator use xhr_logs = seleniumModule.get_xhr_logger_extension_data(driver) # cookies cookies = driver.get_cookies() # DOM level 3 spec: first inline HTML events are fired, then others path_name_js_program = os.path.join(output_folder_path_name_of_this_page, NAME_JS_PROGRAM) with open(path_name_js_program, "wb") as fp: fp.write(inline_js_of_html.encode('utf-8')) fp.write(b'\n') fp.write(js_of_page.encode('utf-8')) _beautify_js(path_name_js_program) path_name_html_rendered = os.path.join( output_folder_path_name_of_this_page, NAME_HTML_RENDERED) with open(path_name_html_rendered, "wb") as fp: fp.write(html_content.encode('utf-8')) fp.write(b'\n') # store individual script files scripts_folder = os.path.join(output_folder_path_name_of_this_page, "scripts") if not os.path.exists(scripts_folder): os.makedirs(scripts_folder) script_files = dynamic_data[2] script_files_counter = 0 mappings = {} writeMapping = False for item in script_files: script_files_counter += 1 script_content = item[1] if len(script_content.strip()) == 0: continue if item[0] == 'internal_script': # remove HTML comment obfuscation in the start and end of inline script tags <!-- and --> script_content = script_content.strip().lstrip('<!--').rstrip( '-->') else: link = item[2] mappings[script_files_counter] = link writeMapping = True script_save_file_name = os.path.join(scripts_folder, str(script_files_counter) + '.js') with open(script_save_file_name, "w+") as fd: fd.write(script_content) _beautify_js(script_save_file_name) if writeMapping: with open(os.path.join(scripts_folder, "mappings.json"), 'w+', encoding='utf-8') as fd: json.dump(mappings, fd, ensure_ascii=False, indent=4) # step 3: save library files lib_links_dictionary = dynamic_data[3] library_output_folder_of_this_site = os.path.join( output_folder_path_name_of_this_page, NAME_LIBRARIES_FOLDER) _save_program_libraries(library_output_folder_of_this_site, lib_links_dictionary) # create timestamp for reports timestamp = get_current_timestamp() sep = get_output_header_sep() sep_templates = get_output_subheader_sep() # step 4: save document and form variables (accessible through document.form_name.input_name) document_form_variables = HTMLParserModule.get_document_properties_from_html( soup_content) path_name_document_props = os.path.join( output_folder_path_name_of_this_page, NAME_DOCUMENT_PROPS) with open(path_name_document_props, 'w+') as fd: fd.write(sep) fd.write('[timestamp] generated on %s\n' % timestamp) fd.write( '[description] defined properties in HTML for \'document\' DOM API\n' ) fd.write(sep + '\n\n') for counter, elm in enumerate(document_form_variables, start=1): fd.write("(%s): %s\n" % (counter, elm)) path_name_document_props_machine = os.path.join( output_folder_path_name_of_this_page, NAME_DOCUMENT_PROPS_MACHINE) with open(path_name_document_props_machine, 'w+') as fd: fd.write(str(document_form_variables)) # step 5: save captured onload requests via extension without_data_reqs = xhr_logs['without_data'] # no formData with_data_reqs = xhr_logs['with_data'] # also contains formData succ_reqs = xhr_logs['succ'] # all successully accepted requests with 2xx path_name_xhr_logs_machine = os.path.join( output_folder_path_name_of_this_page, NAME_XHR_LOGS_MACHINE) with open(path_name_xhr_logs_machine, "w+") as fp: fp.write(str(xhr_logs)) # save also a nicer human readable version path_name_xhr_logs = os.path.join(output_folder_path_name_of_this_page, NAME_XHR_LOGS) with open(path_name_xhr_logs, "w+") as fp: for each_request in without_data_reqs: try: if isinstance(each_request, dict): xhr_url = each_request['url'] xhr_url = _unquote_url(xhr_url) xhr_status = _check_if_req_is_successful( each_request['requestId'], succ_reqs) fp.write("Navigation_URL: '%s'\n" % (url)) fp.write("Request_URL: '%s'\n" % (xhr_url)) fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0]))) fp.write("Response_HTTP_Status: '%s'\n" % (str(xhr_status[1]))) fp.write(sep_templates) else: d = json.loads(each_request) xhr_url = d['url'] xhr_url = _unquote_url(xhr_url) xhr_status = _check_if_req_is_successful( d['requestId'], succ_reqs) fp.write("Navigation_URL: '%s'\n" % (url)) fp.write("Request_URL: '%s'\n" % (xhr_url)) fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0]))) fp.write("Response_HTTP_Status: '%s'\n" % (str(xhr_status[1]))) fp.write(sep_templates) except: continue for each_request in with_data_reqs: try: if isinstance(each_request, dict): xhr_url = each_request['url'] xhr_url = _unquote_url(xhr_url) form_data_dict = each_request['requestBody'] form_data_str = str(form_data_dict) fp.write("Navigation_URL: '%s'\n" % (url)) fp.write("Request_URL: '%s'\n" % (xhr_url)) fp.write("Form_Data: \n%s\n" % (form_data_str)) xhr_status = _check_if_req_is_successful( each_request['requestId'], succ_reqs) fp.write("Request_Accepted: %s\n" % (str(xhr_status[0]))) fp.write("Response_HTTP_Status: %s\n" % (str(xhr_status[1]))) fp.write(sep_templates) else: d = json.loads(each_request) xhr_url = d['url'] xhr_url = _unquote_url(xhr_url) form_data_dict = d['requestBody'] form_data_str = str(form_data_dict) fp.write("Navigation_URL: '%s'\n" % (url)) fp.write("Request_URL: '%s'\n" % (xhr_url)) fp.write("Form_Data: \n%s\n" % (form_data_str)) xhr_status = _check_if_req_is_successful( d['requestId'], succ_reqs) fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0]))) fp.write("Response_HTTP_Status: '%s'\n" % (str(xhr_status[1]))) fp.write(sep_templates) except: continue # step 6: save cookies # @Thanks to: https://stackoverflow.com/questions/15058462/how-to-save-and-load-cookies-using-python-selenium-webdriver path_name_cookie_logs = os.path.join(output_folder_path_name_of_this_page, NAME_COOKIE_FILE) path_name_cookie_logs_str = os.path.join( output_folder_path_name_of_this_page, NAME_COOKIE_FILE_STR) with open(path_name_cookie_logs, "wb") as fp: pickle.dump(cookies, fp) with open(path_name_cookie_logs_str, "w+") as fd: fd.write(str(cookies)) # step 7: save events logs = seleniumModule.get_chrome_console_logs(driver) with open( os.path.join(output_folder_path_name_of_this_page, NAME_FIRED_EVENTS_PICKLE), 'wb') as fd: pickle.dump(logs, fd) with open( os.path.join(output_folder_path_name_of_this_page, NAME_FIRED_EVENTS), 'w+') as fd: for log in logs: if log['level'] == 'INFO' and log['message'].startswith( 'chrome-extension://'): fd.write(str(log['message']) + '\n') d = RequesterModule.requester(url) if RequesterModule.is_http_response_valid(d): unrendered_html_page = str(d).strip() else: driver.get("view-source:" + str(url)) unrendered_html_page = driver.page_source # save the initial html path_name_html_unrendered = os.path.join( output_folder_path_name_of_this_page, NAME_HTML_UNRENDERED) with open(path_name_html_unrendered, "wb") as fp: fp.write(unrendered_html_page.encode('utf-8')) fp.write(b'\n') return SUCCESS