Exemple #1
0
def _get_data_external_links(scripts, driver=None):
	"""
	@param scripts: a list of HTML internal scripts and exernal script links (src)
	@returns: an ordered list containing inline scripts and 
			  the contents of the REACHABLE external script links
	"""
	data = []
	if driver is None:
		# use python requests
		for item in scripts:
			script_type = item[0]
			if script_type == "external_script":
				link = item[1]
				d = RequesterModule.requester(link)
				if RequesterModule.is_http_response_valid(d):
					d_str = str(d).strip()
					if (not d_str.startswith("""<!doctype html>""")) and ('doctype html' not in d_str): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests
						data.append([script_type, d])
				else:
					## no valid content
					if constantsModule.DEBUG_PRINTS:
						print("+ InvalidResourceURL encountered!")
					continue
			else:
				data.append(item)
		return data
	else:
		# use browser
		for item in scripts:
			script_type = item[0]
			if script_type == "external_script":
				link = item[1]
				current_handle = driver.current_window_handle
				driver.execute_script("""window.open('', '_blank')""") # new tab
				time.sleep(1)
				driver.switch_to_window(driver.window_handles[1])
				driver.get(link)
				time.sleep(1)
				d = driver.page_source
				driver.close() # closes the new tab
				driver.switch_to_window(current_handle)


				dp = BeautifulSoup(d, 'html.parser')
				d_str = dp.find('pre', recursive=True) # js is rendered in a pre tag in chrome
				if d_str is None: 
					continue
				else:
					d_str = d_str.text # get the 'pre' tag content

				if (not d_str.startswith("""<!doctype html>""")): #ignore the case when resource is HTML, e.g, non-authenticated access via python requests
					data.append([script_type, d_str])
				else:
					## no valid content
					if constantsModule.DEBUG_PRINTS:
						print("+ InvalidResourceURL encountered!")
					continue
			else:
				data.append(item)
		return data
def get_external_resource(resource_url):
    """
	@param {string} resource_url
	@return {string} http response if valid, o/w empty string
	"""

    response = RequesterModule.requester(resource_url)
    if RequesterModule.is_http_response_valid(response):
        return response
    return ''
Exemple #3
0
def main_data_collection():

    args = sys.argv

    if len(args) > 1:
        low = int(args[1])
        high = low
        if len(args) > 2:
            high = int(args[2])

        for i in range(low, high + 1):

            site_id = args[1]
            # 1. get saved URLs or find URLs if needed
            urls = get_site_urls(site_id)

            # 2. collect js and data of the site, for each URL found
            if CrawlerConfig.PLATFORM == "linux":
                display = Display(visible=0, size=(800, 600))
                display.start()

            driver = seleniumModule.get_new_browser(xhr_logger=True,
                                                    event_logger=True,
                                                    headless_mode=False)

            ## load predefined states into the browser (e.g., login)
            driver = CrawlerModule.get_logged_driver(driver, site_id)

            for navigation_url in urls:
                # crawlerUtilityModule.collect_site_data(site_id, navigation_url, driver)

                d = RequesterModule.requester(navigation_url)
                ## check if the site base address is reachable
                if RequesterModule.is_http_response_valid(d):
                    try:
                        crawlerUtilityModule.collect_site_data(
                            site_id, navigation_url, driver)
                    except BaseException as error:
                        print('chrome runinto error for site: %s' % site_id)
                        driver = seleniumModule.get_new_browser(
                            xhr_logger=True,
                            event_logger=True,
                            headless_mode=False)
                        continue
                else:
                    continue

            if CrawlerConfig.PLATFORM == "linux":
                display.stop()
Exemple #4
0
def collect_site_data(site_id,
                      url,
                      driver,
                      out_path=CrawlerConfig.OUTPUT_DATA_DIRECTORY):
    """
	@param {string} site_id
	@param {string} url
	@param {object} driver: selenium driver handle
	@param {string} out_path
	@return {bool} whether or not the operation has succeeded
	"""

    # possible return values
    ERR_INVALID_URL = False
    SUCCESS = True

    # created output file names
    NAME_HTML_UNRENDERED = "html_initial.html"
    NAME_HTML_RENDERED = "html_rendered.html"
    NAME_JS_PROGRAM = "js_program.js"
    NAME_URL_FILE = "navigation_url.out"

    NAME_DOCUMENT_PROPS = 'document_props.out'
    NAME_DOCUMENT_PROPS_MACHINE = 'document_props_short.out'

    NAME_LIBRARIES_FOLDER = "libraries"

    NAME_XHR_LOGS = "request_logs.out"
    NAME_XHR_LOGS_MACHINE = "request_logs_short.out"

    NAME_COOKIE_FILE = "cookies.pkl"
    NAME_COOKIE_FILE_STR = "cookies_str.out"

    NAME_FIRED_EVENTS = "events.out"
    NAME_FIRED_EVENTS_PICKLE = "events_pickle.pkl"

    # prepare save path directories
    # site_map_name = sitesmapModule.get_site_data(site_id)[0]
    output_folder_of_this_site = os.path.join(out_path, str(site_id))
    folder_name_of_this_page = _hash(url)
    output_folder_path_name_of_this_page = os.path.join(
        output_folder_of_this_site, folder_name_of_this_page)

    if not os.path.exists(output_folder_path_name_of_this_page):
        os.makedirs(output_folder_path_name_of_this_page)

    # save the navigation url
    path_name_navigation_url = os.path.join(
        output_folder_path_name_of_this_page, NAME_URL_FILE)
    with open(path_name_navigation_url, "wb") as fp:
        fp.write(url.encode('utf-8'))

    # step 2: capture the rendered HTML page and JS
    dynamic_data = DOMCollectorModule.get_dynamic_data(site_id,
                                                       url,
                                                       driver,
                                                       close_conn=False)
    if dynamic_data is None:
        return ERR_INVALID_URL

    time.sleep(1)

    html_content = dynamic_data[0]
    soup_content = dynamic_data[1]
    js_of_page = DOMCollectorModule.combine_js_scripts(dynamic_data)
    inline_js_of_html = DOMCollectorModule.process_inline_dom_javascript(
        html_content, soup_content)

    # capture xhr requests via extension for lator use
    xhr_logs = seleniumModule.get_xhr_logger_extension_data(driver)

    # cookies
    cookies = driver.get_cookies()

    # DOM level 3 spec: first inline HTML events are fired, then others
    path_name_js_program = os.path.join(output_folder_path_name_of_this_page,
                                        NAME_JS_PROGRAM)
    with open(path_name_js_program, "wb") as fp:
        fp.write(inline_js_of_html.encode('utf-8'))
        fp.write(b'\n')
        fp.write(js_of_page.encode('utf-8'))
    _beautify_js(path_name_js_program)

    path_name_html_rendered = os.path.join(
        output_folder_path_name_of_this_page, NAME_HTML_RENDERED)
    with open(path_name_html_rendered, "wb") as fp:
        fp.write(html_content.encode('utf-8'))
        fp.write(b'\n')

    # store individual script files
    scripts_folder = os.path.join(output_folder_path_name_of_this_page,
                                  "scripts")
    if not os.path.exists(scripts_folder):
        os.makedirs(scripts_folder)

    script_files = dynamic_data[2]
    script_files_counter = 0

    mappings = {}
    writeMapping = False
    for item in script_files:
        script_files_counter += 1

        script_content = item[1]
        if len(script_content.strip()) == 0:
            continue

        if item[0] == 'internal_script':
            # remove HTML comment obfuscation in the start and end of inline script tags <!-- and -->
            script_content = script_content.strip().lstrip('<!--').rstrip(
                '-->')

        else:
            link = item[2]
            mappings[script_files_counter] = link
            writeMapping = True

        script_save_file_name = os.path.join(scripts_folder,
                                             str(script_files_counter) + '.js')
        with open(script_save_file_name, "w+") as fd:
            fd.write(script_content)
        _beautify_js(script_save_file_name)

    if writeMapping:
        with open(os.path.join(scripts_folder, "mappings.json"),
                  'w+',
                  encoding='utf-8') as fd:
            json.dump(mappings, fd, ensure_ascii=False, indent=4)

    # step 3: save library files
    lib_links_dictionary = dynamic_data[3]
    library_output_folder_of_this_site = os.path.join(
        output_folder_path_name_of_this_page, NAME_LIBRARIES_FOLDER)
    _save_program_libraries(library_output_folder_of_this_site,
                            lib_links_dictionary)

    # create timestamp for reports
    timestamp = get_current_timestamp()
    sep = get_output_header_sep()
    sep_templates = get_output_subheader_sep()

    # step 4: save document and form variables (accessible through document.form_name.input_name)
    document_form_variables = HTMLParserModule.get_document_properties_from_html(
        soup_content)
    path_name_document_props = os.path.join(
        output_folder_path_name_of_this_page, NAME_DOCUMENT_PROPS)
    with open(path_name_document_props, 'w+') as fd:
        fd.write(sep)
        fd.write('[timestamp] generated on %s\n' % timestamp)
        fd.write(
            '[description] defined properties in HTML for \'document\' DOM API\n'
        )
        fd.write(sep + '\n\n')

        for counter, elm in enumerate(document_form_variables, start=1):
            fd.write("(%s): %s\n" % (counter, elm))

    path_name_document_props_machine = os.path.join(
        output_folder_path_name_of_this_page, NAME_DOCUMENT_PROPS_MACHINE)
    with open(path_name_document_props_machine, 'w+') as fd:
        fd.write(str(document_form_variables))

    # step 5: save captured onload requests via extension
    without_data_reqs = xhr_logs['without_data']  # no formData
    with_data_reqs = xhr_logs['with_data']  # also contains formData
    succ_reqs = xhr_logs['succ']  # all successully accepted requests with 2xx

    path_name_xhr_logs_machine = os.path.join(
        output_folder_path_name_of_this_page, NAME_XHR_LOGS_MACHINE)
    with open(path_name_xhr_logs_machine, "w+") as fp:
        fp.write(str(xhr_logs))

    # save also a nicer human readable version
    path_name_xhr_logs = os.path.join(output_folder_path_name_of_this_page,
                                      NAME_XHR_LOGS)
    with open(path_name_xhr_logs, "w+") as fp:

        for each_request in without_data_reqs:
            try:
                if isinstance(each_request, dict):

                    xhr_url = each_request['url']
                    xhr_url = _unquote_url(xhr_url)
                    xhr_status = _check_if_req_is_successful(
                        each_request['requestId'], succ_reqs)
                    fp.write("Navigation_URL: '%s'\n" % (url))
                    fp.write("Request_URL: '%s'\n" % (xhr_url))
                    fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0])))
                    fp.write("Response_HTTP_Status: '%s'\n" %
                             (str(xhr_status[1])))
                    fp.write(sep_templates)
                else:
                    d = json.loads(each_request)
                    xhr_url = d['url']
                    xhr_url = _unquote_url(xhr_url)
                    xhr_status = _check_if_req_is_successful(
                        d['requestId'], succ_reqs)
                    fp.write("Navigation_URL: '%s'\n" % (url))
                    fp.write("Request_URL: '%s'\n" % (xhr_url))
                    fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0])))
                    fp.write("Response_HTTP_Status: '%s'\n" %
                             (str(xhr_status[1])))
                    fp.write(sep_templates)
            except:
                continue

        for each_request in with_data_reqs:
            try:
                if isinstance(each_request, dict):
                    xhr_url = each_request['url']
                    xhr_url = _unquote_url(xhr_url)
                    form_data_dict = each_request['requestBody']
                    form_data_str = str(form_data_dict)
                    fp.write("Navigation_URL: '%s'\n" % (url))
                    fp.write("Request_URL: '%s'\n" % (xhr_url))
                    fp.write("Form_Data: \n%s\n" % (form_data_str))
                    xhr_status = _check_if_req_is_successful(
                        each_request['requestId'], succ_reqs)
                    fp.write("Request_Accepted: %s\n" % (str(xhr_status[0])))
                    fp.write("Response_HTTP_Status: %s\n" %
                             (str(xhr_status[1])))
                    fp.write(sep_templates)
                else:
                    d = json.loads(each_request)
                    xhr_url = d['url']
                    xhr_url = _unquote_url(xhr_url)
                    form_data_dict = d['requestBody']
                    form_data_str = str(form_data_dict)
                    fp.write("Navigation_URL: '%s'\n" % (url))
                    fp.write("Request_URL: '%s'\n" % (xhr_url))
                    fp.write("Form_Data: \n%s\n" % (form_data_str))
                    xhr_status = _check_if_req_is_successful(
                        d['requestId'], succ_reqs)
                    fp.write("Request_Accepted: '%s'\n" % (str(xhr_status[0])))
                    fp.write("Response_HTTP_Status: '%s'\n" %
                             (str(xhr_status[1])))
                    fp.write(sep_templates)
            except:
                continue

    # step 6: save cookies
    # @Thanks to: https://stackoverflow.com/questions/15058462/how-to-save-and-load-cookies-using-python-selenium-webdriver
    path_name_cookie_logs = os.path.join(output_folder_path_name_of_this_page,
                                         NAME_COOKIE_FILE)
    path_name_cookie_logs_str = os.path.join(
        output_folder_path_name_of_this_page, NAME_COOKIE_FILE_STR)
    with open(path_name_cookie_logs, "wb") as fp:
        pickle.dump(cookies, fp)

    with open(path_name_cookie_logs_str, "w+") as fd:
        fd.write(str(cookies))

    # step 7: save events
    logs = seleniumModule.get_chrome_console_logs(driver)
    with open(
            os.path.join(output_folder_path_name_of_this_page,
                         NAME_FIRED_EVENTS_PICKLE), 'wb') as fd:
        pickle.dump(logs, fd)

    with open(
            os.path.join(output_folder_path_name_of_this_page,
                         NAME_FIRED_EVENTS), 'w+') as fd:
        for log in logs:
            if log['level'] == 'INFO' and log['message'].startswith(
                    'chrome-extension://'):
                fd.write(str(log['message']) + '\n')

    d = RequesterModule.requester(url)
    if RequesterModule.is_http_response_valid(d):
        unrendered_html_page = str(d).strip()
    else:
        driver.get("view-source:" + str(url))
        unrendered_html_page = driver.page_source

    # save the initial html
    path_name_html_unrendered = os.path.join(
        output_folder_path_name_of_this_page, NAME_HTML_UNRENDERED)
    with open(path_name_html_unrendered, "wb") as fp:
        fp.write(unrendered_html_page.encode('utf-8'))
        fp.write(b'\n')

    return SUCCESS