def main_data_collection(): args = sys.argv if len(args) > 1: low = int(args[1]) high = low if len(args) > 2: high = int(args[2]) for i in range(low, high + 1): site_id = args[1] # 1. get saved URLs or find URLs if needed urls = get_site_urls(site_id) # 2. collect js and data of the site, for each URL found if CrawlerConfig.PLATFORM == "linux": display = Display(visible=0, size=(800, 600)) display.start() driver = seleniumModule.get_new_browser(xhr_logger=True, event_logger=True, headless_mode=False) ## load predefined states into the browser (e.g., login) driver = CrawlerModule.get_logged_driver(driver, site_id) for navigation_url in urls: # crawlerUtilityModule.collect_site_data(site_id, navigation_url, driver) d = RequesterModule.requester(navigation_url) ## check if the site base address is reachable if RequesterModule.is_http_response_valid(d): try: crawlerUtilityModule.collect_site_data( site_id, navigation_url, driver) except BaseException as error: print('chrome runinto error for site: %s' % site_id) driver = seleniumModule.get_new_browser( xhr_logger=True, event_logger=True, headless_mode=False) continue else: continue if CrawlerConfig.PLATFORM == "linux": display.stop()
def get_dynamic_data(siteId, url, driver=None, close_conn=True, internal_only=False): """ @returns: None if url is not reachable O.W. a list containing page_content + soup_content + scripts (internal & external) from a reachable URL """ if not driver: driver = seleniumModule.get_new_browser() try: driver.get(url) except: # url unreachable return None driver.execute_script( open(constantsModule.JS_LIB_DETECTION_FILE_PATH_NAME, "r").read()) time.sleep(constantsModule.JS_LIB_DETECTION_WAIT_TIME) # Note: # We skip re-analyzing JS libriares embedded in the pages # to make the JS property graph analyzer return faster # and prevent re-analyzing millions of (similar) nodes # on our Neo4j DB for every single URL jslibs = constantsModule.JS_LIB_DETECTION_DEFAULT_LIST_WHEN_FAILED jslibs += constantsModule.JS_LIB_DETECTION_ALWAYS_CHECK_FOR try: elements = driver.find_elements_by_class_name( constantsModule.JS_LIB_DETECTION_SLUG_CLASS_OUTPUT) if (len(elements) > 0): lib_detection_output = elements[0].text libs_list = lib_detection_output.split(',') jslibs = _normalize_js_library_names(libs_list) except: if constantsModule.DEBUG_PRINTS: print("[Warning]: selenium found no library detection outputs!") page_content = driver.page_source soup_content = BeautifulSoup(page_content, "html.parser") domain = get_base_url(url) library_links = {} # lib name -> lib address scripts = [] internals = [] for i in soup_content.find_all('script'): if not i.get('src'): if not i.get('type'): # script contains JS if type is absent scripts.append(['internal_script', i.text]) internals.append(['internal_script', i.text]) else: script_type = i.get('type') # filter out text/json, etc if is_valid_script_type(script_type): scripts.append(['internal_script', i.text]) internals.append(['internal_script', i.text]) else: relative_link = i.get('src').lstrip('/') if relative_link.startswith('//'): link = relative_link.lstrip('//') elif relative_link.startswith('www'): link = "http://" + relative_link elif relative_link.startswith('http'): link = relative_link else: link = domain + '/' + relative_link # filter libs by checking if the keyword of any library names exists in link string: addLink = True for keyword in jslibs: if keyword in link and addLink == True: addLink = False key = keyword + "___" + get_short_uuid() library_links[key] = link break if addLink: if constantsModule.DEBUG_PRINTS: print("++ Ext JS Link: %s" % link) if not i.get('type'): scripts.append(['external_script', link]) else: script_type = i.get('type') if is_valid_script_type(script_type): scripts.append(['external_script', link]) if internal_only: all_scripts = internals else: all_scripts = _get_data_external_links(scripts, driver=driver) if close_conn: driver.close() return [page_content, soup_content, all_scripts, library_links]