Beispiel #1
0
def main_data_collection():

    args = sys.argv

    if len(args) > 1:
        low = int(args[1])
        high = low
        if len(args) > 2:
            high = int(args[2])

        for i in range(low, high + 1):

            site_id = args[1]
            # 1. get saved URLs or find URLs if needed
            urls = get_site_urls(site_id)

            # 2. collect js and data of the site, for each URL found
            if CrawlerConfig.PLATFORM == "linux":
                display = Display(visible=0, size=(800, 600))
                display.start()

            driver = seleniumModule.get_new_browser(xhr_logger=True,
                                                    event_logger=True,
                                                    headless_mode=False)

            ## load predefined states into the browser (e.g., login)
            driver = CrawlerModule.get_logged_driver(driver, site_id)

            for navigation_url in urls:
                # crawlerUtilityModule.collect_site_data(site_id, navigation_url, driver)

                d = RequesterModule.requester(navigation_url)
                ## check if the site base address is reachable
                if RequesterModule.is_http_response_valid(d):
                    try:
                        crawlerUtilityModule.collect_site_data(
                            site_id, navigation_url, driver)
                    except BaseException as error:
                        print('chrome runinto error for site: %s' % site_id)
                        driver = seleniumModule.get_new_browser(
                            xhr_logger=True,
                            event_logger=True,
                            headless_mode=False)
                        continue
                else:
                    continue

            if CrawlerConfig.PLATFORM == "linux":
                display.stop()
def get_dynamic_data(siteId,
                     url,
                     driver=None,
                     close_conn=True,
                     internal_only=False):
    """
	@returns: 
		None if url is not reachable
		O.W. a list containing page_content + soup_content + scripts (internal & external) from a reachable URL
	"""
    if not driver:
        driver = seleniumModule.get_new_browser()
    try:
        driver.get(url)
    except:
        # url unreachable
        return None

    driver.execute_script(
        open(constantsModule.JS_LIB_DETECTION_FILE_PATH_NAME, "r").read())
    time.sleep(constantsModule.JS_LIB_DETECTION_WAIT_TIME)

    # Note:
    # We skip re-analyzing JS libriares embedded in the pages
    # to make the JS property graph analyzer return faster
    # and prevent re-analyzing millions of (similar) nodes
    # on our Neo4j DB for every single URL
    jslibs = constantsModule.JS_LIB_DETECTION_DEFAULT_LIST_WHEN_FAILED
    jslibs += constantsModule.JS_LIB_DETECTION_ALWAYS_CHECK_FOR
    try:
        elements = driver.find_elements_by_class_name(
            constantsModule.JS_LIB_DETECTION_SLUG_CLASS_OUTPUT)
        if (len(elements) > 0):
            lib_detection_output = elements[0].text
            libs_list = lib_detection_output.split(',')
            jslibs = _normalize_js_library_names(libs_list)
    except:
        if constantsModule.DEBUG_PRINTS:
            print("[Warning]: selenium found no library detection outputs!")

    page_content = driver.page_source
    soup_content = BeautifulSoup(page_content, "html.parser")

    domain = get_base_url(url)

    library_links = {}  # lib name -> lib address
    scripts = []
    internals = []
    for i in soup_content.find_all('script'):
        if not i.get('src'):
            if not i.get('type'):
                # script contains JS if type is absent
                scripts.append(['internal_script', i.text])
                internals.append(['internal_script', i.text])
            else:
                script_type = i.get('type')
                # filter out text/json, etc
                if is_valid_script_type(script_type):
                    scripts.append(['internal_script', i.text])
                    internals.append(['internal_script', i.text])

        else:
            relative_link = i.get('src').lstrip('/')
            if relative_link.startswith('//'):
                link = relative_link.lstrip('//')
            elif relative_link.startswith('www'):
                link = "http://" + relative_link
            elif relative_link.startswith('http'):
                link = relative_link
            else:
                link = domain + '/' + relative_link

            # filter libs by checking if the keyword of any library names exists in link string:
            addLink = True
            for keyword in jslibs:
                if keyword in link and addLink == True:
                    addLink = False
                    key = keyword + "___" + get_short_uuid()
                    library_links[key] = link
                    break
            if addLink:
                if constantsModule.DEBUG_PRINTS:
                    print("++ Ext JS Link: %s" % link)

                if not i.get('type'):
                    scripts.append(['external_script', link])
                else:
                    script_type = i.get('type')
                    if is_valid_script_type(script_type):
                        scripts.append(['external_script', link])

    if internal_only:
        all_scripts = internals
    else:
        all_scripts = _get_data_external_links(scripts, driver=driver)

    if close_conn:
        driver.close()

    return [page_content, soup_content, all_scripts, library_links]