def initializeParams(crawling_spec): login_url = None start_header = "" login_header = "" globalVariables = Globals() # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html") #driver = webdriver.PhantomJS() myProxy = "localhost:8081" proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' # set this value as desired }) driver = webdriver.Firefox(proxy = proxy) #driver = webdriver.Chrome() logger.info("Browser is Launched") #driver.get("http://127.0.0.1:81/login/login.php") ''' if crawling_spec["proxy_address"]: globalVariables.setProxy(crawling_spec["proxy_address"]) ''' if crawling_spec["login_url"]: login_url = crawling_spec["login_url"] if crawling_spec["login_script"]: #print crawling_spec["login_script"].readlines() logger.info("Logging in Application") if not login_url: logger.error("No Login URL provided") else: print "performing login" start_header, login_header = doLogin(login_url, driver, globalVariables.proxy_address,scriptFileHandler = crawling_spec["login_script"] ) if crawling_spec["form_values_script"]: globalVariables.getFormValues(fileHandler = crawling_spec["form_values_script"]) if crawling_spec["base_address"]: globalVariables.addBaseAddress(crawling_spec["base_address"]) if crawling_spec["start_url"]: driver.get(crawling_spec["start_url"]) if crawling_spec["scope_urls"]: globalVariables.addScopeUrl(crawling_spec["scope_urls"]) if crawling_spec["black_list_urls"]: globalVariables.addBlackList(crawling_spec["black_list_urls"]) if crawling_spec["depth"]: globalVariables.setDepth(int(crawling_spec["depth"])) if not crawling_spec["start_url"] and not crawling_spec["login_url"]: logger.error("No Start Url Provided not Login Url Provided") return if crawling_spec["wait_time"]: globalVariables.setGlobalWait(crawling_spec["wait_time"]) # time.sleep(5) # move the controller to Initiate Crawler Activity logger.info("Initiating the Crawler") fsm = initState( driver.page_source, driver.current_url, driver.title, driver, globalVariables,0, start_header, login_header) #assert "Welcome, " in driver.page_source driver.close() print "graph obj",fsm.graph.nodes() return fsm
def main(): ''' crawls the demo website http://127.0.0.1:81/login/login.php with login credentials email = vinaysharma@gmail password = vinaykool ''' login_url = None start_header = "" login_header = "" parser = argparse.ArgumentParser() parser.add_argument("-l", "--login-script", action="store", dest="login_script", help="Path to python login script") parser.add_argument("-u", "--login-url", action="store", dest="login_url", help="Login Page Url") parser.add_argument("-f", "--form-script", action="store", dest="form_values_script", help="Path to Form Values Script") parser.add_argument("-b", "--base-address", action="store", dest="base_address", help="Base address") parser.add_argument("-s", "--start-url", action="store", dest="start_url", help="Starting Page Url") parser.add_argument("-bl", "--black-list", action="store", dest="black_list_urls", help="Black List Urls") parser.add_argument("-sc", "--scope", action="store", dest="scope_url", help="scope of the crawler") parser.add_argument("-d", "--depth", action="store", dest="depth",help="depth of crawl", type=int) parser.add_argument("-p", "--proxy", action="store", dest="proxy_address",help="proxy address") parser.add_argument('-t', action="store", dest="time", type=int) args = parser.parse_args() globalVariables = Globals() proxy_address = "" # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html") #driver = webdriver.PhantomJS() myProxy = "localhost:8081" proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' # set this value as desired }) driver = webdriver.Firefox(proxy = proxy) #driver = webdriver.Chrome() logger.info("Browser is Launched") #driver.get("http://127.0.0.1:81/login/login.php") if args.proxy_address: proxy_address = args.proxy_address if args.login_url: login_url = args.login_url if args.login_script: logger.info("Logging in Application") if not login_url: logger.error("No Login URL provided") else: start_header, login_header = doLogin(login_url, driver,proxy_address, scriptFilePath = args.login_script) if args.form_values_script: globalVariables.getFormValues(args.form_values_script) if args.base_address: globalVariables.addBaseAddress(args.base_address) if args.depth: globalVariables.setDepth(args.depth) if args.start_url: driver.get(args.start_url) if args.scope_url: globalVariables.addScopeUrl(args.scope_url) if args.black_list_urls: globalVariables.addBlackList(args.black_list_urls) if not args.start_url and not args.login_url: logger.error("No Start Url Provided not Login Url Provided") return if args.time: globalVariables.setGlobalWait(args.time) # time.sleep(5) # move the controller to Initiate Crawler Activity logger.info("Initiating the Crawler") initState( driver.page_source, driver.current_url, driver.title, driver, globalVariables,0, start_header, login_header) #assert "Welcome, " in driver.page_source driver.close()
def initializeParams(crawling_spec): login_url = None start_header = "" login_header = "" globalVariables = Globals() # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html") #driver = webdriver.PhantomJS() driver = webdriver.Firefox() logger.info("Browser is Launched") #driver.get("http://127.0.0.1:81/login/login.php") if crawling_spec["login_url"]: login_url = crawling_spec["login_url"] if crawling_spec["login_script"]: logger.info("Logging in Application") if not login_url: logger.error("No Login URL provided") else: start_url, login_url = \ doLogin(login_url, driver, scriptFileHandler = crawling_spec["login_script"]) if crawling_spec["form_values_script"]: globalVariables.getFormValues(fileHandler = crawling_spec["form_values_script"]) if crawling_spec["base_address"]: globalVariables.addBaseAddress(crawling_spec["base_address"]) if crawling_spec["start_url"]: driver.get(crawling_spec["start_url"]) if crawling_spec["scope_urls"]: globalVariables.addScopeUrl(crawling_spec["scope_urls"]) if crawling_spec["black_list_urls"]: globalVariables.addBlackList(crawling_spec["black_list_urls"]) if crawling_spec["depth"]: globalVariables.setDepth(int(crawling_spec["depth"])) if not crawling_spec["start_url"] and not crawling_spec["login_url"]: logger.error("No Start Url Provided not Login Url Provided") return if crawling_spec["wait_time"]: globalVariables.setGlobalWait(crawling_spec["wait_time"]) # time.sleep(5) # move the controller to Initiate Crawler Activity logger.info("Initiating the Crawler") initState( driver.page_source, driver.current_url, driver.title, driver, globalVariables,0,start_header, login_header) #assert "Welcome, " in driver.page_source driver.close()