예제 #1
0
def initializeParams(crawling_spec):
    login_url = None
    start_header = ""
    login_header = ""
    globalVariables = Globals()

    # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html")

    #driver = webdriver.PhantomJS()
    myProxy = "localhost:8081"

    proxy = Proxy({
        'proxyType': ProxyType.MANUAL,
        'httpProxy': myProxy,
        'ftpProxy': myProxy,
        'sslProxy': myProxy,
        'noProxy': '' # set this value as desired
    })

    driver = webdriver.Firefox(proxy = proxy)
    #driver = webdriver.Chrome()
    logger.info("Browser is Launched")
    #driver.get("http://127.0.0.1:81/login/login.php")

    '''
    if crawling_spec["proxy_address"]:
        globalVariables.setProxy(crawling_spec["proxy_address"])
    '''


    if crawling_spec["login_url"]:
        login_url = crawling_spec["login_url"]

    if crawling_spec["login_script"]:
        #print crawling_spec["login_script"].readlines()
        logger.info("Logging in Application")
        if not login_url:
            logger.error("No Login URL provided")
        else:
            print "performing login"
            start_header, login_header = doLogin(login_url, driver, globalVariables.proxy_address,scriptFileHandler = crawling_spec["login_script"] )

    if crawling_spec["form_values_script"]:
        globalVariables.getFormValues(fileHandler = crawling_spec["form_values_script"])

    if crawling_spec["base_address"]:
        globalVariables.addBaseAddress(crawling_spec["base_address"])


    if crawling_spec["start_url"]:
        driver.get(crawling_spec["start_url"])

    if crawling_spec["scope_urls"]:
        globalVariables.addScopeUrl(crawling_spec["scope_urls"])

    if crawling_spec["black_list_urls"]:
        globalVariables.addBlackList(crawling_spec["black_list_urls"])

    if crawling_spec["depth"]:
        globalVariables.setDepth(int(crawling_spec["depth"]))

    if not crawling_spec["start_url"] and not crawling_spec["login_url"]:
        logger.error("No Start Url Provided not Login Url Provided")
        return

    if crawling_spec["wait_time"]:
        globalVariables.setGlobalWait(crawling_spec["wait_time"])


   
    # time.sleep(5)
    # move the controller to Initiate Crawler Activity
    logger.info("Initiating the Crawler")
    fsm = initState(
        driver.page_source,
        driver.current_url,
        driver.title,
        driver,
        globalVariables,0, start_header, login_header)

    #assert "Welcome, " in driver.page_source
    driver.close()

    print "graph obj",fsm.graph.nodes()
    return fsm
예제 #2
0
def main():
    '''
    crawls the demo website http://127.0.0.1:81/login/login.php
    with login credentials
    email = vinaysharma@gmail
    password = vinaykool
    '''
    login_url = None
    start_header = ""
    login_header = ""
    parser = argparse.ArgumentParser()

    parser.add_argument("-l", "--login-script", action="store", dest="login_script", help="Path to python login script")
    parser.add_argument("-u", "--login-url", action="store", dest="login_url", help="Login Page Url")
    parser.add_argument("-f", "--form-script", action="store", dest="form_values_script", help="Path to Form Values Script")
    parser.add_argument("-b", "--base-address", action="store", dest="base_address", help="Base address")
    parser.add_argument("-s", "--start-url", action="store", dest="start_url", help="Starting Page Url")
    parser.add_argument("-bl", "--black-list", action="store", dest="black_list_urls", help="Black List Urls")
    parser.add_argument("-sc", "--scope", action="store", dest="scope_url", help="scope of the crawler")
    parser.add_argument("-d", "--depth", action="store", dest="depth",help="depth of crawl", type=int)
    parser.add_argument("-p", "--proxy", action="store", dest="proxy_address",help="proxy address")
    parser.add_argument('-t', action="store", dest="time", type=int)

    args = parser.parse_args()

    globalVariables = Globals()

    proxy_address = ""
    # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html")

    #driver = webdriver.PhantomJS()
    myProxy = "localhost:8081"

    proxy = Proxy({
        'proxyType': ProxyType.MANUAL,
        'httpProxy': myProxy,
        'ftpProxy': myProxy,
        'sslProxy': myProxy,
        'noProxy': '' # set this value as desired
    })

    driver = webdriver.Firefox(proxy = proxy)
    #driver = webdriver.Chrome()
    logger.info("Browser is Launched")
    #driver.get("http://127.0.0.1:81/login/login.php")
    
    if args.proxy_address:
        proxy_address = args.proxy_address

    if args.login_url:
        login_url = args.login_url

    if args.login_script:
        logger.info("Logging in Application")
        if not login_url:
            logger.error("No Login URL provided")
        else:
            start_header, login_header = doLogin(login_url, driver,proxy_address, scriptFilePath = args.login_script)

    if args.form_values_script:
        globalVariables.getFormValues(args.form_values_script)

    if args.base_address:
        globalVariables.addBaseAddress(args.base_address)

    if args.depth:
        globalVariables.setDepth(args.depth)

    if args.start_url:
        driver.get(args.start_url)

    if args.scope_url:
        globalVariables.addScopeUrl(args.scope_url)

    if args.black_list_urls:
        globalVariables.addBlackList(args.black_list_urls)

    if not args.start_url and not args.login_url:
        logger.error("No Start Url Provided not Login Url Provided")
        return

    if args.time:
        globalVariables.setGlobalWait(args.time)

    # time.sleep(5)
    # move the controller to Initiate Crawler Activity
    logger.info("Initiating the Crawler")
    initState(
        driver.page_source,
        driver.current_url,
        driver.title,
        driver,
        globalVariables,0, start_header, login_header)

    #assert "Welcome, " in driver.page_source
    driver.close()
예제 #3
0
def initializeParams(crawling_spec):
    login_url = None
    start_header = ""
    login_header = ""
    globalVariables = Globals()

    # globalVariables.bannedUrls.append("http://127.0.0.1:81/login/profile.html")

    #driver = webdriver.PhantomJS()

    driver = webdriver.Firefox()
    logger.info("Browser is Launched")
    #driver.get("http://127.0.0.1:81/login/login.php")
    if crawling_spec["login_url"]:
        login_url = crawling_spec["login_url"]

    if crawling_spec["login_script"]:
        logger.info("Logging in Application")
        if not login_url:
            logger.error("No Login URL provided")
        else:
            start_url, login_url =  \
                   doLogin(login_url, driver, scriptFileHandler = crawling_spec["login_script"])

    if crawling_spec["form_values_script"]:
        globalVariables.getFormValues(fileHandler = crawling_spec["form_values_script"])

    if crawling_spec["base_address"]:
        globalVariables.addBaseAddress(crawling_spec["base_address"])


    if crawling_spec["start_url"]:
        driver.get(crawling_spec["start_url"])

    if crawling_spec["scope_urls"]:
        globalVariables.addScopeUrl(crawling_spec["scope_urls"])

    if crawling_spec["black_list_urls"]:
        globalVariables.addBlackList(crawling_spec["black_list_urls"])

    if crawling_spec["depth"]:
            globalVariables.setDepth(int(crawling_spec["depth"]))

    if not crawling_spec["start_url"] and not crawling_spec["login_url"]:
        logger.error("No Start Url Provided not Login Url Provided")
        return

    if crawling_spec["wait_time"]:
        globalVariables.setGlobalWait(crawling_spec["wait_time"])

    # time.sleep(5)
    # move the controller to Initiate Crawler Activity
    logger.info("Initiating the Crawler")
    initState(
        driver.page_source,
        driver.current_url,
        driver.title,
        driver,
        globalVariables,0,start_header, login_header)

    #assert "Welcome, " in driver.page_source
    driver.close()