コード例 #1
0
ファイル: Crawler.py プロジェクト: dip-kush/CrawlerUI
def addGraphNode(newNode, curNode, driver, fsm, entity,path):
    '''
    Adding a Node to the Finite State Machine
    Checking if the Dom Tree Does Not Exist Already
    '''
    graph = fsm.graph
    curNodeUrl = graph.node[curNode]['nodedata'].link
    newNodeUrl = driver.current_url
    if curNodeUrl == newNodeUrl:
        logger.debug("found the same url %s %d" % (curNodeUrl, curNode))
        fsm.doBacktrack = True

    for item in graph.node[curNode]['nodedata'].backtrackPath:
        newNode.backtrackPath.append(item)

    newNode.backtrackPath.append(entity)

    existNodeNumber = fsm.checkNodeExists(newNode.domString)
    header = printRequest()
    if existNodeNumber == -1:
        nodeNumber = fsm.numberOfNodes()
        print "========================="
        print header
        print "========================="
        nodeNumber = fsm.numberOfNodes()
        newNode.insertedDom = getDomDiff(graph.node[curNode]['nodedata'].domString,newNode.domString)

        if curNodeUrl == newNodeUrl:
            newNode.clickables = GetClickables(newNode.insertedDom)
        else:
            newNode.clickables = GetClickables(newNode.domString)
        #print newNode.insertedDom
        #write code here
        fsm.addNode(nodeNumber, newNode)
        logger.info("Adding a New Node %d to Graph" % (nodeNumber))
        fsm.addEdges(curNode, nodeNumber, entity, header)
        logger.info(
            "Adding a Edge from Node %d and %d" %
            (curNode, nodeNumber))
        print nodeNumber, newNode.clickables
        c = newNode.clickables
        for item in c:
            print item.xpath
        time.sleep(1.5)
        imgpath = os.path.join(BASE_DIR, "static/screenshots/"+str(nodeNumber)+".png")
        driver.save_screenshot(imgpath)
        #driver.save_screenshot("../static/screenshots/" + str(nodeNumber) + ".png")
        clearContent()
        return nodeNumber
        #queue.put(nodeNumber)
    else:
        logger.info("Dom Tree Already Exist")
        #driver.save_screenshot("snaps/" + str(existNodeNumber) + ".png")
        fsm.addEdges(curNode, existNodeNumber, entity, header)
        clearContent()
        return -1
コード例 #2
0
ファイル: ExtractDom.py プロジェクト: dip-kush/CrawlerUI
def doLogin(login_url, driver, path, scriptFilePath=None, scriptFileHandler=None):
    print "doing login"
    print login_url
    driver.get(login_url)
    start_header = printRequest()
    print "========================="
    print start_header
    print "========================="
    time.sleep(2)
    clearContent()

    if scriptFilePath:
        f = open(scriptFilePath)
    else:
        f = scriptFileHandler
    bs = BeautifulSoup(f)
    #print bs
    l = bs.findAll("tr")
    print len(l)
    for i in range(1, len(l)):
        type = l[i].findAll("td")[0].text
        target = l[i].findAll("td")[1].text
        value = l[i].findAll("td")[2].text
        #print type, target, value
        if value == "" and type == "clickAndWait":
            findSelector(driver,type,target,value)
            #element = driver.find_element_by_xpath(
            #"(//input[@type='submit'])")
            #element.click()
        elif value != "":
            #print value
            target = str(target)
            index = str(target).find('=')
            type = target[0:index]
            fieldVal = target[index + 1:]
            if type == "id":
                element = driver.find_element_by_id(fieldVal)
                element.send_keys(value)
            elif type == "name":
                driver.find_element_by_name(fieldVal).send_keys(value)
            else:
                driver.find_element_by_xpath(fieldVal).send_keys(value)
    time.sleep(2)
    login_header = printRequest()
    print "========================="
    print login_header
    print "========================="
    clearContent()
    return (start_header, login_header)
コード例 #3
0
ファイル: Crawler.py プロジェクト: dip-kush/CrawlerUI
def backtrack(driver, fsm, node, formValues, tillEnd,path):
    logger.info("Doing backtrack")
    #driver.back()
    #if fsm.doBacktrack == False:
        #driver.back()
    #else:
    
    graph = fsm.graph
    path = graph.node[node]['nodedata'].backtrackPath
    driver.get(path[0])
    for i in range(1, len(path)-1+tillEnd):
        time.sleep(0.5)
        fillFormValues(formValues, driver)
        time.sleep(0.5)
        #action, target= path[i].split(":")
        driver.find_element_by_xpath(path[i].xpath).click()
    
    clearContent()
コード例 #4
0
ファイル: Crawler.py プロジェクト: dip-kush/Ajax-Crawler
def backtrack(driver, fsm, node, formValues, tillEnd):
    logger.info("Doing backtrack")
    #driver.back()
    '''
    if fsm.doBacktrack == False:
        driver.back()
    else:
    '''
    graph = fsm.graph
    path = graph.node[node]['nodedata'].backtrackPath
    driver.get(path[0])
    '''
    for i in range(1,len(path)):
        print path[i].tag, path[i].attr, path[i].attrVal
    '''
    for i in range(1, len(path)-1+tillEnd):
        '''
        if i==1:
            driver.switch_to.parent_frame()
            driver.switch_to.frame(driver.find_element_by_name("menu"))
        '''
        time.sleep(0.5)
        fillFormValues(formValues, driver)
        time.sleep(0.5)
        #action, target= path[i].split(":")
        '''
        if tag == "a":
            driver.find_element_by_xpath("//"+tag+"[@"+attr+"='" + attrVal + "']").click()
        elif tag == "input":
            element = driver.find_element_by_xpath("(//"+tag+"[@"+attr+"='"+attrVal+"'])[" + str(tagNumber) + "]")
            element.click()
        '''
        driver.find_element_by_xpath(path[i].xpath).click()
        '''
        if i==1:
            driver.switch_to.parent_frame()
            driver.switch_to.frame(driver.find_element_by_name("body"))
        time.sleep(1.0)
        '''
    clearContent()