Python SSEDAO Examples

Programming Language: Python

Namespace/Package Name: SSE

Class/Type: SSEDAO

Examples at hotexamples.com: 2

Python SSEDAO - 2 examples found. These are the top rated real world Python examples of SSE.SSEDAO extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

storeCrawlerInformation(2)

getDomainsForCrawler(1)

Example #1

Show file

File: adhocCrawler.py Project: asamiam/SSE

def main(start, d):
           
    seeds = []
    seeds.append(start)  
        
#################LOOP FOR EACH SEED#################
    for x in range(0, len(seeds)):
        depth = d
        seed = seeds.pop(0), depth
        
        domainDetailList = []
        domainFileList = []
        uncrawledLinks = []
        crawledLinks = []
        crawlerObj = crawler()  #initialize crawler object
        domainDetail = crawlerObj.crawlDomain(seed)  #first crawls the domain of the seed URL. returns a DomainDetail object
        domainDetailList.append(domainDetail) #stores the returned DomainDetail object in the domainDetailList[]
        domainFile, childLinks = crawlerObj.crawlFilePath(seed)  #second crawls the file path of the seed URL. returns childLinks and a DomainFile object
        domainFileList.append([])  #appends a list at the first index of the domainFileList
        domainFileList[0].append(domainFile)  #appends the domainFile retrieved from the seed URL to the same row index which is equal to the index of the domain in the domainDetailList
        crawledLinks.append(seed[0]) #adds the seed URL to the list of crawled Links
        updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)  #appends the childLinks obtained from the first crawl to the list of uncrawled Links
                                                                        #**may need to have updateUncrawledLinks() return new version of uncrawledLinks and crawledLinks
        
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
        while(len(uncrawledLinks) > 0):
            currUrl = uncrawledLinks.pop(0) #removes the link at the beginning of the uncrawledLinks list and sets currUrl to that link
            crawledLinks.append(currUrl[0])  #adds the link that was just removed from uncrawledLinks to crawledLinks
            scheme, domain, filePath, params, query, fragment = urlparse(currUrl[0])
            
            domainExists = False  #paramater to specify if the domain of the currUrl already exists in the domainDetailList
            y = 0  #count to represent the index in the domainDetailList
            
            #loops while the domain is not found in the domainDetailList and we are within the bounds of the domainDetailList array
            while((domainExists == False) and (y < len(domainDetailList))):
                if(domainDetailList[y].getDomainName() == domain):  #checks if the domain already exists in the domainDetailList
                    domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)  #since the domain has already been crawled, just crawl the page source of the URL
                    domainFileList[y].append(domainFile)  #append the domainFile to the same row index which is equal to the index of the domain in the domainDetailList
                    domainExists = True  #domain is found in the domainDetailList set domainExists to True
                    updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
                y = y + 1
            
            if(domainExists != True):
                domainDetail = crawlerObj.crawlDomain(currUrl)
                ddIndex = len(domainDetailList)
                domainDetailList.append(domainDetail)
                domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)
                domainFileList.append([])
                domainFileList[ddIndex].append(domainFile)
                updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
    print "seed '" + seed[0] + "' done"
    
    print "commencing storage Of DomainDetail objects into database..."
    ssedaoObj = SSEDAO()
    for index in range(0, len(domainDetailList)):
        domainDetailList[index].setDomainFiles(domainFileList[index])
        ssedaoObj.storeCrawlerInformation(domainDetailList[index])
        print"DomainDetail object for '" + domainDetailList[index].getDomainName() + "' send to database"

Example #2

Show file

File: crawler.py Project: asamiam/SSE

            found = True
        
        if(found == False):
            CLIndex = CLIndex + 1
    
    while (len(childLinks) > 0):
        uncrawledLinks.append(childLinks.pop(0))
       
    return childLinks, uncrawledLinks, crawledLinks
  

#################PROGRAM BEGINS HERE!#################        
seeds = []
seeds.append("http://www.bofa.com/")  

seeds = SSEDAO.getDomainsForCrawler()
for x in range (0, len(seeds)):
    seed[x] = "http://" + seed[x]


#################LOOP FOR EACH SEED#################
for x in range(0, len(seeds)):
    depth = 5
    seed = seeds.pop(0), depth
    
    domainDetailList = []
    domainFileList = []
    uncrawledLinks = []
    crawledLinks = []
    crawlerObj = crawler()  #initialize crawler object
    domainDetail = crawlerObj.crawlDomain(seed)  #first crawls the domain of the seed URL. returns a DomainDetail object