예제 #1
0
파일: adhocCrawler.py 프로젝트: asamiam/SSE
def main(start, d):
           
    seeds = []
    seeds.append(start)  
        
#################LOOP FOR EACH SEED#################
    for x in range(0, len(seeds)):
        depth = d
        seed = seeds.pop(0), depth
        
        domainDetailList = []
        domainFileList = []
        uncrawledLinks = []
        crawledLinks = []
        crawlerObj = crawler()  #initialize crawler object
        domainDetail = crawlerObj.crawlDomain(seed)  #first crawls the domain of the seed URL. returns a DomainDetail object
        domainDetailList.append(domainDetail) #stores the returned DomainDetail object in the domainDetailList[]
        domainFile, childLinks = crawlerObj.crawlFilePath(seed)  #second crawls the file path of the seed URL. returns childLinks and a DomainFile object
        domainFileList.append([])  #appends a list at the first index of the domainFileList
        domainFileList[0].append(domainFile)  #appends the domainFile retrieved from the seed URL to the same row index which is equal to the index of the domain in the domainDetailList
        crawledLinks.append(seed[0]) #adds the seed URL to the list of crawled Links
        updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)  #appends the childLinks obtained from the first crawl to the list of uncrawled Links
                                                                        #**may need to have updateUncrawledLinks() return new version of uncrawledLinks and crawledLinks
        
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
        while(len(uncrawledLinks) > 0):
            currUrl = uncrawledLinks.pop(0) #removes the link at the beginning of the uncrawledLinks list and sets currUrl to that link
            crawledLinks.append(currUrl[0])  #adds the link that was just removed from uncrawledLinks to crawledLinks
            scheme, domain, filePath, params, query, fragment = urlparse(currUrl[0])
            
            domainExists = False  #paramater to specify if the domain of the currUrl already exists in the domainDetailList
            y = 0  #count to represent the index in the domainDetailList
            
            #loops while the domain is not found in the domainDetailList and we are within the bounds of the domainDetailList array
            while((domainExists == False) and (y < len(domainDetailList))):
                if(domainDetailList[y].getDomainName() == domain):  #checks if the domain already exists in the domainDetailList
                    domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)  #since the domain has already been crawled, just crawl the page source of the URL
                    domainFileList[y].append(domainFile)  #append the domainFile to the same row index which is equal to the index of the domain in the domainDetailList
                    domainExists = True  #domain is found in the domainDetailList set domainExists to True
                    updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
                y = y + 1
            
            if(domainExists != True):
                domainDetail = crawlerObj.crawlDomain(currUrl)
                ddIndex = len(domainDetailList)
                domainDetailList.append(domainDetail)
                domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)
                domainFileList.append([])
                domainFileList[ddIndex].append(domainFile)
                updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
    print "seed '" + seed[0] + "' done"
    
    print "commencing storage Of DomainDetail objects into database..."
    ssedaoObj = SSEDAO()
    for index in range(0, len(domainDetailList)):
        domainDetailList[index].setDomainFiles(domainFileList[index])
        ssedaoObj.storeCrawlerInformation(domainDetailList[index])
        print"DomainDetail object for '" + domainDetailList[index].getDomainName() + "' send to database"
예제 #2
0
파일: crawler.py 프로젝트: asamiam/SSE
        domainExists = False  #paramater to specify if the domain of the currUrl already exists in the domainDetailList
        y = 0  #count to represent the index in the domainDetailList
        
        #loops while the domain is not found in the domainDetailList and we are within the bounds of the domainDetailList array
        while((domainExists == False) and (y < len(domainDetailList))):
            if(domainDetailList[y].getDomainName() == domain):  #checks if the domain already exists in the domainDetailList
                domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)  #since the domain has already been crawled, just crawl the page source of the URL
                domainFileList[y].append(domainFile)  #append the domainFile to the same row index which is equal to the index of the domain in the domainDetailList
                domainExists = True  #domain is found in the domainDetailList set domainExists to True
                updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
            y = y + 1
        
        if(domainExists != True):
            domainDetail = crawlerObj.crawlDomain(currUrl)
            ddIndex = len(domainDetailList)
            domainDetailList.append(domainDetail)
            domainFile, childLinks = crawlerObj.crawlFilePath(currUrl)
            domainFileList.append([])
            domainFileList[ddIndex].append(domainFile)
            updateUncrawledLinks(childLinks, uncrawledLinks, crawledLinks)
    #################WHILE LOOP FOR CRAWLING CHILDLINKS OF SEED#################
print "seed '" + seed[0] + "' done"

print "commencing storage Of DomainDetail objects into database..."
ssedaoObj = SSEDAO()
for index in range(0, len(domainDetailList)):
    domainDetailList[index].setDomainFiles(domainFileList[index])
    ssedaoObj.storeCrawlerInformation(domainDetailList[index])
    print"DomainDetail object for '" + domainDetailList[index].getDomainName() + "' send to database"

#################LOOP FOR EACH SEED#################