Ejemplo n.º 1
0
def crawl_web(seed,mx_pg,mx_dp):
    tocrawl = [[seed,0]]
    
    crawled = []
    
    index = {}
    while tocrawl:
        
        page_url ,depth= tocrawl.pop(0)
        print 'This is the maximum number of pages crawled'+' '+str(len(crawled))
        print 'This is the DEPTH' , depth,page_url
        #print index

        for key in index.keys():
            print key, rm_url(index[key])
        
        
        
        
        
        if (page_url not in crawled) and (len(crawled)< mx_pg) and (depth <=mx_dp):
            content_soup ,base_robot_parsed_url= get_page(page_url)
            
            make_index(index, page_url, content_soup)
            outlinks=all_links(content_soup,base_robot_parsed_url)
            add_to_tocrawl(tocrawl, outlinks,depth,crawled)
            crawled.append(page_url)
            
            print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
            #yet to check for urls that contains urls that linked to the same pages
        
    return index
Ejemplo n.º 2
0
def crawl_web(seed, mx_pg, mx_dp):
    tocrawl = [[seed, 0]]

    crawled = []

    index = {}
    while tocrawl:

        page_url, depth = tocrawl.pop(0)
        print 'This is the maximum number of pages crawled' + ' ' + str(
            len(crawled))
        print 'This is the DEPTH', depth, page_url
        #print index

        for key in index.keys():
            searchword = Searchword(keyword=key, urls=rm_url(
                index[key]))  #print key, rm_url(index[key])
            searchword.save()
            print key, rm_url(index[key])

        if page_url not in crawled and len(crawled) < mx_pg and depth <= mx_dp:
            content_soup, base_robot_parsed_url = get_page(page_url)

            make_index(index, page_url, content_soup)
            outlinks = all_links(content_soup, base_robot_parsed_url)
            add_to_tocrawl(tocrawl, outlinks, depth, crawled)
            crawled.append(page_url)

            print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
            #yet to check for urls that contains urls that linked to the same pages

    return index
Ejemplo n.º 3
0
def crawl_web(seed,mx_pg,mx_dp,filter_list=None):
    '''
         
	The function is responsible for the entire crawling process 
	This includes making index,getting all links on a particular page 
	passing and so on.
	>>>
    '''
    
    tocrawl = [[seed,0]]
    
    crawled = []
    graph={}
    outlinks=[]
    No_crawled=0
    
    
    
    while tocrawl:
        
        page_url ,depth= tocrawl.pop(0);#here we will count the number of document removed from the frontier
        if page_url:
           db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_removed_urls":1}},safe=True)
        
        if (page_url not in crawled) and (len(crawled)< mx_pg) and (depth <=mx_dp):
            print 'Crawling %s ,depth %d'%(page_url,depth)
            content_soup ,base_robot_parsed_url= get_page(page_url)
            '''
            This module is responsible for obtaining all page info
            '''
            if not test_doc(content_soup):
            	make_index(page_url, content_soup)
            	outlinks=all_links(content_soup,base_robot_parsed_url)
               
               
            else:
                 db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_duplicate_documents":1}},safe=True)
	          #here will count the number of document where duplicate existed
            '''
            Below am trying to obtain the relation between the page_url and it links. This 
            pairs form a graph to be used in our importance score calculation
            '''
            graph[page_url]=outlinks
            add_to_tocrawl(tocrawl, outlinks,depth,crawled)
            crawled.append(page_url)
            
            print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
            #yet to check for urls that contains urls that linked to the same pages
	    No_crawled=len(crawled)
            print 'This is the number of pages crawled'+' '+str(len(crawled))
    return graph,No_crawled,mx_pg,seed
Ejemplo n.º 4
0
from get_all_links import links as all_links
from union import add_to_tocrawl

def crawl_web(seed):
    tocrawl = [seed]
    
    crawled = []
    index = {}
	graph = {}
    while tocrawl:
        page_url = tocrawl.pop()
        if page_url not in crawled:
            content_soup ,base_robot_parsed_url= get_page(page_url)
            
            make_index(index, page_url, content_soup)
            outlinks=all_links(content_soup,base_robot_parsed_url)
			graph[page_url] = outlinks
            add_to_tocrawl(tocrawl, outlinks)
            crawled.append(page_url)
    return index

if __name__=='__main__':
     crawl_web('http://joblistghana.com')
else:
    pass
	
def compute_ranks(graph):
	damping_factor = 0.8
	num_of_loops = 10
	ranks = {}
	number_of_pages = len(graph)