def crawl_web(seed,mx_pg,mx_dp): tocrawl = [[seed,0]] crawled = [] index = {} while tocrawl: page_url ,depth= tocrawl.pop(0) print 'This is the maximum number of pages crawled'+' '+str(len(crawled)) print 'This is the DEPTH' , depth,page_url #print index for key in index.keys(): print key, rm_url(index[key]) if (page_url not in crawled) and (len(crawled)< mx_pg) and (depth <=mx_dp): content_soup ,base_robot_parsed_url= get_page(page_url) make_index(index, page_url, content_soup) outlinks=all_links(content_soup,base_robot_parsed_url) add_to_tocrawl(tocrawl, outlinks,depth,crawled) crawled.append(page_url) print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' #yet to check for urls that contains urls that linked to the same pages return index
def crawl_web(seed, mx_pg, mx_dp): tocrawl = [[seed, 0]] crawled = [] index = {} while tocrawl: page_url, depth = tocrawl.pop(0) print 'This is the maximum number of pages crawled' + ' ' + str( len(crawled)) print 'This is the DEPTH', depth, page_url #print index for key in index.keys(): searchword = Searchword(keyword=key, urls=rm_url( index[key])) #print key, rm_url(index[key]) searchword.save() print key, rm_url(index[key]) if page_url not in crawled and len(crawled) < mx_pg and depth <= mx_dp: content_soup, base_robot_parsed_url = get_page(page_url) make_index(index, page_url, content_soup) outlinks = all_links(content_soup, base_robot_parsed_url) add_to_tocrawl(tocrawl, outlinks, depth, crawled) crawled.append(page_url) print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' #yet to check for urls that contains urls that linked to the same pages return index
def crawl_web(seed,mx_pg,mx_dp,filter_list=None): ''' The function is responsible for the entire crawling process This includes making index,getting all links on a particular page passing and so on. >>> ''' tocrawl = [[seed,0]] crawled = [] graph={} outlinks=[] No_crawled=0 while tocrawl: page_url ,depth= tocrawl.pop(0);#here we will count the number of document removed from the frontier if page_url: db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_removed_urls":1}},safe=True) if (page_url not in crawled) and (len(crawled)< mx_pg) and (depth <=mx_dp): print 'Crawling %s ,depth %d'%(page_url,depth) content_soup ,base_robot_parsed_url= get_page(page_url) ''' This module is responsible for obtaining all page info ''' if not test_doc(content_soup): make_index(page_url, content_soup) outlinks=all_links(content_soup,base_robot_parsed_url) else: db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_duplicate_documents":1}},safe=True) #here will count the number of document where duplicate existed ''' Below am trying to obtain the relation between the page_url and it links. This pairs form a graph to be used in our importance score calculation ''' graph[page_url]=outlinks add_to_tocrawl(tocrawl, outlinks,depth,crawled) crawled.append(page_url) print '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' #yet to check for urls that contains urls that linked to the same pages No_crawled=len(crawled) print 'This is the number of pages crawled'+' '+str(len(crawled)) return graph,No_crawled,mx_pg,seed
from get_all_links import links as all_links from union import add_to_tocrawl def crawl_web(seed): tocrawl = [seed] crawled = [] index = {} graph = {} while tocrawl: page_url = tocrawl.pop() if page_url not in crawled: content_soup ,base_robot_parsed_url= get_page(page_url) make_index(index, page_url, content_soup) outlinks=all_links(content_soup,base_robot_parsed_url) graph[page_url] = outlinks add_to_tocrawl(tocrawl, outlinks) crawled.append(page_url) return index if __name__=='__main__': crawl_web('http://joblistghana.com') else: pass def compute_ranks(graph): damping_factor = 0.8 num_of_loops = 10 ranks = {} number_of_pages = len(graph)