def start_crawler(config): #store the target itself global this_config this_config = config target = this_config['target'] target_link_obj = Link(url=target, parent='', depth=0) add_obj_to_session(target_link_obj) #kick off the crawl crawl(target,target,0)
def crawl(url, parent_url, parent_link_level): res = request_url(url) if res: html_doc = res.read() child_links_list = get_child_links(html_doc) for link in child_links_list: #first make the url absolute if it is not already (checking if it has scheme already) link = link.encode('utf8') if urlparse(link).scheme == '': link = urljoin(parent_url, link) #store the link object (the link, its parent, its 'depth' or level link_obj = Link(url=link, parent=parent_url, depth=parent_link_level + 1) add_obj_to_session(link_obj) try: if parent_link_level < int(this_config['depth']): crawl(link, url, parent_link_level + 1) except: print 'Error in crawl(), check that depth is a number in your config'