def persistURLsForNextCrawlInDB(db, urls): urls = prepareURLsForNextCrawl(urls) crawl_queue = db.crawl_queue visited_urls = db.visited_urls for url in urls: domain = utils.domainOf(url) domain_hash = utils.get_url_hash(domain) url_hash = utils.get_url_hash(url) db_query = {domain_hash: domain} update = {url_hash: url} url_obj = visited_urls.find_one(db_query) if not url_obj: crawl_queue.insert(update)
def seedStartURLsInDB(db, start_urls): crawl_queue_table = db.crawl_queue url_hashes = [{utils.get_url_hash(url): url} for url in start_urls] for query in url_hashes: crawl_queue_table.find_and_modify(query=query, update=query, upsert=True, new=True)
def crawl_url(url, headless=True, save_into_db=True): print "Crawling URL",url iurl_hash = utils.get_url_hash(url) update = {iurl_hash: url} db = db_utils.getDBInstance() if regex_domain_match(db, url): print "Skipping: ",url db.crawl_queue.remove(update) url = utils.sanitize_url(url) url_hash = utils.get_url_hash(url) db_query = {'url_hash': url_hash} if headless: display = Display(visible=0, size=(800, 600)) display.start() obj_in_db = db.webpages.find_one(db_query) webpage = None if not obj_in_db: webpage = WebPage(url) browser = webdriver.Chrome(desired_capabilities=capabilities) browser.set_page_load_timeout(30) #browser.implicitly_wait(5) try: print "Visiting page: ",url if not url.startswith("http"): raise Exception browser.get(url) #time.sleep(1) except Exception, e: print "Error Occured" browser.quit() print e return -1
def _url_hash(self): return utils.get_url_hash(self.url)
browser.quit() if save_into_db: update = None if webpage: update = webpage.json() else: update = obj_in_db db.webpages.find_and_modify(query=db_query, update=update, new=True, upsert=True) all_urls = a_links.union(js_links) all_urls = prepareURLsForNextCrawl(all_urls) for out_url in all_urls: url_hash = utils.get_url_hash(out_url) obj = db.webpages.find_one({'url_hash':url_hash}) if obj: print "Updating for: ", out_url query = {'url_hash': url_hash} obj["incoming_links"].append(url) obj["incoming_links"] = list(set(obj["incoming_links"])) if "_id" in obj: obj.pop("_id") new_obj = db.webpages.find_and_modify(query=query, update=obj, new=True, upsert=True) else: new_webpage = WebPage(out_url) new_webpage.incoming_links.add(url) db.webpages.insert(new_webpage.json())