def persistURLsForNextCrawlInDB(db, urls):
    urls = prepareURLsForNextCrawl(urls)
    crawl_queue = db.crawl_queue
    visited_urls = db.visited_urls

    for url in urls:
        domain = utils.domainOf(url)
        domain_hash = utils.get_url_hash(domain)
        url_hash = utils.get_url_hash(url)

        db_query = {domain_hash: domain}
        update = {url_hash: url}
        url_obj = visited_urls.find_one(db_query)
        if not url_obj:
            crawl_queue.insert(update)
def seedStartURLsInDB(db, start_urls):

    crawl_queue_table = db.crawl_queue

    url_hashes = [{utils.get_url_hash(url): url} for url in start_urls]
    for query in url_hashes:
        crawl_queue_table.find_and_modify(query=query, update=query,
                upsert=True, new=True)
def crawl_url(url, headless=True, save_into_db=True):
    print "Crawling URL",url
    iurl_hash = utils.get_url_hash(url)
    update = {iurl_hash: url}

    db = db_utils.getDBInstance()  
    if regex_domain_match(db, url):
        print "Skipping: ",url
        
    db.crawl_queue.remove(update)
    url = utils.sanitize_url(url)
    url_hash = utils.get_url_hash(url)
    db_query = {'url_hash': url_hash}

    if headless:
        display = Display(visible=0, size=(800, 600))
        display.start()

    
    obj_in_db = db.webpages.find_one(db_query)
    
    webpage = None
    if not obj_in_db:
        webpage = WebPage(url)


    browser = webdriver.Chrome(desired_capabilities=capabilities)
    browser.set_page_load_timeout(30)
    #browser.implicitly_wait(5)
    try:
        print "Visiting page: ",url
        if not url.startswith("http"):
            raise Exception

        browser.get(url)
        #time.sleep(1)
    except Exception, e:
        print "Error Occured"
        browser.quit()
        print e
        return -1 
 def _url_hash(self):
     return utils.get_url_hash(self.url)
    browser.quit()
    
    if save_into_db:
        update = None
        if webpage:
            update = webpage.json()
        else:
            update = obj_in_db

        db.webpages.find_and_modify(query=db_query, update=update, new=True, upsert=True)
        
        all_urls = a_links.union(js_links)
        all_urls = prepareURLsForNextCrawl(all_urls) 

        for out_url in all_urls:
            url_hash = utils.get_url_hash(out_url)
            obj = db.webpages.find_one({'url_hash':url_hash})
            if obj:
                print "Updating for: ", out_url
                query = {'url_hash': url_hash}
                obj["incoming_links"].append(url)
                obj["incoming_links"] = list(set(obj["incoming_links"]))
                if "_id" in obj:
                    obj.pop("_id")
                new_obj = db.webpages.find_and_modify(query=query, update=obj,
                        new=True, upsert=True)
            else:
                new_webpage = WebPage(out_url)
                new_webpage.incoming_links.add(url)
                db.webpages.insert(new_webpage.json())