def run_job(self): self.status = {} self.status["scope"] = "running crawl job" if self.query is None: self.status["msg"] = "Unable to start crawl: no query has been set." self.status["code"] = 600.1 self.status["status"] = False return False else: query = Query(self.query) seeds = self.collect_sources() if self.db.sources.count() == 0: self.status["msg"] = "Unable to start crawl: no seeds have been set." self.status["code"] = 600.1 self.status["status"] = False return False else: self.send_seeds_to_queue() start = datetime.now() if self.db.queue.count == 0: self.status["msg"] = "Error while sending urls into queue: queue is empty" self.status["code"] = 600.1 self.status["status"] = False return False else: self.status["msg"] = "running crawl on %i sources with query '%s'" %(len(self.db.sources.distinct("url")), self.query) while self.db.queue.count > 0: for url in self.db.queue.distinct("url"): if url != "": page = Page(url) if page.check() and page.request() and page.control(): article = page.extract("article") if article.status is True: if article.is_relevant(query): self.db.results.insert(article.repr()) if article.outlinks is not None and len(article.outlinks) > 0: self.db.queue.insert(article.outlinks) else: self.db.logs.insert(article.logs) else: self.db.logs.insert(page.status) self.db.queue.remove({"url": url}) if self.db.queue.count() == 0: break if self.db.queue.count() == 0: break end = datetime.now() elapsed = end - start delta = end-start self.status["msg"] = "%s. Crawl done sucessfully in %s s" %(self.status["msg"],str(elapsed)) self.status["status"] = True return True
def crawler(docopt_args): start = datetime.datetime.now() db_name = docopt_args['<project>'] query = docopt_args['<query>'] db = Database(db_name) db.create_colls() while db.queue.count > 0: print "beginning crawl" print "Nombre de sources dans la base", db.sources.count() print "Nombre d'url à traiter", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url":url}): p = Page(url, query) if p.check() and p.request() and p.control() and p.extract(): #print "Links", p.outlinks db.results.update(p.info, {'$push': {"date": datetime.datetime.today()}}, upsert=True) #db.results.insert(p.info) if p.outlinks is not None: try: for n_url in p.outlinks: if n_url is not None or n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}): # Checking correct url before is problematic # next_p = Page(n_url, query) # if next_p.clean_url(p.url) is not None: db.queue.insert({"url":n_url}) except mongo_err: db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.datetime.today()}}, upsert=True) elif p.error_type != 0: ''' if the page is not relevant do not store in db''' db.log.update(p.bad_status(),{"date": {'$push': datetime.datetime.today()}}, upsert=True) else: continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),db_name, elapsed) return
def crawler(docopt_args): start = datetime.datetime.now() db_name = docopt_args['<project>'] query = docopt_args['<query>'] db = Database(db_name) db.create_colls() while db.queue.count > 0: print "beginning crawl" print "Nombre de sources dans la base", db.sources.count() print "Nombre d'url à traiter", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url": url}): p = Page(url, query) if p.check() and p.request() and p.control() and p.extract(): #print "Links", p.outlinks db.results.update( p.info, {'$push': { "date": datetime.datetime.today() }}, upsert=True) #db.results.insert(p.info) if p.outlinks is not None: try: for n_url in p.outlinks: if n_url is not None or n_url not in db.queue.find( {"url": n_url }) or n_url not in db.results.find( {"url": n_url }) or n_url not in db.log.find( {"url": n_url}): # Checking correct url before is problematic # next_p = Page(n_url, query) # if next_p.clean_url(p.url) is not None: db.queue.insert({"url": n_url}) except mongo_err: db.log.udpate( { "url": url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status": False }, {'$push': { "date": datetime.datetime.today() }}, upsert=True) elif p.error_type != 0: ''' if the page is not relevant do not store in db''' db.log.update( p.bad_status(), {"date": { '$push': datetime.datetime.today() }}, upsert=True) else: continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % ( db.results.count(), db.sources.count(), db_name, elapsed) return