Beispiel #1
0
	def run_job(self):
		self.status = {}
		self.status["scope"] = "running crawl job"
		if self.query is None:
			self.status["msg"] = "Unable to start crawl: no query has been set."
			self.status["code"] = 600.1
			self.status["status"] = False
			return False 
		else:
			query = Query(self.query)
			
		seeds = self.collect_sources()
		if self.db.sources.count() == 0:
			self.status["msg"] = "Unable to start crawl: no seeds have been set."
			self.status["code"] = 600.1
			self.status["status"] = False
			return False
		else:
			self.send_seeds_to_queue()
		
		start = datetime.now()
		if self.db.queue.count == 0:
			self.status["msg"] = "Error while sending urls into queue: queue is empty"
			self.status["code"] = 600.1
			self.status["status"] = False
			return False
			
		else:
			self.status["msg"] = "running crawl on %i sources with query '%s'" %(len(self.db.sources.distinct("url")), self.query)
				
			while self.db.queue.count > 0:	
				for url in self.db.queue.distinct("url"):
					if url != "":
						page = Page(url)
						if page.check() and page.request() and page.control():
							article = page.extract("article")
							if article.status is True:
								if article.is_relevant(query):			
									self.db.results.insert(article.repr())
									if article.outlinks is not None and len(article.outlinks) > 0:
										self.db.queue.insert(article.outlinks)
							else:	
								self.db.logs.insert(article.logs)
						else:
							self.db.logs.insert(page.status)	
					self.db.queue.remove({"url": url})
					
					if self.db.queue.count() == 0:		
						break
				if self.db.queue.count() == 0:		
						break
			end = datetime.now()
			elapsed = end - start
			delta = end-start

			self.status["msg"] = "%s. Crawl done sucessfully in %s s" %(self.status["msg"],str(elapsed))
			self.status["status"] = True
			return True
Beispiel #2
0
def crawler(docopt_args):
	start = datetime.datetime.now()
	db_name = docopt_args['<project>']
	query = docopt_args['<query>']
	
	db = Database(db_name)
	db.create_colls()
	while db.queue.count > 0:

		print "beginning crawl"
		print "Nombre de sources dans la base", db.sources.count()
		print "Nombre d'url à traiter", len(db.queue.distinct("url"))
		for url in db.queue.distinct("url"):
			if url not in db.results.find({"url":url}):
				p = Page(url, query)
				
				if p.check() and p.request() and p.control() and p.extract():
					#print "Links", p.outlinks
					db.results.update(p.info, {'$push': {"date": datetime.datetime.today()}}, upsert=True)
					#db.results.insert(p.info)
					if p.outlinks is not None:
						try:
							for n_url in p.outlinks:
								if n_url is not None or  n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}):
									# Checking correct url before is problematic
									# next_p = Page(n_url, query)
									# if next_p.clean_url(p.url) is not None:
									db.queue.insert({"url":n_url})
						except mongo_err:
							db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.datetime.today()}}, upsert=True)
				elif p.error_type != 0:
					''' if the page is not relevant do not store in db'''
					db.log.update(p.bad_status(),{"date": {'$push': datetime.datetime.today()}}, upsert=True)
				else:
					continue

			db.queue.remove({"url": url})
			if db.queue.count() == 0:
				print db.stats()
				break
			
		if db.queue.count() == 0:
			print db.stats()		
			break
		

	end = datetime.datetime.now()
	elapsed = end - start
	print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),db_name, elapsed)
	return 
Beispiel #3
0
def crawler(docopt_args):
    start = datetime.datetime.now()
    db_name = docopt_args['<project>']
    query = docopt_args['<query>']

    db = Database(db_name)
    db.create_colls()
    while db.queue.count > 0:

        print "beginning crawl"
        print "Nombre de sources dans la base", db.sources.count()
        print "Nombre d'url à traiter", len(db.queue.distinct("url"))
        for url in db.queue.distinct("url"):
            if url not in db.results.find({"url": url}):
                p = Page(url, query)

                if p.check() and p.request() and p.control() and p.extract():
                    #print "Links", p.outlinks
                    db.results.update(
                        p.info, {'$push': {
                            "date": datetime.datetime.today()
                        }},
                        upsert=True)
                    #db.results.insert(p.info)
                    if p.outlinks is not None:
                        try:
                            for n_url in p.outlinks:
                                if n_url is not None or n_url not in db.queue.find(
                                    {"url": n_url
                                     }) or n_url not in db.results.find(
                                         {"url": n_url
                                          }) or n_url not in db.log.find(
                                              {"url": n_url}):
                                    # Checking correct url before is problematic
                                    # next_p = Page(n_url, query)
                                    # if next_p.clean_url(p.url) is not None:
                                    db.queue.insert({"url": n_url})
                        except mongo_err:
                            db.log.udpate(
                                {
                                    "url": url,
                                    "error_type":
                                    "pymongo error inserting outlinks",
                                    "query": self.query,
                                    "status": False
                                },
                                {'$push': {
                                    "date": datetime.datetime.today()
                                }},
                                upsert=True)
                elif p.error_type != 0:
                    ''' if the page is not relevant do not store in db'''
                    db.log.update(
                        p.bad_status(),
                        {"date": {
                            '$push': datetime.datetime.today()
                        }},
                        upsert=True)
                else:
                    continue

            db.queue.remove({"url": url})
            if db.queue.count() == 0:
                print db.stats()
                break

        if db.queue.count() == 0:
            print db.stats()
            break

    end = datetime.datetime.now()
    elapsed = end - start
    print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % (
        db.results.count(), db.sources.count(), db_name, elapsed)
    return