Beispiel #1
0
def Login(params=None, title=None):
    """
    Renders the login page.

    There is no separate login for the application, this is passed to
    the database!
    """
    p = Page()
    p.content('<!DOCTYPE html>')
    with p.html():
        with p.head():
            with p.title():
                p.content('pgui - Login')
            with p.link({'href': 'static/lib/bootstrap/bootstrap-3.3.4-dist/css/bootstrap.css', 'rel': 'stylesheet'}): pass
            with p.link({'href': 'static/login.css', 'rel': 'stylesheet'}): pass

        with p.body():

            with p.div({'class': 'container'}):
                with p.form({'method': 'POST', 'class': 'login'}):
                    with p.h2({'class': 'login-header'}):
                        p.content('Connect to a postgres database server')

                    with p.label({'for': 'name', 'class': 'sr-only'}):
                        p.content('User name')
                    with p.input({'type': 'input', 'id': 'name', 'name': 'name', 'class': 'form-control', 'placeholder': 'User name'}): pass

                    with p.label({'for': 'password', 'class': 'sr-only'}):
                        p.content('Password')
                    with p.input({'type': 'password', 'id': 'password', 'name': 'password', 'class': 'form-control', 'placeholder': 'Password'}): pass

                    with p.label({'for': 'host', 'class': 'sr-only'}):
                        p.control('Host')
                    with p.input({'type': 'input', 'id': 'host', 'name': 'host', 'class': 'form-control', 'value': 'localhost'}): pass

                    with p.label({'for': 'port', 'class': 'sr-only'}):
                        p.content('Port')
                    with p.input({'type': 'input', 'id': 'port', 'name': 'port', 'class': 'form-control', 'value': '5432'}): pass

                    with p.button({'class': 'btn btn-lg btn-success btn-block', 'type': 'submit'}, args=['autofocus']):
                        p.content('Connect')


                with p.div({'class': 'login'}):
                    if 'err' in params and params['err']:
                        for err in params['err']:
                            with p.code():
                                p.content(err)

    return p
Beispiel #2
0
	def run_job(self):
		self.status = {}
		self.status["scope"] = "running crawl job"
		if self.query is None:
			self.status["msg"] = "Unable to start crawl: no query has been set."
			self.status["code"] = 600.1
			self.status["status"] = False
			return False 
		else:
			query = Query(self.query)
			
		seeds = self.collect_sources()
		if self.db.sources.count() == 0:
			self.status["msg"] = "Unable to start crawl: no seeds have been set."
			self.status["code"] = 600.1
			self.status["status"] = False
			return False
		else:
			self.send_seeds_to_queue()
		
		start = datetime.now()
		if self.db.queue.count == 0:
			self.status["msg"] = "Error while sending urls into queue: queue is empty"
			self.status["code"] = 600.1
			self.status["status"] = False
			return False
			
		else:
			self.status["msg"] = "running crawl on %i sources with query '%s'" %(len(self.db.sources.distinct("url")), self.query)
				
			while self.db.queue.count > 0:	
				for url in self.db.queue.distinct("url"):
					if url != "":
						page = Page(url)
						if page.check() and page.request() and page.control():
							article = page.extract("article")
							if article.status is True:
								if article.is_relevant(query):			
									self.db.results.insert(article.repr())
									if article.outlinks is not None and len(article.outlinks) > 0:
										self.db.queue.insert(article.outlinks)
							else:	
								self.db.logs.insert(article.logs)
						else:
							self.db.logs.insert(page.status)	
					self.db.queue.remove({"url": url})
					
					if self.db.queue.count() == 0:		
						break
				if self.db.queue.count() == 0:		
						break
			end = datetime.now()
			elapsed = end - start
			delta = end-start

			self.status["msg"] = "%s. Crawl done sucessfully in %s s" %(self.status["msg"],str(elapsed))
			self.status["status"] = True
			return True
Beispiel #3
0
def crawler(docopt_args):
	start = datetime.datetime.now()
	db_name = docopt_args['<project>']
	query = docopt_args['<query>']
	
	db = Database(db_name)
	db.create_colls()
	while db.queue.count > 0:

		print "beginning crawl"
		print "Nombre de sources dans la base", db.sources.count()
		print "Nombre d'url à traiter", len(db.queue.distinct("url"))
		for url in db.queue.distinct("url"):
			if url not in db.results.find({"url":url}):
				p = Page(url, query)
				
				if p.check() and p.request() and p.control() and p.extract():
					#print "Links", p.outlinks
					db.results.update(p.info, {'$push': {"date": datetime.datetime.today()}}, upsert=True)
					#db.results.insert(p.info)
					if p.outlinks is not None:
						try:
							for n_url in p.outlinks:
								if n_url is not None or  n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}):
									# Checking correct url before is problematic
									# next_p = Page(n_url, query)
									# if next_p.clean_url(p.url) is not None:
									db.queue.insert({"url":n_url})
						except mongo_err:
							db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.datetime.today()}}, upsert=True)
				elif p.error_type != 0:
					''' if the page is not relevant do not store in db'''
					db.log.update(p.bad_status(),{"date": {'$push': datetime.datetime.today()}}, upsert=True)
				else:
					continue

			db.queue.remove({"url": url})
			if db.queue.count() == 0:
				print db.stats()
				break
			
		if db.queue.count() == 0:
			print db.stats()		
			break
		

	end = datetime.datetime.now()
	elapsed = end - start
	print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),db_name, elapsed)
	return 
Beispiel #4
0
def Login(params=None, title=None):
    """
    Renders the login page.

    There is no separate login for the application, this is passed to
    the database!
    """
    p = Page()
    p.content('<!DOCTYPE html>')
    with p.html():
        with p.head():
            with p.title():
                p.content('pgui - Login')
            with p.link({
                    'href':
                    'static/lib/bootstrap/bootstrap-3.3.4-dist/css/bootstrap.css',
                    'rel': 'stylesheet'
            }):
                pass
            with p.link({'href': 'static/login.css', 'rel': 'stylesheet'}):
                pass

        with p.body():

            with p.div({'class': 'container'}):
                with p.form({'method': 'POST', 'class': 'login'}):
                    with p.h2({'class': 'login-header'}):
                        p.content('Connect to a postgres database server')

                    with p.label({'for': 'name', 'class': 'sr-only'}):
                        p.content('User name')
                    with p.input({
                            'type': 'input',
                            'id': 'name',
                            'name': 'name',
                            'class': 'form-control',
                            'placeholder': 'User name'
                    }):
                        pass

                    with p.label({'for': 'password', 'class': 'sr-only'}):
                        p.content('Password')
                    with p.input({
                            'type': 'password',
                            'id': 'password',
                            'name': 'password',
                            'class': 'form-control',
                            'placeholder': 'Password'
                    }):
                        pass

                    with p.label({'for': 'host', 'class': 'sr-only'}):
                        p.control('Host')
                    with p.input({
                            'type': 'input',
                            'id': 'host',
                            'name': 'host',
                            'class': 'form-control',
                            'value': 'localhost'
                    }):
                        pass

                    with p.label({'for': 'port', 'class': 'sr-only'}):
                        p.content('Port')
                    with p.input({
                            'type': 'input',
                            'id': 'port',
                            'name': 'port',
                            'class': 'form-control',
                            'value': '5432'
                    }):
                        pass

                    with p.button(
                        {
                            'class': 'btn btn-lg btn-success btn-block',
                            'type': 'submit'
                        },
                            args=['autofocus']):
                        p.content('Connect')

                with p.div({'class': 'login'}):
                    if 'err' in params and params['err']:
                        for err in params['err']:
                            with p.code():
                                p.content(err)

    return p
Beispiel #5
0
def crawler(docopt_args):
    start = datetime.datetime.now()
    db_name = docopt_args['<project>']
    query = docopt_args['<query>']

    db = Database(db_name)
    db.create_colls()
    while db.queue.count > 0:

        print "beginning crawl"
        print "Nombre de sources dans la base", db.sources.count()
        print "Nombre d'url à traiter", len(db.queue.distinct("url"))
        for url in db.queue.distinct("url"):
            if url not in db.results.find({"url": url}):
                p = Page(url, query)

                if p.check() and p.request() and p.control() and p.extract():
                    #print "Links", p.outlinks
                    db.results.update(
                        p.info, {'$push': {
                            "date": datetime.datetime.today()
                        }},
                        upsert=True)
                    #db.results.insert(p.info)
                    if p.outlinks is not None:
                        try:
                            for n_url in p.outlinks:
                                if n_url is not None or n_url not in db.queue.find(
                                    {"url": n_url
                                     }) or n_url not in db.results.find(
                                         {"url": n_url
                                          }) or n_url not in db.log.find(
                                              {"url": n_url}):
                                    # Checking correct url before is problematic
                                    # next_p = Page(n_url, query)
                                    # if next_p.clean_url(p.url) is not None:
                                    db.queue.insert({"url": n_url})
                        except mongo_err:
                            db.log.udpate(
                                {
                                    "url": url,
                                    "error_type":
                                    "pymongo error inserting outlinks",
                                    "query": self.query,
                                    "status": False
                                },
                                {'$push': {
                                    "date": datetime.datetime.today()
                                }},
                                upsert=True)
                elif p.error_type != 0:
                    ''' if the page is not relevant do not store in db'''
                    db.log.update(
                        p.bad_status(),
                        {"date": {
                            '$push': datetime.datetime.today()
                        }},
                        upsert=True)
                else:
                    continue

            db.queue.remove({"url": url})
            if db.queue.count() == 0:
                print db.stats()
                break

        if db.queue.count() == 0:
            print db.stats()
            break

    end = datetime.datetime.now()
    elapsed = end - start
    print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % (
        db.results.count(), db.sources.count(), db_name, elapsed)
    return