def Login(params=None, title=None): """ Renders the login page. There is no separate login for the application, this is passed to the database! """ p = Page() p.content('<!DOCTYPE html>') with p.html(): with p.head(): with p.title(): p.content('pgui - Login') with p.link({'href': 'static/lib/bootstrap/bootstrap-3.3.4-dist/css/bootstrap.css', 'rel': 'stylesheet'}): pass with p.link({'href': 'static/login.css', 'rel': 'stylesheet'}): pass with p.body(): with p.div({'class': 'container'}): with p.form({'method': 'POST', 'class': 'login'}): with p.h2({'class': 'login-header'}): p.content('Connect to a postgres database server') with p.label({'for': 'name', 'class': 'sr-only'}): p.content('User name') with p.input({'type': 'input', 'id': 'name', 'name': 'name', 'class': 'form-control', 'placeholder': 'User name'}): pass with p.label({'for': 'password', 'class': 'sr-only'}): p.content('Password') with p.input({'type': 'password', 'id': 'password', 'name': 'password', 'class': 'form-control', 'placeholder': 'Password'}): pass with p.label({'for': 'host', 'class': 'sr-only'}): p.control('Host') with p.input({'type': 'input', 'id': 'host', 'name': 'host', 'class': 'form-control', 'value': 'localhost'}): pass with p.label({'for': 'port', 'class': 'sr-only'}): p.content('Port') with p.input({'type': 'input', 'id': 'port', 'name': 'port', 'class': 'form-control', 'value': '5432'}): pass with p.button({'class': 'btn btn-lg btn-success btn-block', 'type': 'submit'}, args=['autofocus']): p.content('Connect') with p.div({'class': 'login'}): if 'err' in params and params['err']: for err in params['err']: with p.code(): p.content(err) return p
def run_job(self): self.status = {} self.status["scope"] = "running crawl job" if self.query is None: self.status["msg"] = "Unable to start crawl: no query has been set." self.status["code"] = 600.1 self.status["status"] = False return False else: query = Query(self.query) seeds = self.collect_sources() if self.db.sources.count() == 0: self.status["msg"] = "Unable to start crawl: no seeds have been set." self.status["code"] = 600.1 self.status["status"] = False return False else: self.send_seeds_to_queue() start = datetime.now() if self.db.queue.count == 0: self.status["msg"] = "Error while sending urls into queue: queue is empty" self.status["code"] = 600.1 self.status["status"] = False return False else: self.status["msg"] = "running crawl on %i sources with query '%s'" %(len(self.db.sources.distinct("url")), self.query) while self.db.queue.count > 0: for url in self.db.queue.distinct("url"): if url != "": page = Page(url) if page.check() and page.request() and page.control(): article = page.extract("article") if article.status is True: if article.is_relevant(query): self.db.results.insert(article.repr()) if article.outlinks is not None and len(article.outlinks) > 0: self.db.queue.insert(article.outlinks) else: self.db.logs.insert(article.logs) else: self.db.logs.insert(page.status) self.db.queue.remove({"url": url}) if self.db.queue.count() == 0: break if self.db.queue.count() == 0: break end = datetime.now() elapsed = end - start delta = end-start self.status["msg"] = "%s. Crawl done sucessfully in %s s" %(self.status["msg"],str(elapsed)) self.status["status"] = True return True
def crawler(docopt_args): start = datetime.datetime.now() db_name = docopt_args['<project>'] query = docopt_args['<query>'] db = Database(db_name) db.create_colls() while db.queue.count > 0: print "beginning crawl" print "Nombre de sources dans la base", db.sources.count() print "Nombre d'url à traiter", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url":url}): p = Page(url, query) if p.check() and p.request() and p.control() and p.extract(): #print "Links", p.outlinks db.results.update(p.info, {'$push': {"date": datetime.datetime.today()}}, upsert=True) #db.results.insert(p.info) if p.outlinks is not None: try: for n_url in p.outlinks: if n_url is not None or n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}): # Checking correct url before is problematic # next_p = Page(n_url, query) # if next_p.clean_url(p.url) is not None: db.queue.insert({"url":n_url}) except mongo_err: db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.datetime.today()}}, upsert=True) elif p.error_type != 0: ''' if the page is not relevant do not store in db''' db.log.update(p.bad_status(),{"date": {'$push': datetime.datetime.today()}}, upsert=True) else: continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),db_name, elapsed) return
def Login(params=None, title=None): """ Renders the login page. There is no separate login for the application, this is passed to the database! """ p = Page() p.content('<!DOCTYPE html>') with p.html(): with p.head(): with p.title(): p.content('pgui - Login') with p.link({ 'href': 'static/lib/bootstrap/bootstrap-3.3.4-dist/css/bootstrap.css', 'rel': 'stylesheet' }): pass with p.link({'href': 'static/login.css', 'rel': 'stylesheet'}): pass with p.body(): with p.div({'class': 'container'}): with p.form({'method': 'POST', 'class': 'login'}): with p.h2({'class': 'login-header'}): p.content('Connect to a postgres database server') with p.label({'for': 'name', 'class': 'sr-only'}): p.content('User name') with p.input({ 'type': 'input', 'id': 'name', 'name': 'name', 'class': 'form-control', 'placeholder': 'User name' }): pass with p.label({'for': 'password', 'class': 'sr-only'}): p.content('Password') with p.input({ 'type': 'password', 'id': 'password', 'name': 'password', 'class': 'form-control', 'placeholder': 'Password' }): pass with p.label({'for': 'host', 'class': 'sr-only'}): p.control('Host') with p.input({ 'type': 'input', 'id': 'host', 'name': 'host', 'class': 'form-control', 'value': 'localhost' }): pass with p.label({'for': 'port', 'class': 'sr-only'}): p.content('Port') with p.input({ 'type': 'input', 'id': 'port', 'name': 'port', 'class': 'form-control', 'value': '5432' }): pass with p.button( { 'class': 'btn btn-lg btn-success btn-block', 'type': 'submit' }, args=['autofocus']): p.content('Connect') with p.div({'class': 'login'}): if 'err' in params and params['err']: for err in params['err']: with p.code(): p.content(err) return p
def crawler(docopt_args): start = datetime.datetime.now() db_name = docopt_args['<project>'] query = docopt_args['<query>'] db = Database(db_name) db.create_colls() while db.queue.count > 0: print "beginning crawl" print "Nombre de sources dans la base", db.sources.count() print "Nombre d'url à traiter", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url": url}): p = Page(url, query) if p.check() and p.request() and p.control() and p.extract(): #print "Links", p.outlinks db.results.update( p.info, {'$push': { "date": datetime.datetime.today() }}, upsert=True) #db.results.insert(p.info) if p.outlinks is not None: try: for n_url in p.outlinks: if n_url is not None or n_url not in db.queue.find( {"url": n_url }) or n_url not in db.results.find( {"url": n_url }) or n_url not in db.log.find( {"url": n_url}): # Checking correct url before is problematic # next_p = Page(n_url, query) # if next_p.clean_url(p.url) is not None: db.queue.insert({"url": n_url}) except mongo_err: db.log.udpate( { "url": url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status": False }, {'$push': { "date": datetime.datetime.today() }}, upsert=True) elif p.error_type != 0: ''' if the page is not relevant do not store in db''' db.log.update( p.bad_status(), {"date": { '$push': datetime.datetime.today() }}, upsert=True) else: continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % ( db.results.count(), db.sources.count(), db_name, elapsed) return