def test_database_session(): """ Test for connect database session :return: """ engine = Engine.create(ini=ini) with Session(engine=engine) as session: assert session
def save(self, id, obj): """Save crawled data into database.""" Log.i("Saving crawled data") meta = { 'id': id, } engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: domain = session.query(Domain).filter_by(uuid=id).first() engine.dispose() # pass the pipeline before saving data (for preprocessing) for pipeline in pipelines.__all__: _class = pipeline(domain, data=obj, ini=self.ini) if _class.active: Log.d(f"handling the {_class.name} pipeline") try: _class.handle() except: Log.e(f"Error while handling {_class.name} pipeline") else: Log.d(f"{_class.name} pipeline isn't active") del _class with Elastic(ini=self.ini): # upload screenshot at Amazon S3 screenshot = self.upload_screenshot(obj.webpage.screenshot, id) Webpage( meta=meta, url=obj.webpage.url, domain=obj.webpage.domain, title=obj.webpage.title, time=datetime.now(), source=obj.webpage.source, screenshot=screenshot, language=obj.webpage.language, headers=obj.webpage.headers, tree=obj.webpage.tree, ).save() Port(meta=meta, services=[ Service(number=port['number'], status=port['status']) for port in obj.port ]).save()
def test_manage_model(): """ Test for create a new table at memory database :return: """ engine = Engine.create(ini=ini) # add new data with Session(engine=engine) as session: session.add(Domain('test', 'https://formed_url.onion')) session.commit() with Session(engine=engine) as session: assert session.query(Domain).filter(Domain.uuid == 'test').count() == 1 assert session.query(Domain).filter( Domain.uuid == 'is_not_exist').count() == 0
def handle(self): super(BitcoinPipeline, self).handle() addresses = re.findall(r'([13][a-km-zA-HJ-NP-Z0-9]{26,33})', self.data.webpage.source) engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: for address in addresses: if self.validate_address(address): Log.d("{} address is valid address".format(address)) instance = get_or_create(session, Address, address=address) instance.domains.append(self.domain) session.add(instance) session.commit() engine.dispose()
def save(self): """ Save domain on database and request crawling. :return: None """ engine = Engine.create(self.ini) with Session(engine=engine) as session: for url in self.urls: task_id = uuid4().hex try: # add url into database session.add(Domain(uuid=task_id, url=url)) session.commit() task = run_crawler.apply_async(args=(url, ), task_id=task_id) Log.i("Crawler issued a new task id {} at {}".format( task.task_id, url)) except: Log.d( "This {} url already saved into database.".format(url)) finally: self.urls.remove(url)
def test_create_engine(): """ Test for create a new engine :return: """ assert Engine.create(ini=ini)