def save(self, id, obj): """Save crawled data into database.""" Log.i("Saving crawled data") meta = { 'id': id, } engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: domain = session.query(Domain).filter_by(uuid=id).first() engine.dispose() # pass the pipeline before saving data (for preprocessing) for pipeline in pipelines.__all__: _class = pipeline(domain, data=obj, ini=self.ini) if _class.active: Log.d(f"handling the {_class.name} pipeline") try: _class.handle() except: Log.e(f"Error while handling {_class.name} pipeline") else: Log.d(f"{_class.name} pipeline isn't active") del _class with Elastic(ini=self.ini): # upload screenshot at Amazon S3 screenshot = self.upload_screenshot(obj.webpage.screenshot, id) Webpage( meta=meta, url=obj.webpage.url, domain=obj.webpage.domain, title=obj.webpage.title, time=datetime.now(), source=obj.webpage.source, screenshot=screenshot, language=obj.webpage.language, headers=obj.webpage.headers, tree=obj.webpage.tree, ).save() Port(meta=meta, services=[ Service(number=port['number'], status=port['status']) for port in obj.port ]).save()
def run(source): _class = source() status = _class.active if _class.active: Log.i("Trying to run {} source".format(_class.name)) try: _class.collect() except: Log.e("Failed to collect data from {} source".format(_class.name)) if _class.urls: _class.save() else: Log.i("{} source is now disabled".format(_class.name)) del _class return status
def collect(self): Log.d("Start collecting from freshonion API") response = HTTP.request( url='http://zlal32teyptf4tvi.onion/json/all', tor_network=True, ini=self.ini ) if not response: Log.e("Exception accrued while loading website.") return if response.status_code == 200: rows = response.json() Log.i("{} url detected from freshonion".format(len(rows))) for row in rows: url = self._get_formed_url(row) if url not in self.urls: self.urls.append(url)
def request(cls, url, tor_network=False, ini=None, timeout=300): """Request URL and get response header and body""" try: if tor_network: if not ini: raise ValueError("Config file not found") server = '{}://{}:{}'.format(ini.read('TOR', 'PROTOCOL'), ini.read('TOR', 'HOST'), ini.read('TOR', 'PORT')) proxies = {'http': server, 'https': server} return requests.get(url, timeout=timeout, proxies=proxies, headers=cls._generate_custom_http_header()) else: return requests.get(url, timeout=timeout, headers=cls._generate_custom_http_header()) except Exception as e: Log.e("Exception at HTTP.request\n{}".format(e))
def run(self, url): try: self.driver.get(url) except: # browser scan failed Log.e("Browser has an error.") return # if driver source is none if not self.get_source(): return # run HTML parser for parse data from source try: # beautifulsoup object for parse html source self.soup = BeautifulSoup(self.driver.page_source, 'html.parser') except: # website source code is not HTML Log.e("Invalid HTML Source code.") return # get HAR from driver self.har = json.loads(self.driver.get_log('har')[0]['message']) report = DynamicObject({ 'url': url, 'domain': urlparse(url).netloc, 'title': self.get_title(), 'screenshot': self.get_screenshot(), 'source': self.get_source(), 'sublinks': self.get_sublinks(), 'language': self.get_language(), 'headers': self.get_headers(), 'tree': self.get_website_tree(), }) return report
def test_write_error(): Log.e("Test Error Message")