def scan(self, url): """Scan and crawl url which user requested.""" Log.i("Trying to crawl {} url".format(url)) domain = urlparse(url).netloc obj = DynamicObject() # Step 1. Visit website using headless tor browser Log.d("Step 1. Visiting {} website using headless browser".format(url)) browser = HeadlessBrowser(ini=self.ini, tor_network=True) report = browser.run(url) del browser # if browser have an exception return from here if not report: return obj obj.webpage = report # Step 2. Scan common service port Log.d( "Step 2. Scanning {} domain's common service port".format(domain)) obj.port = self._portscan(domain) # Step 3. TO-DO return obj
def create(cls, ini): Log.d("Creating database engine...") engine = create_engine( ini.read('DATABASE', 'URL'), echo=True if ini.read('DATABASE', 'DEBUG') is 'true' else False) Base.metadata.create_all(bind=engine) return engine
def save(self, id, obj): """Save crawled data into database.""" Log.i("Saving crawled data") meta = { 'id': id, } engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: domain = session.query(Domain).filter_by(uuid=id).first() engine.dispose() # pass the pipeline before saving data (for preprocessing) for pipeline in pipelines.__all__: _class = pipeline(domain, data=obj, ini=self.ini) if _class.active: Log.d(f"handling the {_class.name} pipeline") try: _class.handle() except: Log.e(f"Error while handling {_class.name} pipeline") else: Log.d(f"{_class.name} pipeline isn't active") del _class with Elastic(ini=self.ini): # upload screenshot at Amazon S3 screenshot = self.upload_screenshot(obj.webpage.screenshot, id) Webpage( meta=meta, url=obj.webpage.url, domain=obj.webpage.domain, title=obj.webpage.title, time=datetime.now(), source=obj.webpage.source, screenshot=screenshot, language=obj.webpage.language, headers=obj.webpage.headers, tree=obj.webpage.tree, ).save() Port(meta=meta, services=[ Service(number=port['number'], status=port['status']) for port in obj.port ]).save()
def handle(self): super(BitcoinPipeline, self).handle() addresses = re.findall(r'([13][a-km-zA-HJ-NP-Z0-9]{26,33})', self.data.webpage.source) engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: for address in addresses: if self.validate_address(address): Log.d("{} address is valid address".format(address)) instance = get_or_create(session, Address, address=address) instance.domains.append(self.domain) session.add(instance) session.commit() engine.dispose()
def collect(self): Log.d("Start collecting from freshonion API") response = HTTP.request( url='http://zlal32teyptf4tvi.onion/json/all', tor_network=True, ini=self.ini ) if not response: Log.e("Exception accrued while loading website.") return if response.status_code == 200: rows = response.json() Log.i("{} url detected from freshonion".format(len(rows))) for row in rows: url = self._get_formed_url(row) if url not in self.urls: self.urls.append(url)
def save(self): """ Save domain on database and request crawling. :return: None """ engine = Engine.create(self.ini) with Session(engine=engine) as session: for url in self.urls: task_id = uuid4().hex try: # add url into database session.add(Domain(uuid=task_id, url=url)) session.commit() task = run_crawler.apply_async(args=(url, ), task_id=task_id) Log.i("Crawler issued a new task id {} at {}".format( task.task_id, url)) except: Log.d( "This {} url already saved into database.".format(url)) finally: self.urls.remove(url)
def _portscan(self, domain): """Scan and check opened port.""" socket = Socket( tor_network=True, ini=self.ini, ) # common service port list services = [ { 'number': 20, 'status': False }, { 'number': 21, 'status': False }, { 'number': 22, 'status': False }, { 'number': 23, 'status': False }, { 'number': 25, 'status': False }, { 'number': 80, 'status': False }, { 'number': 110, 'status': False }, { 'number': 123, 'status': False }, # NTP { 'number': 143, 'status': False }, { 'number': 194, 'status': False }, # IRC { 'number': 389, 'status': False }, { 'number': 443, 'status': False }, { 'number': 993, 'status': False }, # IMAPS { 'number': 3306, 'status': False }, { 'number': 3389, 'status': False }, { 'number': 5222, 'status': False }, # XMPP { 'number': 6667, 'status': False }, # Public IRC { 'number': 8060, 'status': False }, # OnionCat { 'number': 8333, 'status': False }, # Bitcoin ] for i in range(len(services)): opened = socket.ping_check(domain, services[i]['number']) services[i]['status'] = opened Log.d("{} port is {}".format(services[i]['number'], 'opened' if opened else 'closed')) del socket return services
def test_write_debug(): Log.d("Test Debugging Message")