def scan(self, url): """Scan and crawl url which user requested.""" Log.i("Trying to crawl {} url".format(url)) domain = urlparse(url).netloc obj = DynamicObject() # Step 1. Visit website using headless tor browser Log.d("Step 1. Visiting {} website using headless browser".format(url)) browser = HeadlessBrowser(ini=self.ini, tor_network=True) report = browser.run(url) del browser # if browser have an exception return from here if not report: return obj obj.webpage = report # Step 2. Scan common service port Log.d( "Step 2. Scanning {} domain's common service port".format(domain)) obj.port = self._portscan(domain) # Step 3. TO-DO return obj
def main(): """Main method for running all sources.""" scheduler = BackgroundScheduler() scheduler.start() Log.i("{} source(s) detected!".format(len(sources.__all__))) job_id = 1 for source in sources.__all__: status = run(source) # initial run source. if status: # register a scheduler for running periodically. (only for active source) scheduler.add_job(run, "interval", minutes=source().cycle, id=str(job_id), args=(source, )) Log.i("Successfully add a new job") job_id += 1 while True: time.sleep(60) # sleep 1 mintue for running scheduler normally. scheduler.shutdown()
def run_crawler(self, url): Log.i(f"Starting crawler task for {url}") crawler = Crawler(ini=Ini(Env.read("CONFIG_FILE"))) report = crawler.scan(url) if not report.is_empty() and report.webpage.url == url: crawler.save(self.request.id, report) del crawler
def save(self, id, obj): """Save crawled data into database.""" Log.i("Saving crawled data") meta = { 'id': id, } engine = Engine.create(ini=self.ini) with Session(engine=engine) as session: domain = session.query(Domain).filter_by(uuid=id).first() engine.dispose() # pass the pipeline before saving data (for preprocessing) for pipeline in pipelines.__all__: _class = pipeline(domain, data=obj, ini=self.ini) if _class.active: Log.d(f"handling the {_class.name} pipeline") try: _class.handle() except: Log.e(f"Error while handling {_class.name} pipeline") else: Log.d(f"{_class.name} pipeline isn't active") del _class with Elastic(ini=self.ini): # upload screenshot at Amazon S3 screenshot = self.upload_screenshot(obj.webpage.screenshot, id) Webpage( meta=meta, url=obj.webpage.url, domain=obj.webpage.domain, title=obj.webpage.title, time=datetime.now(), source=obj.webpage.source, screenshot=screenshot, language=obj.webpage.language, headers=obj.webpage.headers, tree=obj.webpage.tree, ).save() Port(meta=meta, services=[ Service(number=port['number'], status=port['status']) for port in obj.port ]).save()
def run(source): _class = source() status = _class.active if _class.active: Log.i("Trying to run {} source".format(_class.name)) try: _class.collect() except: Log.e("Failed to collect data from {} source".format(_class.name)) if _class.urls: _class.save() else: Log.i("{} source is now disabled".format(_class.name)) del _class return status
def collect(self): response = HTTP.request(url='https://thehiddenwiki.com/Main_Page') soup = BeautifulSoup(response.text, 'html.parser') for a in soup.find_all('a'): try: parse = urlparse(a['href']) # valid onion domain check routine if parse.scheme.startswith('http') and parse.netloc.endswith( 'onion'): url = "{}://{}".format(parse.scheme, parse.netloc) if url not in self.urls: self.urls.append(url) except: pass Log.i("{} url detected from hiddenwiki".format(len(self.urls)))
def collect(self): Log.d("Start collecting from freshonion API") response = HTTP.request( url='http://zlal32teyptf4tvi.onion/json/all', tor_network=True, ini=self.ini ) if not response: Log.e("Exception accrued while loading website.") return if response.status_code == 200: rows = response.json() Log.i("{} url detected from freshonion".format(len(rows))) for row in rows: url = self._get_formed_url(row) if url not in self.urls: self.urls.append(url)
def save(self): """ Save domain on database and request crawling. :return: None """ engine = Engine.create(self.ini) with Session(engine=engine) as session: for url in self.urls: task_id = uuid4().hex try: # add url into database session.add(Domain(uuid=task_id, url=url)) session.commit() task = run_crawler.apply_async(args=(url, ), task_id=task_id) Log.i("Crawler issued a new task id {} at {}".format( task.task_id, url)) except: Log.d( "This {} url already saved into database.".format(url)) finally: self.urls.remove(url)
def __init__(self, ini): Log.i("Starting crawler") self.ini = ini
def __del__(self): Log.i("Ending crawler") del self
def test_write_info(): Log.i("Test Info Message")