Ejemplo n.º 1
0
    def scan(self, url):
        """Scan and crawl url which user requested."""
        Log.i("Trying to crawl {} url".format(url))

        domain = urlparse(url).netloc
        obj = DynamicObject()

        # Step 1. Visit website using headless tor browser
        Log.d("Step 1. Visiting {} website using headless browser".format(url))

        browser = HeadlessBrowser(ini=self.ini, tor_network=True)

        report = browser.run(url)

        del browser

        # if browser have an exception return from here
        if not report:
            return obj

        obj.webpage = report

        # Step 2. Scan common service port
        Log.d(
            "Step 2. Scanning {} domain's common service port".format(domain))
        obj.port = self._portscan(domain)

        # Step 3. TO-DO

        return obj
Ejemplo n.º 2
0
def main():
    """Main method for running all sources."""
    scheduler = BackgroundScheduler()
    scheduler.start()

    Log.i("{} source(s) detected!".format(len(sources.__all__)))

    job_id = 1

    for source in sources.__all__:
        status = run(source)  # initial run source.

        if status:
            # register a scheduler for running periodically. (only for active source)
            scheduler.add_job(run,
                              "interval",
                              minutes=source().cycle,
                              id=str(job_id),
                              args=(source, ))
            Log.i("Successfully add a new job")

            job_id += 1

    while True:
        time.sleep(60)  # sleep 1 mintue for running scheduler normally.

    scheduler.shutdown()
Ejemplo n.º 3
0
def run_crawler(self, url):
    Log.i(f"Starting crawler task for {url}")

    crawler = Crawler(ini=Ini(Env.read("CONFIG_FILE")))

    report = crawler.scan(url)

    if not report.is_empty() and report.webpage.url == url:
        crawler.save(self.request.id, report)

    del crawler
Ejemplo n.º 4
0
    def save(self, id, obj):
        """Save crawled data into database."""
        Log.i("Saving crawled data")

        meta = {
            'id': id,
        }

        engine = Engine.create(ini=self.ini)

        with Session(engine=engine) as session:
            domain = session.query(Domain).filter_by(uuid=id).first()

        engine.dispose()

        # pass the pipeline before saving data (for preprocessing)
        for pipeline in pipelines.__all__:
            _class = pipeline(domain, data=obj, ini=self.ini)

            if _class.active:
                Log.d(f"handling the {_class.name} pipeline")
                try:
                    _class.handle()
                except:
                    Log.e(f"Error while handling {_class.name} pipeline")
            else:
                Log.d(f"{_class.name} pipeline isn't active")

            del _class

        with Elastic(ini=self.ini):
            # upload screenshot at Amazon S3
            screenshot = self.upload_screenshot(obj.webpage.screenshot, id)

            Webpage(
                meta=meta,
                url=obj.webpage.url,
                domain=obj.webpage.domain,
                title=obj.webpage.title,
                time=datetime.now(),
                source=obj.webpage.source,
                screenshot=screenshot,
                language=obj.webpage.language,
                headers=obj.webpage.headers,
                tree=obj.webpage.tree,
            ).save()

            Port(meta=meta,
                 services=[
                     Service(number=port['number'], status=port['status'])
                     for port in obj.port
                 ]).save()
Ejemplo n.º 5
0
def run(source):
    _class = source()
    status = _class.active

    if _class.active:
        Log.i("Trying to run {} source".format(_class.name))
        try:
            _class.collect()
        except:
            Log.e("Failed to collect data from {} source".format(_class.name))
        if _class.urls:
            _class.save()
    else:
        Log.i("{} source is now disabled".format(_class.name))

    del _class

    return status
Ejemplo n.º 6
0
    def collect(self):

        response = HTTP.request(url='https://thehiddenwiki.com/Main_Page')
        soup = BeautifulSoup(response.text, 'html.parser')

        for a in soup.find_all('a'):
            try:
                parse = urlparse(a['href'])

                # valid onion domain check routine
                if parse.scheme.startswith('http') and parse.netloc.endswith(
                        'onion'):
                    url = "{}://{}".format(parse.scheme, parse.netloc)
                    if url not in self.urls:
                        self.urls.append(url)
            except:
                pass

        Log.i("{} url detected from hiddenwiki".format(len(self.urls)))
Ejemplo n.º 7
0
    def collect(self):
        Log.d("Start collecting from freshonion API")
        response = HTTP.request(
            url='http://zlal32teyptf4tvi.onion/json/all',
            tor_network=True,
            ini=self.ini
        )

        if not response:
            Log.e("Exception accrued while loading website.")
            return

        if response.status_code == 200:
            rows = response.json()
            Log.i("{} url detected from freshonion".format(len(rows)))

            for row in rows:
                url = self._get_formed_url(row)
                if url not in self.urls:
                    self.urls.append(url)
Ejemplo n.º 8
0
    def save(self):
        """
        Save domain on database and request crawling.
        :return: None
        """
        engine = Engine.create(self.ini)
        with Session(engine=engine) as session:
            for url in self.urls:
                task_id = uuid4().hex

                try:
                    # add url into database
                    session.add(Domain(uuid=task_id, url=url))
                    session.commit()

                    task = run_crawler.apply_async(args=(url, ),
                                                   task_id=task_id)
                    Log.i("Crawler issued a new task id {} at {}".format(
                        task.task_id, url))
                except:
                    Log.d(
                        "This {} url already saved into database.".format(url))
                finally:
                    self.urls.remove(url)
Ejemplo n.º 9
0
 def __init__(self, ini):
     Log.i("Starting crawler")
     self.ini = ini
Ejemplo n.º 10
0
 def __del__(self):
     Log.i("Ending crawler")
     del self
Ejemplo n.º 11
0
def test_write_info():
    Log.i("Test Info Message")