Ejemplo n.º 1
0
    def save(self, id, obj):
        """Save crawled data into database."""
        Log.i("Saving crawled data")

        meta = {
            'id': id,
        }

        engine = Engine.create(ini=self.ini)

        with Session(engine=engine) as session:
            domain = session.query(Domain).filter_by(uuid=id).first()

        engine.dispose()

        # pass the pipeline before saving data (for preprocessing)
        for pipeline in pipelines.__all__:
            _class = pipeline(domain, data=obj, ini=self.ini)

            if _class.active:
                Log.d(f"handling the {_class.name} pipeline")
                try:
                    _class.handle()
                except:
                    Log.e(f"Error while handling {_class.name} pipeline")
            else:
                Log.d(f"{_class.name} pipeline isn't active")

            del _class

        with Elastic(ini=self.ini):
            # upload screenshot at Amazon S3
            screenshot = self.upload_screenshot(obj.webpage.screenshot, id)

            Webpage(
                meta=meta,
                url=obj.webpage.url,
                domain=obj.webpage.domain,
                title=obj.webpage.title,
                time=datetime.now(),
                source=obj.webpage.source,
                screenshot=screenshot,
                language=obj.webpage.language,
                headers=obj.webpage.headers,
                tree=obj.webpage.tree,
            ).save()

            Port(meta=meta,
                 services=[
                     Service(number=port['number'], status=port['status'])
                     for port in obj.port
                 ]).save()
Ejemplo n.º 2
0
def run(source):
    _class = source()
    status = _class.active

    if _class.active:
        Log.i("Trying to run {} source".format(_class.name))
        try:
            _class.collect()
        except:
            Log.e("Failed to collect data from {} source".format(_class.name))
        if _class.urls:
            _class.save()
    else:
        Log.i("{} source is now disabled".format(_class.name))

    del _class

    return status
Ejemplo n.º 3
0
    def collect(self):
        Log.d("Start collecting from freshonion API")
        response = HTTP.request(
            url='http://zlal32teyptf4tvi.onion/json/all',
            tor_network=True,
            ini=self.ini
        )

        if not response:
            Log.e("Exception accrued while loading website.")
            return

        if response.status_code == 200:
            rows = response.json()
            Log.i("{} url detected from freshonion".format(len(rows)))

            for row in rows:
                url = self._get_formed_url(row)
                if url not in self.urls:
                    self.urls.append(url)
Ejemplo n.º 4
0
    def request(cls, url, tor_network=False, ini=None, timeout=300):
        """Request URL and get response header and body"""
        try:
            if tor_network:
                if not ini:
                    raise ValueError("Config file not found")

                server = '{}://{}:{}'.format(ini.read('TOR', 'PROTOCOL'),
                                             ini.read('TOR', 'HOST'),
                                             ini.read('TOR', 'PORT'))
                proxies = {'http': server, 'https': server}
                return requests.get(url,
                                    timeout=timeout,
                                    proxies=proxies,
                                    headers=cls._generate_custom_http_header())
            else:
                return requests.get(url,
                                    timeout=timeout,
                                    headers=cls._generate_custom_http_header())
        except Exception as e:
            Log.e("Exception at HTTP.request\n{}".format(e))
Ejemplo n.º 5
0
    def run(self, url):
        try:
            self.driver.get(url)
        except:
            # browser scan failed
            Log.e("Browser has an error.")
            return

        # if driver source is none
        if not self.get_source():
            return

        # run HTML parser for parse data from source
        try:
            # beautifulsoup object for parse html source
            self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        except:
            # website source code is not HTML
            Log.e("Invalid HTML Source code.")
            return

        # get HAR from driver
        self.har = json.loads(self.driver.get_log('har')[0]['message'])

        report = DynamicObject({
            'url': url,
            'domain': urlparse(url).netloc,
            'title': self.get_title(),
            'screenshot': self.get_screenshot(),
            'source': self.get_source(),
            'sublinks': self.get_sublinks(),
            'language': self.get_language(),
            'headers': self.get_headers(),
            'tree': self.get_website_tree(),
        })

        return report
Ejemplo n.º 6
0
def test_write_error():
    Log.e("Test Error Message")