Example #1
0
    def tick(self):
        """Tick.

        Checkout a url from index, take photo of url,
        save to datadir and update index with photo
        metadata
        """
        try:
            timer = time.time()
            url = self._checkout_url()

            console.dp(f'taking photo of {url.to_string()}')

            path = PhotoPath(self.datadir)
            photo = LoadingPhoto(
                url=url,
                path=path,
                refresh_rate=self.refresh_rate
            )
            photo.save_loading_text()
            self.index.save_photo(photo)

            camera = c.Camera(
                viewport_width=self.viewport_width,
                viewport_height=self.viewport_height,
                viewport_max_height=self.viewport_max_height,
                addons={
                    'IDCAC': Addons.IDCAC,
                    'REFERER_HEADER': Addons.REFERER_HEADER,
                    'UBLOCK_ORIGIN': Addons.UBLOCK_ORIGIN,
                }
            )
            photo = camera.take_picture(url, path, self.refresh_rate)
            self.index.save_photo(photo)

            timer = int(time.time() - timer)
            console.p(
                f'photo was taken of {url.to_string()} took: {timer}s'
            )

        except EmptySearchResultException as e:
            pass
        finally:
            time.sleep(1)
Example #2
0
    def start_photographers(
        amount: int,
        refresh_rate: Type[refresh.RefreshRate],
        datadir: DataDirectory,
        viewport_width: int,
        viewport_height: int,
        viewport_max_height: Optional[int],
        elasticsearch_host: str,
        debug: bool
    ):
        """Start photographer threads.

        Args:
            amount: amount of crawlers to start
            refresh_rate: How often photographs should be refreshed,
                more exactly defines which lock should be placed on
                crawled urls
            datadir: Data directory to store pictures in
            viewport_width: width of camera viewport
            viewport_height: height of camera viewport
            viewport_max_height: max height of camera viewport
            elasticsearch_host: elasticsearch host
            debug: Display debugging information
        """
        console.p(f'starting {amount} photographer threads')
        Controller.PHOTOGRAPHER_PROCESSES = amount
        while amount > 0:
            thread_id = str(uuid.uuid4())
            thread = Thread(target=_photographer_thread, args=(
                refresh_rate,
                datadir,
                viewport_width,
                viewport_height,
                viewport_max_height,
                elasticsearch_host,
                debug,
                thread_id
            ))
            thread.start()
            Controller.threads[thread_id] = {
                'running': True
            }
            amount -= 1
Example #3
0
def _stats_thread(elasticsearch_host: str):
    """Stats thread.

    Prints system and saas statistics every 5th minute

    Args:
        elasticsearch_host: elasticsearch host
    """
    start = time.time()
    last_print = 1
    while Controller.SHOULD_RUN:

        time.sleep(1)
        mins = int(int(time.time() - start) / 60)
        if mins % 5 != 0 or mins <= last_print:
            continue

        index = Index(host=elasticsearch_host)
        last_print = mins

        t = '[throughput]           5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            stats.throughput(index, 5),
            stats.throughput(index, 15),
            stats.throughput(index, 30),
            stats.throughput(index, 60),
        )
        ta = '{}  5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            '[throughput 1min avg]',
            round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a',
            round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a',
            round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a',
            round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a',
        )
        load = '[load avg]             1m: {}, 5m: {}, 15min: {}'.format(
            stats.load_avg(1),
            stats.load_avg(5),
            stats.load_avg(15),
        )
        cpu = f'[current cpu usage]    {stats.cpu_usage(10)}%'
        mem = f'[memory usage]         {stats.memory_usage(10)}%'

        for msg in [t, ta, load, cpu, mem]:
            console.p(msg)
Example #4
0
def _crawler_thread(
    url_file: str,
    ignore_found_urls: bool,
    stay_at_domain: bool,
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Crawler thread.

    Args:
        url_file: path to url file
        ignore_found_urls: if crawler should ignore new urls found on
            pages it crawls
        stay_at_domain: if crawler should ignore urls from a different
            domain than the one it was found at
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        crawler = Crawler(
            url_file=url_file,
            index=Index(host=elasticsearch_host),
            ignore_found_urls=ignore_found_urls,
            stay_at_domain=stay_at_domain,
        )
        while Controller.SHOULD_RUN:
            crawler.tick()
    except UrlFileNotFoundError:
        console.p(f'ERROR: url_file was not found at \'{url_file}\'')
        time.sleep(2)
        Controller.threads[thread_id]['running'] = False
        Controller.stop_all()
    except Exception as e:
        console.p(f'error occured in crawler thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Example #5
0
    def start_filesystem(
        mountpoint: str,
        datadir: DataDirectory,
        refresh_rate: Type[refresh.RefreshRate],
        elasticsearch_host: str
    ):
        """Start filesystem process.

        FUSE python library will kill the main process,
        forking main process and mounts the filesystem
        from that process instead.

        Args:
            mountpoint: where to mount filesystem
            datadir: Data directory to store pictures in
            refresh_rate: Which refresh rate filesystem should use
                for fetching photos
            elasticsearch_host: elasticsearch host

        Returns:
            True if main process, False if the forked process
            bool
        """
        console.p(f'mounting filesystem at: {real_path(mountpoint)}')

        pid = os.fork()
        if pid != 0:
            Controller.FUSE_PID = pid
            return True

        try:
            Filesystem.mount(
                mountpoint,
                Index(datadir, host=elasticsearch_host),
                refresh_rate
            )
        except RuntimeError as e:
            console.p(f'failed to mount FUSE filesystem: {e}')

        return False
Example #6
0
def _photographer_thread(
    refresh_rate: Type[refresh.RefreshRate],
    datadir: DataDirectory,
    viewport_width: int,
    viewport_height: int,
    viewport_max_height: Optional[int],
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Photographer thread.

    Args:
        refresh_rate: How often photographs should be refreshed
        datadir: Data directory to store pictures in
        viewport_width: width of camera viewport
        viewport_height: height of camera viewport
        viewport_max_height: max height of camera viewport
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        photographer = p.Photographer(
            Index(host=elasticsearch_host),
            refresh_rate,
            datadir,
            viewport_width,
            viewport_height,
            viewport_max_height
        )
        while Controller.SHOULD_RUN:
            photographer.tick()
    except Exception as e:
        console.p(f'error occured in photographer thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Example #7
0
    def stop_all():
        """Stop all threads."""
        try:
            Controller.SHOULD_RUN = False

            i = 0
            while not Controller._any_thread_is_running():
                if i % 10 == 0:
                    console.p('waiting for saas to stop')
                i += 1
                time.sleep(0.5)

            console.p('cleaning up')
            try:
                for pid in Controller.webdrivers:
                    os.kill(pid, signal.SIGTERM)

                if Controller.FUSE_PID:
                    os.kill(Controller.FUSE_PID, signal.SIGTERM)
            except ProcessLookupError:
                pass
        except KeyboardInterrupt:
            Controller.stop_all()
Example #8
0
    def start_crawlers(
        amount: int,
        url_file: str,
        ignore_found_urls: bool,
        stay_at_domain: bool,
        elasticsearch_host: str,
        debug: bool
    ):
        """Start crawler threads.

        Args:
            amount: amount of crawlers to start
            url_file: path to urls file
            ignore_found_urls: if crawler should ignore new urls found on
                pages it crawls
            stay_at_domain: if crawler should ignore urls from a different
                domain than the one it was found at
            elasticsearch_host: elasticsearch host
            debug: Display debugging information
        """
        console.p(f'starting {amount} crawler threads')
        while amount > 0:
            thread_id = str(uuid.uuid4())
            thread = Thread(target=_crawler_thread, args=(
                url_file,
                ignore_found_urls,
                stay_at_domain,
                elasticsearch_host,
                debug,
                thread_id
            ))
            thread.start()
            Controller.threads[thread_id] = {
                'running': True
            }
            amount -= 1
Example #9
0
 def create_indices(self):
     """Create indices in elasticsearch."""
     console.p('creating indices')
     try:
         self.es.indices.create(Index.UNCRAWLED,
                                body={'mappings': Mappings.uncrawled})
         self.es.indices.create(Index.CRAWLED,
                                body={'mappings': Mappings.crawled})
         self.es.indices.create(Index.PHOTOS,
                                body={
                                    'mappings': Mappings.photos,
                                    'settings': Settings.photos,
                                })
         console.p('done.')
     except RequestError:
         console.p('indices already exist, skipping.')
Example #10
0
 def clear(self):
     """Clear all documents."""
     console.p('clearing all indices')
     self.es.indices.delete(index='_all', request_timeout=1000000)
     console.p('indices cleared')
Example #11
0
def main():
    """Entry point for saas."""
    try:

        parser = arguments.get_argument_parser()
        args = parser.parse_args(sys.argv[1:])

        console.DEBUG = args.debug

        JavascriptSnippets.load()

        index = Index(host=args.elasticsearch_host)

        if not index.ping():
            console.p('ERROR: failed to connect to elasticsearch')
            sys.exit()

        if not index.verify():
            if not args.setup_elasticsearch and not args.clear_elasticsearch:
                console.p('ERROR: elasticsearch is not configured')
                console.p('       {} {}'.format(
                    'start saas with --setup-elasticsearch',
                    'to configure elasticsearch'))
                sys.exit()

        datadir = DataDirectory(args.data_dir, args.optimize_storage)

        refresh_rate = {
            'day': refresh.Daily,
            'hour': refresh.Hourly,
            'minute': refresh.EveryMinute,
        }[args.refresh_rate]

        if args.setup_elasticsearch:
            index.create_indices()

        if args.clear_elasticsearch:
            index.clear()
            index.create_indices()

        if args.clear_data_dir:
            datadir.clear()

        if not Controller.start_filesystem(
                mountpoint=args.mountpoint,
                datadir=datadir,
                refresh_rate=refresh_rate,
                elasticsearch_host=args.elasticsearch_host):
            sys.exit()

        Controller.start_stats(elasticsearch_host=args.elasticsearch_host)

        Controller.start_crawlers(amount=args.crawler_threads,
                                  url_file=args.url_file,
                                  ignore_found_urls=args.ignore_found_urls,
                                  stay_at_domain=args.stay_at_domain,
                                  elasticsearch_host=args.elasticsearch_host,
                                  debug=args.debug)

        Controller.start_photographers(
            amount=args.photographer_threads,
            refresh_rate=refresh_rate,
            datadir=datadir,
            viewport_width=args.viewport_width,
            viewport_height=args.viewport_height,
            viewport_max_height=args.viewport_max_height,
            elasticsearch_host=args.elasticsearch_host,
            debug=args.debug)

        while True:

            if args.stop_if_idle == 0:
                time.sleep(10)
                continue

            try:
                crawled = index.timestamp_of_most_recent_document(
                    index.CRAWLED)
                photos = index.timestamp_of_most_recent_document(index.PHOTOS)

                timestamp = photos
                if crawled > timestamp:
                    timestamp = crawled

                seconds = int(time.time()) - timestamp
                mins = int(seconds / 60)
                if mins >= args.stop_if_idle:
                    console.p(f'was idle for {mins} minutes', end='')
                    raise StopIfIdleTimeoutExpired

            except EmptySearchResultException:
                pass
            finally:
                time.sleep(2)

    except (KeyboardInterrupt, StopIfIdleTimeoutExpired):
        console.p(' terminating.')
        Controller.stop_all()
        console.p('')
Example #12
0
 def clear(self):
     """Clear data directory."""
     console.p(f'clearing data directory at: {self.root}')
     shutil.rmtree(self.root)
     create_dir(self.root)