Exemple #1
0
def view_config(ctx):
    from findex_gui.bin.config import config
    print("config location: %s" % cwd())
    print("application_root: %s" % config("findex:findex:application_root"))
    print("debug: %s" % str(config("findex:findex:debug")))
    print("async: %s" % config("findex:findex:async"))
    print("database: %s" % config("findex:database:connection"))
Exemple #2
0
def web(ctx, args, host, port, uwsgi, nginx):
    """
    Operate the Findex Web Interface.
    @TODO: figure out uwsgi help thingy
    """
    logo(version)

    if nginx:
        # @TODO: nginx help msg here
        pass

    # Switch to findex/web and add the current path to sys.path as the Web
    # Interface is using local imports here and there.
    # TODO Rename local imports to either Findex.web.* or relative imports.
    os.chdir(findex_gui.__path__[0]
             )  # os.chdir(os.path.join(findex_gui.__path__[0], "web"))
    sys.path.insert(0, ".")
    os.environ["FINDEX_APP"] = "web"
    os.environ["FINDEX_CWD"] = cwd()

    try:
        app_debug = config("findex:findex:debug")
        bind_host = host
        bind_port = port

        def run_sync():
            from findex_gui.web import create_app
            app = create_app()
            app.run(debug=app_debug,
                    host=bind_host,
                    port=bind_port,
                    use_reloader=False)

        def run_async():
            from gevent import monkey
            monkey.patch_all()

            from gevent.pywsgi import WSGIServer
            from findex_gui.web import create_app

            app = create_app()
            http_server = WSGIServer((bind_host, bind_port), app)
            print(
                green(" * Running on http://%s:%s/ (Press CTRL+C to quit)") %
                (bind_host, str(bind_port)))
            http_server.serve_forever()

        if config("findex:findex:async"):
            run_async()
        else:
            run_sync()

    except Exception as e:
        message = red("{0}: {1}".format(e.__class__.__name__, e))
        if len(log.handlers):
            log.critical(message)
        else:
            sys.stderr.write("{0}\n".format(traceback.format_exc()))
        sys.exit(1)
Exemple #3
0
def check_specific_config(filename):
    sections = Config.configuration[filename]
    for section, entries in sections.items():
        if section == "*" or section == "__star__":
            continue

        # If an enabled field is present, check it beforehand.
        if config("%s:%s:enabled" % (filename, section)) is False:
            continue

        for key, value in entries.items():
            config(
                "%s:%s:%s" % (filename, section, key),
                check=True, strict=True
            )
Exemple #4
0
    def render(self, template_path, theme=None, status_code=200, **kwargs):
        if not theme:
            theme = self.get_active()

        # @TO-DO: use a context processor
        kwargs["env"] = {z: app.config[z] for z in app.config if z.islower()}
        kwargs["env"]["application_root"] = app.config["APPLICATION_ROOT"]

        user = UserController.get_current_user()
        user_context = get_current_user_data()

        if user_context:
            if not session.get("locale"):
                session["locale"] = user.locale

            elif session["locale"] != user.locale:
                session["locale"] = user.locale

        kwargs["user"] = user
        try:
            return render_template("%s/templates/%s.html" %
                                   (theme, template_path),
                                   url_for=url_for,
                                   **kwargs), status_code
        except TemplateNotFound as e:
            return "Template \"%s\" not found" % str(e)
        except Exception as ex:
            print(ex)
            if config("findex:findex:debug"):
                return "Jinja2 error!\n\n%s" % str(ex)
            return "Jinja2 error!"
Exemple #5
0
def generate_crawl_config(ctx):
    logo(version)
    from findex_gui.bin.config import generate_crawl_config
    from findex_common.utils import random_str
    db = config("findex:database:connection")
    spl = db[db.find("://") + 3:].split(":")
    spl_ = spl[1].split("@")
    spl__ = spl[2].split("/")

    db_host = spl_[1]
    db_user = spl[0]
    db_pass = spl_[1]
    db_port = int(spl__[0])
    db_name = spl__[1]

    crawl_config = generate_crawl_config(bot_name="bot_%s" % random_str(8),
                                         db_host=db_host,
                                         db_port=db_port,
                                         db_name=db_name,
                                         db_user=db_user,
                                         db_pass=db_pass,
                                         db_max_bulk_inserts=1000)

    print("Save the following as `settings.py`")
    print("=" * 26)
    print(crawl_config)
    print("=" * 26)
Exemple #6
0
    def database():
        rtn = OrderedDict()

        dsn = config("findex:database:connection")
        rtn["dsn (RFC-1738)"] = _item(data=dsn, cls="info")

        encoding = DatabaseStatus.raw_query("SHOW SERVER_ENCODING", cls="ok")
        if "UTF8" not in encoding.data:
            encoding.cls = "warning"
        rtn["encoding"] = encoding

        dsn_parsed = dsnparse.parse(dsn)
        dsn_blob = {
            "user": dsn_parsed.username,
            "pass": dsn_parsed.password,
            "host": dsn_parsed.host,
            "port":
            5432 if not isinstance(dsn_parsed.port, int) else dsn_parsed.port,
            "db": dsn_parsed.paths[0]
        }

        for k, v in dsn_blob.items():
            rtn["db_%s" % k] = _item(data=v, cls="ok")

        rtn["Size on Disk"] = DatabaseStatus.get_size()
        return rtn
Exemple #7
0
def after_request(r):
    r.headers.add('Accept-Ranges', 'bytes')

    if config("findex:findex:debug"):
        r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
        r.headers["Pragma"] = "no-cache"
        r.headers["Expires"] = "0"
        r.headers['Cache-Control'] = 'public, max-age=0'
    return r
Exemple #8
0
    def __init__(self):
        """Connects to the Postgres database."""
        self.engine = None
        self.session = None
        self.dsn = config("findex:database:connection")

        self.pool = pool.QueuePool(creator=self._getconn,
                                   max_overflow=1,
                                   pool_size=300,
                                   echo=False)  # config("findex:findex:debug")
Exemple #9
0
    def findex():
        rtn = OrderedDict()
        rtn["Version"] = _item(cls="info", data=version)
        rtn["Debug"] = _item(str(config("findex:findex:debug")))
        rtn["Config Location"] = _item(cwd())
        rtn["Application Root"] = _item(
            config("findex:findex:application_root"))

        is_async = "True" if is_gevent_monkey_patched() else "False"
        rtn["Async mode (Gevent monkey patch)"] = _item(is_async)
        rtn["No. Findex Users"] = FindexStatus.findex_get_nousers()
        try:
            Crawler.can_crawl()
            can_crawl = _item("Available", cls="ok")
        except Exception as ex:
            can_crawl = _item(str(ex), cls="error")
        rtn["'DIRECT' crawl mode"] = can_crawl

        has_cron = CronController.has_cronjob()
        has_cron_err = "Not set, please activate it (Scheduler->Overview)"
        has_cron = has_cron_err if not has_cron else "Set"
        rtn["Scheduler cronjob"] = _item(
            has_cron, cls="info" if has_cron == "Set" else "error")
        return rtn
Exemple #10
0
def test_db(ctx):
    import psycopg2
    dsn = config("findex:database:connection")

    try:
        conn = psycopg2.connect(dsn)
        cur = conn.cursor()
    except:
        print(red("Could not connect to the database via \"%s\"" % dsn))
        return
    try:
        cur.execute("""SELECT 1;""")
        one = cur.fetchone()
        assert one[0] == 1
    except:
        print(red("Database Error"))
    print(green("Database OK"))
Exemple #11
0
def log_msg(msg: str, category: str, level: int = 1):
    """
    Logs a message
    :param msg: msg
    :param category: category
    :param level: 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR
    :return:
    """
    from findex_gui.web import db
    from findex_gui.orm.models import Logging
    from findex_gui.bin.config import config
    if not config("findex:findex:debug") and level == 0:
        return
    categories = [
        "scheduler",
        "meta_import"
    ]

    if category not in categories:
        sys.stderr.write("cant log category %s - not in categories\n" % category)
        return

    print("[%s] %s" % (["DEBUG", "INFO", "WARNING", "ERROR"][level], msg))

    try:
        prev_frame = sys._getframe(1).f_code
        fn = prev_frame.co_filename
        fu = prev_frame.co_name
        fn = "/".join(fn.split("/")[-3:])
        file = "%s:%s" % (fn, fu)
    except:
        file = None

    log = Logging()
    log.file = file
    log.message = msg
    log.log_level = level
    log.category = category
    db.session.add(log)
    db.session.commit()
    db.session.flush()
Exemple #12
0
    def get_size():
        dsn = dsnparse.parse(config("findex:database:connection"))
        db_name = dsn.paths[0]
        sql = """
        SELECT
            pg_size_pretty(pg_database_size(pg_database.datname)) AS size
        FROM pg_database WHERE datname=:db_name;
        """
        res = DatabaseStatus.raw_query(sql, {"db_name": db_name})
        if res.cls != "ok":
            return res

        data_dir = DatabaseStatus.raw_query("""show data_directory;""")
        if data_dir.cls != "ok":
            res.data += " (error fetching data_dir)"
            return res
        else:
            res.cls = "info"

        res.data += " @ %s" % data_dir.data
        return res
Exemple #13
0
    def _search(**kwargs):
        kwargs["key"] = CrawlController.make_valid_key(kwargs["key"])
        if not kwargs["key"]:
            raise Exception("Invalid search. Too short?")

        q = ZdbQuery(Files, session=db.session) if config(
            "findex:elasticsearch:enabled") else Files.query

        # @TODO: filter by protocols / hosts
        # only find files that are not in "temp" mode
        # q = q.filter(Files.resource_id >= 1)

        # ignores certain filters
        ignore_filters = []

        # filter only files/dirs
        if kwargs.get("file_type"):
            if "both" in kwargs["file_type"]:
                pass
            if "folders" in kwargs["file_type"]:
                q = q.filter(Files.file_isdir == True)
                ignore_filters.extend(
                    ("file_size", "file_categories", "file_extensions"))
            elif "files" in kwargs["file_type"]:
                q = q.filter(Files.file_isdir == False)

        # size
        if kwargs["file_size"] and "file_size" not in ignore_filters:
            try:
                file_size = kwargs["file_size"].split("-")

                if not len(file_size) == 2:
                    raise Exception()

                if file_size[0] == "*":
                    q = q.filter(Files.file_size <= int(file_size[1]))
                elif file_size[1] == "*":
                    q = q.filter(Files.file_size >= int(file_size[0]))
                else:
                    q = q.filter(
                        Files.file_size.between(*[int(x) for x in file_size]))
            except:
                pass

        # filter categories
        filecategories = FileCategories()

        cat_ids = []
        cats = kwargs.get("file_categories", [])
        cats = [] if cats is None else cats
        for cat in cats:
            cat_id = filecategories.id_by_name(cat)

            if cat_id is None:
                continue
            cat_ids.append(FileCategories().id_by_name(cat))

        if cat_ids and "file_categories" not in ignore_filters:
            q = q.filter(Files.file_format.in_(cat_ids))

        if not kwargs["file_categories"]:
            file_categories = filecategories.get_names()

        # filter extensions
        if kwargs[
                "file_extensions"] and "file_extensions" not in ignore_filters:
            exts = []

            for ext in kwargs["file_extensions"]:
                if ext.startswith("."):
                    ext = ext[1:]

                exts.append(ext)

            q = q.filter(Files.file_ext.in_(exts))

        if isinstance(kwargs["meta_movie_id"], int):
            q = q.filter(Files.meta_movie_id == kwargs["meta_movie_id"])

        # Search
        if config("findex:elasticsearch:enabled"):
            val = kwargs["key"]
        else:
            if kwargs["autocomplete"] or app.config["db_file_count"] > 5000000:
                print("warning: too many rows, enable ElasticSearch")
                val = "%s%%" % escape_like(kwargs["key"])
            else:
                val = "%%%s%%" % escape_like(kwargs["key"])

        if val != "*":
            q = q.filter(Files.searchable.like(val))

        q = q.order_by(Files.file_size.desc())

        # pagination
        q = q.offset(kwargs["page"])

        if kwargs["autocomplete"]:
            q = q.limit(5)
            # q = q.distinct(func.lower(Files.file_name))
            q = q.distinct(Files.file_size)
        else:
            q = q.limit(kwargs["per_page"])

        # fetch
        try:
            results = q.all()
        except Exception as ex:
            raise Exception(ex)

        results = SearchController.assign_resource_objects(results)
        return results
Exemple #14
0
    def spawn(tasks, queue_size=5):
        """
        Spawns the crawler in 'DIRECT' mode. The suggested way
        is to use AMQP instead.
        :param tasks:
        :param queue_size:
        :return:
        """
        path_fincrawl = "%s/findex-crawl/" % python_env["project_root"]
        log_msg("Spawning local crawler in DIRECT mode for %d tasks" %
                len(tasks),
                category="scheduler")

        # construct tasks json file
        blobs = []
        for t in tasks:
            crawl_message = CrawlController.crawl_message_make(t)
            if not crawl_message:
                continue
            blobs.append(crawl_message)

        try:
            crawl_messages = json.dumps(blobs, indent=4, sort_keys=True)
            crawl_file = tempfile.mkstemp("_fincrawl.json")[1]
        except Exception as ex:
            log_msg(":%s" % (str(ex)), level=3, category="scheduler")
            return

        print(crawl_messages)

        # write tmp tasks file for the crawler
        f = open(crawl_file, "w")
        f.write(crawl_messages)
        f.close()

        dsn = dsnparse.parse(config("findex:database:connection"))

        dsn_blob = {
            "user": dsn.username,
            "pass": dsn.password,
            "host": dsn.host,
            "port": 5432 if not isinstance(dsn.port, int) else dsn.port,
            "db": dsn.paths[0]
        }

        # set env variables
        shell_env = os.environ.copy(
        )  # should include the current python virtualenv
        shell_env["FINDEX_CRAWL_MODE"] = "DIRECT"
        shell_env["FINDEX_CRAWL_FILE"] = crawl_file
        shell_env["FINDEX_CRAWL_FILE_CLEANUP"] = ":-D"
        shell_env["FINDEX_CRAWL_LOG_VERBOSITY"] = "20"
        shell_env["FINDEX_CRAWL_QUEUE_SIZE"] = str(queue_size)

        for k, v in dsn_blob.items():
            shell_env["FINDEX_CRAWL_DB_%s" % k.upper()] = str(v)

        for k, v in shell_env.items():
            if k.startswith("FINDEX_CRAWL"):
                print("export %s=\"%s\"" % (k, str(v)))

        # non-blocking Popen
        # for some reason `/bin/bash -c` is needed, else it cant find the relative `rpc.py`, even tough `cwd=` is set
        command = [
            "/bin/bash", "-c",
            "%s/twistd -ony rpc.py &" %
            os.path.dirname(python_env["interpreter"])
        ]

        print("spawning shell: %s" % " ".join(command))
        subprocess.Popen(command,
                         cwd=path_fincrawl,
                         env=shell_env,
                         stdout=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         preexec_fn=os.setpgrp)
Exemple #15
0
import re
import uuid
from datetime import datetime, timedelta

from flask import request
import humanfriendly
import sqlalchemy_zdb
from findex_gui.bin.config import config
sqlalchemy_zdb.ES_HOST = config("findex:elasticsearch:host")
from sqlalchemy_zdb import ZdbColumn
from sqlalchemy_zdb.types import FULLTEXT

from sqlalchemy.orm import relationship, backref
from sqlalchemy.orm.attributes import flag_modified
from sqlalchemy import (Integer, String, Boolean, DateTime, BigInteger, Index,
                        TIMESTAMP, ForeignKey, Table, Column, SMALLINT, ARRAY)
from sqlalchemy_utils import IPAddressType, force_auto_coercion
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy_json import MutableJson

from findex_common.static_variables import ResourceStatus, FileProtocols, FileCategories
from findex_common.crawl import make_resource_search_column
from findex_common.utils import random_str
from findex_common.utils_time import TimeMagic
from findex_common import static_variables
from findex_gui.bin.utils import Extended
from findex_gui.controllers.auth.auth import AuthUser, get_current_user_data
from findex_gui.controllers.user.roles import RolesType

BASE = declarative_base(name="Model")
force_auto_coercion()
Exemple #16
0
    def get_resources(uid: int = None,
                      name: str = None,
                      address: str = None,
                      port: int = None,
                      limit: int = None,
                      offset: int = None,
                      by_owner: int = None,
                      search: str = None,
                      protocol: int = None,
                      scheduled: bool = None,
                      order_by: str = None):
        """
        Fetches some resources
        :param uid:
        :param name:
        :param address:
        :param port:
        :param limit:
        :param offset:
        :param by_owner:
        :param protocol:
        :param order_by: order_by a column
        :param search: performs a fulltext search on column 'search' which
        :param scheduled: filter on scheduled
        includes: IP/DOMAIN, NAME, DISPLAY_URL, PROTOCOL
        :return:
        """
        # normal sqla or zdb?
        if search and config("findex:elasticsearch:enabled"):
            q = ZdbQuery(Resource, session=db.session)
        else:
            q = db.session.query(Resource)

        if isinstance(by_owner, int):
            q = q.filter(Resource.created_by_id == by_owner)

        if isinstance(uid, int):
            q = q.filter(Resource.id == uid)

        if isinstance(protocol, int):
            q = q.filter(Resource.protocol == protocol)

        if isinstance(scheduled, bool):
            if scheduled:
                q = q.filter(Resource.date_crawl_next <= datetime.now())
            else:
                q = q.filter(Resource.date_crawl_end.isnot(None))

        if isinstance(address, str) and address:
            qs = Server.query
            server = qs.filter(Server.address == address).first()
            if not server:
                raise Exception("Could not find server")
            q = q.filter(Resource.server_id == server.id)

        if isinstance(port, int):
            q = q.filter(Resource.port == port)

        if isinstance(search, str) and search:
            q = q.filter(Resource.search.like(search))

        if isinstance(name, str) and name:
            qs = Server.query
            server = qs.filter(Server.name == name).first()
            if not server:
                raise Exception("Could not find server")

            q = q.filter(Resource.server_id == server.id)

        if isinstance(order_by, str):
            c = getattr(Resource, order_by)
            q = q.order_by(desc(c))

        if offset and isinstance(offset, int):
            q = q.offset(offset)

        if limit and isinstance(limit, int):
            q = q.limit(limit)

        return q.all()
Exemple #17
0
def create_app():
    global app, auth, babel, locales, themes
    app = Flask(import_name=__name__,
                static_folder=None,
                template_folder='themes')

    # setup config
    app.config['MAX_CONTENT_LENGTH'] = 1000 * 1024 * 1024
    app.config['SECRET_KEY'] = config("findex:findex:secret_token")
    app.config['dir_base'] = os.path.dirname(os.path.abspath(__file__))
    app.config['dir_root'] = '/'.join(app.config['dir_base'].split('/')[:-1])
    app.config['APPLICATION_ROOT'] = config("findex:findex:application_root")
    app.config['TEMPLATES_AUTO_RELOAD'] = config("findex:findex:debug")
    app.config['PIP_FREEZE'] = []
    SECRET_KEY = config("findex:findex:secret_token")

    # ISO 8601 datetimes
    from findex_gui.bin.utils import ApiJsonEncoder
    app.json_encoder = ApiJsonEncoder

    from findex_gui.bin.utils import dirty_url_for
    dirty_url_for()

    # setup translations
    babel = Babel(app)
    locales = {'en': 'English', 'nl': 'Nederlands'}

    # init some flask stuff
    import findex_gui.bin.utils
    import findex_gui.controllers.routes.static
    import findex_gui.controllers.routes.errors
    import findex_gui.controllers.routes.before_request

    # init user authentication
    from findex_gui.controllers.auth.auth import Auth
    import hashlib
    auth = Auth(app)
    auth.user_timeout = 604800
    auth.hash_algorithm = hashlib.sha256

    # bootstrap db with default values
    db.bootstrap()

    from findex_gui.bin.themes import ThemeController
    themes = ThemeController()

    # init routes
    from findex_gui.controllers.search import routes
    from findex_gui.controllers.browse import routes
    from findex_gui.controllers.relay import routes
    from findex_gui.controllers.user import routes
    from findex_gui.controllers.meta import routes
    from findex_gui.controllers.news import routes
    from findex_gui.controllers.admin import routes
    from findex_gui.controllers.admin.amqp import routes
    from findex_gui.controllers.admin.server import routes
    from findex_gui.controllers.admin.status import routes
    from findex_gui.controllers.admin.scheduler import routes

    @app.route("/")
    def root():
        """wait for url_for to get monkey patched before trying to import it"""
        from flask import url_for
        return redirect(url_for("news_home"))

    from findex_gui.controllers.search import api
    from findex_gui.controllers.session import api
    from findex_gui.controllers.user import api
    from findex_gui.controllers.browse import api
    from findex_gui.controllers.resources import api
    from findex_gui.controllers.meta import api
    from findex_gui.controllers.news import api
    from findex_gui.controllers.admin import api
    from findex_gui.controllers.nmap import api
    from findex_gui.controllers.amqp import api
    from findex_gui.controllers.admin.status import api
    from findex_gui.controllers.admin.scheduler import api
    from findex_gui.controllers.admin.logs import api

    return app
Exemple #18
0
    def bootstrap(self):
        # check necessary postgres extensions
        self.create_extension(
            extension="pg_trgm",
            msg_on_activate_error="Postgres extension \"pg_trgm\" installed but "
            "could not be enabled, "
            "possibly missing administrator rights to enable "
            "pg_trgm: `CREATE EXTENSION pg_trgm;`")
        if config("findex:elasticsearch:enabled"):
            self.create_extension(
                extension="zombodb",
                msg_on_activate_error=
                "Postgres extension \"zombodb\" installed but "
                "could not be enabled.")

        # create the tables, types and indexes
        BASE.metadata.create_all(bind=self.engine)

        if config("findex:elasticsearch:enabled"):
            # check required types for es
            if not self.check_type(type_name="type_files"):
                raise DatabaseException(
                    "Postgres type `type files` not found. "
                    "Try the following SQL to rebuild the table:\n"
                    "\tDROP TYPE type_files CASCADE;\n"
                    "\tDROP TABLE files;\n")
            # check if the zombodb index is present
            if not self.check_index(table_name="files", index="idx_zdb_files"):
                raise DatabaseException(
                    "Postgres index `idx_zdb_files` not found "
                    "while ElasticSearch was enabled.\n"
                    "Try the following SQL to rebuild the table:\n"
                    "\tDROP TYPE type_files CASCADE;\n"
                    "\tDROP TABLE files;\n")
        else:
            if self.check_index(table_name="files", index="idx_zdb_files"):
                raise DatabaseException(
                    "Please remove the index `idx_zdb_files` before "
                    "using findex without ES enabled:\n"
                    "\tDROP INDEX idx_zdb_files\n"
                    "\tcurl -XDELETE <es_host> db.schema.table.index")

        from findex_gui.controllers.user.user import UserController
        from findex_gui.controllers.user.roles import default_anon_roles
        from findex_gui.controllers.resources.resources import ResourceController

        # add some default users, groups and tasks to the database
        if not UserController.user_view(username="******"):
            UserController.user_add(
                username="******",
                password=config("findex:users:default_root_password"),
                removeable=False,
                admin=True,
                skip_authorization=True)

        if not UserController.user_view(username="******"):
            UserController.user_add(
                username="******",
                password=config("findex:users:default_anon_password"),
                privileges=default_anon_roles,
                removeable=False,
                skip_authorization=True)

        if not ResourceController.get_resource_group(name="Default"):
            ResourceController.add_resource_group(
                name="Default",
                description="Default group",
                removable=False,
                skip_authorization=True,
                log_error=False,
                ignore_constraint_conflict=True)