def view_config(ctx): from findex_gui.bin.config import config print("config location: %s" % cwd()) print("application_root: %s" % config("findex:findex:application_root")) print("debug: %s" % str(config("findex:findex:debug"))) print("async: %s" % config("findex:findex:async")) print("database: %s" % config("findex:database:connection"))
def web(ctx, args, host, port, uwsgi, nginx): """ Operate the Findex Web Interface. @TODO: figure out uwsgi help thingy """ logo(version) if nginx: # @TODO: nginx help msg here pass # Switch to findex/web and add the current path to sys.path as the Web # Interface is using local imports here and there. # TODO Rename local imports to either Findex.web.* or relative imports. os.chdir(findex_gui.__path__[0] ) # os.chdir(os.path.join(findex_gui.__path__[0], "web")) sys.path.insert(0, ".") os.environ["FINDEX_APP"] = "web" os.environ["FINDEX_CWD"] = cwd() try: app_debug = config("findex:findex:debug") bind_host = host bind_port = port def run_sync(): from findex_gui.web import create_app app = create_app() app.run(debug=app_debug, host=bind_host, port=bind_port, use_reloader=False) def run_async(): from gevent import monkey monkey.patch_all() from gevent.pywsgi import WSGIServer from findex_gui.web import create_app app = create_app() http_server = WSGIServer((bind_host, bind_port), app) print( green(" * Running on http://%s:%s/ (Press CTRL+C to quit)") % (bind_host, str(bind_port))) http_server.serve_forever() if config("findex:findex:async"): run_async() else: run_sync() except Exception as e: message = red("{0}: {1}".format(e.__class__.__name__, e)) if len(log.handlers): log.critical(message) else: sys.stderr.write("{0}\n".format(traceback.format_exc())) sys.exit(1)
def check_specific_config(filename): sections = Config.configuration[filename] for section, entries in sections.items(): if section == "*" or section == "__star__": continue # If an enabled field is present, check it beforehand. if config("%s:%s:enabled" % (filename, section)) is False: continue for key, value in entries.items(): config( "%s:%s:%s" % (filename, section, key), check=True, strict=True )
def render(self, template_path, theme=None, status_code=200, **kwargs): if not theme: theme = self.get_active() # @TO-DO: use a context processor kwargs["env"] = {z: app.config[z] for z in app.config if z.islower()} kwargs["env"]["application_root"] = app.config["APPLICATION_ROOT"] user = UserController.get_current_user() user_context = get_current_user_data() if user_context: if not session.get("locale"): session["locale"] = user.locale elif session["locale"] != user.locale: session["locale"] = user.locale kwargs["user"] = user try: return render_template("%s/templates/%s.html" % (theme, template_path), url_for=url_for, **kwargs), status_code except TemplateNotFound as e: return "Template \"%s\" not found" % str(e) except Exception as ex: print(ex) if config("findex:findex:debug"): return "Jinja2 error!\n\n%s" % str(ex) return "Jinja2 error!"
def generate_crawl_config(ctx): logo(version) from findex_gui.bin.config import generate_crawl_config from findex_common.utils import random_str db = config("findex:database:connection") spl = db[db.find("://") + 3:].split(":") spl_ = spl[1].split("@") spl__ = spl[2].split("/") db_host = spl_[1] db_user = spl[0] db_pass = spl_[1] db_port = int(spl__[0]) db_name = spl__[1] crawl_config = generate_crawl_config(bot_name="bot_%s" % random_str(8), db_host=db_host, db_port=db_port, db_name=db_name, db_user=db_user, db_pass=db_pass, db_max_bulk_inserts=1000) print("Save the following as `settings.py`") print("=" * 26) print(crawl_config) print("=" * 26)
def database(): rtn = OrderedDict() dsn = config("findex:database:connection") rtn["dsn (RFC-1738)"] = _item(data=dsn, cls="info") encoding = DatabaseStatus.raw_query("SHOW SERVER_ENCODING", cls="ok") if "UTF8" not in encoding.data: encoding.cls = "warning" rtn["encoding"] = encoding dsn_parsed = dsnparse.parse(dsn) dsn_blob = { "user": dsn_parsed.username, "pass": dsn_parsed.password, "host": dsn_parsed.host, "port": 5432 if not isinstance(dsn_parsed.port, int) else dsn_parsed.port, "db": dsn_parsed.paths[0] } for k, v in dsn_blob.items(): rtn["db_%s" % k] = _item(data=v, cls="ok") rtn["Size on Disk"] = DatabaseStatus.get_size() return rtn
def after_request(r): r.headers.add('Accept-Ranges', 'bytes') if config("findex:findex:debug"): r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" r.headers["Pragma"] = "no-cache" r.headers["Expires"] = "0" r.headers['Cache-Control'] = 'public, max-age=0' return r
def __init__(self): """Connects to the Postgres database.""" self.engine = None self.session = None self.dsn = config("findex:database:connection") self.pool = pool.QueuePool(creator=self._getconn, max_overflow=1, pool_size=300, echo=False) # config("findex:findex:debug")
def findex(): rtn = OrderedDict() rtn["Version"] = _item(cls="info", data=version) rtn["Debug"] = _item(str(config("findex:findex:debug"))) rtn["Config Location"] = _item(cwd()) rtn["Application Root"] = _item( config("findex:findex:application_root")) is_async = "True" if is_gevent_monkey_patched() else "False" rtn["Async mode (Gevent monkey patch)"] = _item(is_async) rtn["No. Findex Users"] = FindexStatus.findex_get_nousers() try: Crawler.can_crawl() can_crawl = _item("Available", cls="ok") except Exception as ex: can_crawl = _item(str(ex), cls="error") rtn["'DIRECT' crawl mode"] = can_crawl has_cron = CronController.has_cronjob() has_cron_err = "Not set, please activate it (Scheduler->Overview)" has_cron = has_cron_err if not has_cron else "Set" rtn["Scheduler cronjob"] = _item( has_cron, cls="info" if has_cron == "Set" else "error") return rtn
def test_db(ctx): import psycopg2 dsn = config("findex:database:connection") try: conn = psycopg2.connect(dsn) cur = conn.cursor() except: print(red("Could not connect to the database via \"%s\"" % dsn)) return try: cur.execute("""SELECT 1;""") one = cur.fetchone() assert one[0] == 1 except: print(red("Database Error")) print(green("Database OK"))
def log_msg(msg: str, category: str, level: int = 1): """ Logs a message :param msg: msg :param category: category :param level: 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR :return: """ from findex_gui.web import db from findex_gui.orm.models import Logging from findex_gui.bin.config import config if not config("findex:findex:debug") and level == 0: return categories = [ "scheduler", "meta_import" ] if category not in categories: sys.stderr.write("cant log category %s - not in categories\n" % category) return print("[%s] %s" % (["DEBUG", "INFO", "WARNING", "ERROR"][level], msg)) try: prev_frame = sys._getframe(1).f_code fn = prev_frame.co_filename fu = prev_frame.co_name fn = "/".join(fn.split("/")[-3:]) file = "%s:%s" % (fn, fu) except: file = None log = Logging() log.file = file log.message = msg log.log_level = level log.category = category db.session.add(log) db.session.commit() db.session.flush()
def get_size(): dsn = dsnparse.parse(config("findex:database:connection")) db_name = dsn.paths[0] sql = """ SELECT pg_size_pretty(pg_database_size(pg_database.datname)) AS size FROM pg_database WHERE datname=:db_name; """ res = DatabaseStatus.raw_query(sql, {"db_name": db_name}) if res.cls != "ok": return res data_dir = DatabaseStatus.raw_query("""show data_directory;""") if data_dir.cls != "ok": res.data += " (error fetching data_dir)" return res else: res.cls = "info" res.data += " @ %s" % data_dir.data return res
def _search(**kwargs): kwargs["key"] = CrawlController.make_valid_key(kwargs["key"]) if not kwargs["key"]: raise Exception("Invalid search. Too short?") q = ZdbQuery(Files, session=db.session) if config( "findex:elasticsearch:enabled") else Files.query # @TODO: filter by protocols / hosts # only find files that are not in "temp" mode # q = q.filter(Files.resource_id >= 1) # ignores certain filters ignore_filters = [] # filter only files/dirs if kwargs.get("file_type"): if "both" in kwargs["file_type"]: pass if "folders" in kwargs["file_type"]: q = q.filter(Files.file_isdir == True) ignore_filters.extend( ("file_size", "file_categories", "file_extensions")) elif "files" in kwargs["file_type"]: q = q.filter(Files.file_isdir == False) # size if kwargs["file_size"] and "file_size" not in ignore_filters: try: file_size = kwargs["file_size"].split("-") if not len(file_size) == 2: raise Exception() if file_size[0] == "*": q = q.filter(Files.file_size <= int(file_size[1])) elif file_size[1] == "*": q = q.filter(Files.file_size >= int(file_size[0])) else: q = q.filter( Files.file_size.between(*[int(x) for x in file_size])) except: pass # filter categories filecategories = FileCategories() cat_ids = [] cats = kwargs.get("file_categories", []) cats = [] if cats is None else cats for cat in cats: cat_id = filecategories.id_by_name(cat) if cat_id is None: continue cat_ids.append(FileCategories().id_by_name(cat)) if cat_ids and "file_categories" not in ignore_filters: q = q.filter(Files.file_format.in_(cat_ids)) if not kwargs["file_categories"]: file_categories = filecategories.get_names() # filter extensions if kwargs[ "file_extensions"] and "file_extensions" not in ignore_filters: exts = [] for ext in kwargs["file_extensions"]: if ext.startswith("."): ext = ext[1:] exts.append(ext) q = q.filter(Files.file_ext.in_(exts)) if isinstance(kwargs["meta_movie_id"], int): q = q.filter(Files.meta_movie_id == kwargs["meta_movie_id"]) # Search if config("findex:elasticsearch:enabled"): val = kwargs["key"] else: if kwargs["autocomplete"] or app.config["db_file_count"] > 5000000: print("warning: too many rows, enable ElasticSearch") val = "%s%%" % escape_like(kwargs["key"]) else: val = "%%%s%%" % escape_like(kwargs["key"]) if val != "*": q = q.filter(Files.searchable.like(val)) q = q.order_by(Files.file_size.desc()) # pagination q = q.offset(kwargs["page"]) if kwargs["autocomplete"]: q = q.limit(5) # q = q.distinct(func.lower(Files.file_name)) q = q.distinct(Files.file_size) else: q = q.limit(kwargs["per_page"]) # fetch try: results = q.all() except Exception as ex: raise Exception(ex) results = SearchController.assign_resource_objects(results) return results
def spawn(tasks, queue_size=5): """ Spawns the crawler in 'DIRECT' mode. The suggested way is to use AMQP instead. :param tasks: :param queue_size: :return: """ path_fincrawl = "%s/findex-crawl/" % python_env["project_root"] log_msg("Spawning local crawler in DIRECT mode for %d tasks" % len(tasks), category="scheduler") # construct tasks json file blobs = [] for t in tasks: crawl_message = CrawlController.crawl_message_make(t) if not crawl_message: continue blobs.append(crawl_message) try: crawl_messages = json.dumps(blobs, indent=4, sort_keys=True) crawl_file = tempfile.mkstemp("_fincrawl.json")[1] except Exception as ex: log_msg(":%s" % (str(ex)), level=3, category="scheduler") return print(crawl_messages) # write tmp tasks file for the crawler f = open(crawl_file, "w") f.write(crawl_messages) f.close() dsn = dsnparse.parse(config("findex:database:connection")) dsn_blob = { "user": dsn.username, "pass": dsn.password, "host": dsn.host, "port": 5432 if not isinstance(dsn.port, int) else dsn.port, "db": dsn.paths[0] } # set env variables shell_env = os.environ.copy( ) # should include the current python virtualenv shell_env["FINDEX_CRAWL_MODE"] = "DIRECT" shell_env["FINDEX_CRAWL_FILE"] = crawl_file shell_env["FINDEX_CRAWL_FILE_CLEANUP"] = ":-D" shell_env["FINDEX_CRAWL_LOG_VERBOSITY"] = "20" shell_env["FINDEX_CRAWL_QUEUE_SIZE"] = str(queue_size) for k, v in dsn_blob.items(): shell_env["FINDEX_CRAWL_DB_%s" % k.upper()] = str(v) for k, v in shell_env.items(): if k.startswith("FINDEX_CRAWL"): print("export %s=\"%s\"" % (k, str(v))) # non-blocking Popen # for some reason `/bin/bash -c` is needed, else it cant find the relative `rpc.py`, even tough `cwd=` is set command = [ "/bin/bash", "-c", "%s/twistd -ony rpc.py &" % os.path.dirname(python_env["interpreter"]) ] print("spawning shell: %s" % " ".join(command)) subprocess.Popen(command, cwd=path_fincrawl, env=shell_env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, universal_newlines=True, preexec_fn=os.setpgrp)
import re import uuid from datetime import datetime, timedelta from flask import request import humanfriendly import sqlalchemy_zdb from findex_gui.bin.config import config sqlalchemy_zdb.ES_HOST = config("findex:elasticsearch:host") from sqlalchemy_zdb import ZdbColumn from sqlalchemy_zdb.types import FULLTEXT from sqlalchemy.orm import relationship, backref from sqlalchemy.orm.attributes import flag_modified from sqlalchemy import (Integer, String, Boolean, DateTime, BigInteger, Index, TIMESTAMP, ForeignKey, Table, Column, SMALLINT, ARRAY) from sqlalchemy_utils import IPAddressType, force_auto_coercion from sqlalchemy.ext.declarative import declarative_base from sqlalchemy_json import MutableJson from findex_common.static_variables import ResourceStatus, FileProtocols, FileCategories from findex_common.crawl import make_resource_search_column from findex_common.utils import random_str from findex_common.utils_time import TimeMagic from findex_common import static_variables from findex_gui.bin.utils import Extended from findex_gui.controllers.auth.auth import AuthUser, get_current_user_data from findex_gui.controllers.user.roles import RolesType BASE = declarative_base(name="Model") force_auto_coercion()
def get_resources(uid: int = None, name: str = None, address: str = None, port: int = None, limit: int = None, offset: int = None, by_owner: int = None, search: str = None, protocol: int = None, scheduled: bool = None, order_by: str = None): """ Fetches some resources :param uid: :param name: :param address: :param port: :param limit: :param offset: :param by_owner: :param protocol: :param order_by: order_by a column :param search: performs a fulltext search on column 'search' which :param scheduled: filter on scheduled includes: IP/DOMAIN, NAME, DISPLAY_URL, PROTOCOL :return: """ # normal sqla or zdb? if search and config("findex:elasticsearch:enabled"): q = ZdbQuery(Resource, session=db.session) else: q = db.session.query(Resource) if isinstance(by_owner, int): q = q.filter(Resource.created_by_id == by_owner) if isinstance(uid, int): q = q.filter(Resource.id == uid) if isinstance(protocol, int): q = q.filter(Resource.protocol == protocol) if isinstance(scheduled, bool): if scheduled: q = q.filter(Resource.date_crawl_next <= datetime.now()) else: q = q.filter(Resource.date_crawl_end.isnot(None)) if isinstance(address, str) and address: qs = Server.query server = qs.filter(Server.address == address).first() if not server: raise Exception("Could not find server") q = q.filter(Resource.server_id == server.id) if isinstance(port, int): q = q.filter(Resource.port == port) if isinstance(search, str) and search: q = q.filter(Resource.search.like(search)) if isinstance(name, str) and name: qs = Server.query server = qs.filter(Server.name == name).first() if not server: raise Exception("Could not find server") q = q.filter(Resource.server_id == server.id) if isinstance(order_by, str): c = getattr(Resource, order_by) q = q.order_by(desc(c)) if offset and isinstance(offset, int): q = q.offset(offset) if limit and isinstance(limit, int): q = q.limit(limit) return q.all()
def create_app(): global app, auth, babel, locales, themes app = Flask(import_name=__name__, static_folder=None, template_folder='themes') # setup config app.config['MAX_CONTENT_LENGTH'] = 1000 * 1024 * 1024 app.config['SECRET_KEY'] = config("findex:findex:secret_token") app.config['dir_base'] = os.path.dirname(os.path.abspath(__file__)) app.config['dir_root'] = '/'.join(app.config['dir_base'].split('/')[:-1]) app.config['APPLICATION_ROOT'] = config("findex:findex:application_root") app.config['TEMPLATES_AUTO_RELOAD'] = config("findex:findex:debug") app.config['PIP_FREEZE'] = [] SECRET_KEY = config("findex:findex:secret_token") # ISO 8601 datetimes from findex_gui.bin.utils import ApiJsonEncoder app.json_encoder = ApiJsonEncoder from findex_gui.bin.utils import dirty_url_for dirty_url_for() # setup translations babel = Babel(app) locales = {'en': 'English', 'nl': 'Nederlands'} # init some flask stuff import findex_gui.bin.utils import findex_gui.controllers.routes.static import findex_gui.controllers.routes.errors import findex_gui.controllers.routes.before_request # init user authentication from findex_gui.controllers.auth.auth import Auth import hashlib auth = Auth(app) auth.user_timeout = 604800 auth.hash_algorithm = hashlib.sha256 # bootstrap db with default values db.bootstrap() from findex_gui.bin.themes import ThemeController themes = ThemeController() # init routes from findex_gui.controllers.search import routes from findex_gui.controllers.browse import routes from findex_gui.controllers.relay import routes from findex_gui.controllers.user import routes from findex_gui.controllers.meta import routes from findex_gui.controllers.news import routes from findex_gui.controllers.admin import routes from findex_gui.controllers.admin.amqp import routes from findex_gui.controllers.admin.server import routes from findex_gui.controllers.admin.status import routes from findex_gui.controllers.admin.scheduler import routes @app.route("/") def root(): """wait for url_for to get monkey patched before trying to import it""" from flask import url_for return redirect(url_for("news_home")) from findex_gui.controllers.search import api from findex_gui.controllers.session import api from findex_gui.controllers.user import api from findex_gui.controllers.browse import api from findex_gui.controllers.resources import api from findex_gui.controllers.meta import api from findex_gui.controllers.news import api from findex_gui.controllers.admin import api from findex_gui.controllers.nmap import api from findex_gui.controllers.amqp import api from findex_gui.controllers.admin.status import api from findex_gui.controllers.admin.scheduler import api from findex_gui.controllers.admin.logs import api return app
def bootstrap(self): # check necessary postgres extensions self.create_extension( extension="pg_trgm", msg_on_activate_error="Postgres extension \"pg_trgm\" installed but " "could not be enabled, " "possibly missing administrator rights to enable " "pg_trgm: `CREATE EXTENSION pg_trgm;`") if config("findex:elasticsearch:enabled"): self.create_extension( extension="zombodb", msg_on_activate_error= "Postgres extension \"zombodb\" installed but " "could not be enabled.") # create the tables, types and indexes BASE.metadata.create_all(bind=self.engine) if config("findex:elasticsearch:enabled"): # check required types for es if not self.check_type(type_name="type_files"): raise DatabaseException( "Postgres type `type files` not found. " "Try the following SQL to rebuild the table:\n" "\tDROP TYPE type_files CASCADE;\n" "\tDROP TABLE files;\n") # check if the zombodb index is present if not self.check_index(table_name="files", index="idx_zdb_files"): raise DatabaseException( "Postgres index `idx_zdb_files` not found " "while ElasticSearch was enabled.\n" "Try the following SQL to rebuild the table:\n" "\tDROP TYPE type_files CASCADE;\n" "\tDROP TABLE files;\n") else: if self.check_index(table_name="files", index="idx_zdb_files"): raise DatabaseException( "Please remove the index `idx_zdb_files` before " "using findex without ES enabled:\n" "\tDROP INDEX idx_zdb_files\n" "\tcurl -XDELETE <es_host> db.schema.table.index") from findex_gui.controllers.user.user import UserController from findex_gui.controllers.user.roles import default_anon_roles from findex_gui.controllers.resources.resources import ResourceController # add some default users, groups and tasks to the database if not UserController.user_view(username="******"): UserController.user_add( username="******", password=config("findex:users:default_root_password"), removeable=False, admin=True, skip_authorization=True) if not UserController.user_view(username="******"): UserController.user_add( username="******", password=config("findex:users:default_anon_password"), privileges=default_anon_roles, removeable=False, skip_authorization=True) if not ResourceController.get_resource_group(name="Default"): ResourceController.add_resource_group( name="Default", description="Default group", removable=False, skip_authorization=True, log_error=False, ignore_constraint_conflict=True)