Esempio n. 1
0
def restart_dataset(key):
	"""
	Run a dataset's query again

	Deletes all underlying datasets, marks dataset as unfinished, and queues a
	job for it.

	:param str key:  Dataset key
	:return:
	"""
	try:
		dataset = DataSet(key=key, db=db)
	except TypeError:
		return error(404, message="Dataset not found.")

	if current_user.get_id() != dataset.parameters.get("user", "") and not current_user.is_admin:
		return error(403, message="Not allowed.")

	if not dataset.is_finished():
		return render_template("error.html", message="This dataset is not finished yet - you cannot re-run it.")

	if "type" not in dataset.parameters:
		return render_template("error.html",
							   message="This is an older dataset that unfortunately lacks the information necessary to properly restart it.")

	for child in dataset.children:
		child.delete()

	dataset.unfinish()
	queue = JobQueue(logger=log, database=db)
	queue.add_job(jobtype=dataset.parameters["type"], remote_id=dataset.key)

	flash("Dataset queued for re-running.")
	return redirect("/results/" + dataset.key + "/")
Esempio n. 2
0
    def __init__(self,
                 logger,
                 job,
                 db=None,
                 queue=None,
                 manager=None,
                 modules=None):
        """
		Basic init, just make sure our thread name is meaningful

		:param Database db:  Database connection - if not given, a new one will be created
		:param JobQueue queue: Job Queue - if not given, a new one will be instantiated
		:param WorkerManager manager:  Worker manager reference
		"""
        super().__init__()
        self.name = self.type
        self.log = logger
        self.manager = manager
        self.job = job
        self.init_time = int(time.time())

        # all_modules cannot be easily imported into a worker because all_modules itself
        # imports all workers, so you get a recursive import that Python (rightly) blocks
        # so for workers, all_modules' content is passed as a constructor argument
        self.all_modules = modules

        self.db = Database(logger=self.log,
                           appname=self.type) if not db else db
        self.queue = JobQueue(logger=self.log,
                              database=self.db) if not queue else queue
Esempio n. 3
0
def api_status():
    """
	Get service status

	:return: Flask JSON response
	"""

    # get job stats
    queue = JobQueue(logger=log, database=db)
    jobs = queue.get_all_jobs()
    jobs_count = len(jobs)
    jobs_types = set([job.data["jobtype"] for job in jobs])
    jobs_sorted = {
        jobtype: len([job for job in jobs if job.data["jobtype"] == jobtype])
        for jobtype in jobs_types
    }
    jobs_sorted["total"] = jobs_count

    # determine if backend is live by checking if the process is running
    lockfile = Path(config.PATH_ROOT, config.PATH_LOCKFILE, "4cat.pid")
    if os.path.isfile(lockfile):
        with lockfile.open() as pidfile:
            pid = pidfile.read()
            backend_live = int(pid) in psutil.pids()
    else:
        backend_live = False

    response = {
        "code": API_SUCCESS,
        "items": {
            "backend": {
                "live": backend_live,
                "queued": jobs_sorted
            },
            "frontend": {
                "live": True  # duh
            }
        }
    }

    return jsonify(response)
Esempio n. 4
0
                 type=str,
                 required=True,
                 help="Datasource ID")
cli.add_argument("-b", "--board", type=str, required=True, help="Board name")
args = cli.parse_args()

if not Path(args.input).exists() or not Path(args.input).is_dir():
    print("%s is not a valid folder name." % args.input)
    sys.exit(1)

input = Path(args.input).resolve()
jsons = input.glob("*.json")

print("Initialising queue...")
logger = Logger()
queue = JobQueue(logger=logger,
                 database=Database(logger=logger, appname="queue-folder"))

print("Adding files to queue...")
files = 0
deadline = time.time()
for file in jsons:
    files += 1
    file = str(file)
    queue.add_job(args.datasource + "-thread",
                  remote_id=file,
                  details={
                      "board": args.board,
                      "file": str(file)
                  },
                  claim_after=int(deadline))
    deadline += 0.1
Esempio n. 5
0
def run(as_daemon=True):
    if not as_daemon:
        indent_spaces = round(shutil.get_terminal_size().columns / 2) - 33
        indent = "".join([" " for i in range(0, indent_spaces)
                          ]) if indent_spaces > 0 else ""
        print("\n\n")
        print(
            indent +
            "+---------------------------------------------------------------+"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                           welcome to                          |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                  j88D   .o88b.  .d8b.  d888888b               |"
        )
        print(
            indent +
            "|                 j8~88  d8P  Y8 d8' `8b `~~88~~'               |"
        )
        print(
            indent +
            "|                j8' 88  8P      88ooo88    88                  |"
        )
        print(
            indent +
            "|                V88888D 8b      88~~~88    88                  |"
        )
        print(
            indent +
            "|                    88  Y8b  d8 88   88    88                  |"
        )
        print(
            indent +
            "|                    VP   `Y88P' YP   YP    YP                  |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|               4CAT: Capture and Analysis Toolkit              |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "+---------------------------------------------------------------+"
        )
        print(
            indent +
            "|                  press q + enter to shut down                 |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "| WARNING: Not running as a daemon.  Quitting this process will |"
        )
        print(
            indent +
            "|                 shut down the backend as well.                |"
        )
        print(
            indent +
            "+---------------------------------------------------------------+\n\n"
        )

    # load everything
    log = Logger(output=not as_daemon)
    db = Database(logger=log, appname="main")
    queue = JobQueue(logger=log, database=db)

    # clean up after ourselves
    db.commit()
    queue.release_all()

    # make it happen
    WorkerManager(logger=log, database=db, queue=queue, as_daemon=as_daemon)
    log.info("4CAT Backend shut down.")
Esempio n. 6
0
def process_standalone(processor):
    """
	Run a standalone processor

	This bypasses the usual 4CAT query-processor structure and allows running
	any available processor (see the `/api/get-standalone-processors/`
	endpoint) with one API call. The data is returned immediately and not saved
	server-side.

	Requires authentication.

	:param str processor:  ID of the processor to run on incoming data

	:request-body object data:  Data to process, a JSON-formatted list of
	objects with each object having at least they keys `post_id`,
	`thread_id`, body`, and `author`.

	:request-schema data: {
		type=object,
		properties={
			post_id={type=string},
			thread_id={type=string},
			body={type=string},
			author={type=string}
		}
	}

    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return:  A JSON object containing the processed data, with a
	processor-specific structure.

	:return-schema: {
		type=object,
		additionalProperties={}
	}

	:return-error 402: If an invalid processor is requested, or if the input is
	not properly-formatted JSON.
	:return-error 503: If too many other requests are currently being handled,
	so that the server does not have the capacity to deal with this request
	"""
    processors = get_standalone_processors().get_json()

    if processor not in processors:
        return error(402, error="Processor '%s' is not available" % processor)

    if not request.is_json:
        return error(
            402,
            error="This API endpoint only accepts JSON-formatted data as input"
        )

    try:
        input = request.get_json(force=True)
    except json.JSONDecodeError:
        return error(402, error="JSON decoding error")

    # check file integrity
    required = ("id", "thread_id", "body", "author")
    try:
        for row in input:
            for field in required:
                if field not in row:
                    return error(
                        402,
                        error=
                        "Input is valid JSON, but not a list of data objects (missing field '%s')"
                        % field)
    except TypeError:
        return error(
            402, error="Input is valid JSON, but not a list of data objects")

    if not input:
        return error(402, error="Input is empty")

    # ok, valid input!
    temp_dataset = DataSet(extension="csv",
                           type="standalone",
                           parameters={
                               "user": current_user.get_id(),
                               "after": [processor]
                           },
                           db=db)
    temp_dataset.finish(len(input))

    # make sure the file is deleted later, whichever way this request is
    # ultimately handled
    @after_this_request
    def delete_temporary_dataset(response):
        temp_dataset.delete()  # also deletes children!
        return response

    # write the input as a csv file so it can be accessed as normal by
    # processors
    result_file = temp_dataset.get_results_path()
    with result_file.open("w") as temp_csv:
        writer = csv.DictWriter(temp_csv, fieldnames=required)
        writer.writeheader()
        for row in input:
            writer.writerow({field: row[field] for field in required})

    # queue the postprocessor
    metadata = processors[processor]
    processed = DataSet(extension=metadata["extension"],
                        type=processor,
                        parent=temp_dataset.key,
                        db=db)

    queue = JobQueue(database=db, logger=log)
    job = queue.add_job(processor, {}, processed.key)
    place_in_queue = queue.get_place_in_queue(job)
    if place_in_queue > 5:
        job.finish()
        return error(
            code=503,
            error=
            "Your request could not be handled as there are currently %i other jobs of this type in the queue. Please try again later."
            % place_in_queue)

    # wait up to half a minute for the job to be taken up
    # if not, tell the user to try again later

    start = time.time()
    while True:
        if time.time() > start + 30:
            job.finish()
            return error(
                code=503,
                error=
                "The server is currently too busy to handle your request. Please try again later."
            )

        if queue.get_place_in_queue(job) != 0:
            time.sleep(2)
            continue
        else:
            break

    # job currently being processed, wait for it to finish
    while True:
        try:
            job = Job.get_by_remote_ID(job.data["remote_id"], db, processor)
        except JobNotFoundException:
            break

        if not job.is_finished:
            time.sleep(2)
        else:
            break

    # job finished, send file - temporary datasets will be cleaned up by
    # after_this_request function defined earlier
    return send_file(processed.get_results_path(), as_attachment=True)
Esempio n. 7
0
from flask_limiter.util import get_remote_address

import config

from backend.lib.database import Database
from backend.lib.logger import Logger
from backend.lib.queue import JobQueue

database_name = config.DB_NAME_TEST if hasattr(
    config.FlaskConfig,
    "DEBUG") and config.FlaskConfig.DEBUG == "Test" else config.DB_NAME
login_manager = LoginManager()
app = Flask(__name__)
log = Logger()
db = Database(logger=log, dbname=database_name, appname="frontend")
queue = JobQueue(logger=log, database=db)

# initialize openapi endpoint collector for later specification generation
from webtool.lib.openapi_collector import OpenAPICollector
openapi = OpenAPICollector(app)

# initialize rate limiter
limiter = Limiter(app, key_func=get_remote_address)

# make sure a secret key was set in the config file, for secure session cookies
if config.FlaskConfig.SECRET_KEY == "REPLACE_THIS":
    raise Exception(
        "You need to set a FLASK_SECRET in config.py before running the web tool."
    )

# initialize login manager