Exemple #1
0
def queue_dataset():
    """
	Queue a 4CAT search query for processing into a dataset

	Requires authentication by logging in or providing a valid access token.
	Request parameters vary by data source. The ones mandated constitute the
	minimum but more may be required.

	:request-param str board:  Board ID to query
	:request-param str datasource:  Data source ID to query
	:request-param str body_match:  String to match in the post body
	:request-param str subject_match:  String to match in the post subject
    :request-param int min_date:  Timestamp marking the beginning of the match
                                  period
    :request-param int max_date:  Timestamp marking the end of the match period
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return str:  The dataset key, which may be used to later retrieve dataset
	              status and results.
	:return-error 404: If the datasource does not exist.
	"""

    datasource_id = request.form.get("datasource", "")
    if datasource_id not in backend.all_modules.datasources:
        return error(404,
                     message="Datasource '%s' does not exist" % datasource_id)

    search_worker_id = datasource_id + "-search"
    if search_worker_id not in backend.all_modules.workers:
        return error(404,
                     message="Datasource '%s' has no search interface" %
                     datasource_id)

    search_worker = backend.all_modules.workers[search_worker_id]

    if hasattr(search_worker["class"], "validate_query"):
        try:
            sanitised_query = search_worker["class"].validate_query(
                request.form.to_dict(), request, current_user)
        except QueryParametersException as e:
            return "Invalid query. %s" % e
    else:
        sanitised_query = request.form.to_dict()

    sanitised_query["user"] = current_user.get_id()
    sanitised_query["datasource"] = datasource_id
    sanitised_query["type"] = search_worker_id

    dataset = DataSet(parameters=sanitised_query, db=db, type="search")

    if hasattr(search_worker["class"], "after_create"):
        search_worker["class"].after_create(sanitised_query, dataset, request)

    queue.add_job(jobtype=search_worker_id, remote_id=dataset.key)

    return dataset.key
Exemple #2
0
def queue_processor(key=None, processor=None):
	"""
	Queue a new processor

	Queues the processor for a given dataset; with the returned query key,
	the processor status can then be checked periodically to download the
	result when available.

	Note that apart from the required parameters, further parameters may be
	provided based on the configuration options available for the chosen
	processor. Available options may be found via the
	`/get-available-processors/` endpoint.

	:request-param str key:  Key of dataset to queue processor for
	:request-param str processor:  ID of processor to queue
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return: A list of dataset properties, with each dataset an item with a `key`,
	        whether it had `finished`, a `html` snippet containing details,
	        a `url` at which the result may be downloaded when finished, and a
	        list of `messages` describing any warnings generated while queuing.

	:return-schema: {type=object,additionalProperties={type=object,properties={
		key={type=string},
		finished={type=boolean},
		html={type=string},
		url={type=string},
		messages={type=array,items={type=string}}
	}}}
	"""
	if request.files and "input_file" in request.files:
		input_file = request.files["input_file"]
		if not input_file:
			return jsonify({"error": "No file input provided"})

		if input_file.filename[-4:] != ".csv":
			return jsonify({"error": "File input is not a csv file"})

		test_csv_file = csv.DictReader(input_file.stream)
		if "body" not in test_csv_file.fieldnames:
			return jsonify({"error": "File must contain a 'body' column"})

		filename = secure_filename(input_file.filename)
		input_file.save(config.PATH_DATA + "/")

	elif not key:
		key = request.form.get("key", "")

	if not processor:
		processor = request.form.get("processor", "")

	# cover all bases - can only run processor on "parent" dataset
	try:
		dataset = DataSet(key=key, db=db)
	except TypeError:
		return jsonify({"error": "Not a valid dataset key."})

	# check if processor is available for this dataset
	if processor not in dataset.processors:
		return jsonify({"error": "This processor is not available for this dataset or has already been run."})

	# create a dataset now
	options = UserInput.parse_all(dataset.processors[processor]["options"], request.form.to_dict(), silently_correct=False)
	options["user"] = current_user.get_id()

	analysis = DataSet(parent=dataset.key, parameters=options, db=db,
					   extension=dataset.processors[processor]["extension"], type=processor)
	if analysis.is_new:
		# analysis has not been run or queued before - queue a job to run it
		queue.add_job(jobtype=processor, remote_id=analysis.key)
		job = Job.get_by_remote_ID(analysis.key, database=db)
		analysis.link_job(job)
		analysis.update_status("Queued")
	else:
		flash("This analysis (%s) is currently queued or has already been run with these parameters." %
			  dataset.processors[processor]["name"])

	return jsonify({
		"status": "success",
		"container": "*[data-dataset-key=" + dataset.key + "]",
		"key": analysis.key,
		"html": render_template("result-child.html", child=analysis, dataset=dataset, parent_key=dataset.key,
								processors=backend.all_modules.processors) if analysis.is_new else "",
		"messages": get_flashed_messages(),
		"is_filter": dataset.processors[processor]["is_filter"]
	})
Exemple #3
0
def queue_dataset():
	"""
	Queue a 4CAT search query for processing into a dataset

	Requires authentication by logging in or providing a valid access token.
	Request parameters vary by data source. The ones mandated constitute the
	minimum but more may be required.

	:request-param str board:  Board ID to query
	:request-param str datasource:  Data source ID to query
	:request-param str body_match:  String to match in the post body
	:request-param str subject_match:  String to match in the post subject
    :request-param int min_date:  Timestamp marking the beginning of the match
                                  period
    :request-param int max_date:  Timestamp marking the end of the match period
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return str:  The dataset key, which may be used to later retrieve dataset
	              status and results.
	:return-error 404: If the datasource does not exist.
	"""

	datasource_id = request.form.get("datasource", "")
	if datasource_id not in backend.all_modules.datasources:
		return error(404, message="Datasource '%s' does not exist" % datasource_id)

	search_worker_id = datasource_id + "-search"
	if search_worker_id not in backend.all_modules.workers:
		return error(404, message="Datasource '%s' has no search interface" % datasource_id)

	search_worker = backend.all_modules.workers[search_worker_id]
	worker_class = backend.all_modules.load_worker_class(search_worker)

	if hasattr(worker_class, "validate_query"):
		try:
			# first sanitise values
			sanitised_query = UserInput.parse_all(worker_class.options, request.form.to_dict(), silently_correct=False)

			# then validate for this particular datasource
			sanitised_query = worker_class.validate_query(sanitised_query, request, current_user)
		except QueryParametersException as e:
			return "Invalid query. %s" % e
	else:
		raise NotImplementedError("Data sources MUST sanitise input values with validate_query")

	sanitised_query["user"] = current_user.get_id()
	sanitised_query["datasource"] = datasource_id
	sanitised_query["type"] = search_worker_id

	sanitised_query["pseudonymise"] = bool(request.form.to_dict().get("pseudonymise", False))

	extension = worker_class.extension if hasattr(worker_class, "extension") else "csv"
	dataset = DataSet(parameters=sanitised_query, db=db, type=search_worker_id, extension=extension)

	if hasattr(worker_class, "after_create"):
		worker_class.after_create(sanitised_query, dataset, request)

	queue.add_job(jobtype=search_worker_id, remote_id=dataset.key)

	return dataset.key
Exemple #4
0
def process_standalone(processor):
    """
	Run a standalone processor

	This bypasses the usual 4CAT query-processor structure and allows running
	any available processor (see the `/api/get-standalone-processors/`
	endpoint) with one API call. The data is returned immediately and not saved
	server-side.

	Requires authentication.

	:param str processor:  ID of the processor to run on incoming data

	:request-body object data:  Data to process, a JSON-formatted list of
	objects with each object having at least they keys `post_id`,
	`thread_id`, body`, and `author`.

	:request-schema data: {
		type=object,
		properties={
			post_id={type=string},
			thread_id={type=string},
			body={type=string},
			author={type=string}
		}
	}

    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return:  A JSON object containing the processed data, with a
	processor-specific structure.

	:return-schema: {
		type=object,
		additionalProperties={}
	}

	:return-error 402: If an invalid processor is requested, or if the input is
	not properly-formatted JSON.
	:return-error 503: If too many other requests are currently being handled,
	so that the server does not have the capacity to deal with this request
	"""
    processors = get_standalone_processors().get_json()

    if processor not in processors:
        return error(402, error="Processor '%s' is not available" % processor)

    if not request.is_json:
        return error(
            402,
            error="This API endpoint only accepts JSON-formatted data as input"
        )

    try:
        input = request.get_json(force=True)
    except json.JSONDecodeError:
        return error(402, error="JSON decoding error")

    # check file integrity
    required = ("id", "thread_id", "body", "author")
    try:
        for row in input:
            for field in required:
                if field not in row:
                    return error(
                        402,
                        error=
                        "Input is valid JSON, but not a list of data objects (missing field '%s')"
                        % field)
    except TypeError:
        return error(
            402, error="Input is valid JSON, but not a list of data objects")

    if not input:
        return error(402, error="Input is empty")

    # ok, valid input!
    temp_dataset = DataSet(extension="csv",
                           type="standalone",
                           parameters={
                               "user": current_user.get_id(),
                               "after": [processor]
                           },
                           db=db)
    temp_dataset.finish(len(input))

    # make sure the file is deleted later, whichever way this request is
    # ultimately handled
    @after_this_request
    def delete_temporary_dataset(response):
        temp_dataset.delete()  # also deletes children!
        return response

    # write the input as a csv file so it can be accessed as normal by
    # processors
    result_file = temp_dataset.get_results_path()
    with result_file.open("w") as temp_csv:
        writer = csv.DictWriter(temp_csv, fieldnames=required)
        writer.writeheader()
        for row in input:
            writer.writerow({field: row[field] for field in required})

    # queue the postprocessor
    metadata = processors[processor]
    processed = DataSet(extension=metadata["extension"],
                        type=processor,
                        parent=temp_dataset.key,
                        db=db)

    queue = JobQueue(database=db, logger=log)
    job = queue.add_job(processor, {}, processed.key)
    place_in_queue = queue.get_place_in_queue(job)
    if place_in_queue > 5:
        job.finish()
        return error(
            code=503,
            error=
            "Your request could not be handled as there are currently %i other jobs of this type in the queue. Please try again later."
            % place_in_queue)

    # wait up to half a minute for the job to be taken up
    # if not, tell the user to try again later

    start = time.time()
    while True:
        if time.time() > start + 30:
            job.finish()
            return error(
                code=503,
                error=
                "The server is currently too busy to handle your request. Please try again later."
            )

        if queue.get_place_in_queue(job) != 0:
            time.sleep(2)
            continue
        else:
            break

    # job currently being processed, wait for it to finish
    while True:
        try:
            job = Job.get_by_remote_ID(job.data["remote_id"], db, processor)
        except JobNotFoundException:
            break

        if not job.is_finished:
            time.sleep(2)
        else:
            break

    # job finished, send file - temporary datasets will be cleaned up by
    # after_this_request function defined earlier
    return send_file(processed.get_results_path(), as_attachment=True)