Python DataSet Examples, backend.lib.dataset.DataSet Python Examples

Example #1

0

Show file

File: expire_datasets.py Project: pgr-me/4cat

	def work(self):
		"""
		Go through all datasources, and if it is configured to automatically
		delete old datasets, do so for all qualifying datasets
		:return:
		"""
		for datasource_id in self.all_modules.datasources:
			datasource = self.all_modules.datasources[datasource_id]

			# default = never expire
			if not datasource.get("expire-datasets", None):
				continue

			cutoff = time.time() - datasource.get("expire-datasets")
			datasets = self.db.fetchall(
				"SELECT key FROM datasets WHERE key_parent = '' AND parameters::json->>'datasource' = %s AND timestamp < %s",
				(datasource_id, cutoff))

			# we instantiate the dataset, because its delete() method does all
			# the work (e.g. deleting child datasets) for us
			for dataset in datasets:
				dataset = DataSet(key=dataset["key"], db=self.db)
				dataset.delete()
				self.log.info("Deleting dataset %s/%s (expired per configuration)" % (datasource, dataset.key))

		self.job.finish()

Example #2

0

Show file

File: views.py Project: pgr-me/4cat

def preview_csv(key):
    """
	Preview a CSV file

	Simply passes the first 25 rows of a dataset's csv result file to the
	template renderer.

	:param str key:  Dataset key
	:return:  HTML preview
	"""
    try:
        dataset = DataSet(key=key, db=db)
    except TypeError:
        return error(404, "Dataset not found.")

    try:
        with dataset.get_results_path().open(encoding="utf-8") as csvfile:
            rows = []
            reader = csv.reader(csvfile)
            while len(rows) < 25:
                try:
                    row = next(reader)
                    rows.append(row)
                except StopIteration:
                    break
    except FileNotFoundError:
        abort(404)

    return render_template("result-csv-preview.html",
                           rows=rows,
                           filename=dataset.get_results_path().name)

Example #3

0

Show file

def restart_dataset(key):
	"""
	Run a dataset's query again

	Deletes all underlying datasets, marks dataset as unfinished, and queues a
	job for it.

	:param str key:  Dataset key
	:return:
	"""
	try:
		dataset = DataSet(key=key, db=db)
	except TypeError:
		return error(404, message="Dataset not found.")

	if current_user.get_id() != dataset.parameters.get("user", "") and not current_user.is_admin:
		return error(403, message="Not allowed.")

	if not dataset.is_finished():
		return render_template("error.html", message="This dataset is not finished yet - you cannot re-run it.")

	if "type" not in dataset.parameters:
		return render_template("error.html",
							   message="This is an older dataset that unfortunately lacks the information necessary to properly restart it.")

	for child in dataset.children:
		child.delete()

	dataset.unfinish()
	queue = JobQueue(logger=log, database=db)
	queue.add_job(jobtype=dataset.parameters["type"], remote_id=dataset.key)

	flash("Dataset queued for re-running.")
	return redirect("/results/" + dataset.key + "/")

Example #4

0

Show file

def delete_dataset(key=None):
    """
	Delete a dataset

	Only available to administrators. Deletes a dataset, as well as any
	children linked to it, from 4CAT. Calling this on a dataset that is
	currently being executed is undefined behaviour.

	:request-param str query_key:  ID of the dataset for which to return the status
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return: A dictionary with a successful `status`.

	:return-schema: {type=object,properties={status={type=string}}}

	:return-error 404:  If the dataset does not exist.
	"""
    if not current_user.is_admin():
        return error(403, message="Not allowed")

    dataset_key = request.form.get("key", "") if not key else key

    try:
        dataset = DataSet(key=dataset_key, db=db)
    except TypeError:
        return error(404, error="Dataset does not exist.")

    dataset.delete()
    return jsonify({"status": "success"})

Example #5

0

Show file

File: api_tool.py Project: p-charis/4cat

def available_processors():
	"""
	Get processors available for a dataset

	:request-param string key:  Dataset key to get processors for
	:return: An object containing the `error` if the request failed, or a list
	         of processors, each with a `name`, a `type` ID, a
	         `description` of what it does, the `extension` of the file it
	         produces, a `category` name, what types of datasets it `accepts`,
	         and a list of `options`, if applicable.

	:return-schema: {type=array,items={type=object,properties={
		name={type=string},
		type={type=string},
		description={type=string},
		extension={type=string},
		category={type=string},
		accepts={type=array,items={type=string}}
	}}}

	:return-error 404:  If the dataset does not exist.
	"""
	try:
		dataset = DataSet(key=request.args.get("key"), db=db)
	except TypeError:
		return error(404, error="Dataset does not exist.")

	# Class type is not JSON serialisable
	processors = dataset.get_available_processors()
	for key, value in processors.items():
		if "class" in value:
			del value["class"]

	return jsonify(processors)

Example #6

0

Show file

File: views.py Project: pgr-me/4cat

def show_result(key):
    """
	Show result page

	The page contains dataset details and a download link, but also shows a list
	of finished and available processors.

	:param key:  Result key
	:return:  Rendered template
	"""
    try:
        dataset = DataSet(key=key, db=db)
    except TypeError:
        abort(404)

    # child datasets are not available via a separate page - redirect to parent
    if dataset.key_parent:
        genealogy = dataset.get_genealogy()
        nav = ",".join([family.key for family in genealogy])
        url = "/results/%s/#nav=%s" % (genealogy[0].key, nav)
        return redirect(url)

    # load list of processors compatible with this dataset
    is_processor_running = False

    # show preview
    if dataset.is_finished() and dataset.num_rows > 0:
        preview = get_preview(dataset)
    else:
        preview = None

    is_favourite = (db.fetchone(
        "SELECT COUNT(*) AS num FROM users_favourites WHERE name = %s AND key = %s",
        (current_user.get_id(), dataset.key))["num"] > 0)

    # if the datasource is configured for it, this dataset may be deleted at some point
    datasource = dataset.parameters.get("datasource", "")
    if datasource in backend.all_modules.datasources and backend.all_modules.datasources[
            datasource].get("expire-datasets", None):
        timestamp_expires = dataset.timestamp + int(
            backend.all_modules.datasources[datasource].get("expire-datasets"))
    else:
        timestamp_expires = None

    # we can either show this view as a separate page or as a bunch of html
    # to be retrieved via XHR
    standalone = "processors" not in request.url
    template = "result.html" if standalone else "result-details.html"
    return render_template(template,
                           preview=preview,
                           dataset=dataset,
                           parent_key=dataset.key,
                           processors=backend.all_modules.processors,
                           is_processor_running=is_processor_running,
                           messages=get_flashed_messages(),
                           is_favourite=is_favourite,
                           timestamp_expires=timestamp_expires)

Example #7

0

Show file

def check_processor():
    """
	Check processor status

	:request-param str subqueries:  A JSON-encoded list of dataset keys to get
	                                the status of
	:return: A list of dataset data, with each dataset an item with a `key`,
	        whether it had `finished`, a `html` snippet containing details, and
	        a `url` at which the result may be downloaded when finished.

	:return-schema:{type=array,items={type=object,properties={
		key={type=string},
		finished={type=boolean},
		html={type=string},
		url={type=string}
	}}}

	:return-error 406:  If the list of subqueries could not be parsed.
	"""
    try:
        keys = json.loads(request.args.get("subqueries"))
    except (TypeError, json.decoder.JSONDecodeError):
        return error(406,
                     error="Unexpected format for child dataset key list.")

    children = []

    for key in keys:
        try:
            dataset = DataSet(key=key, db=db)
        except TypeError:
            continue

        genealogy = dataset.get_genealogy()
        parent = genealogy[-2]
        top_parent = genealogy[0]

        children.append({
            "key":
            dataset.key,
            "finished":
            dataset.is_finished(),
            "html":
            render_template("result-child.html",
                            child=dataset,
                            dataset=parent,
                            query=dataset.get_genealogy()[0],
                            parent_key=top_parent.key,
                            processors=backend.all_modules.processors),
            "resultrow_html":
            render_template("result-result-row.html", dataset=top_parent),
            "url":
            "/result/" + dataset.data["result_file"]
        })

    return jsonify(children)

Example #8

0

Show file

File: api_tool.py Project: p-charis/4cat

def delete_dataset(key=None):
	"""
	Delete a dataset

	Only available to administrators and dataset owners. Deletes a dataset, as
	well as any children linked to it, from 4CAT. Also tells the backend to stop
	any jobs dealing with the dataset.

	:request-param str key:  ID of the dataset to delete
    :request-param str ?access_token:  Access token; only required if not
    logged in currently.

	:return: A dictionary with a successful `status`.

	:return-schema: {type=object,properties={status={type=string}}}

	:return-error 404:  If the dataset does not exist.
	"""
	dataset_key = request.form.get("key", "") if not key else key

	try:
		dataset = DataSet(key=dataset_key, db=db)
	except TypeError:
		return error(404, error="Dataset does not exist.")

	if not current_user.is_admin() and not current_user.get_id() == dataset.parameters.get("user"):
		return error(403, message="Not allowed")

	# if there is an active or queued job for some child dataset, cancel and
	# delete it
	children = dataset.get_all_children()
	for child in children:
		try:
			job = Job.get_by_remote_ID(child.key, database=db, jobtype=child.type)
			call_api("cancel-job", {"remote_id": child.key, "jobtype": dataset.type, "level": BasicWorker.INTERRUPT_CANCEL})
			job.finish()
		except JobNotFoundException:
			pass

	# now cancel and delete the job for this one (if it exists)
	try:
		job = Job.get_by_remote_ID(dataset.key, database=db, jobtype=dataset.type)
		call_api("cancel-job", {"remote_id": dataset.key, "jobtype": dataset.type, "level": BasicWorker.INTERRUPT_CANCEL})
	except JobNotFoundException:
		pass

	# and delete the dataset and child datasets
	dataset.delete()

	return jsonify({"status": "success", "key": dataset.key})

Example #9

0

Show file

File: preset.py Project: p-charis/4cat

    def process(self):
        """
		This queues a series of post-processors to run in sequence, with an
		overarching dataset to which the results of the last processor in the
		sequence are copied. The processor pipeline is then attached to the
		overarching dataset so it is clear that all processors were run as part
		of that particular preset.
		"""
        pipeline = self.get_processor_pipeline()

        # make sure the last item in the pipeline copies to the preset's dataset
        pipeline = pipeline.copy()
        pipeline[-1]["parameters"]["attach_to"] = self.dataset.key

        # map the linear pipeline to a nested processor parameter set
        while len(pipeline) > 1:
            last = pipeline.pop()
            pipeline[-1]["parameters"]["next"] = [last]

        analysis_pipeline = DataSet(parameters=pipeline[0]["parameters"],
                                    type=pipeline[0]["type"],
                                    db=self.db,
                                    parent=self.dataset.key)

        # this starts the pipeline
        self.queue.add_job(pipeline[0]["type"],
                           remote_id=analysis_pipeline.key)

Example #10

0

Show file

File: api_tool.py Project: p-charis/4cat

def toggle_favourite(key):
	"""
	'Like' a dataset

	Marks the dataset as being liked by the currently active user, which can be
	used for organisation in the front-end.

	:param str key: Key of the dataset to mark as favourite.

	:return: A JSON object with the status of the request
	:return-schema: {type=object,properties={success={type=boolean},favourite_status={type=boolean}}}

	:return-error 404:  If the dataset key was not found
	"""
	try:
		dataset = DataSet(key=key, db=db)
	except TypeError:
		return error(404, error="Dataset does not exist.")

	current_status = db.fetchone("SELECT * FROM users_favourites WHERE name = %s AND key = %s",
								 (current_user.get_id(), dataset.key))
	if not current_status:
		db.insert("users_favourites", data={"name": current_user.get_id(), "key": dataset.key})
		return jsonify({"success": True, "favourite_status": True})
	else:
		db.delete("users_favourites", where={"name": current_user.get_id(), "key": dataset.key})
		return jsonify({"success": True, "favourite_status": False})

Example #11

0

Show file

    def work(self):
        """
		Send pg_cancel_backend query to cancel query with given PID
		"""

        # delete dataset
        try:
            dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
            jobtype = dataset.data["type"]
        except TypeError:
            # dataset already deleted, apparently
            self.job.finish()
            return

        # now find the job that's tasked with creating this dataset, if it
        # exists
        try:
            job = Job.get_by_remote_ID(remote_id=self.job.data["remote_id"],
                                       jobtype=jobtype,
                                       database=self.db)
        except JobNotFoundException:
            # no job... dataset already fully finished?
            self.job.finish()
            return

        # ask the manager to interrupt this job
        self.manager.request_interrupt(job, self.INTERRUPT_CANCEL)

        # done
        self.job.finish()

Example #12

0

Show file

def queue_dataset():
    """
	Queue a 4CAT search query for processing into a dataset

	Requires authentication by logging in or providing a valid access token.
	Request parameters vary by data source. The ones mandated constitute the
	minimum but more may be required.

	:request-param str board:  Board ID to query
	:request-param str datasource:  Data source ID to query
	:request-param str body_match:  String to match in the post body
	:request-param str subject_match:  String to match in the post subject
    :request-param int min_date:  Timestamp marking the beginning of the match
                                  period
    :request-param int max_date:  Timestamp marking the end of the match period
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return str:  The dataset key, which may be used to later retrieve dataset
	              status and results.
	:return-error 404: If the datasource does not exist.
	"""

    datasource_id = request.form.get("datasource", "")
    if datasource_id not in backend.all_modules.datasources:
        return error(404,
                     message="Datasource '%s' does not exist" % datasource_id)

    search_worker_id = datasource_id + "-search"
    if search_worker_id not in backend.all_modules.workers:
        return error(404,
                     message="Datasource '%s' has no search interface" %
                     datasource_id)

    search_worker = backend.all_modules.workers[search_worker_id]

    if hasattr(search_worker["class"], "validate_query"):
        try:
            sanitised_query = search_worker["class"].validate_query(
                request.form.to_dict(), request, current_user)
        except QueryParametersException as e:
            return "Invalid query. %s" % e
    else:
        sanitised_query = request.form.to_dict()

    sanitised_query["user"] = current_user.get_id()
    sanitised_query["datasource"] = datasource_id
    sanitised_query["type"] = search_worker_id

    dataset = DataSet(parameters=sanitised_query, db=db, type="search")

    if hasattr(search_worker["class"], "after_create"):
        search_worker["class"].after_create(sanitised_query, dataset, request)

    queue.add_job(jobtype=search_worker_id, remote_id=dataset.key)

    return dataset.key

Example #13

0

Show file

File: search.py Project: saviaga/4cat

	def process(self):
		"""
		Run 4CAT search query

		Gets query details, passes them on to the object's search method, and
		writes the results to a CSV file. If that all went well, the query and
		job are marked as finished.
		"""

		query_parameters = self.dataset.get_parameters()
		results_file = self.dataset.get_results_path()

		self.log.info("Querying: %s" % str(query_parameters))

		# Execute the relevant query (string-based, random, countryflag-based)
		try:
			posts = self.search(query_parameters)
		except WorkerInterruptedException:
			raise ProcessorInterruptedException("Interrupted while collecting data, trying again later.")

		# Write posts to csv and update the DataBase status to finished
		num_posts = 0
		if posts:
			self.dataset.update_status("Writing posts to result file")
			num_posts = self.posts_to_csv(posts, results_file)
			self.dataset.update_status("Query finished, results are available.")
		elif posts is not None:
			self.dataset.update_status("Query finished, no results found.")

		# queue predefined post-processors
		if num_posts > 0 and query_parameters.get("next", []):
			for next in query_parameters.get("next"):
				next_parameters = next.get("parameters", {})
				next_type = next.get("type", "")
				available_processors = self.dataset.get_available_processors()

				# run it only if the post-processor is actually available for this query
				if next_type in available_processors:
					next_analysis = DataSet(parameters=next_parameters, type=next_type, db=self.db,
											parent=self.dataset.key,
											extension=available_processors[next_type]["extension"])
					self.queue.add_job(next_type, remote_id=next_analysis.key)

		# see if we need to register the result somewhere
		if query_parameters.get("copy_to", None):
			# copy the results to an arbitrary place that was passed
			if self.dataset.get_results_path().exists():
				# but only if we actually have something to copy
				shutil.copyfile(str(self.dataset.get_results_path()), query_parameters.get("copy_to"))
			else:
				# if copy_to was passed, that means it's important that this
				# file exists somewhere, so we create it as an empty file
				with open(query_parameters.get("copy_to"), "w") as empty_file:
					empty_file.write("")

		self.dataset.finish(num_rows=num_posts)

Example #14

0

Show file

def show_results(page):
	"""
	Show results overview

	For each result, available analyses are also displayed.

	:return:  Rendered template
	"""
	page_size = 20
	offset = (page - 1) * page_size

	where = ["key_parent = ''"]
	replacements = []

	query_filter = request.args.get("filter", "")

	depth = request.args.get("depth", "own")
	if depth not in ("own", "favourites", "all"):
		depth = "own"

	if depth == "own":
		where.append("parameters::json->>'user' = %s")
		replacements.append(current_user.get_id())

	if depth == "favourites":
		where.append("key IN ( SELECT key FROM users_favourites WHERE name = %s )")
		replacements.append(current_user.get_id())

	if query_filter:
		where.append("query LIKE %s")
		replacements.append("%" + query_filter + "%")

	where = " AND ".join(where)

	num_datasets = db.fetchone("SELECT COUNT(*) AS num FROM datasets WHERE " + where, tuple(replacements))["num"]
	
	replacements.append(page_size)
	replacements.append(offset)
	datasets = db.fetchall("SELECT key FROM datasets WHERE " + where + " ORDER BY timestamp DESC LIMIT %s OFFSET %s",
						   tuple(replacements))
	
	if not datasets and page != 1:
		abort(404)

	pagination = Pagination(page, page_size, num_datasets)
	filtered = []
	processors = backend.all_modules.processors

	for dataset in datasets:
		filtered.append(DataSet(key=dataset["key"], db=db))

	favourites = [row["key"] for row in
				  db.fetchall("SELECT key FROM users_favourites WHERE name = %s", (current_user.get_id(),))]

	return render_template("results.html", filter={"filter": query_filter}, depth=depth, datasets=filtered,
						   pagination=pagination, favourites=favourites)

Example #15

0

Show file

from pathlib import Path

cli = argparse.ArgumentParser()
cli.add_argument("-i", "--input", required=True, help="csv to import")
args = cli.parse_args()

input = Path(args.input)
if not input.exists():
	print("File not found")
	sys.exit(1)

with open(input) as i:
	reader = csv.DictReader(i)
	rows = 0
	for row in reader:
		rows += 1

required = ("id", "thread_id", "subject", "author", "timestamp", "body")
for field in required:
	if field not in reader.fieldnames:
		print("Column '%s' missing." % field)
		sys.exit(1)

logger = Logger()
new_set = DataSet(
	parameters={"user": "******", "filename": input.name, "time": int(time.time()), "datasource": "custom",
				"board": "upload"}, type="custom",
	db=Database(logger=logger))

shutil.copyfile(input, new_set.get_results_path())
new_set.finish(rows)

Example #16

0

Show file

File: api_tool.py Project: p-charis/4cat

def queue_processor(key=None, processor=None):
	"""
	Queue a new processor

	Queues the processor for a given dataset; with the returned query key,
	the processor status can then be checked periodically to download the
	result when available.

	Note that apart from the required parameters, further parameters may be
	provided based on the configuration options available for the chosen
	processor. Available options may be found via the
	`/get-available-processors/` endpoint.

	:request-param str key:  Key of dataset to queue processor for
	:request-param str processor:  ID of processor to queue
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return: A list of dataset properties, with each dataset an item with a `key`,
	        whether it had `finished`, a `html` snippet containing details,
	        a `url` at which the result may be downloaded when finished, and a
	        list of `messages` describing any warnings generated while queuing.

	:return-schema: {type=object,additionalProperties={type=object,properties={
		key={type=string},
		finished={type=boolean},
		html={type=string},
		url={type=string},
		messages={type=array,items={type=string}}
	}}}
	"""
	if request.files and "input_file" in request.files:
		input_file = request.files["input_file"]
		if not input_file:
			return jsonify({"error": "No file input provided"})

		if input_file.filename[-4:] != ".csv":
			return jsonify({"error": "File input is not a csv file"})

		test_csv_file = csv.DictReader(input_file.stream)
		if "body" not in test_csv_file.fieldnames:
			return jsonify({"error": "File must contain a 'body' column"})

		filename = secure_filename(input_file.filename)
		input_file.save(config.PATH_DATA + "/")

	elif not key:
		key = request.form.get("key", "")

	if not processor:
		processor = request.form.get("processor", "")

	# cover all bases - can only run processor on "parent" dataset
	try:
		dataset = DataSet(key=key, db=db)
	except TypeError:
		return jsonify({"error": "Not a valid dataset key."})

	# check if processor is available for this dataset
	if processor not in dataset.processors:
		return jsonify({"error": "This processor is not available for this dataset or has already been run."})

	# create a dataset now
	options = UserInput.parse_all(dataset.processors[processor]["options"], request.form.to_dict(), silently_correct=False)
	options["user"] = current_user.get_id()

	analysis = DataSet(parent=dataset.key, parameters=options, db=db,
					   extension=dataset.processors[processor]["extension"], type=processor)
	if analysis.is_new:
		# analysis has not been run or queued before - queue a job to run it
		queue.add_job(jobtype=processor, remote_id=analysis.key)
		job = Job.get_by_remote_ID(analysis.key, database=db)
		analysis.link_job(job)
		analysis.update_status("Queued")
	else:
		flash("This analysis (%s) is currently queued or has already been run with these parameters." %
			  dataset.processors[processor]["name"])

	return jsonify({
		"status": "success",
		"container": "*[data-dataset-key=" + dataset.key + "]",
		"key": analysis.key,
		"html": render_template("result-child.html", child=analysis, dataset=dataset, parent_key=dataset.key,
								processors=backend.all_modules.processors) if analysis.is_new else "",
		"messages": get_flashed_messages(),
		"is_filter": dataset.processors[processor]["is_filter"]
	})

Example #17

0

Show file

File: processor.py Project: p-charis/4cat

class BasicProcessor(BasicWorker, metaclass=abc.ABCMeta):
    """
	Abstract post-processor class

	A post-processor takes a finished search query as input and processed its
	result in some way, with another result set as output. The input thus is
	a CSV file, and the output (usually) as well. In other words, the result of
	a post-processor run can be used as input for another post-processor
	(though whether and when this is useful is another question).
	"""
    db = None  # database handler
    dataset = None  # Dataset object representing the dataset to be created
    job = None  # Job object that requests the execution of this processor
    source_dataset = None  # Dataset object to be processed, if applicable
    source_file = None  # path to dataset to be processed, if applicable

    description = "No description available"  # processor description, shown in web front-end
    category = "Other"  # processor category, for sorting in web front-end
    extension = "csv"  # extension of files created by this processor
    options = {}  # configurable options for this processor
    parameters = {}  # values for the processor's configurable options

    is_running_in_preset = False  # is this processor running 'within' a preset processor?

    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        is_running_in_preset = False
        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the source_dataset dataset is if it's a search worker
            try:
                self.source_dataset = DataSet(
                    key=self.dataset.data["key_parent"], db=self.db)

                # for presets, transparently use the *top* dataset as a source_dataset
                # since that is where any underlying processors should get
                # their data from. However, this should only be done as long as the
                # preset is not finished yet, because after that there may be processors
                # that run on the final preset result
                if self.source_dataset.type.find(
                        "preset-"
                ) == 0 and not self.source_dataset.is_finished():
                    self.is_running_in_preset = True
                    self.source_dataset = self.source_dataset.get_genealogy(
                    )[0]

            except TypeError:
                # we need to know what the source_dataset dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.source_dataset.is_finished(
            ) and not self.is_running_in_preset:
                # not finished yet - retry after a while
                # exception for presets, since these *should* be unfinished
                # until underlying processors are done
                self.job.release(delay=30)
                return

            self.source_file = self.source_dataset.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running processor %s on dataset %s" %
                      (self.type, self.job.data["remote_id"]))

        processor_name = self.title if hasattr(self, "title") else self.type
        self.dataset.clear_log()
        self.dataset.log("Processing '%s' started for dataset %s" %
                         (processor_name, self.dataset.key))

        # start log file
        self.parameters = self.dataset.parameters.copy()
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        # now the parameters have been loaded into memory, clear any sensitive
        # ones. This has a side-effect that a processor may not run again
        # without starting from scratch, but this is the price of progress
        if hasattr(self, "options"):
            for option in self.options:
                if self.options[option].get("sensitive"):
                    self.dataset.delete_parameter(option)

        if self.interrupted:
            self.dataset.log("Processing interrupted, trying again later")
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException as e:
                self.dataset.log(
                    "Processing interrupted (%s), trying again later" % str(e))
                self.abort()
            except Exception as e:
                self.dataset.log("Processor crashed (%s), trying again later" %
                                 str(e))
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have source_dataset keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()

    def after_process(self):
        """
		After processing, declare job finished
		"""
        if self.dataset.data["num_rows"] > 0:
            self.dataset.update_status("Dataset saved.")

        if not self.dataset.is_finished():
            self.dataset.finish()

        if hasattr(self, "staging_area") and type(
                self.staging_area) == Path and self.staging_area.exists():
            shutil.rmtree(self.staging_area)

        # see if we have anything else lined up to run next
        for next in self.parameters.get("next", []):
            next_parameters = next.get("parameters", {})
            next_type = next.get("type", "")
            available_processors = self.dataset.get_available_processors()

            # run it only if the post-processor is actually available for this query
            if next_type in available_processors:
                next_analysis = DataSet(
                    parameters=next_parameters,
                    type=next_type,
                    db=self.db,
                    parent=self.dataset.key,
                    extension=available_processors[next_type]["extension"])
                self.queue.add_job(next_type, remote_id=next_analysis.key)
            else:
                self.log.warning(
                    "Dataset %s (of type %s) wants to run processor %s next, but it is incompatible"
                    % (self.dataset.key, self.type, next_type))

        # see if we need to register the result somewhere
        if "copy_to" in self.parameters:
            # copy the results to an arbitrary place that was passed
            if self.dataset.get_results_path().exists():
                shutil.copyfile(str(self.dataset.get_results_path()),
                                self.parameters["copy_to"])
            else:
                # if copy_to was passed, that means it's important that this
                # file exists somewhere, so we create it as an empty file
                with open(self.parameters["copy_to"], "w") as empty_file:
                    empty_file.write("")

        # see if this query chain is to be attached to another query
        # if so, the full genealogy of this query (minus the original dataset)
        # is attached to the given query - this is mostly useful for presets,
        # where a chain of processors can be marked as 'underlying' a preset
        if "attach_to" in self.parameters:
            try:
                # copy metadata and results to the surrogate
                surrogate = DataSet(key=self.parameters["attach_to"],
                                    db=self.db)

                if self.dataset.get_results_path().exists():
                    shutil.copyfile(str(self.dataset.get_results_path()),
                                    str(surrogate.get_results_path()))

                try:
                    surrogate.finish(self.dataset.data["num_rows"])
                except RuntimeError:
                    # already finished, could happen (though it shouldn't)
                    pass

                surrogate.update_status(self.dataset.get_status())

            except ValueError:
                # dataset with key to attach to doesn't exist...
                self.log.warning(
                    "Cannot attach dataset chain containing %s to %s (dataset does not exist)"
                    % (self.dataset.key, self.parameters["attach_to"]))

        self.job.finish()

    def abort(self):
        """
		Abort dataset creation and clean up so it may be attempted again later
		"""
        # remove any result files that have been created so far
        if self.dataset.get_results_path().exists():
            self.dataset.get_results_path().unlink(missing_ok=True)

        if self.dataset.get_staging_area().exists():
            shutil.rmtree(str(self.dataset.get_staging_area()))

        # we release instead of finish, since interrupting is just that - the
        # job should resume at a later point. Delay resuming by 10 seconds to
        # give 4CAT the time to do whatever it wants (though usually this isn't
        # needed since restarting also stops the spawning of new workers)
        self.dataset.update_status(
            "Dataset processing interrupted. Retrying later.")

        if self.interrupted == self.INTERRUPT_RETRY:
            # retry later - wait at least 10 seconds to give the backend time to shut down
            self.job.release(delay=10)
        elif self.interrupted == self.INTERRUPT_CANCEL:
            # cancel job
            self.job.finish()

    def iterate_items(self, path, bypass_map_item=False):
        """
		A generator that iterates through a CSV or NDJSON file

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Processors can define a method called `map_item` that can be used to
		map an item from the dataset file before it is processed any further
		this is slower than storing the data file in the right format to begin
		with but not all data sources allow for easy 'flat' mapping of items,
		e.g. tweets are nested objects when retrieved from the twitter API
		that are easier to store as a JSON file than as a flat CSV file, and
		it would be a shame to throw away that data

		There are two file types that can be iterated (currently): CSV files
		and NDJSON (newline-delimited JSON) files. In the future, one could
		envision adding a pathway to retrieve items from e.g. a MongoDB
		collection directly instead of from a static file

		:param Path path: 	Path to file to read
		:return Generator:  A generator that yields each item as a dictionary
		"""

        # see if an item mapping function has been defined
        # open question if 'source_dataset' shouldn't be an attribute of the dataset
        # instead of the processor...
        item_mapper = None
        if hasattr(self, "source_dataset"
                   ) and self.source_dataset and not bypass_map_item:
            parent_processor = self.all_modules.processors.get(
                self.source_dataset.type)
            if parent_processor:
                parent_processor = self.all_modules.load_worker_class(
                    parent_processor)
                if hasattr(parent_processor, "map_item"):
                    item_mapper = parent_processor.map_item

        # go through items one by one, optionally mapping them
        if path.suffix.lower() == ".csv":
            with path.open(encoding="utf-8") as input:
                reader = csv.DictReader(input)

                for item in reader:
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Processor interrupted while iterating through CSV file"
                        )

                    if item_mapper:
                        item = item_mapper(item)

                    yield item

        elif path.suffix.lower() == ".ndjson":
            # in this format each line in the file is a self-contained JSON
            # file
            with path.open(encoding="utf-8") as input:
                for line in input:
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Processor interrupted while iterating through NDJSON file"
                        )

                    item = json.loads(line)
                    if item_mapper:
                        item = item_mapper(item)

                    yield item

        else:
            raise NotImplementedError("Cannot iterate through %s file" %
                                      path.suffix)

    def get_item_keys(self, path=None):
        """
		Get item attribute names

		It can be useful to know what attributes an item in the dataset is
		stored with, e.g. when one wants to produce a new dataset identical
		to the source_dataset one but with extra attributes. This method provides
		these, as a list.

		:param Path path:  Path to the dataset file; if left empty, use the
		processor's own dataset's path
		:return list:  List of keys, may be empty if there are no items in the
		dataset

		:todo: Figure out if this makes more sense as a Dataset method
		"""
        if not path:
            path = self.dataset.get_results_path()

        items = self.iterate_items(path)
        try:
            keys = list(items.__next__().keys())
        except StopIteration:
            return []
        finally:
            del items

        return keys

    def iterate_archive_contents(self, path, staging_area=None):
        """
		A generator that iterates through files in an archive

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are temporarily unzipped and deleted after use.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return:  An iterator with a Path item for each file
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area") and not staging_area:
                self.staging_area = self.dataset.get_staging_area()
                staging_area = self.staging_area

        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    if hasattr(self, "staging_area"):
                        shutil.rmtree(self.staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(file_name, staging_area)

                yield temp_file
                if hasattr(self, "staging_area"):
                    temp_file.unlink()

        if hasattr(self, "staging_area"):
            shutil.rmtree(self.staging_area)
            del self.staging_area

    def unpack_archive_contents(self, path, staging_area=None):
        """
		Unpack all files in an archive to a staging area

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are unzipped to a staging area. The staging area is *not*
		cleaned up automatically.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return Path:  A path to the staging area
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area"):
                self.staging_area = self.dataset.get_staging_area()

            staging_area = self.staging_area

        paths = []
        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(archived_file, staging_area)
                paths.append(temp_file)

        return staging_area

    def write_csv_items_and_finish(self, data):
        """
		Write data as csv to results file and finish dataset

		Determines result file path using dataset's path determination helper
		methods. After writing results, the dataset is marked finished. Will
		raise a ProcessorInterruptedException if the interrupted flag for this
		processor is set while iterating.

		:param data: A list or tuple of dictionaries, all with the same keys
		"""
        if not (isinstance(data, typing.List)
                or isinstance(data, typing.Tuple)) or isinstance(data, str):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        if not data:
            raise ValueError(
                "write_csv_items requires a dictionary with at least one item")

        if not isinstance(data[0], dict):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        self.dataset.update_status("Writing results file")
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8",
                                                  newline='') as results:
            writer = csv.DictWriter(results, fieldnames=data[0].keys())
            writer.writeheader()

            for row in data:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results file")
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(data))

    def write_archive_and_finish(self,
                                 files,
                                 num_items=None,
                                 compression=zipfile.ZIP_STORED):
        """
		Archive a bunch of files into a zip archive and finish processing

		:param list|Path files: If a list, all files will be added to the
		archive and deleted afterwards. If a folder, all files in the folder
		will be added and the folder will be deleted afterwards.
		:param int num_items: Items in the dataset. If None, the amount of
		files added to the archive will be used.
		:param int compression:  Type of compression to use. By default, files
		are not compressed, to speed up unarchiving.
		"""
        is_folder = False
        if issubclass(type(files), PurePath):
            is_folder = files
            if not files.exists() or not files.is_dir():
                raise RuntimeError(
                    "Folder %s is not a folder that can be archived" % files)

            files = files.glob("*")

        # create zip of archive and delete temporary files and folder
        self.dataset.update_status("Compressing results into archive")
        done = 0
        with zipfile.ZipFile(self.dataset.get_results_path(),
                             "w",
                             compression=compression) as zip:
            for output_path in files:
                zip.write(output_path, output_path.name)
                output_path.unlink()
                done += 1

        # delete temporary folder
        if is_folder:
            shutil.rmtree(is_folder)

        self.dataset.update_status("Finished")
        if num_items is None:
            num_items = done

        self.dataset.finish(num_items)

    def is_filter(self):
        """
		Is this processor a filter?

		Filters do not produce their own dataset but replace the source_dataset dataset
		instead.

		:todo: Make this a bit more robust than sniffing the processor category
		:return bool:
		"""
        return hasattr(
            self, "category"
        ) and self.category and "filter" in self.category.lower()

    @abc.abstractmethod
    def process(self):
        """
		Process data

		To be defined by the child processor.
		"""
        pass

Example #18

0

Show file

File: processor.py Project: p-charis/4cat

    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        is_running_in_preset = False
        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the source_dataset dataset is if it's a search worker
            try:
                self.source_dataset = DataSet(
                    key=self.dataset.data["key_parent"], db=self.db)

                # for presets, transparently use the *top* dataset as a source_dataset
                # since that is where any underlying processors should get
                # their data from. However, this should only be done as long as the
                # preset is not finished yet, because after that there may be processors
                # that run on the final preset result
                if self.source_dataset.type.find(
                        "preset-"
                ) == 0 and not self.source_dataset.is_finished():
                    self.is_running_in_preset = True
                    self.source_dataset = self.source_dataset.get_genealogy(
                    )[0]

            except TypeError:
                # we need to know what the source_dataset dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.source_dataset.is_finished(
            ) and not self.is_running_in_preset:
                # not finished yet - retry after a while
                # exception for presets, since these *should* be unfinished
                # until underlying processors are done
                self.job.release(delay=30)
                return

            self.source_file = self.source_dataset.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running processor %s on dataset %s" %
                      (self.type, self.job.data["remote_id"]))

        processor_name = self.title if hasattr(self, "title") else self.type
        self.dataset.clear_log()
        self.dataset.log("Processing '%s' started for dataset %s" %
                         (processor_name, self.dataset.key))

        # start log file
        self.parameters = self.dataset.parameters.copy()
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        # now the parameters have been loaded into memory, clear any sensitive
        # ones. This has a side-effect that a processor may not run again
        # without starting from scratch, but this is the price of progress
        if hasattr(self, "options"):
            for option in self.options:
                if self.options[option].get("sensitive"):
                    self.dataset.delete_parameter(option)

        if self.interrupted:
            self.dataset.log("Processing interrupted, trying again later")
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException as e:
                self.dataset.log(
                    "Processing interrupted (%s), trying again later" % str(e))
                self.abort()
            except Exception as e:
                self.dataset.log("Processor crashed (%s), trying again later" %
                                 str(e))
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have source_dataset keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()

Example #19

0

Show file

File: api_tool.py Project: p-charis/4cat

def check_dataset():
	"""
	Check dataset status

	Requires authentication by logging in or providing a valid access token.

	:request-param str key:  ID of the dataset for which to return the status
	:return: Dataset status, containing the `status`, `query`, number of `rows`,
	         the dataset `key`, whether the dataset is `done`, the `path` of the
	         result file and whether the dataset is `empty`.

	:return-schema: {
		type=object,
		properties={
			status={type=string},
			query={type=string},
			rows={type=integer},
			key={type=string},
			done={type=boolean},
			path={type=string},
			empty={type=boolean},
			is_favourite={type=boolean}
		}
	}

	:return-error 404:  If the dataset does not exist.
	"""
	dataset_key = request.args.get("key")
	try:
		dataset = DataSet(key=dataset_key, db=db)
	except TypeError:
		return error(404, error="Dataset does not exist.")

	results = dataset.check_dataset_finished()
	if results == 'empty':
		dataset_data = dataset.data
		dataset_data["parameters"] = json.loads(dataset_data["parameters"])
		path = False
	elif results:
		# Return absolute folder when using localhost for debugging
		path = results.name
		dataset_data = dataset.data
		dataset_data["parameters"] = json.loads(dataset_data["parameters"])
	else:
		path = ""

	status = {
		"status": dataset.get_status(),
		"status_html": render_template("result-status.html", dataset=dataset),
		"label": dataset.get_label(),
		"query": dataset.data["query"],
		"rows": dataset.data["num_rows"],
		"key": dataset_key,
		"done": True if dataset.is_finished() else False,
		"path": path,
		"empty": (dataset.data["num_rows"] == 0),
		"is_favourite": (db.fetchone("SELECT COUNT(*) AS num FROM users_favourites WHERE name = %s AND key = %s",
									 (current_user.get_id(), dataset.key))["num"] > 0)
	}

	return jsonify(status)

Example #20

0

Show file

File: api_tool.py Project: p-charis/4cat

def queue_dataset():
	"""
	Queue a 4CAT search query for processing into a dataset

	Requires authentication by logging in or providing a valid access token.
	Request parameters vary by data source. The ones mandated constitute the
	minimum but more may be required.

	:request-param str board:  Board ID to query
	:request-param str datasource:  Data source ID to query
	:request-param str body_match:  String to match in the post body
	:request-param str subject_match:  String to match in the post subject
    :request-param int min_date:  Timestamp marking the beginning of the match
                                  period
    :request-param int max_date:  Timestamp marking the end of the match period
    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return str:  The dataset key, which may be used to later retrieve dataset
	              status and results.
	:return-error 404: If the datasource does not exist.
	"""

	datasource_id = request.form.get("datasource", "")
	if datasource_id not in backend.all_modules.datasources:
		return error(404, message="Datasource '%s' does not exist" % datasource_id)

	search_worker_id = datasource_id + "-search"
	if search_worker_id not in backend.all_modules.workers:
		return error(404, message="Datasource '%s' has no search interface" % datasource_id)

	search_worker = backend.all_modules.workers[search_worker_id]
	worker_class = backend.all_modules.load_worker_class(search_worker)

	if hasattr(worker_class, "validate_query"):
		try:
			# first sanitise values
			sanitised_query = UserInput.parse_all(worker_class.options, request.form.to_dict(), silently_correct=False)

			# then validate for this particular datasource
			sanitised_query = worker_class.validate_query(sanitised_query, request, current_user)
		except QueryParametersException as e:
			return "Invalid query. %s" % e
	else:
		raise NotImplementedError("Data sources MUST sanitise input values with validate_query")

	sanitised_query["user"] = current_user.get_id()
	sanitised_query["datasource"] = datasource_id
	sanitised_query["type"] = search_worker_id

	sanitised_query["pseudonymise"] = bool(request.form.to_dict().get("pseudonymise", False))

	extension = worker_class.extension if hasattr(worker_class, "extension") else "csv"
	dataset = DataSet(parameters=sanitised_query, db=db, type=search_worker_id, extension=extension)

	if hasattr(worker_class, "after_create"):
		worker_class.after_create(sanitised_query, dataset, request)

	queue.add_job(jobtype=search_worker_id, remote_id=dataset.key)

	return dataset.key

Example #21

0

Show file

def process_standalone(processor):
    """
	Run a standalone processor

	This bypasses the usual 4CAT query-processor structure and allows running
	any available processor (see the `/api/get-standalone-processors/`
	endpoint) with one API call. The data is returned immediately and not saved
	server-side.

	Requires authentication.

	:param str processor:  ID of the processor to run on incoming data

	:request-body object data:  Data to process, a JSON-formatted list of
	objects with each object having at least they keys `post_id`,
	`thread_id`, body`, and `author`.

	:request-schema data: {
		type=object,
		properties={
			post_id={type=string},
			thread_id={type=string},
			body={type=string},
			author={type=string}
		}
	}

    :request-param str ?access_token:  Access token; only required if not
                                       logged in currently.

	:return:  A JSON object containing the processed data, with a
	processor-specific structure.

	:return-schema: {
		type=object,
		additionalProperties={}
	}

	:return-error 402: If an invalid processor is requested, or if the input is
	not properly-formatted JSON.
	:return-error 503: If too many other requests are currently being handled,
	so that the server does not have the capacity to deal with this request
	"""
    processors = get_standalone_processors().get_json()

    if processor not in processors:
        return error(402, error="Processor '%s' is not available" % processor)

    if not request.is_json:
        return error(
            402,
            error="This API endpoint only accepts JSON-formatted data as input"
        )

    try:
        input = request.get_json(force=True)
    except json.JSONDecodeError:
        return error(402, error="JSON decoding error")

    # check file integrity
    required = ("id", "thread_id", "body", "author")
    try:
        for row in input:
            for field in required:
                if field not in row:
                    return error(
                        402,
                        error=
                        "Input is valid JSON, but not a list of data objects (missing field '%s')"
                        % field)
    except TypeError:
        return error(
            402, error="Input is valid JSON, but not a list of data objects")

    if not input:
        return error(402, error="Input is empty")

    # ok, valid input!
    temp_dataset = DataSet(extension="csv",
                           type="standalone",
                           parameters={
                               "user": current_user.get_id(),
                               "after": [processor]
                           },
                           db=db)
    temp_dataset.finish(len(input))

    # make sure the file is deleted later, whichever way this request is
    # ultimately handled
    @after_this_request
    def delete_temporary_dataset(response):
        temp_dataset.delete()  # also deletes children!
        return response

    # write the input as a csv file so it can be accessed as normal by
    # processors
    result_file = temp_dataset.get_results_path()
    with result_file.open("w") as temp_csv:
        writer = csv.DictWriter(temp_csv, fieldnames=required)
        writer.writeheader()
        for row in input:
            writer.writerow({field: row[field] for field in required})

    # queue the postprocessor
    metadata = processors[processor]
    processed = DataSet(extension=metadata["extension"],
                        type=processor,
                        parent=temp_dataset.key,
                        db=db)

    queue = JobQueue(database=db, logger=log)
    job = queue.add_job(processor, {}, processed.key)
    place_in_queue = queue.get_place_in_queue(job)
    if place_in_queue > 5:
        job.finish()
        return error(
            code=503,
            error=
            "Your request could not be handled as there are currently %i other jobs of this type in the queue. Please try again later."
            % place_in_queue)

    # wait up to half a minute for the job to be taken up
    # if not, tell the user to try again later

    start = time.time()
    while True:
        if time.time() > start + 30:
            job.finish()
            return error(
                code=503,
                error=
                "The server is currently too busy to handle your request. Please try again later."
            )

        if queue.get_place_in_queue(job) != 0:
            time.sleep(2)
            continue
        else:
            break

    # job currently being processed, wait for it to finish
    while True:
        try:
            job = Job.get_by_remote_ID(job.data["remote_id"], db, processor)
        except JobNotFoundException:
            break

        if not job.is_finished:
            time.sleep(2)
        else:
            break

    # job finished, send file - temporary datasets will be cleaned up by
    # after_this_request function defined earlier
    return send_file(processed.get_results_path(), as_attachment=True)

Example #22

0

Show file

    description="Deletes a query, the corresponding job, and any sub-queries.")
cli.add_argument("-k", "--key", required=True, help="Query key to delete.")
cli.add_argument(
    "-q",
    "--quiet",
    type=bool,
    default=False,
    help="Whether to skip asking for confirmation. Defaults to false.")
args = cli.parse_args()

if not args.quiet:
    confirm = input(
        "This will delete the query, and any sub-queries. Are you sure? (y/n)")
    if confirm.strip().lower() != "y":
        sys.exit(0)

logger = Logger()
database = Database(logger=logger, appname="delete-query")

# Initialize query
try:
    parent = DataSet(key=args.key, db=database)
except TypeError:
    print("No query found with that key.")
    sys.exit(1)

parent.delete()
print(
    "Done. Note that running jobs for the queries above are not stopped; you will have to wait for them to finish on their own."
)

Example #23

0

Show file

    def after_process(self):
        """
		After processing, declare job finished
		"""
        if self.dataset.data["num_rows"] > 0:
            self.dataset.update_status("Dataset saved.")

        if not self.dataset.is_finished():
            self.dataset.finish()

        # see if we have anything else lined up to run next
        for next in self.parameters.get("next", []):
            next_parameters = next.get("parameters", {})
            next_type = next.get("type", "")
            available_processors = self.dataset.get_available_processors()

            # run it only if the post-processor is actually available for this query
            if next_type in available_processors:
                next_analysis = DataSet(
                    parameters=next_parameters,
                    type=next_type,
                    db=self.db,
                    parent=self.dataset.key,
                    extension=available_processors[next_type]["extension"])
                self.queue.add_job(next_type, remote_id=next_analysis.key)

        # see if we need to register the result somewhere
        if "copy_to" in self.parameters:
            # copy the results to an arbitrary place that was passed
            if self.dataset.get_results_path().exists():
                shutil.copyfile(str(self.dataset.get_results_path()),
                                self.parameters["copy_to"])
            else:
                # if copy_to was passed, that means it's important that this
                # file exists somewhere, so we create it as an empty file
                with open(self.parameters["copy_to"], "w") as empty_file:
                    empty_file.write("")

        # see if this query chain is to be attached to another query
        # if so, the full genealogy of this query (minus the original dataset)
        # is attached to the given query - this is mostly useful for presets,
        # where a chain of processors can be marked as 'underlying' a preset
        if "attach_to" in self.parameters:
            try:
                # copy metadata and results to the surrogate
                surrogate = DataSet(key=self.parameters["attach_to"],
                                    db=self.db)

                if self.dataset.get_results_path().exists():
                    shutil.copyfile(str(self.dataset.get_results_path()),
                                    str(surrogate.get_results_path()))

                top_parent = self.dataset.get_genealogy()[1]
                top_parent.link_parent(surrogate.key)

                try:
                    surrogate.finish(self.dataset.data["num_rows"])
                except RuntimeError:
                    # already finished, could happen (though it shouldn't)
                    pass

                surrogate.update_status(self.dataset.get_status())

            except ValueError:
                # dataset with key to attach to doesn't exist...
                self.log.warning(
                    "Cannot attach dataset chain containing %s to %s (dataset does not exist)"
                    % (self.dataset.key, self.parameters["attach_to"]))

        self.job.finish()

Example #24

0

Show file

class BasicProcessor(BasicWorker, metaclass=abc.ABCMeta):
    """
	Abstract post-processor class

	A post-processor takes a finished search query as input and processed its
	result in some way, with another result set as output. The input thus is
	a CSV file, and the output (usually) as well. In other words, the result of
	a post-processor run can be used as input for another post-processor
	(though whether and when this is useful is another question).
	"""
    db = None  # database handler
    dataset = None  # Dataset object representing the dataset to be created
    job = None  # Job object that requests the execution of this processor
    parent = None  # Dataset object to be processed, if applicable
    source_file = None  # path to dataset to be processed, if applicable

    description = "No description available"  # processor description, shown in web front-end
    category = "Other"  # processor category, for sorting in web front-end
    extension = "csv"  # extension of files created by this processor
    options = {}  # configurable options for this processor
    parameters = {}  # values for the processor's configurable options

    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the parent dataset is if it's a search worker
            try:
                self.parent = DataSet(key=self.dataset.data["key_parent"],
                                      db=self.db)
            except TypeError:
                # we need to know what the parent dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.parent.is_finished():
                # not finished yet - retry after a while
                self.job.release(delay=30)
                return

            self.parent = DataSet(key=self.dataset.data["key_parent"],
                                  db=self.db)

            self.source_file = self.parent.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running post-processor %s on query %s" %
                      (self.type, self.job.data["remote_id"]))

        self.parameters = self.dataset.parameters
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        if self.interrupted:
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException:
                self.abort()
            except Exception as e:
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have parent keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()

    def after_process(self):
        """
		After processing, declare job finished
		"""
        if self.dataset.data["num_rows"] > 0:
            self.dataset.update_status("Dataset saved.")

        if not self.dataset.is_finished():
            self.dataset.finish()

        # see if we have anything else lined up to run next
        for next in self.parameters.get("next", []):
            next_parameters = next.get("parameters", {})
            next_type = next.get("type", "")
            available_processors = self.dataset.get_available_processors()

            # run it only if the post-processor is actually available for this query
            if next_type in available_processors:
                next_analysis = DataSet(
                    parameters=next_parameters,
                    type=next_type,
                    db=self.db,
                    parent=self.dataset.key,
                    extension=available_processors[next_type]["extension"])
                self.queue.add_job(next_type, remote_id=next_analysis.key)

        # see if we need to register the result somewhere
        if "copy_to" in self.parameters:
            # copy the results to an arbitrary place that was passed
            if self.dataset.get_results_path().exists():
                shutil.copyfile(str(self.dataset.get_results_path()),
                                self.parameters["copy_to"])
            else:
                # if copy_to was passed, that means it's important that this
                # file exists somewhere, so we create it as an empty file
                with open(self.parameters["copy_to"], "w") as empty_file:
                    empty_file.write("")

        # see if this query chain is to be attached to another query
        # if so, the full genealogy of this query (minus the original dataset)
        # is attached to the given query - this is mostly useful for presets,
        # where a chain of processors can be marked as 'underlying' a preset
        if "attach_to" in self.parameters:
            try:
                # copy metadata and results to the surrogate
                surrogate = DataSet(key=self.parameters["attach_to"],
                                    db=self.db)

                if self.dataset.get_results_path().exists():
                    shutil.copyfile(str(self.dataset.get_results_path()),
                                    str(surrogate.get_results_path()))

                top_parent = self.dataset.get_genealogy()[1]
                top_parent.link_parent(surrogate.key)

                try:
                    surrogate.finish(self.dataset.data["num_rows"])
                except RuntimeError:
                    # already finished, could happen (though it shouldn't)
                    pass

                surrogate.update_status(self.dataset.get_status())

            except ValueError:
                # dataset with key to attach to doesn't exist...
                self.log.warning(
                    "Cannot attach dataset chain containing %s to %s (dataset does not exist)"
                    % (self.dataset.key, self.parameters["attach_to"]))

        self.job.finish()

    def abort(self):
        """
		Abort dataset creation and clean up so it may be attempted again later
		"""
        # remove any result files that have been created so far
        if self.dataset.get_results_path().exists():
            os.unlink(str(self.dataset.get_results_path()))

        if self.dataset.get_temporary_path().exists():
            shutil.rmtree(str(self.dataset.get_temporary_path()))

        # we release instead of finish, since interrupting is just that - the
        # job should resume at a later point. Delay resuming by 10 seconds to
        # give 4CAT the time to do whatever it wants (though usually this isn't
        # needed since restarting also stops the spawning of new workers)
        self.dataset.update_status(
            "Dataset processing interrupted. Retrying later.")

        if self.interrupted == self.INTERRUPT_RETRY:
            # retry later - wait at least 10 seconds to give the backend time to shut down
            self.job.release(delay=10)
        elif self.interrupted == self.INTERRUPT_CANCEL:
            # cancel job
            self.job.finish()

    def iterate_csv_items(self, path):
        """
		A generator that iterates through a CSV file

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		:param Path path: 	Path to csv file to read
		:return:
		"""

        with open(path, encoding="utf-8") as input:

            reader = csv.DictReader(input)

            for item in reader:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Processor interrupted while iterating through CSV file"
                    )

                yield item

    def write_csv_items_and_finish(self, data):
        """
		Write data as csv to results file and finish dataset

		Determines result file path using dataset's path determination helper
		methods. After writing results, the dataset is marked finished. Will
		raise a ProcessorInterruptedException if the interrupted flag for this
		processor is set while iterating.

		:param data: A list or tuple of dictionaries, all with the same keys
		"""
        if not (isinstance(data, typing.List)
                or isinstance(data, typing.Tuple)) or isinstance(data, str):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        if not data:
            raise ValueError(
                "write_csv_items requires a dictionary with at least one item")

        if not isinstance(data[0], dict):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        self.dataset.update_status("Writing results file")
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8",
                                                  newline='') as results:
            writer = csv.DictWriter(results, fieldnames=data[0].keys())
            writer.writeheader()

            for row in data:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results file")
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(data))

    def is_filter(self):
        """
		Is this processor a filter?

		Filters do not produce their own dataset but replace the parent dataset
		instead.

		:todo: Make this a bit more robust than sniffing the processor category
		:return bool:
		"""
        return hasattr(
            self, "category"
        ) and self.category and "filter" in self.category.lower()

    @abc.abstractmethod
    def process(self):
        """
		Process data

		To be defined by the child processor.
		"""
        pass

Example #25

0

Show file

    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the parent dataset is if it's a search worker
            try:
                self.parent = DataSet(key=self.dataset.data["key_parent"],
                                      db=self.db)
            except TypeError:
                # we need to know what the parent dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.parent.is_finished():
                # not finished yet - retry after a while
                self.job.release(delay=30)
                return

            self.parent = DataSet(key=self.dataset.data["key_parent"],
                                  db=self.db)

            self.source_file = self.parent.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running post-processor %s on query %s" %
                      (self.type, self.job.data["remote_id"]))

        self.parameters = self.dataset.parameters
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        if self.interrupted:
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException:
                self.abort()
            except Exception as e:
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have parent keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()