Esempio n. 1
0
    def after_create(query, dataset, request):
        """
		Hook to execute after the dataset for this source has been created

		In this case, it is used to save the uploaded file to the dataset's
		result path, and finalise the dataset metadata.

		:param dict query:  Sanitised query parameters
		:param DataSet dataset:  Dataset created for this query
		:param request:  Flask request submitted for its creation
		"""
        hashtag = re.compile(r"#([^\s,.+=-]+)")
        usertag = re.compile(r"@([^\s,.+=-]+)")

        file = request.files["data_upload"]
        platform = dataset.parameters.get("platform")
        dataset.type = "%s-search" % platform
        dataset.datasource = platform

        file.seek(0)
        done = 0

        # With validated csvs, save as is but make sure the raw file is sorted
        if dataset.parameters.get("platform") == "instagram":
            with dataset.get_results_path().open("w",
                                                 encoding="utf-8",
                                                 newline="") as output_csv:
                wrapped_upload = io.TextIOWrapper(file, encoding="utf-8")
                reader = csv.DictReader(wrapped_upload)
                writer = csv.DictWriter(
                    output_csv,
                    fieldnames=("id", "thread_id", "parent_id", "body",
                                "author", "timestamp", "type", "url",
                                "thumbnail_url", "hashtags", "usertags",
                                "mentioned", "num_likes", "num_comments",
                                "subject"))
                writer.writeheader()

                dataset.update_status("Sorting by date...")
                posts = sorted(reader, key=lambda x: x["Created"])

                dataset.update_status("Processing posts...")
                for item in posts:
                    done += 1
                    url = item["URL"]
                    url = re.sub(r"/*$", "", url)

                    id = url.split("/")[-1]
                    caption = item["Description"]
                    hashtags = hashtag.findall(caption)
                    usertags = usertag.findall(caption)

                    datestamp = " ".join(item["Created"].split(" ")[:-1])
                    date = datetime.datetime.strptime(datestamp,
                                                      "%Y-%m-%d %H:%M:%S")

                    writer.writerow({
                        "id":
                        id,
                        "thread_id":
                        id,
                        "parent_id":
                        id,
                        "body":
                        caption if caption is not None else "",
                        "author":
                        item["User Name"],
                        "timestamp":
                        int(date.timestamp()),
                        "type":
                        "picture"
                        if item["Type"] == "Photo" else item["Type"].lower(),
                        "url":
                        item["URL"],
                        "thumbnail_url":
                        item["Photo"],
                        "hashtags":
                        ",".join(hashtags),
                        "usertags":
                        ",".join(usertags),
                        "mentioned":
                        "",
                        "num_likes":
                        item["Likes"],
                        "num_comments":
                        item["Comments"],
                        "subject":
                        item["Title"]
                    })

        elif platform == "tiktok":
            with dataset.get_results_path().open("w",
                                                 encoding="utf-8",
                                                 newline="") as output_csv:
                wrapped_upload = io.TextIOWrapper(file, encoding="utf-8")
                reader = csv.DictReader(wrapped_upload)
                writer = csv.DictWriter(
                    output_csv,
                    fieldnames=("id", "thread_id", "author", "subject", "body",
                                "timestamp", "is_harmful", "is_duet",
                                "music_name", "music_id", "music_author",
                                "video_url", "tiktok_url", "thumbnail_url",
                                "amount_likes", "amount_comments",
                                "amount_shares", "amount_plays", "hashtags"))
                writer.writeheader()

                dataset.update_status("Sorting by date...")
                posts = sorted(reader, key=lambda x: x["createTime"])

                dataset.update_status("Processing posts...")
                for item in posts:
                    hashtags = json.loads(item["hashtags"])
                    hashtags = [hashtag["name"] for hashtag in hashtags]

                    done += 1

                    writer.writerow({
                        "id":
                        item["id"],
                        "thread_id":
                        item["id"],
                        "author":
                        item["authorMeta.name"],
                        "subject":
                        "",
                        "body":
                        item["text"],
                        "timestamp":
                        int(item["createTime"]),
                        "is_harmful":
                        -1,
                        "is_duet":
                        -1,
                        "music_name":
                        item["musicMeta.musicName"],
                        "music_id":
                        item["musicMeta.musicId"],
                        "music_author":
                        item["musicMeta.musicAuthor"],
                        "video_url":
                        item["videoUrl"],
                        "tiktok_url":
                        "https://tiktok.com/@%s/video/%s" %
                        (item["authorMeta.id"], item["id"]),
                        "thumbnail_url":
                        item["covers.default"],
                        "amount_likes":
                        item["diggCount"],
                        "amount_comments":
                        item["commentCount"],
                        "amount_shares":
                        item["shareCount"],
                        "amount_plays":
                        item["playCount"],
                        "hashtags":
                        ",".join(hashtags),
                    })

        file.close()

        dataset.finish(done)
        dataset.update_status("Result processed")
        dataset.update_version(get_software_version())
Esempio n. 2
0
	def after_create(query, dataset, request):
		"""
		Hook to execute after the dataset for this source has been created

		In this case, it is used to save the uploaded file to the dataset's
		result path, and finalise the dataset metadata.

		:param dict query:  Sanitised query parameters
		:param DataSet dataset:  Dataset created for this query
		:param request:  Flask request submitted for its creation
		"""
		hashtag = re.compile(r"#([^\s,.+=-]+)")
		usertag = re.compile(r"@([^\s,.+=-]+)")

		file = request.files["option-data_upload"]
		platform = dataset.parameters.get("platform")

		# this is a bit hacky, but sometimes we have multiple tools that can
		# all serve as input for the same datasource (e.g. CrowdTangle and
		# the DMI Instagram Scraper would both go to the 'instagram'
		# datasource), so just assume the datasource ID has no dashes in it
		# and ignore everything after a dash for the purposes of determining
		# what datasource to assign to the dataset
		datasource = platform.split("-")[0]
		dataset.type = "%s-search" % datasource
		dataset.datasource = datasource

		file.seek(0)
		done = 0

		encoding = sniff_encoding(file)

		# With validated csvs, save as is but make sure the raw file is sorted
		if platform == "instagram-crowdtangle":
			with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv:
				wrapped_upload = io.TextIOWrapper(file, encoding=encoding)
				reader = csv.DictReader(wrapped_upload)
				writer = csv.DictWriter(output_csv, fieldnames=(
					"id", "thread_id", "parent_id", "body", "author", "timestamp", "type", "url", "thumbnail_url",
					"hashtags", "usertags", "mentioned", "num_likes", "num_comments", "subject"))
				writer.writeheader()

				dataset.update_status("Sorting by date...")
				posts = sorted(reader, key=lambda x: x["Post Created"])

				dataset.update_status("Processing posts...")
				for item in posts:
					done += 1
					url = item["URL"]
					url = re.sub(r"/*$", "", url)

					id = url.split("/")[-1]
					caption = item["Description"]
					hashtags = hashtag.findall(caption)
					usertags = usertag.findall(caption)

					datestamp = " ".join(item["Post Created"].split(" ")[:-1])
					date = datetime.datetime.strptime(datestamp, "%Y-%m-%d %H:%M:%S")

					writer.writerow({
						"id": id,
						"thread_id": id,
						"parent_id": id,
						"body": caption if caption is not None else "",
						"author": item["User Name"],
						"timestamp": int(date.timestamp()),
						"type": "picture" if item["Type"] == "Photo" else item["Type"].lower(),
						"url": item["URL"],
						"thumbnail_url": item["Photo"],
						"hashtags": ",".join(hashtags),
						"usertags": ",".join(usertags),
						"mentioned": "",
						"num_likes": item["Likes"],
						"num_comments": item["Comments"],
						"subject": item["Title"]}
					)

		elif platform == "facebook-crowdtangle":
			with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv:
				wrapped_upload = io.TextIOWrapper(file, encoding=encoding)
				reader = csv.DictReader(wrapped_upload)

				entity_name = "Page Name" if "Page Name" in reader.fieldnames else "Group Name"

				writer = csv.DictWriter(output_csv, fieldnames=(
					"id", "thread_id", "body", "author", "timestamp", "page_id", "page_name", "page_likes",
					"page_followers", "page_shared_from", "type", "interactions", "likes", "comments", "shares",
					"likes_love", "likes_wow", "likes_haha", "likes_sad", "likes_angry", "likes_care", "views_post",
					"views_total", "views_total_crossposts", "video_length", "video_status", "url", "url_original",
					"body_image", "body_link", "body_description", "hashtags", "sponsor_id", "sponsor_name"))
				writer.writeheader()

				dataset.update_status("Sorting by date...")
				posts = sorted(reader, key=lambda x: x["Created"])

				dataset.update_status("Processing posts...")
				for item in posts:
					done += 1
					hashtags = hashtag.findall(item["Message"])

					date = datetime.datetime.strptime(" ".join(item["Created"].split(" ")[:2]), "%Y-%m-%d %H:%M:%S")

					is_from_elsewhere = item["Link"].find("https://www.facebook.com/" + item["User Name"]) < 0
					shared_page = item["Link"].split("/")[3] if is_from_elsewhere and item["Link"].find("https://www.facebook.com/") == 0 else ""

					writer.writerow({
						"id": item["URL"].split("/")[-1],
						"thread_id": item["URL"].split("/")[-1],
						"body": item["Message"],
						"author": item["User Name"],
						"timestamp": int(date.timestamp()),
						"page_name": item[entity_name],
						"page_likes": item["Likes at Posting"],
						"page_id": item["Facebook Id"],
						"page_followers": item["Followers at Posting"],
						"page_shared_from": shared_page,
						"type": item["Type"],
						"interactions": int(re.sub(r"[^0-9]", "", item["Total Interactions"])) if item["Total Interactions"] else 0,
						"comments": item["Comments"],
						"shares": item["Shares"],
						"likes": item["Likes"],
						"likes_love": item["Love"],
						"likes_wow": item["Wow"],
						"likes_haha": item["Haha"],
						"likes_sad": item["Sad"],
						"likes_angry": item["Angry"],
						"likes_care": item["Care"],
						"views_post": item["Post Views"],
						"views_total": item["Total Views"],
						"views_total_crossposts": item["Total Views For All Crossposts"],
						"video_length": "" if item["Video Length"] == "N/A" else item["Video Length"],
						"video_status": item["Video Share Status"],
						"url": item["URL"],
						"hashtags": ",".join(hashtags),
						"url_original": item["Link"],
						"body_image": item["Image Text"],
						"body_link": item["Link Text"],
						"body_description": item["Description"],
						"sponsor_id": item["Sponsor Id"],
						"sponsor_name": item["Sponsor Name"]
					})

		elif platform == "instagram-dmi-scraper":
			# in principe, this csv file should be good to go
			# however, we still need to know how many rows are in it, so we
			# nevertheless copy it line by line rather than in one go
			# as a bonus this also ensures it uses the right csv dialect
			with dataset.get_results_path().open("w", encoding="utf-8") as output_csv:
				wrapped_upload = io.TextIOWrapper(file, encoding=encoding)
				reader = csv.DictReader(wrapped_upload)
				writer = csv.DictWriter(output_csv, fieldnames=reader.fieldnames)
				writer.writeheader()
				for row in reader:
					done += 1
					writer.writerow(row)

		elif platform == "tiktok":
			with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv:
				wrapped_upload = io.TextIOWrapper(file, encoding=encoding)
				reader = csv.DictReader(wrapped_upload)
				writer = csv.DictWriter(output_csv, fieldnames=("id", "thread_id", "author", "subject", "body",
					"timestamp", "is_harmful", "is_duet", "music_name", "music_id", "music_author", "video_url",
					"tiktok_url", "thumbnail_url", "amount_likes", "amount_comments", "amount_shares", "amount_plays",
					"hashtags"))
				writer.writeheader()


				dataset.update_status("Sorting by date...")
				posts = sorted(reader, key=lambda x: x["createTime"])

				dataset.update_status("Processing posts...")
				for item in posts:
					hashtags = json.loads(item["hashtags"])
					hashtags = [hashtag["name"] for hashtag in hashtags]

					done += 1

					writer.writerow({
						"id": item["id"],
						"thread_id": item["id"],
						"author": item["authorMeta.name"],
						"subject": "",
						"body": item["text"],
						"timestamp": int(item["createTime"]),
						"is_harmful": -1,
						"is_duet": -1,
						"music_name": item["musicMeta.musicName"],
						"music_id": item["musicMeta.musicId"],
						"music_author": item["musicMeta.musicAuthor"],
						"video_url": item["videoUrl"],
						"tiktok_url": "https://tiktok.com/@%s/video/%s" % (item["authorMeta.id"], item["id"]),
						"thumbnail_url": item["covers.default"],
						"amount_likes": item["diggCount"],
						"amount_comments": item["commentCount"],
						"amount_shares": item["shareCount"],
						"amount_plays": item["playCount"],
						"hashtags": ",".join(hashtags),
					})

		elif platform == "facepager":
			with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv:
				wrapped_upload = io.TextIOWrapper(file, encoding=encoding)
				reader = csv.DictReader(wrapped_upload)
				writer = csv.DictWriter(output_csv, fieldnames=("id", "thread_id", "author", "subject", "body",
					"timestamp", "is_harmful", "is_duet", "music_name", "music_id", "music_author", "video_url",
					"tiktok_url", "thumbnail_url", "amount_likes", "amount_comments", "amount_shares", "amount_plays",
					"hashtags"))
				writer.writeheader()


				dataset.update_status("Sorting by date...")
				posts = sorted(reader, key=lambda x: x["createTime"])

				dataset.update_status("Processing posts...")
				for item in posts:
					hashtags = json.loads(item["hashtags"])
					hashtags = [hashtag["name"] for hashtag in hashtags]

					done += 1

					writer.writerow({
						"id": item["id"],
						"thread_id": item["id"],
						"author": item["authorMeta.name"],
						"subject": "",
						"body": item["text"],
						"timestamp": int(item["createTime"]),
						"is_harmful": -1,
						"is_duet": -1,
						"music_name": item["musicMeta.musicName"],
						"music_id": item["musicMeta.musicId"],
						"music_author": item["musicMeta.musicAuthor"],
						"video_url": item["videoUrl"],
						"tiktok_url": "https://tiktok.com/@%s/video/%s" % (item["authorMeta.id"], item["id"]),
						"thumbnail_url": item["covers.default"],
						"amount_likes": item["diggCount"],
						"amount_comments": item["commentCount"],
						"amount_shares": item["shareCount"],
						"amount_plays": item["playCount"],
						"hashtags": ",".join(hashtags),
					})


		file.close()

		dataset.finish(done)
		dataset.update_status("Result processed")
		dataset.update_version(get_software_version())
Esempio n. 3
0
    def after_create(query, dataset, request):
        """
		Hook to execute after the dataset for this source has been created

		In this case, it is used to save the uploaded file to the dataset's
		result path, and finalise the dataset metadata.

		:param dict query:  Sanitised query parameters
		:param DataSet dataset:  Dataset created for this query
		:param request:  Flask request submitted for its creation
		"""

        strip_html = query.get("strip_html")

        file = request.files["data_upload"]

        file.seek(0)

        # Convert .tab files to comma delimited files
        if file.filename.endswith(".tab"):

            wrapped_upload = io.TextIOWrapper(file, encoding="utf-8")
            reader = csv.DictReader(wrapped_upload,
                                    delimiter="\t",
                                    quoting=csv.QUOTE_NONE)

            # Write to csv
            with dataset.get_results_path().open("w",
                                                 encoding="utf-8",
                                                 newline="") as output_csv:
                writer = csv.DictWriter(output_csv,
                                        fieldnames=reader.fieldnames)
                writer.writeheader()
                for row in reader:
                    if strip_html:  # Possibly strip HTML
                        row["body"] = strip_tags(row["body"])
                    writer.writerow(row)

            wrapped_upload.detach()

        else:
            # With validated csvs, just save the raw file
            if not strip_html:
                file.save(dataset.get_results_path().open("wb"))
            else:
                with dataset.get_results_path().open("w",
                                                     encoding="utf-8",
                                                     newline="") as output_csv:
                    wrapped_upload = io.TextIOWrapper(file, encoding="utf-8")
                    reader = csv.DictReader(wrapped_upload)
                    writer = csv.DictWriter(output_csv,
                                            fieldnames=reader.fieldnames)
                    writer.writeheader()
                    for row in reader:
                        row["body"] = strip_tags(row["body"])
                        writer.writerow(row)

        file.close()

        with dataset.get_results_path().open(encoding="utf-8") as input:
            if file.filename.endswith(".tab"):
                reader = csv.DictReader(input,
                                        delimiter="\t",
                                        quoting=csv.QUOTE_NONE)
            else:
                reader = csv.DictReader(input)

            dataset.finish(sum(1 for line in reader))
            dataset.update_status("Result processed")

        dataset.update_version(get_software_version())
Esempio n. 4
0
    def after_create(query, dataset, request):
        """
		Hook to execute after the dataset for this source has been created

		In this case, it is used to save the uploaded file to the dataset's
		result path, and finalise the dataset metadata.

		:param dict query:  Sanitised query parameters
		:param DataSet dataset:  Dataset created for this query
		:param request:  Flask request submitted for its creation
		"""

        strip_html = query.get("strip_html")

        file = request.files["data_upload"]

        file.seek(0)

        # detect encoding - UTF-8 with or without BOM
        encoding = SearchCustom.sniff_encoding(file)

        wrapped_file = io.TextIOWrapper(file, encoding=encoding)
        sample = wrapped_file.read(1024 * 1024)
        wrapped_file.seek(0)
        dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t"))

        # With validated csvs, save as is but make sure the raw file is sorted
        reader = csv.DictReader(wrapped_file, dialect=dialect)
        with dataset.get_results_path().open("w", encoding="utf-8",
                                             newline="") as output_csv:
            # Sort by timestamp
            # note that this relies on the timestamp format to be sortable
            # but the alternative - first converting timestamps and then
            # sorting - would be quite intensive
            dataset.update_status("Sorting file by date")
            sorted_reader = sorted(
                reader,
                key=lambda row: row["timestamp"]
                if isinstance(row["timestamp"], str) else "")

            dataset.update_status("Writing to file")
            fieldnames = list(reader.fieldnames)
            if "unix_timestamp" not in fieldnames:
                fieldnames.append("unix_timestamp")

            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            for row in sorted_reader:
                try:
                    sanitised_time = parse_datetime(row["timestamp"])
                    row["timestamp"] = sanitised_time.strftime(
                        "%Y-%m-%d %H:%I:%S")
                    row["unix_timestamp"] = sanitised_time.timestamp()
                except (TypeError, ValueError):
                    # bad format, skip
                    continue

                if strip_html:
                    row["body"] = strip_tags(row["body"])
                writer.writerow(row)

        file.close()

        with dataset.get_results_path().open(encoding="utf-8") as input:
            if file.filename.endswith(".tab"):
                reader = csv.DictReader(input,
                                        delimiter="\t",
                                        quoting=csv.QUOTE_NONE)
            else:
                reader = csv.DictReader(input)

            dataset.finish(sum(1 for line in reader))
            dataset.update_status("Result processed")

        dataset.update_version(get_software_version())
Esempio n. 5
0
    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the parent dataset is if it's a search worker
            try:
                self.parent = DataSet(key=self.dataset.data["key_parent"],
                                      db=self.db)
            except TypeError:
                # we need to know what the parent dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.parent.is_finished():
                # not finished yet - retry after a while
                self.job.release(delay=30)
                return

            self.parent = DataSet(key=self.dataset.data["key_parent"],
                                  db=self.db)

            self.source_file = self.parent.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running post-processor %s on query %s" %
                      (self.type, self.job.data["remote_id"]))

        self.parameters = self.dataset.parameters
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        if self.interrupted:
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException:
                self.abort()
            except Exception as e:
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have parent keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()
Esempio n. 6
0
    def __init__(self,
                 parameters={},
                 key=None,
                 job=None,
                 data=None,
                 db=None,
                 parent=None,
                 extension="csv",
                 type=None):
        """
		Create new dataset object

		If the dataset is not in the database yet, it is added.

		:param parameters:  Parameters, e.g. search query, date limits, et cetera
		:param db:  Database connection
		"""
        self.db = db
        self.folder = Path(config.PATH_ROOT, config.PATH_DATA)

        if key is not None:
            self.key = key
            current = self.db.fetchone("SELECT * FROM datasets WHERE key = %s",
                                       (self.key, ))
            if not current:
                raise TypeError(
                    "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given"
                    % key)

            query = current["query"]
        elif job is not None:
            current = self.db.fetchone(
                "SELECT * FROM datasets WHERE parameters::json->>'job' = %s",
                (job, ))
            if not current:
                raise TypeError(
                    "DataSet() requires a valid job ID for its 'job' argument")

            query = current["query"]
            self.key = current["key"]
        elif data is not None:
            current = data
            if "query" not in data or "key" not in data or "parameters" not in data or "key_parent" not in data:
                raise ValueError(
                    "DataSet() requires a complete dataset record for its 'data' argument"
                )

            query = current["query"]
            self.key = current["key"]
        else:
            if parameters is None:
                raise TypeError(
                    "DataSet() requires either 'key', or 'parameters' to be given"
                )

            if not type:
                raise ValueError(
                    "Datasets must have their type set explicitly")

            query = self.get_label(parameters, default=type)
            self.key = self.get_key(query, parameters, parent)
            current = self.db.fetchone(
                "SELECT * FROM datasets WHERE key = %s AND query = %s",
                (self.key, query))

        if current:
            self.data = current
            self.parameters = json.loads(self.data["parameters"])
            self.is_new = False
        else:
            self.data = {
                "key": self.key,
                "query": self.get_label(parameters, default=type),
                "parameters": json.dumps(parameters),
                "result_file": "",
                "status": "",
                "type": type,
                "timestamp": int(time.time()),
                "is_finished": False,
                "software_version": get_software_version(),
                "software_file": "",
                "num_rows": 0
            }
            self.parameters = parameters

            if parent:
                self.data["key_parent"] = parent

            self.db.insert("datasets", data=self.data)
            self.reserve_result_file(parameters, extension)

        # retrieve analyses and processors that may be run for this dataset
        analyses = self.db.fetchall(
            "SELECT * FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC",
            (self.key, ))
        self.children = sorted(
            [DataSet(data=analysis, db=self.db) for analysis in analyses],
            key=lambda dataset: dataset.is_finished(),
            reverse=True)
        self.processors = self.get_available_processors()
Esempio n. 7
0
    def work(self):
        """
		Process a dataset

		Loads dataset metadata, sets up the scaffolding for performing some kind
		of processing on that dataset, and then processes it. Afterwards, clean
		up.
		"""
        try:
            self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
        except TypeError:
            # query has been deleted in the meantime. finish without error,
            # as deleting it will have been a conscious choice by a user
            self.job.finish()
            return

        is_running_in_preset = False
        if self.dataset.data.get("key_parent", None):
            # search workers never have parents (for now), so we don't need to
            # find out what the source_dataset dataset is if it's a search worker
            try:
                self.source_dataset = DataSet(
                    key=self.dataset.data["key_parent"], db=self.db)

                # for presets, transparently use the *top* dataset as a source_dataset
                # since that is where any underlying processors should get
                # their data from. However, this should only be done as long as the
                # preset is not finished yet, because after that there may be processors
                # that run on the final preset result
                if self.source_dataset.type.find(
                        "preset-"
                ) == 0 and not self.source_dataset.is_finished():
                    self.is_running_in_preset = True
                    self.source_dataset = self.source_dataset.get_genealogy(
                    )[0]

            except TypeError:
                # we need to know what the source_dataset dataset was to properly handle the
                # analysis
                self.log.warning(
                    "Processor %s queued for orphan query %s: cannot run, cancelling job"
                    % (self.type, self.dataset.key))
                self.job.finish()
                return

            if not self.source_dataset.is_finished(
            ) and not self.is_running_in_preset:
                # not finished yet - retry after a while
                # exception for presets, since these *should* be unfinished
                # until underlying processors are done
                self.job.release(delay=30)
                return

            self.source_file = self.source_dataset.get_results_path()
            if not self.source_file.exists():
                self.dataset.update_status("Finished, no input data found.")

        self.log.info("Running processor %s on dataset %s" %
                      (self.type, self.job.data["remote_id"]))

        processor_name = self.title if hasattr(self, "title") else self.type
        self.dataset.clear_log()
        self.dataset.log("Processing '%s' started for dataset %s" %
                         (processor_name, self.dataset.key))

        # start log file
        self.parameters = self.dataset.parameters.copy()
        self.dataset.update_status("Processing data")
        self.dataset.update_version(get_software_version())

        # now the parameters have been loaded into memory, clear any sensitive
        # ones. This has a side-effect that a processor may not run again
        # without starting from scratch, but this is the price of progress
        if hasattr(self, "options"):
            for option in self.options:
                if self.options[option].get("sensitive"):
                    self.dataset.delete_parameter(option)

        if self.interrupted:
            self.dataset.log("Processing interrupted, trying again later")
            return self.abort()

        if not self.dataset.is_finished():
            try:
                self.process()
                self.after_process()
            except WorkerInterruptedException as e:
                self.dataset.log(
                    "Processing interrupted (%s), trying again later" % str(e))
                self.abort()
            except Exception as e:
                self.dataset.log("Processor crashed (%s), trying again later" %
                                 str(e))
                frames = traceback.extract_tb(e.__traceback__)
                frames = [
                    frame.filename.split("/").pop() + ":" + str(frame.lineno)
                    for frame in frames[1:]
                ]
                location = "->".join(frames)

                # Not all datasets have source_dataset keys
                if len(self.dataset.get_genealogy()) > 1:
                    parent_key = " (via " + self.dataset.get_genealogy(
                    )[0].key + ")"
                else:
                    parent_key = ""

                raise ProcessorException(
                    "Processor %s raised %s while processing dataset %s%s in %s:\n   %s\n"
                    % (self.type, e.__class__.__name__, self.dataset.key,
                       parent_key, location, str(e)))
        else:
            # dataset already finished, job shouldn't be open anymore
            self.log.warning(
                "Job %s/%s was queued for a dataset already marked as finished, deleting..."
                % (self.job.data["jobtype"], self.job.data["remote_id"]))
            self.job.finish()