def after_create(query, dataset, request): """ Hook to execute after the dataset for this source has been created In this case, it is used to save the uploaded file to the dataset's result path, and finalise the dataset metadata. :param dict query: Sanitised query parameters :param DataSet dataset: Dataset created for this query :param request: Flask request submitted for its creation """ hashtag = re.compile(r"#([^\s,.+=-]+)") usertag = re.compile(r"@([^\s,.+=-]+)") file = request.files["data_upload"] platform = dataset.parameters.get("platform") dataset.type = "%s-search" % platform dataset.datasource = platform file.seek(0) done = 0 # With validated csvs, save as is but make sure the raw file is sorted if dataset.parameters.get("platform") == "instagram": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter( output_csv, fieldnames=("id", "thread_id", "parent_id", "body", "author", "timestamp", "type", "url", "thumbnail_url", "hashtags", "usertags", "mentioned", "num_likes", "num_comments", "subject")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["Created"]) dataset.update_status("Processing posts...") for item in posts: done += 1 url = item["URL"] url = re.sub(r"/*$", "", url) id = url.split("/")[-1] caption = item["Description"] hashtags = hashtag.findall(caption) usertags = usertag.findall(caption) datestamp = " ".join(item["Created"].split(" ")[:-1]) date = datetime.datetime.strptime(datestamp, "%Y-%m-%d %H:%M:%S") writer.writerow({ "id": id, "thread_id": id, "parent_id": id, "body": caption if caption is not None else "", "author": item["User Name"], "timestamp": int(date.timestamp()), "type": "picture" if item["Type"] == "Photo" else item["Type"].lower(), "url": item["URL"], "thumbnail_url": item["Photo"], "hashtags": ",".join(hashtags), "usertags": ",".join(usertags), "mentioned": "", "num_likes": item["Likes"], "num_comments": item["Comments"], "subject": item["Title"] }) elif platform == "tiktok": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter( output_csv, fieldnames=("id", "thread_id", "author", "subject", "body", "timestamp", "is_harmful", "is_duet", "music_name", "music_id", "music_author", "video_url", "tiktok_url", "thumbnail_url", "amount_likes", "amount_comments", "amount_shares", "amount_plays", "hashtags")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["createTime"]) dataset.update_status("Processing posts...") for item in posts: hashtags = json.loads(item["hashtags"]) hashtags = [hashtag["name"] for hashtag in hashtags] done += 1 writer.writerow({ "id": item["id"], "thread_id": item["id"], "author": item["authorMeta.name"], "subject": "", "body": item["text"], "timestamp": int(item["createTime"]), "is_harmful": -1, "is_duet": -1, "music_name": item["musicMeta.musicName"], "music_id": item["musicMeta.musicId"], "music_author": item["musicMeta.musicAuthor"], "video_url": item["videoUrl"], "tiktok_url": "https://tiktok.com/@%s/video/%s" % (item["authorMeta.id"], item["id"]), "thumbnail_url": item["covers.default"], "amount_likes": item["diggCount"], "amount_comments": item["commentCount"], "amount_shares": item["shareCount"], "amount_plays": item["playCount"], "hashtags": ",".join(hashtags), }) file.close() dataset.finish(done) dataset.update_status("Result processed") dataset.update_version(get_software_version())
def after_create(query, dataset, request): """ Hook to execute after the dataset for this source has been created In this case, it is used to save the uploaded file to the dataset's result path, and finalise the dataset metadata. :param dict query: Sanitised query parameters :param DataSet dataset: Dataset created for this query :param request: Flask request submitted for its creation """ hashtag = re.compile(r"#([^\s,.+=-]+)") usertag = re.compile(r"@([^\s,.+=-]+)") file = request.files["option-data_upload"] platform = dataset.parameters.get("platform") # this is a bit hacky, but sometimes we have multiple tools that can # all serve as input for the same datasource (e.g. CrowdTangle and # the DMI Instagram Scraper would both go to the 'instagram' # datasource), so just assume the datasource ID has no dashes in it # and ignore everything after a dash for the purposes of determining # what datasource to assign to the dataset datasource = platform.split("-")[0] dataset.type = "%s-search" % datasource dataset.datasource = datasource file.seek(0) done = 0 encoding = sniff_encoding(file) # With validated csvs, save as is but make sure the raw file is sorted if platform == "instagram-crowdtangle": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding=encoding) reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter(output_csv, fieldnames=( "id", "thread_id", "parent_id", "body", "author", "timestamp", "type", "url", "thumbnail_url", "hashtags", "usertags", "mentioned", "num_likes", "num_comments", "subject")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["Post Created"]) dataset.update_status("Processing posts...") for item in posts: done += 1 url = item["URL"] url = re.sub(r"/*$", "", url) id = url.split("/")[-1] caption = item["Description"] hashtags = hashtag.findall(caption) usertags = usertag.findall(caption) datestamp = " ".join(item["Post Created"].split(" ")[:-1]) date = datetime.datetime.strptime(datestamp, "%Y-%m-%d %H:%M:%S") writer.writerow({ "id": id, "thread_id": id, "parent_id": id, "body": caption if caption is not None else "", "author": item["User Name"], "timestamp": int(date.timestamp()), "type": "picture" if item["Type"] == "Photo" else item["Type"].lower(), "url": item["URL"], "thumbnail_url": item["Photo"], "hashtags": ",".join(hashtags), "usertags": ",".join(usertags), "mentioned": "", "num_likes": item["Likes"], "num_comments": item["Comments"], "subject": item["Title"]} ) elif platform == "facebook-crowdtangle": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding=encoding) reader = csv.DictReader(wrapped_upload) entity_name = "Page Name" if "Page Name" in reader.fieldnames else "Group Name" writer = csv.DictWriter(output_csv, fieldnames=( "id", "thread_id", "body", "author", "timestamp", "page_id", "page_name", "page_likes", "page_followers", "page_shared_from", "type", "interactions", "likes", "comments", "shares", "likes_love", "likes_wow", "likes_haha", "likes_sad", "likes_angry", "likes_care", "views_post", "views_total", "views_total_crossposts", "video_length", "video_status", "url", "url_original", "body_image", "body_link", "body_description", "hashtags", "sponsor_id", "sponsor_name")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["Created"]) dataset.update_status("Processing posts...") for item in posts: done += 1 hashtags = hashtag.findall(item["Message"]) date = datetime.datetime.strptime(" ".join(item["Created"].split(" ")[:2]), "%Y-%m-%d %H:%M:%S") is_from_elsewhere = item["Link"].find("https://www.facebook.com/" + item["User Name"]) < 0 shared_page = item["Link"].split("/")[3] if is_from_elsewhere and item["Link"].find("https://www.facebook.com/") == 0 else "" writer.writerow({ "id": item["URL"].split("/")[-1], "thread_id": item["URL"].split("/")[-1], "body": item["Message"], "author": item["User Name"], "timestamp": int(date.timestamp()), "page_name": item[entity_name], "page_likes": item["Likes at Posting"], "page_id": item["Facebook Id"], "page_followers": item["Followers at Posting"], "page_shared_from": shared_page, "type": item["Type"], "interactions": int(re.sub(r"[^0-9]", "", item["Total Interactions"])) if item["Total Interactions"] else 0, "comments": item["Comments"], "shares": item["Shares"], "likes": item["Likes"], "likes_love": item["Love"], "likes_wow": item["Wow"], "likes_haha": item["Haha"], "likes_sad": item["Sad"], "likes_angry": item["Angry"], "likes_care": item["Care"], "views_post": item["Post Views"], "views_total": item["Total Views"], "views_total_crossposts": item["Total Views For All Crossposts"], "video_length": "" if item["Video Length"] == "N/A" else item["Video Length"], "video_status": item["Video Share Status"], "url": item["URL"], "hashtags": ",".join(hashtags), "url_original": item["Link"], "body_image": item["Image Text"], "body_link": item["Link Text"], "body_description": item["Description"], "sponsor_id": item["Sponsor Id"], "sponsor_name": item["Sponsor Name"] }) elif platform == "instagram-dmi-scraper": # in principe, this csv file should be good to go # however, we still need to know how many rows are in it, so we # nevertheless copy it line by line rather than in one go # as a bonus this also ensures it uses the right csv dialect with dataset.get_results_path().open("w", encoding="utf-8") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding=encoding) reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter(output_csv, fieldnames=reader.fieldnames) writer.writeheader() for row in reader: done += 1 writer.writerow(row) elif platform == "tiktok": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding=encoding) reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter(output_csv, fieldnames=("id", "thread_id", "author", "subject", "body", "timestamp", "is_harmful", "is_duet", "music_name", "music_id", "music_author", "video_url", "tiktok_url", "thumbnail_url", "amount_likes", "amount_comments", "amount_shares", "amount_plays", "hashtags")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["createTime"]) dataset.update_status("Processing posts...") for item in posts: hashtags = json.loads(item["hashtags"]) hashtags = [hashtag["name"] for hashtag in hashtags] done += 1 writer.writerow({ "id": item["id"], "thread_id": item["id"], "author": item["authorMeta.name"], "subject": "", "body": item["text"], "timestamp": int(item["createTime"]), "is_harmful": -1, "is_duet": -1, "music_name": item["musicMeta.musicName"], "music_id": item["musicMeta.musicId"], "music_author": item["musicMeta.musicAuthor"], "video_url": item["videoUrl"], "tiktok_url": "https://tiktok.com/@%s/video/%s" % (item["authorMeta.id"], item["id"]), "thumbnail_url": item["covers.default"], "amount_likes": item["diggCount"], "amount_comments": item["commentCount"], "amount_shares": item["shareCount"], "amount_plays": item["playCount"], "hashtags": ",".join(hashtags), }) elif platform == "facepager": with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding=encoding) reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter(output_csv, fieldnames=("id", "thread_id", "author", "subject", "body", "timestamp", "is_harmful", "is_duet", "music_name", "music_id", "music_author", "video_url", "tiktok_url", "thumbnail_url", "amount_likes", "amount_comments", "amount_shares", "amount_plays", "hashtags")) writer.writeheader() dataset.update_status("Sorting by date...") posts = sorted(reader, key=lambda x: x["createTime"]) dataset.update_status("Processing posts...") for item in posts: hashtags = json.loads(item["hashtags"]) hashtags = [hashtag["name"] for hashtag in hashtags] done += 1 writer.writerow({ "id": item["id"], "thread_id": item["id"], "author": item["authorMeta.name"], "subject": "", "body": item["text"], "timestamp": int(item["createTime"]), "is_harmful": -1, "is_duet": -1, "music_name": item["musicMeta.musicName"], "music_id": item["musicMeta.musicId"], "music_author": item["musicMeta.musicAuthor"], "video_url": item["videoUrl"], "tiktok_url": "https://tiktok.com/@%s/video/%s" % (item["authorMeta.id"], item["id"]), "thumbnail_url": item["covers.default"], "amount_likes": item["diggCount"], "amount_comments": item["commentCount"], "amount_shares": item["shareCount"], "amount_plays": item["playCount"], "hashtags": ",".join(hashtags), }) file.close() dataset.finish(done) dataset.update_status("Result processed") dataset.update_version(get_software_version())
def after_create(query, dataset, request): """ Hook to execute after the dataset for this source has been created In this case, it is used to save the uploaded file to the dataset's result path, and finalise the dataset metadata. :param dict query: Sanitised query parameters :param DataSet dataset: Dataset created for this query :param request: Flask request submitted for its creation """ strip_html = query.get("strip_html") file = request.files["data_upload"] file.seek(0) # Convert .tab files to comma delimited files if file.filename.endswith(".tab"): wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") reader = csv.DictReader(wrapped_upload, delimiter="\t", quoting=csv.QUOTE_NONE) # Write to csv with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: writer = csv.DictWriter(output_csv, fieldnames=reader.fieldnames) writer.writeheader() for row in reader: if strip_html: # Possibly strip HTML row["body"] = strip_tags(row["body"]) writer.writerow(row) wrapped_upload.detach() else: # With validated csvs, just save the raw file if not strip_html: file.save(dataset.get_results_path().open("wb")) else: with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") reader = csv.DictReader(wrapped_upload) writer = csv.DictWriter(output_csv, fieldnames=reader.fieldnames) writer.writeheader() for row in reader: row["body"] = strip_tags(row["body"]) writer.writerow(row) file.close() with dataset.get_results_path().open(encoding="utf-8") as input: if file.filename.endswith(".tab"): reader = csv.DictReader(input, delimiter="\t", quoting=csv.QUOTE_NONE) else: reader = csv.DictReader(input) dataset.finish(sum(1 for line in reader)) dataset.update_status("Result processed") dataset.update_version(get_software_version())
def after_create(query, dataset, request): """ Hook to execute after the dataset for this source has been created In this case, it is used to save the uploaded file to the dataset's result path, and finalise the dataset metadata. :param dict query: Sanitised query parameters :param DataSet dataset: Dataset created for this query :param request: Flask request submitted for its creation """ strip_html = query.get("strip_html") file = request.files["data_upload"] file.seek(0) # detect encoding - UTF-8 with or without BOM encoding = SearchCustom.sniff_encoding(file) wrapped_file = io.TextIOWrapper(file, encoding=encoding) sample = wrapped_file.read(1024 * 1024) wrapped_file.seek(0) dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t")) # With validated csvs, save as is but make sure the raw file is sorted reader = csv.DictReader(wrapped_file, dialect=dialect) with dataset.get_results_path().open("w", encoding="utf-8", newline="") as output_csv: # Sort by timestamp # note that this relies on the timestamp format to be sortable # but the alternative - first converting timestamps and then # sorting - would be quite intensive dataset.update_status("Sorting file by date") sorted_reader = sorted( reader, key=lambda row: row["timestamp"] if isinstance(row["timestamp"], str) else "") dataset.update_status("Writing to file") fieldnames = list(reader.fieldnames) if "unix_timestamp" not in fieldnames: fieldnames.append("unix_timestamp") writer = csv.DictWriter(output_csv, fieldnames=fieldnames) writer.writeheader() for row in sorted_reader: try: sanitised_time = parse_datetime(row["timestamp"]) row["timestamp"] = sanitised_time.strftime( "%Y-%m-%d %H:%I:%S") row["unix_timestamp"] = sanitised_time.timestamp() except (TypeError, ValueError): # bad format, skip continue if strip_html: row["body"] = strip_tags(row["body"]) writer.writerow(row) file.close() with dataset.get_results_path().open(encoding="utf-8") as input: if file.filename.endswith(".tab"): reader = csv.DictReader(input, delimiter="\t", quoting=csv.QUOTE_NONE) else: reader = csv.DictReader(input) dataset.finish(sum(1 for line in reader)) dataset.update_status("Result processed") dataset.update_version(get_software_version())
def work(self): """ Process a dataset Loads dataset metadata, sets up the scaffolding for performing some kind of processing on that dataset, and then processes it. Afterwards, clean up. """ try: self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db) except TypeError: # query has been deleted in the meantime. finish without error, # as deleting it will have been a conscious choice by a user self.job.finish() return if self.dataset.data.get("key_parent", None): # search workers never have parents (for now), so we don't need to # find out what the parent dataset is if it's a search worker try: self.parent = DataSet(key=self.dataset.data["key_parent"], db=self.db) except TypeError: # we need to know what the parent dataset was to properly handle the # analysis self.log.warning( "Processor %s queued for orphan query %s: cannot run, cancelling job" % (self.type, self.dataset.key)) self.job.finish() return if not self.parent.is_finished(): # not finished yet - retry after a while self.job.release(delay=30) return self.parent = DataSet(key=self.dataset.data["key_parent"], db=self.db) self.source_file = self.parent.get_results_path() if not self.source_file.exists(): self.dataset.update_status("Finished, no input data found.") self.log.info("Running post-processor %s on query %s" % (self.type, self.job.data["remote_id"])) self.parameters = self.dataset.parameters self.dataset.update_status("Processing data") self.dataset.update_version(get_software_version()) if self.interrupted: return self.abort() if not self.dataset.is_finished(): try: self.process() self.after_process() except WorkerInterruptedException: self.abort() except Exception as e: frames = traceback.extract_tb(e.__traceback__) frames = [ frame.filename.split("/").pop() + ":" + str(frame.lineno) for frame in frames[1:] ] location = "->".join(frames) # Not all datasets have parent keys if len(self.dataset.get_genealogy()) > 1: parent_key = " (via " + self.dataset.get_genealogy( )[0].key + ")" else: parent_key = "" raise ProcessorException( "Processor %s raised %s while processing dataset %s%s in %s:\n %s\n" % (self.type, e.__class__.__name__, self.dataset.key, parent_key, location, str(e))) else: # dataset already finished, job shouldn't be open anymore self.log.warning( "Job %s/%s was queued for a dataset already marked as finished, deleting..." % (self.job.data["jobtype"], self.job.data["remote_id"])) self.job.finish()
def __init__(self, parameters={}, key=None, job=None, data=None, db=None, parent=None, extension="csv", type=None): """ Create new dataset object If the dataset is not in the database yet, it is added. :param parameters: Parameters, e.g. search query, date limits, et cetera :param db: Database connection """ self.db = db self.folder = Path(config.PATH_ROOT, config.PATH_DATA) if key is not None: self.key = key current = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key, )) if not current: raise TypeError( "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key) query = current["query"] elif job is not None: current = self.db.fetchone( "SELECT * FROM datasets WHERE parameters::json->>'job' = %s", (job, )) if not current: raise TypeError( "DataSet() requires a valid job ID for its 'job' argument") query = current["query"] self.key = current["key"] elif data is not None: current = data if "query" not in data or "key" not in data or "parameters" not in data or "key_parent" not in data: raise ValueError( "DataSet() requires a complete dataset record for its 'data' argument" ) query = current["query"] self.key = current["key"] else: if parameters is None: raise TypeError( "DataSet() requires either 'key', or 'parameters' to be given" ) if not type: raise ValueError( "Datasets must have their type set explicitly") query = self.get_label(parameters, default=type) self.key = self.get_key(query, parameters, parent) current = self.db.fetchone( "SELECT * FROM datasets WHERE key = %s AND query = %s", (self.key, query)) if current: self.data = current self.parameters = json.loads(self.data["parameters"]) self.is_new = False else: self.data = { "key": self.key, "query": self.get_label(parameters, default=type), "parameters": json.dumps(parameters), "result_file": "", "status": "", "type": type, "timestamp": int(time.time()), "is_finished": False, "software_version": get_software_version(), "software_file": "", "num_rows": 0 } self.parameters = parameters if parent: self.data["key_parent"] = parent self.db.insert("datasets", data=self.data) self.reserve_result_file(parameters, extension) # retrieve analyses and processors that may be run for this dataset analyses = self.db.fetchall( "SELECT * FROM datasets WHERE key_parent = %s ORDER BY timestamp ASC", (self.key, )) self.children = sorted( [DataSet(data=analysis, db=self.db) for analysis in analyses], key=lambda dataset: dataset.is_finished(), reverse=True) self.processors = self.get_available_processors()
def work(self): """ Process a dataset Loads dataset metadata, sets up the scaffolding for performing some kind of processing on that dataset, and then processes it. Afterwards, clean up. """ try: self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db) except TypeError: # query has been deleted in the meantime. finish without error, # as deleting it will have been a conscious choice by a user self.job.finish() return is_running_in_preset = False if self.dataset.data.get("key_parent", None): # search workers never have parents (for now), so we don't need to # find out what the source_dataset dataset is if it's a search worker try: self.source_dataset = DataSet( key=self.dataset.data["key_parent"], db=self.db) # for presets, transparently use the *top* dataset as a source_dataset # since that is where any underlying processors should get # their data from. However, this should only be done as long as the # preset is not finished yet, because after that there may be processors # that run on the final preset result if self.source_dataset.type.find( "preset-" ) == 0 and not self.source_dataset.is_finished(): self.is_running_in_preset = True self.source_dataset = self.source_dataset.get_genealogy( )[0] except TypeError: # we need to know what the source_dataset dataset was to properly handle the # analysis self.log.warning( "Processor %s queued for orphan query %s: cannot run, cancelling job" % (self.type, self.dataset.key)) self.job.finish() return if not self.source_dataset.is_finished( ) and not self.is_running_in_preset: # not finished yet - retry after a while # exception for presets, since these *should* be unfinished # until underlying processors are done self.job.release(delay=30) return self.source_file = self.source_dataset.get_results_path() if not self.source_file.exists(): self.dataset.update_status("Finished, no input data found.") self.log.info("Running processor %s on dataset %s" % (self.type, self.job.data["remote_id"])) processor_name = self.title if hasattr(self, "title") else self.type self.dataset.clear_log() self.dataset.log("Processing '%s' started for dataset %s" % (processor_name, self.dataset.key)) # start log file self.parameters = self.dataset.parameters.copy() self.dataset.update_status("Processing data") self.dataset.update_version(get_software_version()) # now the parameters have been loaded into memory, clear any sensitive # ones. This has a side-effect that a processor may not run again # without starting from scratch, but this is the price of progress if hasattr(self, "options"): for option in self.options: if self.options[option].get("sensitive"): self.dataset.delete_parameter(option) if self.interrupted: self.dataset.log("Processing interrupted, trying again later") return self.abort() if not self.dataset.is_finished(): try: self.process() self.after_process() except WorkerInterruptedException as e: self.dataset.log( "Processing interrupted (%s), trying again later" % str(e)) self.abort() except Exception as e: self.dataset.log("Processor crashed (%s), trying again later" % str(e)) frames = traceback.extract_tb(e.__traceback__) frames = [ frame.filename.split("/").pop() + ":" + str(frame.lineno) for frame in frames[1:] ] location = "->".join(frames) # Not all datasets have source_dataset keys if len(self.dataset.get_genealogy()) > 1: parent_key = " (via " + self.dataset.get_genealogy( )[0].key + ")" else: parent_key = "" raise ProcessorException( "Processor %s raised %s while processing dataset %s%s in %s:\n %s\n" % (self.type, e.__class__.__name__, self.dataset.key, parent_key, location, str(e))) else: # dataset already finished, job shouldn't be open anymore self.log.warning( "Job %s/%s was queued for a dataset already marked as finished, deleting..." % (self.job.data["jobtype"], self.job.data["remote_id"])) self.job.finish()