def validate_query(query, request, user): """ Validate input for a dataset query on the Douban data source. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ filtered_query = {} # the dates need to make sense as a range to search within after, before = query.get("daterange") if before and after and before < after: raise QueryParametersException("Date range must start before it ends") filtered_query["min_date"], filtered_query["max_date"] = (after, before) # normalize groups to just their IDs, even if a URL was provided, and # limit to 25 groups = [group.split("/group/").pop().split("/")[0].strip() for group in query["groups"].replace("\n", ",").split(",")] groups = [group for group in groups if group][:25] if not any(groups): raise QueryParametersException("No valid groups were provided.") filtered_query["groups"] = ",".join(groups) # max amount of topics is 200 because after that Douban starts throwing 429s filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) # strip HTML from posts? filtered_query["strip"] = bool(query.get("strip", False)) return filtered_query
def get_posts_complex(self, query): """ Execute a query; get messages for given parameters :param dict query: Query parameters, as part of the DataSet object :return list: Posts, sorted by thread and post ID, in ascending order """ self.eventloop = asyncio.new_event_loop() session_path = Path(__file__).parent.joinpath( "sessions", self.dataset.parameters.get("session")) client = None try: client = TelegramClient(str(session_path), self.dataset.parameters.get("api_id"), self.dataset.parameters.get("api_hash"), loop=self.eventloop) client.start() except Exception as e: self.dataset.update_status( "Error connecting to the Telegram API with provided credentials.", is_final=True) self.dataset.finish() if client and hasattr(client, "disconnect"): client.disconnect() return None # ready our parameters parameters = self.dataset.get_parameters() queries = [ query.strip() for query in parameters.get("query", "").split(",") ] max_items = convert_to_int(parameters.get("items", 10), 10) # userinfo needs some work before it can be retrieved, something with # async method calls userinfo = False # bool(parameters.get("scrape-userinfo", False)) try: posts = self.gather_posts(client, queries, max_items, userinfo) except Exception as e: self.dataset.update_status("Error scraping posts from Telegram") self.log.error("Telegram scraping error: %s" % traceback.format_exc()) posts = None finally: client.disconnect() # delete personal data from parameters. We still have a Telegram # session saved to disk, but it's useless without this information. self.dataset.delete_parameter("api_id") self.dataset.delete_parameter("api_hash") self.dataset.delete_parameter("api_phone") return posts
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ api_key = self.parameters.get("api_key") self.dataset.delete_parameter("api_key") # sensitive, delete after use features = self.parameters.get("features") features = [{"type": feature} for feature in features] if not api_key: self.dataset.update_status("You need to provide a valid API key", is_final=True) self.dataset.finish(0) return max_images = convert_to_int(self.parameters.get("amount", 0), 100) total = self.source_dataset.num_rows if not max_images else min( max_images, self.source_dataset.num_rows) done = 0 for image_file in self.iterate_archive_contents(self.source_file): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from Google Vision API") done += 1 self.dataset.update_status("Annotating image %i/%i" % (done, total)) try: annotations = self.annotate_image(image_file, api_key, features) except RuntimeError: # cannot continue fetching, e.g. when API key is invalid break if not annotations: continue annotations = {"file_name": image_file.name, **annotations} with self.dataset.get_results_path().open( "a", encoding="utf-8") as outfile: outfile.write(json.dumps(annotations) + "\n") if max_images and done >= max_images: break self.dataset.update_status("Annotations retrieved for %i images" % done) self.dataset.finish(done)
def get_processor_pipeline(self): """ This queues a series of post-processors to annotate images First, the required amount of images referenced in the dataset is downloaded, in order of most-referenced; then, the requested features are extracted using the Google Vision API; finally, the result is converted to a CSV file for easy processing. """ amount = convert_to_int(self.parameters.get("amount", 10), 10) api_key = self.parameters.get("api_key", "") features = self.parameters.get("features", "") self.dataset.delete_parameter( "api_key") # sensitive, delete as soon as possible pipeline = [ # first, extract top images { "type": "top-images", "parameters": { "overwrite": False } }, # then, download the images we want to annotate { "type": "image-downloader", "parameters": { "amount": amount, "overwrite": False } }, # then, annotate the downloaded images with the Google Vision API { "type": "google-vision-api", "parameters": { "features": features, "amount": amount, "api_key": api_key } }, # finally, create a simplified CSV file from the download NDJSON (which can also be retrieved later) { "type": "convert-vision-to-csv", "parameters": {} } ] return pipeline
def process(self): """ Takes the thumbnails downloaded from YouTube metadata and turns it into an image wall. """ results_path = self.dataset.get_results_path() dirname = Path(results_path.parent, results_path.name.replace(".", "")) # Get the required parameters # path to the YouTube csv data that was the source of the thumbnails root_csv = self.dataset.get_genealogy()[-3].get_results_path() max_amount = convert_to_int(self.parameters.get("max_amount", 0), 0) category_overlay = self.parameters.get("category_overlay") # Build that wall! self.make_imagewall(root_csv, max_amount=max_amount, category_overlay=category_overlay)
def process(self): """ Unzips and appends tokens to fetch and write a tf-idf matrix """ # Validate and process user inputs library = self.parameters.get("library", "gensim") if "-" not in self.parameters.get("n_size"): n_size = convert_to_int(self.parameters.get("n_size", 1), 1) n_size = (n_size, n_size) # needs to be a tuple for sklearn. else: n_size_split = self.parameters.get("n_size").split("-") n_size = (convert_to_int(n_size_split[0]), convert_to_int(n_size_split[1])) min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1) max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1) max_output = convert_to_int(self.parameters.get("max_output", 10), 10) smartirs = self.parameters.get("smartirs", "nfc") # Get token sets self.dataset.update_status("Processing token sets") tokens = [] dates = [] # Go through all archived token sets and generate collocations for each for token_file in self.iterate_archive_contents(self.source_file): # Get the date date_string = token_file.stem dates.append(date_string) # we support both pickle and json dumps of vectors token_unpacker = pickle if token_file.suffix == "pb" else json try: with token_file.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode post_tokens = token_unpacker.load(binary_tokens) # Flatten the list of list of tokens - we're treating the whole time series as one document. post_tokens = list(itertools.chain.from_iterable(post_tokens)) # Add to all date's tokens tokens.append(post_tokens) except UnicodeDecodeError: self.dataset.update_status("Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True) self.dataset.finish(0) return # Make sure `min_occurrences` and `max_occurrences` are valid if min_occurrences > len(tokens): min_occurrences = len(tokens) - 1 if max_occurrences <= 0 or max_occurrences > len(tokens): max_occurrences = len(tokens) # Get the tf-idf matrix. self.dataset.update_status("Generating tf-idf for token set") try: if library == "gensim": results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs) elif library == "scikit-learn": results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences, max_occurrences=max_occurrences, top_n=max_output) else: self.dataset.update_status("Invalid library.") self.dataset.finish(0) return if results: # Generate csv and finish self.dataset.update_status("Writing to csv and finishing") self.write_csv_items_and_finish(results) except MemoryError: self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.") self.dataset.finish(0)
def get_items(self, query): """ Get Douban posts In the case of Douban, there is no need for multiple pathways, so we can route it all to the one post query method. Will scrape posts from the most recent topics for a given list of groups. Douban prevents scraping old content, so this is mostly useful to get a sense of what a given group is talking about at the moment. :param query: Filtered query parameters :return: """ groups = query["groups"].split(",") max_topics = min(convert_to_int(query["amount"], 100), 500) start = query["min_date"] end = query["max_date"] strip = bool(query["strip"]) topics_processed = 0 posts_processed = 0 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} for group in groups: # get URL for group index group = str(group) group_url = "https://www.douban.com/group/%s/discussion?start=" % group offset = 0 while True: # get list of topics in group, for the given offset fetch_url = group_url + str(offset) request = self.get_douban_url(fetch_url, headers=headers) # this would usually mean the group doesn't exist, or we hit some rate limit if request.status_code != 200: self.dataset.update_status( "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) break self.dataset.update_status("Scraping group %s...") # parse the HTML and get links to individual topics, as well as group name overview_page = BeautifulSoup(request.text, 'html.parser') group_name = overview_page.select_one(".group-item .title a").text for topic in overview_page.select("table.olt tr:not(.th)"): if self.interrupted: raise ProcessorInterruptedException("Interrupted while scraping Douban topics") if topics_processed >= max_topics: break # get topic URL, and whether it is an 'elite' topic topic_url = topic.find("a").get("href") topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" topic_id = topic_url.split("/topic/").pop().split("/")[0] topic_updated = int( datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) # if a date range is given, ignore topics outside of it if start and topic_updated < start: continue if end and topic_updated > end: break self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( posts_processed, offset, min(max_topics, offset + 50), group_name)) # request topic page - fortunately all comments are on a single page topic_request = self.get_douban_url(topic_url, headers=headers) time.sleep(5) # don't hit rate limits topic_page = BeautifulSoup(topic_request.text, 'html.parser') topic = topic_page.select_one("#topic-content") topics_processed += 1 # include original post as the first item try: first_post = { "id": topic_id, "group_id": group, "thread_id": topic_id, "group_name": group_name, "subject": topic_page.select_one("h1").text.strip(), "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), "author": topic.select_one(".user-face img").get("alt"), "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, "%Y-%m-%d %H:%M:%S").timestamp()), "likes": 0, "is_highlighted": "no", "is_reply": "no", "is_topic_elite": topic_is_elite, "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) } except (AttributeError, ValueError): self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) continue if strip: first_post["body"] = strip_tags(first_post["body"]) posts_processed += 1 yield first_post # now loop through all comments on the page for comment in topic_page.select("ul#comments > li"): comment_data = { "id": comment.get("data-cid"), "group_id": group, "thread_id": topic_id, "group_name": group_name, "subject": "", "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), "author": comment.select_one(".user-face img").get("alt"), "author_id": comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text, "%Y-%m-%d %H:%M:%S").timestamp()), "likes": convert_to_int( re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0), "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in comment.select( "ul#popular-comments li")] else "no", "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", "is_topic_elite": topic_is_elite, "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) } if strip: comment_data["body"] = strip_tags(comment_data["body"]) posts_processed += 1 yield comment_data if offset < max_topics - 50: offset += 50 else: break
def process(self): """ This takes previously generated Word2Vec models and uses them to find similar words based on a list of words """ self.dataset.update_status("Processing sentences") words = self.parameters.get("words", "").split(",") if not words: self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(-1) return num_words = convert_to_int(self.parameters.get("num-words"), self.options["num-words"]["default"]) try: threshold = float( self.parameters.get("threshold", self.options["threshold"]["default"])) except ValueError: threshold = float(self.options["threshold"]["default"]) # prepare staging area temp_path = self.dataset.get_temporary_path() temp_path.mkdir() # go through all models and calculate similarity for all given input words result = [] with zipfile.ZipFile(self.source_file, "r") as model_archive: model_files = model_archive.namelist() for model_file in model_files: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing token sets") # the model is stored as [interval].model model_name = model_file.split("/")[-1] interval = model_name.split(".")[0] # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes) temp_file = temp_path.joinpath(model_name) model_archive.extract(model_name, temp_path) # for each separate model, calculate top similar words for each # input word, giving us at most # [max amount] * [number of input] * [number of intervals] # items self.dataset.update_status("Running model %s..." % model_name) model = Word2Vec.load(str(temp_file)) for word in words: similar_words = model.most_similar(positive=[word], topn=num_words) for similar_word in similar_words: if similar_word[1] < threshold: continue result.append({ "date": interval, "input": word, "item": similar_word[0], "value": similar_word[1] }) temp_file.unlink() # delete temporary folder shutil.rmtree(temp_path) self.write_csv_items_and_finish(result)
def process(self): """ Reads a CSV file, counts occurences of chosen values over all posts, and aggregates the results per chosen time frame """ # convenience variables timeframe = self.parameters.get("timeframe", self.options["timeframe"]["default"]) attribute = self.parameters.get("attribute", self.options["attribute"]["default"]) rank_style = self.parameters.get("top-style", self.options["top-style"]["default"]) cutoff = convert_to_int( self.parameters.get("top", self.options["top"]["default"])) try: filter = re.compile(self.parameters.get("regex", None)) except (TypeError, re.error): self.dataset.update_status( "Could not complete: regular expression invalid") self.dataset.finish(0) return # This is needed to check for URLs in the "domain" and "url" columns for Reddit submissions datasource = self.parent.parameters.get("datasource") # we need to be able to order the values later, chronologically, so use # and OrderedDict; all frequencies go into this variable items = OrderedDict() # if we're interested in overall top-ranking items rather than a # per-period ranking, we need to do a first pass in which all posts are # inspected to determine those overall top-scoring items overall_top = {} if rank_style == "overall": self.dataset.update_status("Determining overall top-%i items" % cutoff) for post in self.iterate_csv_items(self.source_file): values = self.get_values(post, attribute, filter) for value in values: if value not in overall_top: overall_top[value] = 0 overall_top[value] += 1 overall_top = sorted(overall_top, key=lambda item: overall_top[item], reverse=True)[0:cutoff] # now for the real deal self.dataset.update_status("Reading source file") for post in self.iterate_csv_items(self.source_file): # determine where to put this data if timeframe == "all": time_unit = "overall" else: try: timestamp = int( datetime.datetime.strptime( post["timestamp"], "%Y-%m-%d %H:%M:%S").timestamp()) except ValueError: timestamp = 0 date = datetime.datetime.fromtimestamp(timestamp) if timeframe == "year": time_unit = str(date.year) elif timeframe == "month": time_unit = str(date.year) + "-" + str(date.month).zfill(2) else: time_unit = str(date.year) + "-" + str( date.month).zfill(2) + "-" + str(date.day).zfill(2) # again, we need to be able to sort, so OrderedDict it is if time_unit not in items: items[time_unit] = OrderedDict() # get values from post values = self.get_values(post, attribute, filter) # keep track of occurrences of found items per relevant time period for value in values: if rank_style == "overall" and value not in overall_top: continue if value not in items[time_unit]: items[time_unit][value] = 0 items[time_unit][value] += 1 # sort by time and frequency self.dataset.update_status("Sorting items") sorted_items = OrderedDict( (key, items[key]) for key in sorted(items.keys())) for time_unit in sorted_items: sorted_unit = OrderedDict( (item, sorted_items[time_unit][item]) for item in sorted( sorted_items[time_unit], reverse=True, key=lambda key: sorted_items[time_unit][key])) sorted_items[time_unit].clear() sorted_items[time_unit].update(sorted_unit) if cutoff > 0: # OrderedDict's API sucks and really needs some extra # convenience methods sorted_items[time_unit] = OrderedDict( islice(sorted_items[time_unit].items(), cutoff)) # convert to flat list rows = [] for time_unit in sorted_items: for item in sorted_items[time_unit]: row = { "date": time_unit, "item": item, "frequency": sorted_items[time_unit][item] } rows.append(row) # write as csv if rows: self.write_csv_items_and_finish(rows) else: self.dataset.update_status( "No posts contain the requested attributes.") self.dataset.finish(0)
def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing tokenised posts, grouped per time unit as specified in the parameters. """ self.dataset.update_status("Processing sentences") use_skipgram = 1 if self.parameters.get( "algorithm") == "skipgram" else 0 window = min(10, max(1, convert_to_int(self.parameters.get("window")))) use_negative = 5 if self.parameters.get("negative") else 0 # prepare staging area temp_path = self.dataset.get_temporary_path() temp_path.mkdir() # go through all archived token sets and vectorise them models = 0 with zipfile.ZipFile(self.source_file, "r") as token_archive: token_sets = token_archive.namelist() # create one model file per token file for token_set in token_sets: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing token sets") # the model file's name will be based on the token set name, # i.e. 2020-08-01.json becomes 2020-08-01.model token_set_name = token_set.split("/")[-1] # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes) temp_file = temp_path.joinpath(token_set_name) token_archive.extract(token_set_name, temp_path) # use the "list of lists" as input for the word2vec model # by default the tokeniser generates one list of tokens per # post... which may actually be preferable for short # 4chan-style posts. But alternatively it could generate one # list per sentence - this processor is agnostic in that regard self.dataset.update_status( "Training model for token set %s..." % token_set_name) with temp_file.open() as input: model = Word2Vec(json.load(input), negative=use_negative, sg=use_skipgram, window=window) model_name = token_set_name.split(".")[0] + ".model" model.save(str(temp_path.joinpath(model_name))) models += 1 temp_file.unlink() # create another archive with all model files in it with zipfile.ZipFile(self.dataset.get_results_path(), "w") as zip: for output_path in temp_path.glob("*.model"): zip.write(output_path, output_path.name) output_path.unlink() # delete temporary folder shutil.rmtree(temp_path) self.dataset.update_status("Finished") self.dataset.finish(models)
def process(self): """ Reads vector set and creates a CSV with ranked vectors """ # prepare staging area results_path = self.dataset.get_temporary_path() results_path.mkdir() self.dataset.update_status("Processing token sets") vector_paths = [] # go through all archived token sets and vectorise them results = [] def file_to_timestamp(file): """ Get comparable datestamp value for token file Token files are named YYYY-m.pb. This function converts that to a YYYYmm string, then that string to an int, so that it may be compared for sorting chronologically. :param str file: File name :return int: Comparable datestamp """ stem = file.split("/")[-1].split(".")[0].split("-") try: return int(stem[0] + stem[1].zfill(2)) except (ValueError, IndexError): return 0 results = [] # truncate results as needed rank_style = self.parameters.get("top-style", self.options["top-style"]["default"]) cutoff = convert_to_int( self.parameters.get("top", self.options["top"]["default"]), self.options["top"]["default"]) # now rank the vectors by most prevalent per "file" (i.e. interval) overall_top = {} with zipfile.ZipFile(self.source_file, "r") as token_archive: vector_sets = sorted(token_archive.namelist(), key=file_to_timestamp) index = 0 for vector_set in vector_sets: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing vector sets") index += 1 vector_set_name = vector_set.split("/")[ -1] # we don't need the full path self.dataset.update_status("Processing token set %i/%i" % (index, len(vector_sets))) # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes) temp_path = results_path.joinpath(vector_set_name) token_archive.extract(vector_set_name, results_path) with temp_path.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode vectors = pickle.load(binary_tokens) temp_path.unlink() vectors = sorted(vectors, key=lambda x: x[1], reverse=True) # for overall ranking we need the full vector space per interval # because maybe an overall top-ranking vector is at the bottom # in this particular interval - we'll truncate the top list at # a later point in that case. Else, truncate it here if rank_style == "per-item": vectors = vectors[0:cutoff] for vector in vectors: if not vector[0].strip(): continue results.append({ "date": vector_set_name.split(".")[0], "item": vector[0], "frequency": vector[1] }) if vector[0] not in overall_top: overall_top[vector[0]] = 0 overall_top[vector[0]] += int(vector[1]) # this eliminates all items from the results that were not in the # *overall* top-occuring items. This only has an effect when vectors # were generated for multiple intervals if rank_style == "overall": overall_top = { item: overall_top[item] for item in sorted(overall_top, key=lambda x: overall_top[x], reverse=True)[0:cutoff] } filtered_results = [] for item in results: if item["item"] in overall_top: filtered_results.append(item) results = filtered_results # delete temporary files and folder shutil.rmtree(results_path) # done! self.dataset.update_status("Writing results file") with open(self.dataset.get_results_path(), "w", encoding="utf-8") as output: writer = csv.DictWriter(output, fieldnames=("date", "item", "frequency")) writer.writeheader() for row in results: writer.writerow(row) self.dataset.update_status("Finished") self.dataset.finish(len(results))
async def execute_queries(self): """ Get messages for queries This is basically what would be done in get_items(), except due to Telethon's architecture this needs to be called in an async method, which is this one. """ # session file has been created earlier, and we can re-use it here in # order to avoid having to re-enter the security code query = self.parameters hash_base = query["api_phone"].replace( "+", "") + query["api_id"] + query["api_hash"] session_id = hashlib.blake2b(hash_base.encode("ascii")).hexdigest() session_path = Path(__file__).parent.joinpath("sessions", session_id + ".session") client = None def cancel_start(): """ Replace interactive phone number input in Telethon By default, if Telethon cannot use the given session file to authenticate, it will interactively prompt the user for a phone number on the command line. That is not useful here, so instead raise a RuntimeError. This will be caught below and the user will be told they need to re-authenticate via 4CAT. """ raise RuntimeError("Connection cancelled") try: client = TelegramClient(str(session_path), int(query.get("api_id")), query.get("api_hash"), loop=self.eventloop) await client.start(phone=cancel_start) except RuntimeError: # session is no longer useable, delete file so user will be asked # for security code again self.dataset.update_status( "Session is not authenticated: login security code may have expired. You need to re-enter the security code.", is_final=True) session_path.unlink(missing_ok=True) if client and hasattr(client, "disconnect"): await client.disconnect() return None except Exception as e: self.dataset.update_status( "Error connecting to the Telegram API with provided credentials.", is_final=True) if client and hasattr(client, "disconnect"): await client.disconnect() return None # ready our parameters parameters = self.dataset.get_parameters() queries = [ query.strip() for query in parameters.get("query", "").split(",") ] max_items = convert_to_int(parameters.get("items", 10), 10) try: posts = await self.gather_posts(client, queries, max_items) except Exception as e: self.dataset.update_status("Error scraping posts from Telegram") self.log.error("Telegram scraping error: %s" % traceback.format_exc()) posts = None finally: await client.disconnect() return posts
def process(self): # parse parameters input_words = self.parameters.get("words", "") if not input_words or not input_words.split(","): self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(0) return input_words = input_words.split(",") try: threshold = float( self.parameters.get("threshold", self.options["threshold"]["default"])) except ValueError: threshold = float(self.options["threshold"]["default"]) threshold = max(-1.0, min(1.0, threshold)) num_words = convert_to_int(self.parameters.get("num-words"), self.options["num-words"]["default"]) overlay = self.parameters.get("overlay") reduction_method = self.parameters.get("method") all_words = self.parameters.get("all-words") # load model files and initialise self.dataset.update_status("Unpacking word embedding models") staging_area = self.unpack_archive_contents(self.source_file) common_vocab = None vector_size = None models = {} # find words that are common to all models self.dataset.update_status("Determining cross-model common vocabulary") for model_file in staging_area.glob("*.model"): if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while processing word embedding models") model = KeyedVectors.load(str(model_file)).wv models[model_file.stem] = model if vector_size is None: vector_size = model.vector_size # needed later for dimensionality reduction if common_vocab is None: common_vocab = set(model.vocab.keys()) else: common_vocab &= set(model.vocab.keys()) # intersect # sort common vocabulary by combined frequency across all models # this should make filtering for common words a bit faster further down self.dataset.update_status("Sorting vocabulary") common_vocab = list(common_vocab) common_vocab.sort(key=lambda w: sum( [model.vocab[w].count for model in models.values()]), reverse=True) # initial boundaries of 2D space (to be adjusted later based on t-sne # outcome) max_x = 0.0 - sys.float_info.max max_y = 0.0 - sys.float_info.max min_x = sys.float_info.max min_y = sys.float_info.max # for each model, find the words that we may want to plot - these are # the nearest neighbours for the given query words relevant_words = {} # the vectors need to be reduced all at once - but the vectors are # grouped by model. To solve this, keep one numpy array of vectors, # but also keep track of which indexes of this array belong to which # model, by storing the index of the first vector for a model vectors = numpy.empty((0, vector_size)) vector_offsets = {} # now process each model for model_name, model in models.items(): relevant_words[model_name] = set( ) # not a set, since order needs to be preserved self.dataset.update_status("Finding similar words in model '%s'" % model_name) for query in input_words: if query not in model.vocab: self.dataset.update_status( "Query '%s' was not found in model %s; cannot find nearest neighbours." % (query, model_name), is_final=True) self.dataset.finish(0) return if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while finding similar words") # use a larger sample (topn) than required since some of the # nearest neighbours may not be in the common vocabulary and # will therefore need to be ignored context = set([ word[0] for word in model.most_similar(query, topn=1000) if word[0] in common_vocab and word[1] >= threshold ][:num_words]) relevant_words[model_name] |= { query } | context # always include query word # now do another loop to determine which words to plot for each model # this is either the same as relevant_words, or a superset which # combines all relevant words for all models plottable_words = {} last_model = max(relevant_words.keys()) all_relevant_words = set().union(*relevant_words.values()) for model_name, words in relevant_words.items(): plottable_words[model_name] = [] vector_offsets[model_name] = len(vectors) # determine which words to plot for this model. either the nearest # neighbours for this model, or all nearest neighbours found across # all models words_to_include = all_relevant_words if all_words else relevant_words[ model_name] for word in words_to_include: if word in plottable_words[model_name] or ( not overlay and model_name != last_model and word not in input_words): # only plot each word once per model, or if 'overlay' # is not set, only once overall (for the most recent # model) continue vector = models[model_name][word] plottable_words[model_name].append(word) vectors = numpy.append(vectors, [vector], axis=0) del models # no longer needed # reduce the vectors of all words to be plotted for this model to # a two-dimensional coordinate with the previously initialised tsne # transformer. here the two-dimensional vectors are interpreted as # cartesian coordinates if reduction_method == "PCA": pca = PCA(n_components=2, random_state=0) vectors = pca.fit_transform(vectors) elif reduction_method == "t-SNE": # initialise t-sne transformer # parameters taken from Hamilton et al. # https://github.com/williamleif/histwords/blob/master/viz/common.py tsne = TSNE(n_components=2, random_state=0, learning_rate=150, init="pca") vectors = tsne.fit_transform(vectors) elif reduction_method == "TruncatedSVD": # standard sklearn parameters made explicit svd = TruncatedSVD(n_components=2, algorithm="randomized", n_iter=5, random_state=0) vectors = svd.fit_transform(vectors) else: shutil.rmtree(staging_area) self.dataset.update_status( "Invalid dimensionality reduction technique selected", is_final=True) self.dataset.finish(0) return # also keep track of the boundaries of our 2D space, so we can plot # them properly later for position in vectors: max_x = max(max_x, position[0]) max_y = max(max_y, position[1]) min_x = min(min_x, position[0]) min_y = min(min_y, position[1]) # now we know for each model which words should be plotted and at what # position # with this knowledge, we can normalize the positions, and start # plotting them in a graph # a palette generated with https://medialab.github.io/iwanthue/ colours = [ "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb", "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c", "#00b0ca", "#9a0f76", "#ff70c8", "#713c88" ] colour_index = 0 # make sure all coordinates are positive max_x -= min_x max_y -= min_y # determine graph dimensions and proportions width = 1000 # arbitrary height = width * (max_y / max_x) # retain proportions scale = width / max_x # margin around the plot to give room for labels and to look better margin = width * 0.1 width += 2 * margin height += 2 * margin # normalize all known positions to fit within the graph vectors = [(margin + ((position[0] - min_x) * scale), margin + ((position[1] - min_y) * scale)) for position in vectors] # now all positions are finalised, we can determine the "journey" of # each query - the sequence of positions in the graph it takes, so we # can draw lines from position to position later journeys = {} for query in input_words: journeys[query] = [] for model_name, words in plottable_words.items(): index = words.index(query) journeys[query].append(vectors[vector_offsets[model_name] + index]) # font sizes proportional to width (which is static and thus predictable) fontsize_large = width / 50 fontsize_normal = width / 75 fontsize_small = width / 100 # now we have the dimensions, the canvas can be instantiated model_type = self.source_dataset.parameters.get( "model-type", "word2vec") canvas = get_4cat_canvas( self.dataset.get_results_path(), width, height, header="%s nearest neighbours (fitting: %s) - '%s'" % (model_type, reduction_method, ",".join(input_words)), fontsize_normal=fontsize_normal, fontsize_large=fontsize_large, fontsize_small=fontsize_small) # use colour-coded backgrounds to distinguish the query words in the # graph, each model (= interval) with a separate colour for model_name in plottable_words: solid = Filter(id="solid-%s" % model_name) solid.feFlood(flood_color=colours[colour_index]) solid.feComposite(in_="SourceGraphic") canvas.defs.add(solid) colour_index += 1 # now plot each word for each model self.dataset.update_status("Plotting graph") words = SVG(insert=(0, 0), size=(width, height)) queries = SVG(insert=(0, 0), size=(width, height)) colour_index = 0 for model_name, labels in plottable_words.items(): positions = vectors[ vector_offsets[model_name]:vector_offsets[model_name] + len(labels)] label_index = 0 for position in positions: word = labels[label_index] is_query = word in input_words label_index += 1 filter = ("url(#solid-%s)" % model_name) if is_query else "none" colour = "#FFF" if is_query else colours[colour_index] fontsize = fontsize_normal if is_query else fontsize_small if word in input_words: word += " (" + model_name + ")" label_container = SVG(insert=position, size=(1, 1), overflow="visible") label_container.add( Text(insert=("50%", "50%"), text=word, dominant_baseline="middle", text_anchor="middle", style="fill:%s;font-size:%ipx" % (colour, fontsize), filter=filter)) # we make sure the queries are always rendered on top by # putting them in a separate SVG container if is_query: queries.add(label_container) else: words.add(label_container) colour_index = 0 if colour_index >= len( colours) else colour_index + 1 # plot a line between positions for query words lines = SVG(insert=(0, 0), size=(width, height)) for query, journey in journeys.items(): previous_position = None for position in journey: if previous_position is None: previous_position = position continue lines.add( Line(start=previous_position, end=position, stroke="#CE1B28", stroke_width=2)) previous_position = position canvas.add(lines) canvas.add(words) canvas.add(queries) canvas.save(pretty=True) shutil.rmtree(staging_area) self.dataset.finish(len(journeys))
def get_items(self, query): """ Use the Twitter v2 API historical search to get tweets :param query: :return: """ # this is pretty sensitive so delete it immediately after storing in # memory bearer_token = self.parameters.get("api_bearer_token") auth = {"Authorization": "Bearer %s" % bearer_token} endpoint = "https://api.twitter.com/2/tweets/search/all" # these are all expansions and fields available at the time of writing # since it does not cost anything extra in terms of rate limiting, go # for as much data per tweet as possible... tweet_fields = ("attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "public_metrics", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "text", "withheld") user_fields = ("created_at", "description", "entities", "id", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "withheld") place_fields = ("contained_within", "country", "country_code", "full_name", "geo", "id", "name", "place_type") poll_fields = ("duration_minutes", "end_datetime", "id", "options", "voting_status") expansions = ("attachments.poll_ids", "attachments.media_keys", "author_id", "entities.mentions.username", "geo.place_id", "in_reply_to_user_id", "referenced_tweets.id", "referenced_tweets.id.author_id") media_fields = ("duration_ms", "height", "media_key", "non_public_metrics", "organic_metrics", "preview_image_url", "promoted_metrics", "public_metrics", "type", "url", "width") amount = convert_to_int(self.parameters.get("amount"), 10) params = { "query": self.parameters.get("query", ""), "expansions": ",".join(expansions), "tweet.fields": ",".join(tweet_fields), "user.fields": ",".join(user_fields), "poll.fields": ",".join(poll_fields), "place.fields": ",".join(place_fields), "media.fields": ",".join(media_fields), "max_results": max(10, min(amount, 500)) if amount > 0 else 500, # 500 = upper limit, 10 = lower } if self.parameters.get("min_date"): params["start_time"] = datetime.datetime.fromtimestamp( self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") if self.parameters.get("max_date"): params["end_time"] = datetime.datetime.fromtimestamp( self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") tweets = 0 self.dataset.log("Search parameters: %s" % repr(params)) while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while getting tweets from the Twitter API") # there is a limit of one request per second, so stay on the safe side of this while self.previous_request == int(time.time()): time.sleep(0.1) time.sleep(0.05) self.previous_request = int(time.time()) # now send the request, allowing for at least 5 replies if the connection seems unstable retries = 5 api_response = None while retries > 0: try: api_response = requests.get(endpoint, headers=auth, params=params) break except (ConnectionError, requests.exceptions.RequestException) as e: retries -= 1 wait_time = (5 - retries) * 10 self.dataset.update_status( "Got %s, waiting %i seconds before retrying" % (str(e), wait_time)) time.sleep(wait_time) # rate limited - the limit at time of writing is 300 reqs per 15 # minutes # usually you don't hit this when requesting batches of 500 at # 1/second if api_response.status_code == 429: resume_at = convert_to_int( api_response.headers["x-rate-limit-reset"]) + 1 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Hit Twitter rate limit - waiting until %s to continue." % resume_at_str) while time.time() <= resume_at: time.sleep(0.5) continue # API keys that are valid but don't have access or haven't been # activated properly get a 403 elif api_response.status_code == 403: try: structured_response = api_response.json() self.dataset.update_status( "'Forbidden' error from Twitter API. Could not connect to Twitter API " "with this API key. %s" % structured_response.get("detail", ""), is_final=True) except (json.JSONDecodeError, ValueError): self.dataset.update_status( "'Forbidden' error from Twitter API. Your key may not have access to " "the full-archive search endpoint.", is_final=True) finally: return # sometimes twitter says '503 service unavailable' for unclear # reasons - in that case just wait a while and try again elif api_response.status_code in (502, 503, 504): resume_at = time.time() + 60 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Twitter unavailable (status %i) - waiting until %s to continue." % (api_response.status_code, resume_at_str)) while time.time() <= resume_at: time.sleep(0.5) continue # this usually means the query is too long or otherwise contains # a syntax error elif api_response.status_code == 400: msg = "Response %i from the Twitter API; " % api_response.status_code try: api_response = api_response.json() msg += api_response.get("title", "") if "detail" in api_response: msg += ": " + api_response.get("detail", "") except (json.JSONDecodeError, TypeError): msg += "Some of your parameters (e.g. date range) may be invalid." self.dataset.update_status(msg, is_final=True) return # invalid API key elif api_response.status_code == 401: self.dataset.update_status( "Invalid API key - could not connect to Twitter API", is_final=True) return # haven't seen one yet, but they probably exist elif api_response.status_code != 200: self.dataset.update_status( "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True) self.log.warning( "Twitter API v2 responded with status code %i. Response body: %s" % (api_response.status_code, api_response.text)) return elif not api_response: self.dataset.update_status( "Could not connect to Twitter. Cancelling.", is_final=True) return api_response = api_response.json() # The API response contains tweets (of course) and 'includes', # objects that can be referenced in tweets. Later we will splice # this data into the tweets themselves to make them easier to # process. So extract them first... included_users = api_response.get("includes", {}).get("users", {}) included_media = api_response.get("includes", {}).get("media", {}) included_polls = api_response.get("includes", {}).get("polls", {}) included_tweets = api_response.get("includes", {}).get("tweets", {}) included_places = api_response.get("includes", {}).get("places", {}) for tweet in api_response.get("data", []): if 0 < amount <= tweets: break # splice referenced data back in # we use copy.deepcopy here because else we run into a # pass-by-reference quagmire tweet = self.enrich_tweet(tweet, included_users, included_media, included_polls, included_places, copy.deepcopy(included_tweets)) tweets += 1 if tweets % 500 == 0: self.dataset.update_status( "Received %i tweets from Twitter API" % tweets) yield tweet # paginate if (amount <= 0 or tweets < amount) and api_response.get( "meta") and "next_token" in api_response["meta"]: params["next_token"] = api_response["meta"]["next_token"] else: break
def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing tokenised posts, grouped per time unit as specified in the parameters. """ self.dataset.update_status("Processing sentences") use_skipgram = 1 if self.parameters.get( "algorithm") == "skipgram" else 0 window = min( 10, max( 1, convert_to_int(self.parameters.get("window"), int(self.options["window"]["default"])))) use_negative = 5 if self.parameters.get("negative") else 0 min_count = max( 1, convert_to_int(self.parameters.get("min_count"), self.options["min_count"]["default"])) dimensionality = convert_to_int(self.parameters.get("dimensionality"), 100) detect_bigrams = self.parameters.get("detect-bigrams") model_type = self.parameters.get("model-type") if not model_type: model_type = self.options["model-type"]["default"] staging_area = self.dataset.get_staging_area() model_builder = { "Word2Vec": Word2Vec, "FastText": FastText }[model_type] # go through all archived token sets and vectorise them models = 0 for temp_file in self.iterate_archive_contents(self.source_file): # use the "list of lists" as input for the word2vec model # by default the tokeniser generates one list of tokens per # post... which may actually be preferable for short # 4chan-style posts. But alternatively it could generate one # list per sentence - this processor is agnostic in that regard token_set_name = temp_file.name self.dataset.update_status( "Extracting bigrams from token set %s..." % token_set_name) try: if detect_bigrams: bigram_transformer = Phrases( self.tokens_from_file(temp_file, staging_area)) bigram_transformer = Phraser(bigram_transformer) else: bigram_transformer = None self.dataset.update_status( "Training %s model for token set %s..." % (model_builder.__name__, token_set_name)) try: model = model_builder(negative=use_negative, size=dimensionality, sg=use_skipgram, window=window, workers=3, min_count=min_count) # we do not simply pass a sentences argument to model builder # because we are using a generator, which exhausts, while # Word2Vec needs to iterate over the sentences twice # https://stackoverflow.com/a/57632747 model.build_vocab( self.tokens_from_file(temp_file, staging_area, phraser=bigram_transformer)) model.train(self.tokens_from_file( temp_file, staging_area, phraser=bigram_transformer), epochs=model.iter, total_examples=model.corpus_count) except RuntimeError as e: if "you must first build vocabulary before training the model" in str( e): # not enough data. Skip - if this happens for all models # an error will be generated later continue else: raise e except UnicodeDecodeError: self.dataset.update_status( "Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True) self.dataset.finish(0) return # save - we only save the KeyedVectors for the model, this # saves space and we don't need to re-train the model later model_name = token_set_name.split(".")[0] + ".model" model.wv.save(str(staging_area.joinpath(model_name))) # save vocabulary too, some processors need it del model models += 1 if models == 0: self.dataset.update_status( "Not enough data in source file to train %s models." % model_builder.__name__) shutil.rmtree(staging_area) self.dataset.finish(0) return # create another archive with all model files in it self.dataset.update_status("%s model(s) saved." % model_builder.__name__) self.write_archive_and_finish(staging_area)
def process(self): """ This takes previously generated Word2Vec models and uses them to find similar words based on a list of words """ self.dataset.update_status("Processing sentences") depth = max( 1, min( 3, convert_to_int( self.parameters.get( "crawl_depth", self.options["crawl_depth"]["default"]), self.options["crawl_depth"]["default"]))) input_words = self.parameters.get("words", "") if not input_words or not input_words.split(","): self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(0) return input_words = input_words.split(",") num_words = convert_to_int(self.parameters.get("num-words"), self.options["num-words"]["default"]) try: threshold = float( self.parameters.get("threshold", self.options["threshold"]["default"])) except ValueError: threshold = float(self.options["threshold"]["default"]) threshold = max(-1.0, min(1.0, threshold)) # go through all models and calculate similarity for all given input words result = [] staging_area = self.unpack_archive_contents(self.source_file) for model_file in staging_area.glob("*.model"): interval = model_file.stem # for each separate model, calculate top similar words for each # input word, giving us at most # [max amount] * [number of input] * [number of intervals] # items self.dataset.update_status("Running model %s..." % model_file.name) model = KeyedVectors.load(str(model_file)) word_queue = set() checked_words = set() level = 1 words = input_words.copy() while words: if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while extracting similar words") word = words.pop() checked_words.add(word) try: similar_words = model.most_similar(positive=[word], topn=num_words) except KeyError: continue for similar_word in similar_words: if similar_word[1] < threshold: continue result.append({ "date": interval, "input": word, "item": similar_word[0], "value": similar_word[1], "input_occurences": model.vocab[word].count, "item_occurences": model.vocab[similar_word[0]].count, "depth": level }) # queue word for the next iteration if there is one and # it hasn't been seen yet if level < depth and similar_word[0] not in checked_words: word_queue.add(similar_word[0]) # if all words have been checked, but we still have an # iteration to go, load the queued words into the list if not words and word_queue and level < depth: level += 1 words = word_queue.copy() word_queue = set() shutil.rmtree(staging_area) if not result: self.dataset.update_status( "None of the words were found in the word embedding model.", is_final=True) self.dataset.finish(0) else: self.write_csv_items_and_finish(result)
def process(self): """ Reads vector set and creates a CSV with ranked vectors """ self.dataset.update_status("Processing token sets") def file_to_timestamp(file): """ Get comparable datestamp value for token file Token files are named YYYY-m.pb. This function converts that to a YYYYmm string, then that string to an int, so that it may be compared for sorting chronologically. :param str file: File name :return int: Comparable datestamp """ stem = file.split("/")[-1].split(".")[0].split("-") try: return int(stem[0] + stem[1].zfill(2)) except (ValueError, IndexError): return 0 results = [] # truncate results as needed rank_style = self.parameters.get("top-style", self.options["top-style"]["default"]) cutoff = convert_to_int( self.parameters.get("top", self.options["top"]["default"]), self.options["top"]["default"]) # now rank the vectors by most prevalent per "file" (i.e. interval) overall_top = {} index = 0 for vector_file in self.iterate_archive_contents(self.source_file): # we support both pickle and json dumps of vectors vector_unpacker = pickle if vector_file.suffix == "pb" else json index += 1 vector_set_name = vector_file.stem # we don't need the full path self.dataset.update_status("Processing token set %i (%s)" % (index, vector_set_name)) with vector_file.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode vectors = vector_unpacker.load(binary_tokens) vectors = sorted(vectors, key=lambda x: x[1], reverse=True) # for overall ranking we need the full vector space per interval # because maybe an overall top-ranking vector is at the bottom # in this particular interval - we'll truncate the top list at # a later point in that case. Else, truncate it here if rank_style == "per-item": vectors = vectors[0:cutoff] for vector in vectors: if not vector[0].strip(): continue results.append({ "date": vector_set_name.split(".")[0], "item": vector[0], "value": vector[1] }) if vector[0] not in overall_top: overall_top[vector[0]] = 0 overall_top[vector[0]] += int(vector[1]) # this eliminates all items from the results that were not in the # *overall* top-occuring items. This only has an effect when vectors # were generated for multiple intervals if rank_style == "overall": overall_top = { item: overall_top[item] for item in sorted(overall_top, key=lambda x: overall_top[x], reverse=True)[0:cutoff] } filtered_results = [] for item in results: if item["item"] in overall_top: filtered_results.append(item) results = filtered_results # done! self.dataset.update_status("Writing results file") with open(self.dataset.get_results_path(), "w", encoding="utf-8") as output: writer = csv.DictWriter(output, fieldnames=("date", "item", "value")) writer.writeheader() for row in results: writer.writerow(row) self.dataset.update_status("Finished") self.dataset.finish(len(results))
def process(self): """ Unzips and appends tokens to fetch and write a tf-idf matrix """ # Validate and process user inputs - parse to int library = self.parameters.get("library", "gensim") n_size = convert_to_int(self.parameters.get("n_size", 1), 1) min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1) max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1) max_output = convert_to_int(self.parameters.get("max_output", 10), 10) smartirs = self.parameters.get("smartirs", "nfc") # Get token sets self.dataset.update_status("Processing token sets") tokens = [] dates = [] results_path = self.dataset.get_results_path() dirname = Path(results_path.parent, results_path.name.replace(".", "")) # Go through all archived token sets and generate collocations for each with zipfile.ZipFile(str(self.source_file), "r") as token_archive: token_sets = token_archive.namelist() index = 0 # Loop through the tokens (can also be a single set) for tokens_name in token_sets: if self.interrupted: raise ProcessorInterruptedException("Interrupted while loading token sets") # Get the date date_string = tokens_name.split('.')[0] dates.append(date_string) # Temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes) temp_path = dirname.joinpath(tokens_name) token_archive.extract(str(tokens_name), str(dirname)) # we support both pickle and json dumps of vectors token_unpacker = pickle if tokens_name.split(".")[-1] == "pb" else json with temp_path.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode post_tokens = token_unpacker.load(binary_tokens) # Flatten the list of list of tokens - we're treating the whole time series as one document. post_tokens = list(itertools.chain.from_iterable(post_tokens)) # Add to all date's tokens tokens.append(post_tokens) temp_path.unlink() # Make sure `min_occurrences` and `max_occurrences` are valid if min_occurrences > len(tokens): min_occurrences = len(tokens) - 1 if max_occurrences <= 0 or max_occurrences > len(tokens): max_occurrences = len(tokens) # Get the tf-idf matrix. self.dataset.update_status("Generating tf-idf for token set") try: if library == "gensim": results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs) elif library == "scikit-learn": results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences, max_occurrences=max_occurrences, top_n=max_output) else: self.dataset.update_status("Invalid library.") self.dataset.finish(0) return if results: # Generate csv and finish self.dataset.update_status("Writing to csv and finishing") self.write_csv_items_and_finish(results) except MemoryError: self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.") self.dataset.finish(0)
def process(self): items = {} max_weight = 1 colour_property = self.options.get( "colour_property", self.options["colour_property"]["default"]) size_property = self.options.get( "size_property", self.options["size_property"]["default"]) # first create a map with the ranks for each period with self.source_file.open() as input: reader = csv.DictReader(input) weight_attribute = "value" if "value" in reader.fieldnames else "frequency" item_attribute = "item" if "item" in reader.fieldnames else "text" date_attribute = "date" if "date" in reader.fieldnames else "time" weighted = (weight_attribute in reader.fieldnames) for row in reader: if row[date_attribute] not in items: items[row[date_attribute]] = {} weight = convert_to_int(row[weight_attribute], 1) if weighted else 1 items[row[date_attribute]][row[item_attribute]] = weight max_weight = max(max_weight, weight) # determine per-period changes # this is used for determining what colour to give to nodes, and # visualise outlying items in the data changes = {} max_change = 1 for period in items: changes[period] = {} for item in items[period]: now = items[period][item] then = -1 for previous_period in items: if previous_period == period: break for previous_item in items[previous_period]: if previous_item == item: then = items[previous_period][item] if then >= 0: change = abs(now - then) max_change = max(max_change, change) changes[period][item] = change else: changes[period][item] = 1 # some sizing parameters for the chart - experiment with those box_width = 12 box_height = 10 # boxes will never be smaller than this box_max_height = 100 box_gap_x = 90 box_gap_y = 5 # don't change this - initial X value for top left box box_start_x = 0 # we use this to know if and where to draw the flow curve between a box # and its previous counterpart previous_boxes = {} previous = [] # we need to store the svg elements before drawing them to the canvas # because we need to know what elements to draw before we can set the # canvas up for drawing to boxes = [] labels = [] flows = [] definitions = [] # this is the default colour for items (it's blue-ish) # we're using HSV, so we can increase the hue for more prominent items base_colour = [.55, .95, .95] max_y = 0 # go through all periods and draw boxes and flows for period in items: # reset Y coordinate, i.e. start at top box_start_y = 0 for item in items[period]: # determine weight (and thereby height) of this particular item weight = items[period][item] weight_factor = weight / max_weight height = int(max(box_height, box_max_height * weight_factor) ) if size_property and weighted else box_height # colour ranges from blue to red change = changes[period][item] change_factor = 0 if not weighted or change <= 0 else ( changes[period][item] / max_change) colour = base_colour.copy() colour[0] += (1 - base_colour[0]) * ( weight_factor if colour_property == "weight" else change_factor) # first draw the box box_fill = "rgb(%i, %i, %i)" % tuple( [int(v * 255) for v in colorsys.hsv_to_rgb(*colour)]) box = Rect(insert=(box_start_x, box_start_y), size=(box_width, height), fill=box_fill) boxes.append(box) # then the text label label_y = (box_start_y + (height / 2)) + 3 label = Text( text=(item + (" (%s)" % weight if weight != 1 else "")), insert=(box_start_x + box_width + box_gap_y, label_y)) labels.append(label) # store the max y coordinate, which marks the SVG overall height max_y = max(max_y, (box["y"] + box["height"])) # then draw the flow curve, if the box was ranked in an earlier # period as well if item in previous: previous_box = previous_boxes[item] # create a gradient from the colour of the previous box for # this item to this box's colour colour_from = previous_box["fill"] colour_to = box["fill"] gradient = LinearGradient(start=(0, 0), end=(1, 0)) gradient.add_stop_color(offset="0%", color=colour_from) gradient.add_stop_color(offset="100%", color=colour_to) definitions.append(gradient) # the addition of ' none' in the auto-generated fill colour # messes up some viewers/browsers, so get rid of it gradient_key = gradient.get_paint_server().replace( " none", "") # calculate control points for the connecting bezier bar # the top_offset determines the 'steepness' of the curve, # experiment with the "/ 2" part to make it less or more # steep top_offset = (box["x"] - previous_box["x"] + previous_box["width"]) / 2 control_top_left = (previous_box["x"] + previous_box["width"] + top_offset, previous_box["y"]) control_top_right = (box["x"] - top_offset, box["y"]) bottom_offset = top_offset # mirroring looks best control_bottom_left = (previous_box["x"] + previous_box["width"] + bottom_offset, previous_box["y"] + previous_box["height"]) control_bottom_right = (box["x"] - bottom_offset, box["y"] + box["height"]) # now add the bezier curves - svgwrite has no convenience # function for beziers unfortunately. we're using cubic # beziers though quadratic could work as well since our # control points are, in principle, mirrored flow_start = (previous_box["x"] + previous_box["width"], previous_box["y"]) flow = Path(fill=gradient_key, opacity="0.35") flow.push("M %f %f" % flow_start) # go to start flow.push("C %f %f %f %f %f %f" % (*control_top_left, *control_top_right, box["x"], box["y"])) # top bezier flow.push( "L %f %f" % (box["x"], box["y"] + box["height"])) # right boundary flow.push("C %f %f %f %f %f %f" % (*control_bottom_right, *control_bottom_left, previous_box["x"] + previous_box["width"], previous_box["y"] + previous_box["height"])) # bottom bezier flow.push("L %f %f" % flow_start) # back to start flow.push("Z") # close path flows.append(flow) # mark this item as having appeared previously previous.append(item) previous_boxes[item] = box box_start_y += height + box_gap_y box_start_x += (box_gap_x + box_width) # generate SVG canvas to add elements to canvas = Drawing(self.dataset.get_results_path(), size=(len(items) * (box_width + box_gap_x), max_y), style="font-family:monospace;font-size:8px;") # now add the various shapes and paths. We only do this here rather than # as we go because only at this point can the canvas be instantiated, as # before we don't know the dimensions of the SVG drawing. # add our gradients so they can be referenced for definition in definitions: canvas.defs.add(definition) # add flows (which should go beyond the boxes) for flow in flows: canvas.add(flow) # add boxes and labels: for item in (*boxes, *labels): canvas.add(item) # finally, save the svg file canvas.saveas(pretty=True, filename=str(self.dataset.get_results_path())) self.dataset.finish(len(items) * len(list(items.items()).pop()))
def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file containing all post bodies as one continuous string, sanitized. """ link_regex = re.compile(r"https?://[^\s]+") delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]") # settings strip_urls = self.parameters.get("strip-urls", self.options["strip-urls"]["default"]) strip_symbols = self.parameters.get( "strip-symbols", self.options["strip-symbols"]["default"]) sides = self.parameters.get("sides", self.options["sides"]["default"]) self.align = self.parameters.get("align", self.options["align"]["default"]) window = convert_to_int( self.parameters.get("window", self.options["window"]["default"]), 5) + 1 query = self.parameters.get("query", self.options["query"]["default"]) self.limit = convert_to_int( self.parameters.get("limit", self.options["limit"]["default"]), 100) left_branches = [] right_branches = [] # do some validation if not query.strip() or re.sub(r"\s", "", query) != query: self.dataset.update_status( "Invalid query for word tree generation. Query cannot be empty or contain whitespace." ) self.dataset.finish(0) return window = min(window, self.options["window"]["max"] + 1) window = max(1, window) # find matching posts processed = 0 for post in self.iterate_csv_items(self.source_file): processed += 1 if processed % 500 == 0: self.dataset.update_status( "Processing and tokenising post %i" % processed) body = post["body"] if strip_urls: body = link_regex.sub("", body) if strip_symbols: body = delete_regex.sub("", body) body = word_tokenize(body) positions = [ i for i, x in enumerate(body) if x.lower() == query.lower() ] # get lists of tokens for both the left and right side of the tree # on the left side, all lists end with the query, on the right side, # they start with the query for position in positions: right_branches.append(body[position:position + window]) left_branches.append(body[max(0, position - window):position + 1]) # Some settings for rendering the tree later self.step = self.fontsize * 0.6 # approximately the width of a monospace char self.gap = (7 * self.step) # space for lines between nodes width = 1 # will be updated later # invert the left side of the tree (because that's the way we want the # branching to work for that side) # we'll visually invert the nodes in the tree again later left_branches = [list(reversed(branch)) for branch in left_branches] # first create vertical slices of tokens per level self.dataset.update_status("Generating token tree from posts") levels_right = [{} for i in range(0, window)] levels_left = [{} for i in range(0, window)] tokens_left = [] tokens_right = [] # for each "level" (each branching point representing a level), turn # tokens into nodes, record the max amount of occurences for any # token in that level, and keep track of what nodes are in which level. # The latter is needed because a token may occur multiple times, at # different points in the graph. Do this for both the left and right # side of the tree. for i in range(0, window): for branch in right_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_right[i]: parent = levels_right[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_right[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_right.append(levels_right[i][token]) else: levels_right[i][token].occurrences += 1 occurrences = levels_right[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences for branch in left_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_left[i]: parent = levels_left[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_left[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_left.append(levels_left[i][token]) else: levels_left[i][token].occurrences += 1 occurrences = levels_left[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences # nodes that have no siblings can be merged with their parents, else # the graph becomes unnecessarily large with lots of single-word nodes # connected to single-word nodes. additionally, we want the nodes with # the most branches to be sorted to the top, and then only retain the # most interesting (i.e. most-occurring) branches self.dataset.update_status("Merging and sorting tree nodes") for token in tokens_left: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) for token in tokens_right: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) # somewhat annoyingly, anytree does not simply delete nodes detached # from the tree in the previous steps, but makes them root nodes. We # don't need these root nodes (we only need the original root), so the # next step is to remove all root nodes that are not the main root. # We cannot modify a list in-place, so make a new list with the # relevant nodes level_sizes = {} filtered_tokens_right = [] for token in tokens_right: if token.is_root and not token.is_top_root: continue filtered_tokens_right.append(token) filtered_tokens_left = [] for token in tokens_left: if token.is_root and not token.is_top_root: continue filtered_tokens_left.append(token) # now we know which nodes are left, and can therefore determine how # large the canvas needs to be - this is based on the max number of # branches found on any level of the tree, in other words, the number # of "terminal nodes" height_left = self.whitespace * self.fontsize * max([ self.max_breadth(node) for node in filtered_tokens_left if node.is_top_root ]) height_right = self.whitespace * self.fontsize * max([ self.max_breadth(node) for node in filtered_tokens_right if node.is_top_root ]) height = max(height_left, height_right) canvas = Drawing(str(self.dataset.get_results_path()), size=(width, height), style="font-family:monospace;font-size:%ipx" % self.fontsize) # the nodes on the left side of the graph now have the wrong word order, # because we reversed them earlier to generate the correct tree # hierarchy - now reverse the node labels so they are proper language # again for token in tokens_left: self.invert_node_labels(token) wrapper = SVG(overflow="visible") self.dataset.update_status("Rendering tree to SVG file") if sides != "right": wrapper = self.render(wrapper, [ token for token in filtered_tokens_left if token.is_root and token.children ], height=height, side=self.SIDE_LEFT) if sides != "left": wrapper = self.render(wrapper, [ token for token in filtered_tokens_right if token.is_root and token.children ], height=height, side=self.SIDE_RIGHT) # things may have been rendered outside the canvas, in which case we # need to readjust the SVG properties wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1}) canvas.update({"width": (self.x_max - self.x_min)}) canvas.add(wrapper) canvas.save(pretty=True) self.dataset.update_status("Finished") self.dataset.finish(len(tokens_left) + len(tokens_right))
def process(self): graphs = {} intervals = [] smooth = self.parameters.get("smooth", self.options["smooth"]["default"]) normalise_values = self.parameters.get( "normalise", self.options["normalise"]["default"]) completeness = convert_to_int( self.parameters.get("complete", self.options["complete"]["default"]), 0) graph_label = self.parameters.get("label", self.options["label"]["default"]) top = convert_to_int( self.parameters.get("top", self.options["top"]["default"]), 10) # first gather graph data: each distinct item gets its own graph and # for each graph we have a sequence of intervals, each interval with # its own value first_date = "9999-99-99" last_date = "0000-00-00" with self.source_file.open() as input: reader = csv.DictReader(input) item_key = "text" if "text" in reader.fieldnames else "item" date_key = "time" if "time" in reader.fieldnames else "date" value_key = "value" if "value" in reader.fieldnames else "frequency" for row in self.iterate_csv_items(self.source_file): if row[item_key] not in graphs: graphs[row[item_key]] = {} # make sure the months and days are zero-padded interval = row.get(date_key, "") interval = "-".join([ str(bit).zfill(2 if len(bit) != 4 else 4) for bit in interval.split("-") ]) first_date = min(first_date, interval) last_date = max(last_date, interval) if interval not in intervals: intervals.append(interval) if interval not in graphs[row[item_key]]: graphs[row[item_key]][interval] = 0 graphs[row[item_key]][interval] += float(row.get(value_key, 0)) # first make sure we actually have something to render intervals = sorted(intervals) if len(intervals) <= 1: self.dataset.update_status( "Not enough data for a side-by-side over-time visualisation.") self.dataset.finish(0) return # only retain most-occurring series - sort by sum of all frequencies if len(graphs) > top: selected_graphs = { graph: graphs[graph] for graph in sorted( graphs, key=lambda x: sum( [graphs[x][interval] for interval in graphs[x]]), reverse=True)[0:top] } graphs = selected_graphs # there may be items that do not have values for all intervals # this will distort the graph, so the next step is to make sure all # graphs consist of the same continuous interval list missing = {graph: 0 for graph in graphs} for graph in graphs: missing[graph], graphs[graph] = pad_interval( graphs[graph], first_interval=first_date, last_interval=last_date) # now that's done, make sure the graph datapoints are in order intervals = sorted(list(graphs[list(graphs)[0]].keys())) # delete graphs that do not have the required amount of intervals # this is useful to get rid of outliers and items that only occur # very few times over the full interval if completeness > 0: intervals_required = len(intervals) * (completeness / 100) disqualified = [] for graph in graphs: if len(intervals) - missing[graph] < intervals_required: disqualified.append(graph) graphs = { graph: graphs[graph] for graph in graphs if graph not in disqualified } # determine max value per item, so we can normalize them later limits = {} max_limit = 0 for graph in graphs: for interval in graphs[graph]: limits[graph] = max(limits.get(graph, 0), abs(graphs[graph][interval])) max_limit = max(max_limit, abs(graphs[graph][interval])) # order graphs by highest (or lowest) value) limits = { limit: limits[limit] for limit in sorted(limits, key=lambda l: limits[l]) } graphs = {graph: graphs[graph] for graph in limits} if not graphs: # maybe nothing is actually there to be graphed self.dataset.update_status( "No items match the selection criteria - nothing to visualise." ) self.dataset.finish(0) return None # how many vertical grid lines (and labels) are to be included at most # 12 is a sensible default because it allows one label per month for a full # year's data max_gridlines = 12 # If True, label is put at the lower left bottom of the graph rather than # outside it. Automatically set to True if one of the labels is long, as # else the label would fall off the screen label_in_graph = max([len(item) for item in graphs]) > 30 # determine how wide each interval should be # the graph has a minimum width - but the graph's width will be # extended if at this minimum width each item does not have the # minimum per-item width min_full_width = 600 min_item_width = 50 item_width = max(min_item_width, min_full_width / len(intervals)) # determine how much space each graph should get # same trade-off as for the interval width min_full_height = 300 min_item_height = 100 item_height = max(min_item_height, min_full_height / len(graphs)) # margin - this should be enough for the text labels to fit in margin_base = 50 margin_right = margin_base * 4 margin_top = margin_base * 3 # this determines the "flatness" of the isometric projection and an be # tweaked for different looks - basically corresponds to how far the # camera is above the horizon plane_angle = 120 # don't change these plane_obverse = radians((180 - plane_angle) / 2) plane_angle = radians(plane_angle) # okay, now determine the full graphic size with these dimensions projected # semi-isometrically. We can also use these values later for drawing for # drawing grid lines, et cetera. The axis widths and heights here are the # dimensions of the bounding box wrapping the isometrically projected axes. x_axis_length = (item_width * (len(intervals) - 1)) y_axis_length = (item_height * len(graphs)) x_axis_width = (sin(plane_angle / 2) * x_axis_length) y_axis_width = (sin(plane_angle / 2) * y_axis_length) canvas_width = x_axis_width + y_axis_width # leave room for graph header if graph_label: margin_top += (2 * (canvas_width / 50)) x_axis_height = (cos(plane_angle / 2) * x_axis_length) y_axis_height = (cos(plane_angle / 2) * y_axis_length) canvas_height = x_axis_height + y_axis_height # now we have the dimensions, the canvas can be instantiated canvas = get_4cat_canvas( self.dataset.get_results_path(), width=(canvas_width + margin_base + margin_right), height=(canvas_height + margin_base + margin_top), header=graph_label) # draw gridlines - vertical gridline_x = y_axis_width + margin_base gridline_y = margin_top + canvas_height step_x_horizontal = sin(plane_angle / 2) * item_width step_y_horizontal = cos(plane_angle / 2) * item_width step_x_vertical = sin(plane_angle / 2) * item_height step_y_vertical = cos(plane_angle / 2) * item_height # labels for x axis skip = max(1, int(len(intervals) / max_gridlines)) for i in range(0, len(intervals)): if i % skip == 0: canvas.add( Line(start=(gridline_x, gridline_y), end=(gridline_x - y_axis_width, gridline_y - y_axis_height), stroke="grey", stroke_width=0.25)) # to properly position the rotated and skewed text a container # element is needed label1 = str(intervals[i])[0:4] center = (gridline_x, gridline_y) container = SVG(x=center[0] - 25, y=center[1], width="50", height="1.5em", overflow="visible", style="font-size:0.8em;") container.add( Text(insert=("25%", "100%"), text=label1, transform="rotate(%f) skewX(%f)" % (-degrees(plane_obverse), degrees(plane_obverse)), text_anchor="middle", baseline_shift="-0.5em", style="font-weight:bold;")) if re.match(r"^[0-9]{4}-[0-9]{2}", intervals[i]): label2 = month_abbr[int(str(intervals[i])[5:7])] if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}", intervals[i]): label2 += " %i" % int(intervals[i][8:10]) container.add( Text(insert=("25%", "150%"), text=label2, transform="rotate(%f) skewX(%f)" % (-degrees(plane_obverse), degrees(plane_obverse)), text_anchor="middle", baseline_shift="-0.5em")) canvas.add(container) gridline_x += step_x_horizontal gridline_y -= step_y_horizontal # draw graphs as filled beziers top = step_y_vertical * 1.5 graph_start_x = y_axis_width + margin_base graph_start_y = margin_top + canvas_height # draw graphs in reverse order, so the bottom one is most in the # foreground (in case of overlap) for graph in reversed(list(graphs)): self.dataset.update_status("Rendering graph for '%s'" % graph) # path starting at lower left corner of graph area_graph = Path(fill=self.colours[self.colour_index]) area_graph.push("M %f %f" % (graph_start_x, graph_start_y)) previous_value = None graph_x = graph_start_x graph_y = graph_start_y for interval in graphs[graph]: # normalise value value = graphs[graph][interval] try: limit = limits[graph] if normalise_values else max_limit value = top * copysign(abs(value) / limit, value) except ZeroDivisionError: value = 0 if previous_value is None: # vertical line upwards to starting value of graph area_graph.push("L %f %f" % (graph_start_x, graph_start_y - value)) elif not smooth: area_graph.push("L %f %f" % (graph_x, graph_y - value)) else: # quadratic bezier from previous value to current value control_left = (graph_x - (step_x_horizontal / 2), graph_y + step_y_horizontal - previous_value - (step_y_horizontal / 2)) control_right = (graph_x - (step_x_horizontal / 2), graph_y - value + (step_y_horizontal / 2)) area_graph.push("C %f %f %f %f %f %f" % (*control_left, *control_right, graph_x, graph_y - value)) previous_value = value graph_x += step_x_horizontal graph_y -= step_y_horizontal # line to the bottom of the graph at the current Y position area_graph.push( "L %f %f" % (graph_x - step_x_horizontal, graph_y + step_y_horizontal)) area_graph.push("Z") # then close the Path canvas.add(area_graph) # add text labels - skewing is a bit complicated and we need a # "center" to translate the origins properly. if label_in_graph: insert = (graph_start_x + 5, graph_start_y - 10) else: insert = (graph_x - (step_x_horizontal) + 5, graph_y + step_y_horizontal - 10) # we need to take the skewing into account for the translation offset_y = tan(plane_obverse) * insert[0] canvas.add( Text(insert=(0, 0), text=graph, transform="skewY(%f) translate(%f %f)" % (-degrees(plane_obverse), insert[0], insert[1] + offset_y))) # cycle colours, back to the beginning if all have been used self.colour_index += 1 if self.colour_index >= len(self.colours): self.colour_index = 0 graph_start_x -= step_x_vertical graph_start_y -= step_y_vertical # draw gridlines - horizontal gridline_x = margin_base gridline_y = margin_top + canvas_height - y_axis_height for graph in graphs: gridline_x += step_x_vertical gridline_y += step_y_vertical canvas.add( Line(start=(gridline_x, gridline_y), end=(gridline_x + x_axis_width, gridline_y - x_axis_height), stroke="black", stroke_width=1)) # x axis canvas.add( Line(start=(margin_base + y_axis_width, margin_top + canvas_height), end=(margin_base + canvas_width, margin_top + canvas_height - x_axis_height), stroke="black", stroke_width=2)) # and finally save the SVG canvas.save(pretty=True) self.dataset.finish(len(graphs))
def process(self): """ Reads a CSV file, counts occurences of chosen values over all posts, and aggregates the results per chosen time frame """ # convenience variables timeframe = self.parameters.get("timeframe", self.options["timeframe"]["default"]) scope = self.parameters.get("scope", self.options["scope"]["default"]) rank_style = self.parameters.get("top-style", self.options["top-style"]["default"]) cutoff = convert_to_int(self.parameters.get("top", self.options["top"]["default"])) # This is needed to check for URLs in the "domain" and "url" columns for Reddit submissions datasource = self.parent.parameters.get("datasource") # now for the real deal self.dataset.update_status("Reading source file") overall_top = {} interval_top = {} for post in self.iterate_csv_items(self.source_file): # determine where to put this data if timeframe == "all": time_unit = "overall" else: try: timestamp = int(datetime.datetime.strptime(post["timestamp"], "%Y-%m-%d %H:%M:%S").timestamp()) except ValueError: timestamp = 0 date = datetime.datetime.fromtimestamp(timestamp) if timeframe == "year": time_unit = str(date.year) elif timeframe == "month": time_unit = str(date.year) + "-" + str(date.month).zfill(2) else: time_unit = str(date.year) + "-" + str(date.month).zfill(2) + "-" + str(date.day).zfill(2) if time_unit not in interval_top: interval_top[time_unit] = {} if scope == "unambiguous": terms = post["hatebase_terms_unambiguous"] elif scope == "ambiguous": terms = post["hatebase_terms_ambiguous"] else: terms = post["hatebase_terms"] terms = terms.split(",") if not terms: continue for term in terms: if not term.strip(): continue if term not in overall_top: overall_top[term] = 0 overall_top[term] += 1 if term not in interval_top[time_unit]: interval_top[time_unit][term] = 0 interval_top[time_unit][term] += 1 # this eliminates all items from the results that were not in the # *overall* top-occuring items. This only has an effect when vectors # were generated for multiple intervals if rank_style == "overall": overall_top = {item: overall_top[item] for item in sorted(overall_top, key=lambda x: overall_top[x], reverse=True)[0:cutoff]} filtered_results = {} for interval in interval_top: filtered_results[interval] = {} for term in interval_top[interval]: if term in overall_top: filtered_results[interval][term] = interval_top[interval][term] interval_top = filtered_results rows = [] for interval in interval_top: interval_top[interval] = {term: interval_top[interval][term] for term in sorted(interval_top[interval], reverse=True, key=lambda x: interval_top[interval][x])[0:cutoff]} for interval in sorted(interval_top): for term in interval_top[interval]: rows.append({ "date": interval, "item": term, "frequency": interval_top[interval][term] }) # write as csv if rows: self.write_csv_items_and_finish(rows) else: self.dataset.finish(0)
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ self.dataset.update_status("Reading source file") # prepare ImageFile.LOAD_TRUNCATED_IMAGES = True sample_max = 75 # image size for colour sampling def numpy_to_rgb(numpy_array): """ Helper function to go from numpy array to list of RGB strings Used in the K-Means clustering part """ return ",".join([str(int(value)) for value in numpy_array]) max_images = convert_to_int(self.parameters.get("amount"), 100) sizing_mode = self.parameters.get("tile-size", self.options["tile-size"]["default"]) sort_mode = self.parameters.get("sort-mode") # is there anything to put on a wall? if self.source_dataset.num_rows == 0: self.dataset.update_status( "No images available to render to image wall.", is_final=True) self.dataset.finish(0) return # 0 = use as many images as in the archive, up to the max if max_images == 0: max_images = self.options["amount"]["max"] # we loop through the images twice - once to reduce them to a value # that can be sorted, and another time to actually copy them to the # canvas for the image wall # we create a staging area manually here, so it is not automatically # deleted after one loop, since we need two staging_area = self.dataset.get_staging_area() # first, extract and reduce, and store the sortable value in a # dictionary with the image file name as key image_colours = {} dimensions = {} # used to calculate optimal tile size later index = 0 random_values = list(range(0, self.source_dataset.num_rows)) random.shuffle(random_values) for path in self.iterate_archive_contents(self.source_file, staging_area): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while determining image wall order") try: picture = Image.open(str(path)) except UnidentifiedImageError: self.dataset.update_status( "Image %s could not be parsed. Skipping." % path) continue self.dataset.update_status( "Analysing %s (%i/%i)" % (path.name, len(dimensions), self.source_dataset.num_rows)) # these calculations can take ages for huge images, so resize if it is # larger than the threshold dimensions[path.name] = (picture.width, picture.height) if sort_mode not in ("", "random") and (picture.height > sample_max or picture.width > sample_max): sample_width = int(sample_max * picture.width / max(picture.width, picture.height)) sample_height = int(sample_max * picture.height / max(picture.width, picture.height)) picture = ImageOps.fit(picture, (sample_width, sample_height)) if sort_mode not in ("", "random"): # ensure we get RGB values for pixels picture = picture.convert("RGB") # determine a 'representative colour' if sort_mode == "random": # just randomly sort it, don't even look at the colours value = random_values.pop() elif sort_mode in ("average-rgb", "average-hsv"): # average colour, as RGB or HSV pixels = picture.getdata() if sort_mode == "average-hsv": pixels = [colorsys.rgb_to_hsv(*pixel) for pixel in pixels] sum_colour = (sum([p[0] for p in pixels]), sum([p[1] for p in pixels]), sum([p[2] for p in pixels])) avg_colour = (sum_colour[0] / len(pixels), sum_colour[1] / len(pixels), sum_colour[2] / len(pixels)) # this is a bit dumb, but since all the other modes return rgb... if sort_mode == "average-hsv": avg_colour = colorsys.hsv_to_rgb(*avg_colour) value = avg_colour elif sort_mode == "dominant": # most-occurring colour colours = picture.getcolors(picture.width * picture.height) colours = sorted(colours, key=lambda x: x[0], reverse=True) value = colours[0][1] elif sort_mode in ("kmeans-dominant", "kmeans-average"): # use k-means clusters to determine the representative colour # this is more computationally expensive but gives far better # results. # determine k-means clusters for this image, i.e. the n most # dominant "average" colours, in this case n=3 (make parameter?) pixels = picture.getdata() clusters = KMeans(n_clusters=3, random_state=0) # 0 so it is deterministic predicted_centroids = clusters.fit_predict(pixels).tolist() # now we have two options - if sort_mode == "kmeans-dominant": # the colour of the single most dominant k-means centroid ranked_centroids = {} for index in range(0, len(clusters.cluster_centers_)): ranked_centroids[numpy_to_rgb( clusters.cluster_centers_[index] )] = predicted_centroids.count(index) value = [ int(v) for v in sorted(ranked_centroids, key=lambda k: ranked_centroids[k], reverse=True)[0].split(",") ] elif sort_mode == "kmeans-average": # average colour of all k-means centroids, weighted by the # dominance of each centroid value = [0, 0, 0] for index in clusters.labels_: value[0] += clusters.cluster_centers_[index][0] value[1] += clusters.cluster_centers_[index][1] value[2] += clusters.cluster_centers_[index][2] value[0] /= len(clusters.labels_) value[1] /= len(clusters.labels_) value[2] /= len(clusters.labels_) else: value = (0, 0, 0) # converted to HSV, because RGB does not sort nicely image_colours[path.name] = colorsys.rgb_to_hsv(*value) index += 1 # only retain the top n of the sorted list of images - this gives us # our final image set sorted_image_files = [ path for path in sorted( image_colours, key=lambda k: image_colours[k])[:max_images] ] dimensions = {path: dimensions[path] for path in sorted_image_files} average_size = (sum([k[0] for k in dimensions.values()]) / len(dimensions), sum([k[1] for k in dimensions.values()]) / len(dimensions)) self.dataset.update_status("Determining canvas and image sizes") # calculate 'tile sizes' (a tile is an image) and also the size of the # canvas we will need to fit them all. The canvas can never be larger than # this: max_pixels = self.TARGET_WIDTH * self.TARGET_HEIGHT if sizing_mode == "fit-height": # assuming every image has the overall average height, how wide would # the canvas need to be (if everything is on a single row)? full_width = 0 tile_y = average_size[1] for dimension in dimensions.values(): # ideally, we make everything the average height optimal_ratio = average_size[1] / dimension[1] full_width += dimension[0] * optimal_ratio # now we can calculate the total amount of pixels needed fitted_pixels = full_width * tile_y if fitted_pixels > max_pixels: # try again with a lower height area_ratio = max_pixels / fitted_pixels tile_y = int(tile_y * math.sqrt(area_ratio)) fitted_pixels = max_pixels # find the canvas size that can fit this amount of pixels at the # required proportions, provided that y = multiple of avg height ideal_height = math.sqrt(fitted_pixels / (self.TARGET_WIDTH / self.TARGET_HEIGHT)) size_y = math.ceil(ideal_height / tile_y) * tile_y size_x = fitted_pixels / size_y tile_x = -1 # varies elif sizing_mode == "square": # assuming each image is square, find a canvas with the right # proportions that would fit all of them # assume the average dimensions tile_size = int(sum(average_size) / 2) # this is how many pixels we need fitted_pixels = tile_size * tile_size * len(sorted_image_files) # does that fit our canvas? if fitted_pixels > max_pixels: tile_size = math.floor( math.sqrt(max_pixels / len(sorted_image_files))) fitted_pixels = tile_size * tile_size * len(sorted_image_files) ideal_width = math.sqrt(fitted_pixels / (self.TARGET_HEIGHT / self.TARGET_WIDTH)) size_x = math.ceil(ideal_width / tile_size) * tile_size size_y = math.ceil(fitted_pixels / size_x / tile_size) * tile_size tile_x = tile_y = tile_size elif sizing_mode == "average": tile_x = int(average_size[0]) tile_y = int(average_size[1]) fitted_pixels = tile_x * tile_y * len(sorted_image_files) if fitted_pixels > max_pixels: area_ratio = max_pixels / fitted_pixels tile_x = int(tile_x * math.sqrt(area_ratio)) tile_y = int(tile_y * math.sqrt(area_ratio)) fitted_pixels = tile_x * tile_y * len(sorted_image_files) ideal_width = math.sqrt(fitted_pixels / (self.TARGET_HEIGHT / self.TARGET_WIDTH)) size_x = math.ceil(ideal_width / tile_x) * tile_x size_y = math.ceil(fitted_pixels / size_x / tile_y) * tile_y else: raise NotImplementedError("Sizing mode '%s' not implemented" % sizing_mode) self.dataset.log("Canvas size is %ix%i" % (size_x, size_y)) wall = Image.new("RGBA", (int(size_x), int(size_y))) ImageDraw.floodfill(wall, (0, 0), (255, 255, 255, 0)) # transparent background counter = 0 offset_x = 0 offset_y = 0 tile_x = int(tile_x) tile_y = int(tile_y) # now actually putting the images on a wall is relatively trivial for path in sorted_image_files: counter += 1 self.dataset.update_status( "Rendering %s (%i/%i) to image wall" % (path, counter, len(sorted_image_files))) picture = Image.open(str(staging_area.joinpath(path))) if tile_x == -1: picture_x = max(1, int(picture.width * (tile_y / picture.height))) picture = ImageOps.fit(picture, (picture_x, tile_y), method=Image.BILINEAR) else: picture = ImageOps.fit(picture, (tile_x, tile_y), method=Image.BILINEAR) # simply put them side by side until the right edge is reached, # then move to a new row if offset_x + picture.width > wall.width: offset_x = 0 offset_y += picture.height # this can happen in some edge cases: there is an extra row of # images we hadn't accounted for. In that case, simply enlarge the # canvas. if offset_y + picture.height > wall.height: new_wall = Image.new("RGBA", (wall.width, offset_y + picture.height)) ImageDraw.floodfill( new_wall, (0, 0), (255, 255, 255, 0)) # transparent background new_wall.paste(wall, (0, 0)) wall = new_wall wall.paste(picture, (offset_x, offset_y)) offset_x += picture.width # finish up self.dataset.update_status("Saving result") wall.save(str(self.dataset.get_results_path())) shutil.rmtree(staging_area) self.dataset.update_status("Finished") self.dataset.finish(counter)