def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ api_key = self.parameters.get("api_key") self.dataset.delete_parameter("api_key") # sensitive, delete after use features = self.parameters.get("features") features = [{"type": feature} for feature in features] if not api_key: self.dataset.update_status("You need to provide a valid API key", is_final=True) self.dataset.finish(0) return max_images = convert_to_int(self.parameters.get("amount", 0), 100) total = self.source_dataset.num_rows if not max_images else min( max_images, self.source_dataset.num_rows) done = 0 for image_file in self.iterate_archive_contents(self.source_file): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from Google Vision API") done += 1 self.dataset.update_status("Annotating image %i/%i" % (done, total)) try: annotations = self.annotate_image(image_file, api_key, features) except RuntimeError: # cannot continue fetching, e.g. when API key is invalid break if not annotations: continue annotations = {"file_name": image_file.name, **annotations} with self.dataset.get_results_path().open( "a", encoding="utf-8") as outfile: outfile.write(json.dumps(annotations) + "\n") if max_images and done >= max_images: break self.dataset.update_status("Annotations retrieved for %i images" % done) self.dataset.finish(done)
def get_processor_pipeline(self): """ This queues a series of post-processors to annotate images First, the required amount of images referenced in the dataset is downloaded, in order of most-referenced; then, the requested features are extracted using the Google Vision API; finally, the result is converted to a CSV file for easy processing. """ amount = convert_to_int(self.parameters.get("amount", 10), 10) api_key = self.parameters.get("api_key", "") features = self.parameters.get("features", "") self.dataset.delete_parameter( "api_key") # sensitive, delete as soon as possible pipeline = [ # first, extract top images { "type": "top-images", "parameters": { "overwrite": False } }, # then, download the images we want to annotate { "type": "image-downloader", "parameters": { "amount": amount, "overwrite": False } }, # then, annotate the downloaded images with the Google Vision API { "type": "google-vision-api", "parameters": { "features": features, "amount": amount, "api_key": api_key } }, # finally, create a simplified CSV file from the download NDJSON (which can also be retrieved later) { "type": "convert-vision-to-csv", "parameters": {} } ] return pipeline
def process(self): """ Takes the thumbnails downloaded from YouTube metadata and turns it into an image wall. """ results_path = self.dataset.get_results_path() dirname = Path(results_path.parent, results_path.name.replace(".", "")) # Get the required parameters # path to the YouTube csv data that was the source of the thumbnails root_csv = self.dataset.get_genealogy()[-3].get_results_path() max_amount = convert_to_int(self.parameters.get("max_amount", 0), 0) category_overlay = self.parameters.get("category_overlay") # Build that wall! self.make_imagewall(root_csv, max_amount=max_amount, category_overlay=category_overlay)
def process(self): graphs = {} intervals = [] smooth = self.parameters.get("smooth") normalise_values = self.parameters.get("normalise") completeness = convert_to_int(self.parameters.get("complete"), 0) graph_label = self.parameters.get("label") top = convert_to_int(self.parameters.get("top"), 10) # first gather graph data: each distinct item gets its own graph and # for each graph we have a sequence of intervals, each interval with # its own value first_date = "9999-99-99" last_date = "0000-00-00" for row in self.iterate_items(self.source_file): if row["item"] not in graphs: graphs[row["item"]] = {} # make sure the months and days are zero-padded interval = row.get("date", "") interval = "-".join([ str(bit).zfill(2 if len(bit) != 4 else 4) for bit in interval.split("-") ]) first_date = min(first_date, interval) last_date = max(last_date, interval) if interval not in intervals: intervals.append(interval) if interval not in graphs[row["item"]]: graphs[row["item"]][interval] = 0 graphs[row["item"]][interval] += float(row.get("value", 0)) # first make sure we actually have something to render intervals = sorted(intervals) if len(intervals) <= 1: self.dataset.update_status( "Not enough data for a side-by-side over-time visualisation.") self.dataset.finish(0) return # only retain most-occurring series - sort by sum of all frequencies if len(graphs) > top: selected_graphs = { graph: graphs[graph] for graph in sorted( graphs, key=lambda x: sum( [graphs[x][interval] for interval in graphs[x]]), reverse=True)[0:top] } graphs = selected_graphs # there may be items that do not have values for all intervals # this will distort the graph, so the next step is to make sure all # graphs consist of the same continuous interval list missing = {graph: 0 for graph in graphs} for graph in graphs: missing[graph], graphs[graph] = pad_interval( graphs[graph], first_interval=first_date, last_interval=last_date) # now that's done, make sure the graph datapoints are in order intervals = sorted(list(graphs[list(graphs)[0]].keys())) # delete graphs that do not have the required amount of intervals # this is useful to get rid of outliers and items that only occur # very few times over the full interval if completeness > 0: intervals_required = len(intervals) * (completeness / 100) disqualified = [] for graph in graphs: if len(intervals) - missing[graph] < intervals_required: disqualified.append(graph) graphs = { graph: graphs[graph] for graph in graphs if graph not in disqualified } # determine max value per item, so we can normalize them later limits = {} max_limit = 0 for graph in graphs: for interval in graphs[graph]: limits[graph] = max(limits.get(graph, 0), abs(graphs[graph][interval])) max_limit = max(max_limit, abs(graphs[graph][interval])) # order graphs by highest (or lowest) value) limits = { limit: limits[limit] for limit in sorted(limits, key=lambda l: limits[l]) } graphs = {graph: graphs[graph] for graph in limits} if not graphs: # maybe nothing is actually there to be graphed self.dataset.update_status( "No items match the selection criteria - nothing to visualise." ) self.dataset.finish(0) return None # how many vertical grid lines (and labels) are to be included at most # 12 is a sensible default because it allows one label per month for a full # year's data max_gridlines = 12 # If True, label is put at the lower left bottom of the graph rather than # outside it. Automatically set to True if one of the labels is long, as # else the label would fall off the screen label_in_graph = max([len(item) for item in graphs]) > 30 # determine how wide each interval should be # the graph has a minimum width - but the graph's width will be # extended if at this minimum width each item does not have the # minimum per-item width min_full_width = 600 min_item_width = 50 item_width = max(min_item_width, min_full_width / len(intervals)) # determine how much space each graph should get # same trade-off as for the interval width min_full_height = 300 min_item_height = 100 item_height = max(min_item_height, min_full_height / len(graphs)) # margin - this should be enough for the text labels to fit in margin_base = 50 margin_right = margin_base * 4 margin_top = margin_base * 3 # this determines the "flatness" of the isometric projection and an be # tweaked for different looks - basically corresponds to how far the # camera is above the horizon plane_angle = 120 # don't change these plane_obverse = radians((180 - plane_angle) / 2) plane_angle = radians(plane_angle) # okay, now determine the full graphic size with these dimensions projected # semi-isometrically. We can also use these values later for drawing for # drawing grid lines, et cetera. The axis widths and heights here are the # dimensions of the bounding box wrapping the isometrically projected axes. x_axis_length = (item_width * (len(intervals) - 1)) y_axis_length = (item_height * len(graphs)) x_axis_width = (sin(plane_angle / 2) * x_axis_length) y_axis_width = (sin(plane_angle / 2) * y_axis_length) canvas_width = x_axis_width + y_axis_width # leave room for graph header if graph_label: margin_top += (2 * (canvas_width / 50)) x_axis_height = (cos(plane_angle / 2) * x_axis_length) y_axis_height = (cos(plane_angle / 2) * y_axis_length) canvas_height = x_axis_height + y_axis_height # now we have the dimensions, the canvas can be instantiated canvas = get_4cat_canvas( self.dataset.get_results_path(), width=(canvas_width + margin_base + margin_right), height=(canvas_height + margin_base + margin_top), header=graph_label) # draw gridlines - vertical gridline_x = y_axis_width + margin_base gridline_y = margin_top + canvas_height step_x_horizontal = sin(plane_angle / 2) * item_width step_y_horizontal = cos(plane_angle / 2) * item_width step_x_vertical = sin(plane_angle / 2) * item_height step_y_vertical = cos(plane_angle / 2) * item_height # labels for x axis # month and week both follow the same pattern # it's not always possible to distinguish between them but we will try # by looking for months greater than 12 in which case we are dealing # with weeks # we need to know this because for months there is an extra row in the # label with the full month is_week = False for i in range(0, len(intervals)): if re.match(r"^[0-9]{4}-[0-9]{2}", intervals[i]) and int(intervals[i].split("-")[1]) > 12: is_week = True break skip = max(1, int(len(intervals) / max_gridlines)) for i in range(0, len(intervals)): if i % skip == 0: canvas.add( Line(start=(gridline_x, gridline_y), end=(gridline_x - y_axis_width, gridline_y - y_axis_height), stroke="grey", stroke_width=0.25)) # to properly position the rotated and skewed text a container # element is needed label1 = str(intervals[i])[0:4] center = (gridline_x, gridline_y) container = SVG(x=center[0] - 25, y=center[1], width="50", height="1.5em", overflow="visible", style="font-size:0.8em;") container.add( Text(insert=("25%", "100%"), text=label1, transform="rotate(%f) skewX(%f)" % (-degrees(plane_obverse), degrees(plane_obverse)), text_anchor="middle", baseline_shift="-0.5em", style="font-weight:bold;")) if re.match(r"^[0-9]{4}-[0-9]{2}", intervals[i]) and not is_week: label2 = month_abbr[int(str(intervals[i])[5:7])] if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}", intervals[i]): label2 += " %i" % int(intervals[i][8:10]) container.add( Text(insert=("25%", "150%"), text=label2, transform="rotate(%f) skewX(%f)" % (-degrees(plane_obverse), degrees(plane_obverse)), text_anchor="middle", baseline_shift="-0.5em")) canvas.add(container) gridline_x += step_x_horizontal gridline_y -= step_y_horizontal # draw graphs as filled beziers top = step_y_vertical * 1.5 graph_start_x = y_axis_width + margin_base graph_start_y = margin_top + canvas_height # draw graphs in reverse order, so the bottom one is most in the # foreground (in case of overlap) for graph in reversed(list(graphs)): self.dataset.update_status("Rendering graph for '%s'" % graph) # path starting at lower left corner of graph area_graph = Path(fill=self.colours[self.colour_index]) area_graph.push("M %f %f" % (graph_start_x, graph_start_y)) previous_value = None graph_x = graph_start_x graph_y = graph_start_y for interval in graphs[graph]: # normalise value value = graphs[graph][interval] try: limit = limits[graph] if normalise_values else max_limit value = top * copysign(abs(value) / limit, value) except ZeroDivisionError: value = 0 if previous_value is None: # vertical line upwards to starting value of graph area_graph.push("L %f %f" % (graph_start_x, graph_start_y - value)) elif not smooth: area_graph.push("L %f %f" % (graph_x, graph_y - value)) else: # quadratic bezier from previous value to current value control_left = (graph_x - (step_x_horizontal / 2), graph_y + step_y_horizontal - previous_value - (step_y_horizontal / 2)) control_right = (graph_x - (step_x_horizontal / 2), graph_y - value + (step_y_horizontal / 2)) area_graph.push("C %f %f %f %f %f %f" % (*control_left, *control_right, graph_x, graph_y - value)) previous_value = value graph_x += step_x_horizontal graph_y -= step_y_horizontal # line to the bottom of the graph at the current Y position area_graph.push( "L %f %f" % (graph_x - step_x_horizontal, graph_y + step_y_horizontal)) area_graph.push("Z") # then close the Path canvas.add(area_graph) # add text labels - skewing is a bit complicated and we need a # "center" to translate the origins properly. if label_in_graph: insert = (graph_start_x + 5, graph_start_y - 10) else: insert = (graph_x - (step_x_horizontal) + 5, graph_y + step_y_horizontal - 10) # we need to take the skewing into account for the translation offset_y = tan(plane_obverse) * insert[0] canvas.add( Text(insert=(0, 0), text=graph, transform="skewY(%f) translate(%f %f)" % (-degrees(plane_obverse), insert[0], insert[1] + offset_y))) # cycle colours, back to the beginning if all have been used self.colour_index += 1 if self.colour_index >= len(self.colours): self.colour_index = 0 graph_start_x -= step_x_vertical graph_start_y -= step_y_vertical # draw gridlines - horizontal gridline_x = margin_base gridline_y = margin_top + canvas_height - y_axis_height for graph in graphs: gridline_x += step_x_vertical gridline_y += step_y_vertical canvas.add( Line(start=(gridline_x, gridline_y), end=(gridline_x + x_axis_width, gridline_y - x_axis_height), stroke="black", stroke_width=1)) # x axis canvas.add( Line(start=(margin_base + y_axis_width, margin_top + canvas_height), end=(margin_base + canvas_width, margin_top + canvas_height - x_axis_height), stroke="black", stroke_width=2)) # and finally save the SVG canvas.save(pretty=True) self.dataset.finish(len(graphs))
def process(self): """ This takes previously generated Word2Vec models and uses them to find similar words based on a list of words """ self.dataset.update_status("Processing sentences") depth = max(1, min(3, convert_to_int(self.parameters.get("crawl_depth")))) input_words = self.parameters.get("words", "") if not input_words or not input_words.split(","): self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(0) return input_words = input_words.split(",") num_words = convert_to_int(self.parameters.get("num-words")) try: threshold = float(self.parameters.get("threshold")) except ValueError: threshold = float(self.get_options()["threshold"]["default"]) threshold = max(-1.0, min(1.0, threshold)) # go through all models and calculate similarity for all given input words result = [] staging_area = self.unpack_archive_contents(self.source_file) for model_file in staging_area.glob("*.model"): interval = model_file.stem # for each separate model, calculate top similar words for each # input word, giving us at most # [max amount] * [number of input] * [number of intervals] # items self.dataset.update_status("Running model %s..." % model_file.name) model = KeyedVectors.load(str(model_file)) word_queue = set() checked_words = set() level = 1 words = input_words.copy() while words: if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while extracting similar words") word = words.pop() checked_words.add(word) try: similar_words = model.most_similar(positive=[word], topn=num_words) except KeyError: continue for similar_word in similar_words: if similar_word[1] < threshold: continue result.append({ "date": interval, "input": word, "item": similar_word[0], "value": similar_word[1], "input_occurences": model.vocab[word].count, "item_occurences": model.vocab[similar_word[0]].count, "depth": level }) # queue word for the next iteration if there is one and # it hasn't been seen yet if level < depth and similar_word[0] not in checked_words: word_queue.add(similar_word[0]) # if all words have been checked, but we still have an # iteration to go, load the queued words into the list if not words and word_queue and level < depth: level += 1 words = word_queue.copy() word_queue = set() shutil.rmtree(staging_area) if not result: self.dataset.update_status( "None of the words were found in the word embedding model.", is_final=True) self.dataset.finish(0) else: self.write_csv_items_and_finish(result)
def process(self): """ Reads vector set and creates a CSV with ranked vectors """ self.dataset.update_status("Processing token sets") def file_to_timestamp(file): """ Get comparable datestamp value for token file Token files are named YYYY-m.pb. This function converts that to a YYYYmm string, then that string to an int, so that it may be compared for sorting chronologically. :param str file: File name :return int: Comparable datestamp """ stem = file.split("/")[-1].split(".")[0].split("-") try: return int(stem[0] + stem[1].zfill(2)) except (ValueError, IndexError): return 0 results = [] # truncate results as needed rank_style = self.parameters.get("top-style") cutoff = convert_to_int(self.parameters.get("top")) # now rank the vectors by most prevalent per "file" (i.e. interval) overall_top = {} index = 0 for vector_file in self.iterate_archive_contents(self.source_file): # we support both pickle and json dumps of vectors vector_unpacker = pickle if vector_file.suffix == "pb" else json index += 1 vector_set_name = vector_file.stem # we don't need the full path self.dataset.update_status("Processing token set %i (%s)" % (index, vector_set_name)) with vector_file.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode vectors = vector_unpacker.load(binary_tokens) vectors = sorted(vectors, key=lambda x: x[1], reverse=True) # for overall ranking we need the full vector space per interval # because maybe an overall top-ranking vector is at the bottom # in this particular interval - we'll truncate the top list at # a later point in that case. Else, truncate it here if rank_style == "per-item": vectors = vectors[0:cutoff] for vector in vectors: if not vector[0].strip(): continue results.append({ "date": vector_set_name.split(".")[0], "item": vector[0], "value": vector[1] }) if vector[0] not in overall_top: overall_top[vector[0]] = 0 overall_top[vector[0]] += int(vector[1]) # this eliminates all items from the results that were not in the # *overall* top-occuring items. This only has an effect when vectors # were generated for multiple intervals if rank_style == "overall": overall_top = { item: overall_top[item] for item in sorted(overall_top, key=lambda x: overall_top[x], reverse=True)[0:cutoff] } filtered_results = [] for item in results: if item["item"] in overall_top: filtered_results.append(item) results = filtered_results # done! self.dataset.update_status("Writing results file") with open(self.dataset.get_results_path(), "w", encoding="utf-8") as output: writer = csv.DictWriter(output, fieldnames=("date", "item", "value")) writer.writeheader() for row in results: writer.writerow(row) self.dataset.update_status("Finished") self.dataset.finish(len(results))
def process(self): """ This takes a 4CAT results file as input, and outputs a number of files containing tokenised posts, grouped per time unit as specified in the parameters. """ self.dataset.update_status("Processing sentences") use_skipgram = 1 if self.parameters.get( "algorithm") == "skipgram" else 0 window = min(10, max(1, convert_to_int(self.parameters.get("window")))) use_negative = 5 if self.parameters.get("negative") else 0 min_count = max(1, convert_to_int(self.parameters.get("min_count"))) dimensionality = convert_to_int(self.parameters.get("dimensionality"), 100) detect_bigrams = self.parameters.get("detect-bigrams") model_type = self.parameters.get("model-type") max_words = convert_to_int(self.parameters.get("max_words")) if max_words == 0: # unlimited amount of words in model max_words = None staging_area = self.dataset.get_staging_area() model_builder = { "Word2Vec": Word2Vec, "FastText": FastText }[model_type] # go through all archived token sets and vectorise them models = 0 for temp_file in self.iterate_archive_contents(self.source_file): # use the "list of lists" as input for the word2vec model # by default the tokeniser generates one list of tokens per # post... which may actually be preferable for short # 4chan-style posts. But alternatively it could generate one # list per sentence - this processor is agnostic in that regard token_set_name = temp_file.name self.dataset.update_status( "Extracting bigrams from token set %s..." % token_set_name) try: if detect_bigrams: bigram_transformer = Phrases( self.tokens_from_file(temp_file, staging_area)) bigram_transformer = Phraser(bigram_transformer) else: bigram_transformer = None self.dataset.update_status( "Training %s model for token set %s..." % (model_builder.__name__, token_set_name)) try: model = model_builder(negative=use_negative, size=dimensionality, sg=use_skipgram, window=window, workers=3, min_count=min_count, max_final_vocab=max_words) # we do not simply pass a sentences argument to model builder # because we are using a generator, which exhausts, while # Word2Vec needs to iterate over the sentences twice # https://stackoverflow.com/a/57632747 model.build_vocab( self.tokens_from_file(temp_file, staging_area, phraser=bigram_transformer)) model.train(self.tokens_from_file( temp_file, staging_area, phraser=bigram_transformer), epochs=model.iter, total_examples=model.corpus_count) except RuntimeError as e: if "you must first build vocabulary before training the model" in str( e): # not enough data. Skip - if this happens for all models # an error will be generated later continue else: raise e except UnicodeDecodeError: self.dataset.update_status( "Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True) self.dataset.finish(0) return # save - we only save the KeyedVectors for the model, this # saves space and we don't need to re-train the model later model_name = token_set_name.split(".")[0] + ".model" model.wv.save(str(staging_area.joinpath(model_name))) # save vocabulary too, some processors need it del model models += 1 if models == 0: self.dataset.update_status( "Not enough data in source file to train %s models." % model_builder.__name__) shutil.rmtree(staging_area) self.dataset.finish(0) return # create another archive with all model files in it self.dataset.update_status("%s model(s) saved." % model_builder.__name__) self.write_archive_and_finish(staging_area)
def process(self): """ Unzips and appends tokens to fetch and write a tf-idf matrix """ # Validate and process user inputs library = self.parameters.get("library", "gensim") if "-" not in self.parameters.get("n_size"): n_size = convert_to_int(self.parameters.get("n_size", 1), 1) n_size = (n_size, n_size) # needs to be a tuple for sklearn. else: n_size_split = self.parameters.get("n_size").split("-") n_size = (convert_to_int(n_size_split[0]), convert_to_int(n_size_split[1])) min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1) max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1) max_output = convert_to_int(self.parameters.get("max_output", 10), 10) smartirs = self.parameters.get("smartirs", "nfc") # Get token sets self.dataset.update_status("Processing token sets") tokens = [] dates = [] # Go through all archived token sets and generate collocations for each for token_file in self.iterate_archive_contents(self.source_file): # Get the date date_string = token_file.stem dates.append(date_string) # we support both pickle and json dumps of vectors token_unpacker = pickle if token_file.suffix == "pb" else json try: with token_file.open("rb") as binary_tokens: # these were saved as pickle dumps so we need the binary mode post_tokens = token_unpacker.load(binary_tokens) # Flatten the list of list of tokens - we're treating the whole time series as one document. post_tokens = list(itertools.chain.from_iterable(post_tokens)) # Add to all date's tokens tokens.append(post_tokens) except UnicodeDecodeError: self.dataset.update_status("Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True) self.dataset.finish(0) return # Make sure `min_occurrences` and `max_occurrences` are valid if min_occurrences > len(tokens): min_occurrences = len(tokens) - 1 if max_occurrences <= 0 or max_occurrences > len(tokens): max_occurrences = len(tokens) # Get the tf-idf matrix. self.dataset.update_status("Generating tf-idf for token set") try: if library == "gensim": results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs) elif library == "scikit-learn": results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences, max_occurrences=max_occurrences, top_n=max_output) else: self.dataset.update_status("Invalid library.") self.dataset.finish(0) return if results: # Generate csv and finish self.dataset.update_status("Writing to csv and finishing") self.write_csv_items_and_finish(results) except MemoryError: self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.") self.dataset.finish(0)
async def execute_queries(self): """ Get messages for queries This is basically what would be done in get_items(), except due to Telethon's architecture this needs to be called in an async method, which is this one. """ # session file has been created earlier, and we can re-use it here in # order to avoid having to re-enter the security code query = self.parameters hash_base = query["api_phone"].replace( "+", "") + query["api_id"] + query["api_hash"] session_id = hashlib.blake2b(hash_base.encode("ascii")).hexdigest() session_path = Path(config.PATH_ROOT).joinpath(config.PATH_SESSIONS, session_id + ".session") client = None def cancel_start(): """ Replace interactive phone number input in Telethon By default, if Telethon cannot use the given session file to authenticate, it will interactively prompt the user for a phone number on the command line. That is not useful here, so instead raise a RuntimeError. This will be caught below and the user will be told they need to re-authenticate via 4CAT. """ raise RuntimeError("Connection cancelled") try: client = TelegramClient(str(session_path), int(query.get("api_id")), query.get("api_hash"), loop=self.eventloop) await client.start(phone=cancel_start) except RuntimeError: # session is no longer useable, delete file so user will be asked # for security code again self.dataset.update_status( "Session is not authenticated: login security code may have expired. You need to re-enter the security code.", is_final=True) session_path.unlink(missing_ok=True) if client and hasattr(client, "disconnect"): await client.disconnect() return None except Exception as e: self.dataset.update_status( "Error connecting to the Telegram API with provided credentials.", is_final=True) if client and hasattr(client, "disconnect"): await client.disconnect() return None # ready our parameters parameters = self.dataset.get_parameters() queries = [ query.strip() for query in parameters.get("query", "").split(",") ] max_items = convert_to_int(parameters.get("items", 10), 10) try: posts = await self.gather_posts(client, queries, max_items) except Exception as e: self.dataset.update_status("Error scraping posts from Telegram") self.log.error("Telegram scraping error: %s" % traceback.format_exc()) posts = None finally: await client.disconnect() return posts
def get_items(self, query): """ Use the Twitter v2 API historical search to get tweets :param query: :return: """ # this is pretty sensitive so delete it immediately after storing in # memory bearer_token = self.parameters.get("api_bearer_token") auth = {"Authorization": "Bearer %s" % bearer_token} endpoint = "https://api.twitter.com/2/tweets/search/all" # these are all expansions and fields available at the time of writing # since it does not cost anything extra in terms of rate limiting, go # for as much data per tweet as possible... tweet_fields = ("attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "public_metrics", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "text", "withheld") user_fields = ("created_at", "description", "entities", "id", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "withheld") place_fields = ("contained_within", "country", "country_code", "full_name", "geo", "id", "name", "place_type") poll_fields = ("duration_minutes", "end_datetime", "id", "options", "voting_status") expansions = ("attachments.poll_ids", "attachments.media_keys", "author_id", "entities.mentions.username", "geo.place_id", "in_reply_to_user_id", "referenced_tweets.id", "referenced_tweets.id.author_id") media_fields = ("duration_ms", "height", "media_key", "non_public_metrics", "organic_metrics", "preview_image_url", "promoted_metrics", "public_metrics", "type", "url", "width") amount = convert_to_int(self.parameters.get("amount"), 10) params = { "query": self.parameters.get("query", ""), "expansions": ",".join(expansions), "tweet.fields": ",".join(tweet_fields), "user.fields": ",".join(user_fields), "poll.fields": ",".join(poll_fields), "place.fields": ",".join(place_fields), "media.fields": ",".join(media_fields), "max_results": max(10, min(amount, 500)) if amount > 0 else 500, # 500 = upper limit, 10 = lower } if self.parameters.get("min_date"): params["start_time"] = datetime.datetime.fromtimestamp( self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") if self.parameters.get("max_date"): params["end_time"] = datetime.datetime.fromtimestamp( self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") tweets = 0 self.dataset.log("Search parameters: %s" % repr(params)) while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while getting tweets from the Twitter API") # there is a limit of one request per second, so stay on the safe side of this while self.previous_request == int(time.time()): time.sleep(0.1) time.sleep(0.05) self.previous_request = int(time.time()) # now send the request, allowing for at least 5 replies if the connection seems unstable retries = 5 api_response = None while retries > 0: try: api_response = requests.get(endpoint, headers=auth, params=params) break except (ConnectionError, requests.exceptions.RequestException) as e: retries -= 1 wait_time = (5 - retries) * 10 self.dataset.update_status( "Got %s, waiting %i seconds before retrying" % (str(e), wait_time)) time.sleep(wait_time) # rate limited - the limit at time of writing is 300 reqs per 15 # minutes # usually you don't hit this when requesting batches of 500 at # 1/second if api_response.status_code == 429: resume_at = convert_to_int( api_response.headers["x-rate-limit-reset"]) + 1 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Hit Twitter rate limit - waiting until %s to continue." % resume_at_str) while time.time() <= resume_at: time.sleep(0.5) continue # API keys that are valid but don't have access or haven't been # activated properly get a 403 elif api_response.status_code == 403: try: structured_response = api_response.json() self.dataset.update_status( "'Forbidden' error from Twitter API. Could not connect to Twitter API " "with this API key. %s" % structured_response.get("detail", ""), is_final=True) except (json.JSONDecodeError, ValueError): self.dataset.update_status( "'Forbidden' error from Twitter API. Your key may not have access to " "the full-archive search endpoint.", is_final=True) finally: return # sometimes twitter says '503 service unavailable' for unclear # reasons - in that case just wait a while and try again elif api_response.status_code in (502, 503, 504): resume_at = time.time() + 60 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Twitter unavailable (status %i) - waiting until %s to continue." % (api_response.status_code, resume_at_str)) while time.time() <= resume_at: time.sleep(0.5) continue # this usually means the query is too long or otherwise contains # a syntax error elif api_response.status_code == 400: msg = "Response %i from the Twitter API; " % api_response.status_code try: api_response = api_response.json() msg += api_response.get("title", "") if "detail" in api_response: msg += ": " + api_response.get("detail", "") except (json.JSONDecodeError, TypeError): msg += "Some of your parameters (e.g. date range) may be invalid." self.dataset.update_status(msg, is_final=True) return # invalid API key elif api_response.status_code == 401: self.dataset.update_status( "Invalid API key - could not connect to Twitter API", is_final=True) return # haven't seen one yet, but they probably exist elif api_response.status_code != 200: self.dataset.update_status( "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True) self.log.warning( "Twitter API v2 responded with status code %i. Response body: %s" % (api_response.status_code, api_response.text)) return elif not api_response: self.dataset.update_status( "Could not connect to Twitter. Cancelling.", is_final=True) return api_response = api_response.json() # The API response contains tweets (of course) and 'includes', # objects that can be referenced in tweets. Later we will splice # this data into the tweets themselves to make them easier to # process. So extract them first... included_users = api_response.get("includes", {}).get("users", {}) included_media = api_response.get("includes", {}).get("media", {}) included_polls = api_response.get("includes", {}).get("polls", {}) included_tweets = api_response.get("includes", {}).get("tweets", {}) included_places = api_response.get("includes", {}).get("places", {}) for tweet in api_response.get("data", []): if 0 < amount <= tweets: break # splice referenced data back in # we use copy.deepcopy here because else we run into a # pass-by-reference quagmire tweet = self.enrich_tweet(tweet, included_users, included_media, included_polls, included_places, copy.deepcopy(included_tweets)) tweets += 1 if tweets % 500 == 0: self.dataset.update_status( "Received %i tweets from Twitter API" % tweets) yield tweet # paginate if (amount <= 0 or tweets < amount) and api_response.get( "meta") and "next_token" in api_response["meta"]: params["next_token"] = api_response["meta"]["next_token"] else: break
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ self.dataset.update_status("Reading source file") # prepare ImageFile.LOAD_TRUNCATED_IMAGES = True sample_max = 75 # image size for colour sampling def numpy_to_rgb(numpy_array): """ Helper function to go from numpy array to list of RGB strings Used in the K-Means clustering part """ return ",".join([str(int(value)) for value in numpy_array]) max_images = convert_to_int(self.parameters.get("amount"), 100) sizing_mode = self.parameters.get("tile-size") sort_mode = self.parameters.get("sort-mode") # is there anything to put on a wall? if self.source_dataset.num_rows == 0: self.dataset.update_status( "No images available to render to image wall.", is_final=True) self.dataset.finish(0) return # 0 = use as many images as in the archive, up to the max if max_images == 0: max_images = self.get_options()["amount"]["max"] # we loop through the images twice - once to reduce them to a value # that can be sorted, and another time to actually copy them to the # canvas for the image wall # we create a staging area manually here, so it is not automatically # deleted after one loop, since we need two staging_area = self.dataset.get_staging_area() # first, extract and reduce, and store the sortable value in a # dictionary with the image file name as key image_colours = {} dimensions = {} # used to calculate optimal tile size later index = 0 random_values = list(range(0, self.source_dataset.num_rows)) random.shuffle(random_values) for path in self.iterate_archive_contents(self.source_file, staging_area): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while determining image wall order") try: picture = Image.open(str(path)) except UnidentifiedImageError: self.dataset.update_status( "Image %s could not be parsed. Skipping." % path) continue self.dataset.update_status( "Analysing %s (%i/%i)" % (path.name, len(dimensions), self.source_dataset.num_rows)) # these calculations can take ages for huge images, so resize if it is # larger than the threshold dimensions[path.name] = (picture.width, picture.height) if sort_mode not in ("", "random") and (picture.height > sample_max or picture.width > sample_max): sample_width = int(sample_max * picture.width / max(picture.width, picture.height)) sample_height = int(sample_max * picture.height / max(picture.width, picture.height)) picture = ImageOps.fit(picture, (sample_width, sample_height)) if sort_mode not in ("", "random"): # ensure we get RGB values for pixels picture = picture.convert("RGB") # determine a 'representative colour' if sort_mode == "random": # just randomly sort it, don't even look at the colours value = random_values.pop() elif sort_mode in ("average-rgb", "average-hsv"): # average colour, as RGB or HSV pixels = picture.getdata() if sort_mode == "average-hsv": pixels = [colorsys.rgb_to_hsv(*pixel) for pixel in pixels] sum_colour = (sum([p[0] for p in pixels]), sum([p[1] for p in pixels]), sum([p[2] for p in pixels])) avg_colour = (sum_colour[0] / len(pixels), sum_colour[1] / len(pixels), sum_colour[2] / len(pixels)) # this is a bit dumb, but since all the other modes return rgb... if sort_mode == "average-hsv": avg_colour = colorsys.hsv_to_rgb(*avg_colour) value = avg_colour elif sort_mode == "dominant": # most-occurring colour colours = picture.getcolors(picture.width * picture.height) colours = sorted(colours, key=lambda x: x[0], reverse=True) value = colours[0][1] elif sort_mode in ("kmeans-dominant", "kmeans-average"): # use k-means clusters to determine the representative colour # this is more computationally expensive but gives far better # results. # determine k-means clusters for this image, i.e. the n most # dominant "average" colours, in this case n=3 (make parameter?) pixels = picture.getdata() clusters = KMeans(n_clusters=3, random_state=0) # 0 so it is deterministic predicted_centroids = clusters.fit_predict(pixels).tolist() # now we have two options - if sort_mode == "kmeans-dominant": # the colour of the single most dominant k-means centroid ranked_centroids = {} for index in range(0, len(clusters.cluster_centers_)): ranked_centroids[numpy_to_rgb( clusters.cluster_centers_[index] )] = predicted_centroids.count(index) value = [ int(v) for v in sorted(ranked_centroids, key=lambda k: ranked_centroids[k], reverse=True)[0].split(",") ] elif sort_mode == "kmeans-average": # average colour of all k-means centroids, weighted by the # dominance of each centroid value = [0, 0, 0] for index in clusters.labels_: value[0] += clusters.cluster_centers_[index][0] value[1] += clusters.cluster_centers_[index][1] value[2] += clusters.cluster_centers_[index][2] value[0] /= len(clusters.labels_) value[1] /= len(clusters.labels_) value[2] /= len(clusters.labels_) else: value = (0, 0, 0) # converted to HSV, because RGB does not sort nicely image_colours[path.name] = colorsys.rgb_to_hsv(*value) index += 1 # only retain the top n of the sorted list of images - this gives us # our final image set sorted_image_files = [ path for path in sorted( image_colours, key=lambda k: image_colours[k])[:max_images] ] dimensions = {path: dimensions[path] for path in sorted_image_files} average_size = (sum([k[0] for k in dimensions.values()]) / len(dimensions), sum([k[1] for k in dimensions.values()]) / len(dimensions)) self.dataset.update_status("Determining canvas and image sizes") # calculate 'tile sizes' (a tile is an image) and also the size of the # canvas we will need to fit them all. The canvas can never be larger than # this: max_pixels = self.TARGET_WIDTH * self.TARGET_HEIGHT if sizing_mode == "fit-height": # assuming every image has the overall average height, how wide would # the canvas need to be (if everything is on a single row)? full_width = 0 tile_y = average_size[1] for dimension in dimensions.values(): # ideally, we make everything the average height optimal_ratio = average_size[1] / dimension[1] full_width += dimension[0] * optimal_ratio # now we can calculate the total amount of pixels needed fitted_pixels = full_width * tile_y if fitted_pixels > max_pixels: # try again with a lower height area_ratio = max_pixels / fitted_pixels tile_y = int(tile_y * math.sqrt(area_ratio)) fitted_pixels = max_pixels # find the canvas size that can fit this amount of pixels at the # required proportions, provided that y = multiple of avg height ideal_height = math.sqrt(fitted_pixels / (self.TARGET_WIDTH / self.TARGET_HEIGHT)) size_y = math.ceil(ideal_height / tile_y) * tile_y size_x = fitted_pixels / size_y tile_x = -1 # varies elif sizing_mode == "square": # assuming each image is square, find a canvas with the right # proportions that would fit all of them # assume the average dimensions tile_size = int(sum(average_size) / 2) # this is how many pixels we need fitted_pixels = tile_size * tile_size * len(sorted_image_files) # does that fit our canvas? if fitted_pixels > max_pixels: tile_size = math.floor( math.sqrt(max_pixels / len(sorted_image_files))) fitted_pixels = tile_size * tile_size * len(sorted_image_files) ideal_width = math.sqrt(fitted_pixels / (self.TARGET_HEIGHT / self.TARGET_WIDTH)) size_x = math.ceil(ideal_width / tile_size) * tile_size size_y = math.ceil(fitted_pixels / size_x / tile_size) * tile_size tile_x = tile_y = tile_size elif sizing_mode == "average": tile_x = int(average_size[0]) tile_y = int(average_size[1]) fitted_pixels = tile_x * tile_y * len(sorted_image_files) if fitted_pixels > max_pixels: area_ratio = max_pixels / fitted_pixels tile_x = int(tile_x * math.sqrt(area_ratio)) tile_y = int(tile_y * math.sqrt(area_ratio)) fitted_pixels = tile_x * tile_y * len(sorted_image_files) ideal_width = math.sqrt(fitted_pixels / (self.TARGET_HEIGHT / self.TARGET_WIDTH)) size_x = math.ceil(ideal_width / tile_x) * tile_x size_y = math.ceil(fitted_pixels / size_x / tile_y) * tile_y else: raise NotImplementedError("Sizing mode '%s' not implemented" % sizing_mode) self.dataset.log("Canvas size is %ix%i" % (size_x, size_y)) wall = Image.new("RGBA", (int(size_x), int(size_y))) ImageDraw.floodfill(wall, (0, 0), (255, 255, 255, 0)) # transparent background counter = 0 offset_x = 0 offset_y = 0 tile_x = int(tile_x) tile_y = int(tile_y) # now actually putting the images on a wall is relatively trivial for path in sorted_image_files: counter += 1 self.dataset.update_status( "Rendering %s (%i/%i) to image wall" % (path, counter, len(sorted_image_files))) picture = Image.open(str(staging_area.joinpath(path))) if tile_x == -1: picture_x = max(1, int(picture.width * (tile_y / picture.height))) picture = ImageOps.fit(picture, (picture_x, tile_y), method=Image.BILINEAR) else: picture = ImageOps.fit(picture, (tile_x, tile_y), method=Image.BILINEAR) # simply put them side by side until the right edge is reached, # then move to a new row if offset_x + picture.width > wall.width: offset_x = 0 offset_y += picture.height # this can happen in some edge cases: there is an extra row of # images we hadn't accounted for. In that case, simply enlarge the # canvas. if offset_y + picture.height > wall.height: new_wall = Image.new("RGBA", (wall.width, offset_y + picture.height)) ImageDraw.floodfill( new_wall, (0, 0), (255, 255, 255, 0)) # transparent background new_wall.paste(wall, (0, 0)) wall = new_wall wall.paste(picture, (offset_x, offset_y)) offset_x += picture.width # finish up self.dataset.update_status("Saving result") wall.save(str(self.dataset.get_results_path())) shutil.rmtree(staging_area) self.dataset.update_status("Finished") self.dataset.finish(counter)
def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file containing all post bodies as one continuous string, sanitized. """ link_regex = re.compile(r"https?://[^\s]+") delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]") # settings strip_urls = self.parameters.get("strip-urls") strip_symbols = self.parameters.get("strip-symbols") sides = self.parameters.get("sides") self.align = self.parameters.get("align") window = convert_to_int(self.parameters.get("window"), 5) + 1 query = self.parameters.get("query") self.limit = convert_to_int(self.parameters.get("limit"), 100) left_branches = [] right_branches = [] # do some validation if not query.strip() or re.sub(r"\s", "", query) != query: self.dataset.update_status( "Invalid query for word tree generation. Query cannot be empty or contain whitespace." ) self.dataset.finish(0) return window = min(window, self.get_options()["window"]["max"] + 1) window = max(1, window) # find matching posts processed = 0 for post in self.iterate_items(self.source_file): processed += 1 if processed % 500 == 0: self.dataset.update_status( "Processing and tokenising post %i" % processed) body = post["body"] if not body: continue if strip_urls: body = link_regex.sub("", body) if strip_symbols: body = delete_regex.sub("", body) body = word_tokenize(body) positions = [ i for i, x in enumerate(body) if x.lower() == query.lower() ] # get lists of tokens for both the left and right side of the tree # on the left side, all lists end with the query, on the right side, # they start with the query for position in positions: right_branches.append(body[position:position + window]) left_branches.append(body[max(0, position - window):position + 1]) # Some settings for rendering the tree later self.step = self.fontsize * 0.6 # approximately the width of a monospace char self.gap = (7 * self.step) # space for lines between nodes width = 1 # will be updated later # invert the left side of the tree (because that's the way we want the # branching to work for that side) # we'll visually invert the nodes in the tree again later left_branches = [list(reversed(branch)) for branch in left_branches] # first create vertical slices of tokens per level self.dataset.update_status("Generating token tree from posts") levels_right = [{} for i in range(0, window)] levels_left = [{} for i in range(0, window)] tokens_left = [] tokens_right = [] # for each "level" (each branching point representing a level), turn # tokens into nodes, record the max amount of occurences for any # token in that level, and keep track of what nodes are in which level. # The latter is needed because a token may occur multiple times, at # different points in the graph. Do this for both the left and right # side of the tree. for i in range(0, window): for branch in right_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_right[i]: parent = levels_right[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_right[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_right.append(levels_right[i][token]) else: levels_right[i][token].occurrences += 1 occurrences = levels_right[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences for branch in left_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_left[i]: parent = levels_left[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_left[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_left.append(levels_left[i][token]) else: levels_left[i][token].occurrences += 1 occurrences = levels_left[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences # nodes that have no siblings can be merged with their parents, else # the graph becomes unnecessarily large with lots of single-word nodes # connected to single-word nodes. additionally, we want the nodes with # the most branches to be sorted to the top, and then only retain the # most interesting (i.e. most-occurring) branches self.dataset.update_status("Merging and sorting tree nodes") for token in tokens_left: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) for token in tokens_right: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) # somewhat annoyingly, anytree does not simply delete nodes detached # from the tree in the previous steps, but makes them root nodes. We # don't need these root nodes (we only need the original root), so the # next step is to remove all root nodes that are not the main root. # We cannot modify a list in-place, so make a new list with the # relevant nodes level_sizes = {} filtered_tokens_right = [] for token in tokens_right: if token.is_root and not token.is_top_root: continue filtered_tokens_right.append(token) filtered_tokens_left = [] for token in tokens_left: if token.is_root and not token.is_top_root: continue filtered_tokens_left.append(token) # now we know which nodes are left, and can therefore determine how # large the canvas needs to be - this is based on the max number of # branches found on any level of the tree, in other words, the number # of "terminal nodes" breadths_left = [ self.max_breadth(node) for node in filtered_tokens_left if node.is_top_root ] breadths_right = [ self.max_breadth(node) for node in filtered_tokens_right if node.is_top_root ] if not breadths_left: if sides == "left": self.dataset.update_status( "No data available to the left of the query", is_final=True) self.dataset.finish(0) return None elif sides == "both": sides = "right" breadths_left = [0] if not breadths_right: if sides == "right": self.dataset.update_status( "No data available to the right of the query", is_final=True) self.dataset.finish(0) return None elif sides == "both": sides = "left" breadths_right = [0] height_left = self.whitespace * self.fontsize * max(breadths_left) height_right = self.whitespace * self.fontsize * max(breadths_right) height = max(height_left, height_right) canvas = Drawing(str(self.dataset.get_results_path()), size=(width, height), style="font-family:monospace;font-size:%ipx" % self.fontsize) # the nodes on the left side of the graph now have the wrong word order, # because we reversed them earlier to generate the correct tree # hierarchy - now reverse the node labels so they are proper language # again for token in tokens_left: self.invert_node_labels(token) wrapper = SVG(overflow="visible") self.dataset.update_status("Rendering tree to SVG file") if sides != "right": wrapper = self.render(wrapper, [ token for token in filtered_tokens_left if token.is_root and token.children ], height=height, side=self.SIDE_LEFT) if sides != "left": wrapper = self.render(wrapper, [ token for token in filtered_tokens_right if token.is_root and token.children ], height=height, side=self.SIDE_RIGHT) # things may have been rendered outside the canvas, in which case we # need to readjust the SVG properties wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1}) canvas.update({"width": (self.x_max - self.x_min)}) canvas.add(wrapper) canvas.save(pretty=True) self.dataset.update_status("Finished") self.dataset.finish(len(tokens_left) + len(tokens_right))
def process(self): # parse parameters input_words = self.parameters.get("words", "") if not input_words or not input_words.split(","): self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(0) return input_words = input_words.split(",") try: threshold = float(self.parameters.get("threshold")) except ValueError: threshold = float(self.get_options()["threshold"]["default"]) threshold = max(-1.0, min(1.0, threshold)) num_words = convert_to_int(self.parameters.get("num-words")) overlay = self.parameters.get("overlay") reduction_method = self.parameters.get("method") all_words = self.parameters.get("all-words") # load model files and initialise self.dataset.update_status("Unpacking word embedding models") staging_area = self.unpack_archive_contents(self.source_file) common_vocab = None vector_size = None models = {} # find words that are common to all models self.dataset.update_status("Determining cross-model common vocabulary") for model_file in staging_area.glob("*.model"): if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while processing word embedding models") model = KeyedVectors.load(str(model_file)).wv models[model_file.stem] = model if vector_size is None: vector_size = model.vector_size # needed later for dimensionality reduction if common_vocab is None: common_vocab = set(model.vocab.keys()) else: common_vocab &= set(model.vocab.keys()) # intersect # sort common vocabulary by combined frequency across all models # this should make filtering for common words a bit faster further down self.dataset.update_status("Sorting vocabulary") common_vocab = list(common_vocab) common_vocab.sort(key=lambda w: sum( [model.vocab[w].count for model in models.values()]), reverse=True) # initial boundaries of 2D space (to be adjusted later based on t-sne # outcome) max_x = 0.0 - sys.float_info.max max_y = 0.0 - sys.float_info.max min_x = sys.float_info.max min_y = sys.float_info.max # for each model, find the words that we may want to plot - these are # the nearest neighbours for the given query words relevant_words = {} # the vectors need to be reduced all at once - but the vectors are # grouped by model. To solve this, keep one numpy array of vectors, # but also keep track of which indexes of this array belong to which # model, by storing the index of the first vector for a model vectors = numpy.empty((0, vector_size)) vector_offsets = {} # now process each model for model_name, model in models.items(): relevant_words[model_name] = set( ) # not a set, since order needs to be preserved self.dataset.update_status("Finding similar words in model '%s'" % model_name) for query in input_words: if query not in model.vocab: self.dataset.update_status( "Query '%s' was not found in model %s; cannot find nearest neighbours." % (query, model_name), is_final=True) self.dataset.finish(0) return if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while finding similar words") # use a larger sample (topn) than required since some of the # nearest neighbours may not be in the common vocabulary and # will therefore need to be ignored context = set([ word[0] for word in model.most_similar(query, topn=1000) if word[0] in common_vocab and word[1] >= threshold ][:num_words]) relevant_words[model_name] |= { query } | context # always include query word # now do another loop to determine which words to plot for each model # this is either the same as relevant_words, or a superset which # combines all relevant words for all models plottable_words = {} last_model = max(relevant_words.keys()) all_relevant_words = set().union(*relevant_words.values()) for model_name, words in relevant_words.items(): plottable_words[model_name] = [] vector_offsets[model_name] = len(vectors) # determine which words to plot for this model. either the nearest # neighbours for this model, or all nearest neighbours found across # all models words_to_include = all_relevant_words if all_words else relevant_words[ model_name] for word in words_to_include: if word in plottable_words[model_name] or ( not overlay and model_name != last_model and word not in input_words): # only plot each word once per model, or if 'overlay' # is not set, only once overall (for the most recent # model) continue vector = models[model_name][word] plottable_words[model_name].append(word) vectors = numpy.append(vectors, [vector], axis=0) del models # no longer needed # reduce the vectors of all words to be plotted for this model to # a two-dimensional coordinate with the previously initialised tsne # transformer. here the two-dimensional vectors are interpreted as # cartesian coordinates if reduction_method == "PCA": pca = PCA(n_components=2, random_state=0) vectors = pca.fit_transform(vectors) elif reduction_method == "t-SNE": # initialise t-sne transformer # parameters taken from Hamilton et al. # https://github.com/williamleif/histwords/blob/master/viz/common.py tsne = TSNE(n_components=2, random_state=0, learning_rate=150, init="pca") vectors = tsne.fit_transform(vectors) elif reduction_method == "TruncatedSVD": # standard sklearn parameters made explicit svd = TruncatedSVD(n_components=2, algorithm="randomized", n_iter=5, random_state=0) vectors = svd.fit_transform(vectors) else: shutil.rmtree(staging_area) self.dataset.update_status( "Invalid dimensionality reduction technique selected", is_final=True) self.dataset.finish(0) return # also keep track of the boundaries of our 2D space, so we can plot # them properly later for position in vectors: max_x = max(max_x, position[0]) max_y = max(max_y, position[1]) min_x = min(min_x, position[0]) min_y = min(min_y, position[1]) # now we know for each model which words should be plotted and at what # position # with this knowledge, we can normalize the positions, and start # plotting them in a graph # a palette generated with https://medialab.github.io/iwanthue/ colours = [ "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb", "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c", "#00b0ca", "#9a0f76", "#ff70c8", "#713c88" ] colour_index = 0 # make sure all coordinates are positive max_x -= min_x max_y -= min_y # determine graph dimensions and proportions width = 1000 # arbitrary height = width * (max_y / max_x) # retain proportions scale = width / max_x # margin around the plot to give room for labels and to look better margin = width * 0.1 width += 2 * margin height += 2 * margin # normalize all known positions to fit within the graph vectors = [(margin + ((position[0] - min_x) * scale), margin + ((position[1] - min_y) * scale)) for position in vectors] # now all positions are finalised, we can determine the "journey" of # each query - the sequence of positions in the graph it takes, so we # can draw lines from position to position later journeys = {} for query in input_words: journeys[query] = [] for model_name, words in plottable_words.items(): index = words.index(query) journeys[query].append(vectors[vector_offsets[model_name] + index]) # font sizes proportional to width (which is static and thus predictable) fontsize_large = width / 50 fontsize_normal = width / 75 fontsize_small = width / 100 # now we have the dimensions, the canvas can be instantiated model_type = self.source_dataset.parameters.get( "model-type", "word2vec") canvas = get_4cat_canvas( self.dataset.get_results_path(), width, height, header="%s nearest neighbours (fitting: %s) - '%s'" % (model_type, reduction_method, ",".join(input_words)), fontsize_normal=fontsize_normal, fontsize_large=fontsize_large, fontsize_small=fontsize_small) # use colour-coded backgrounds to distinguish the query words in the # graph, each model (= interval) with a separate colour for model_name in plottable_words: solid = Filter(id="solid-%s" % model_name) solid.feFlood(flood_color=colours[colour_index]) solid.feComposite(in_="SourceGraphic") canvas.defs.add(solid) # this can get kind of confusing, but you shouldn't be using this # with more than 16 models anyway colour_index = 0 if colour_index >= len( colours) - 1 else colour_index + 1 # now plot each word for each model self.dataset.update_status("Plotting graph") words = SVG(insert=(0, 0), size=(width, height)) queries = SVG(insert=(0, 0), size=(width, height)) colour_index = 0 for model_name, labels in plottable_words.items(): positions = vectors[ vector_offsets[model_name]:vector_offsets[model_name] + len(labels)] label_index = 0 for position in positions: word = labels[label_index] is_query = word in input_words label_index += 1 filter = ("url(#solid-%s)" % model_name) if is_query else "none" colour = "#FFF" if is_query else colours[colour_index] fontsize = fontsize_normal if is_query else fontsize_small if word in input_words: word += " (" + model_name + ")" label_container = SVG(insert=position, size=(1, 1), overflow="visible") label_container.add( Text(insert=("50%", "50%"), text=word, dominant_baseline="middle", text_anchor="middle", style="fill:%s;font-size:%ipx" % (colour, fontsize), filter=filter)) # we make sure the queries are always rendered on top by # putting them in a separate SVG container if is_query: queries.add(label_container) else: words.add(label_container) colour_index = 0 if colour_index >= len( colours) - 1 else colour_index + 1 # plot a line between positions for query words lines = SVG(insert=(0, 0), size=(width, height)) for query, journey in journeys.items(): previous_position = None for position in journey: if previous_position is None: previous_position = position continue lines.add( Line(start=previous_position, end=position, stroke="#CE1B28", stroke_width=2)) previous_position = position canvas.add(lines) canvas.add(words) canvas.add(queries) canvas.save(pretty=True) shutil.rmtree(staging_area) self.dataset.finish(len(journeys))