def process(self): """ This takes a 4CAT results file as input, and outputs a plain text file containing all post bodies as one continuous string, sanitized. """ link_regex = re.compile(r"https?://[^\s]+") delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]") # settings strip_urls = self.parameters.get("strip-urls", self.options["strip-urls"]["default"]) strip_symbols = self.parameters.get( "strip-symbols", self.options["strip-symbols"]["default"]) sides = self.parameters.get("sides", self.options["sides"]["default"]) self.align = self.parameters.get("align", self.options["align"]["default"]) window = convert_to_int( self.parameters.get("window", self.options["window"]["default"]), 5) + 1 query = self.parameters.get("query", self.options["query"]["default"]) self.limit = convert_to_int( self.parameters.get("limit", self.options["limit"]["default"]), 100) left_branches = [] right_branches = [] # do some validation if not query.strip() or re.sub(r"\s", "", query) != query: self.dataset.update_status( "Invalid query for word tree generation. Query cannot be empty or contain whitespace." ) self.dataset.finish(0) return window = min(window, self.options["window"]["max"] + 1) window = max(1, window) # find matching posts processed = 0 for post in self.iterate_csv_items(self.source_file): processed += 1 if processed % 500 == 0: self.dataset.update_status( "Processing and tokenising post %i" % processed) body = post["body"] if strip_urls: body = link_regex.sub("", body) if strip_symbols: body = delete_regex.sub("", body) body = word_tokenize(body) positions = [ i for i, x in enumerate(body) if x.lower() == query.lower() ] # get lists of tokens for both the left and right side of the tree # on the left side, all lists end with the query, on the right side, # they start with the query for position in positions: right_branches.append(body[position:position + window]) left_branches.append(body[max(0, position - window):position + 1]) # Some settings for rendering the tree later self.step = self.fontsize * 0.6 # approximately the width of a monospace char self.gap = (7 * self.step) # space for lines between nodes width = 1 # will be updated later # invert the left side of the tree (because that's the way we want the # branching to work for that side) # we'll visually invert the nodes in the tree again later left_branches = [list(reversed(branch)) for branch in left_branches] # first create vertical slices of tokens per level self.dataset.update_status("Generating token tree from posts") levels_right = [{} for i in range(0, window)] levels_left = [{} for i in range(0, window)] tokens_left = [] tokens_right = [] # for each "level" (each branching point representing a level), turn # tokens into nodes, record the max amount of occurences for any # token in that level, and keep track of what nodes are in which level. # The latter is needed because a token may occur multiple times, at # different points in the graph. Do this for both the left and right # side of the tree. for i in range(0, window): for branch in right_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_right[i]: parent = levels_right[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_right[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_right.append(levels_right[i][token]) else: levels_right[i][token].occurrences += 1 occurrences = levels_right[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences for branch in left_branches: if i >= len(branch): continue token = branch[i].lower() if token not in levels_left[i]: parent = levels_left[i - 1][branch[ i - 1].lower()] if i > 0 else None levels_left[i][token] = Node(token, parent=parent, occurrences=1, is_top_root=(parent is None)) tokens_left.append(levels_left[i][token]) else: levels_left[i][token].occurrences += 1 occurrences = levels_left[i][token].occurrences self.max_occurrences[i] = max( occurrences, self.max_occurrences[i] ) if i in self.max_occurrences else occurrences # nodes that have no siblings can be merged with their parents, else # the graph becomes unnecessarily large with lots of single-word nodes # connected to single-word nodes. additionally, we want the nodes with # the most branches to be sorted to the top, and then only retain the # most interesting (i.e. most-occurring) branches self.dataset.update_status("Merging and sorting tree nodes") for token in tokens_left: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) for token in tokens_right: self.merge_upwards(token) self.sort_node(token) self.limit_subtree(token) # somewhat annoyingly, anytree does not simply delete nodes detached # from the tree in the previous steps, but makes them root nodes. We # don't need these root nodes (we only need the original root), so the # next step is to remove all root nodes that are not the main root. # We cannot modify a list in-place, so make a new list with the # relevant nodes level_sizes = {} filtered_tokens_right = [] for token in tokens_right: if token.is_root and not token.is_top_root: continue filtered_tokens_right.append(token) filtered_tokens_left = [] for token in tokens_left: if token.is_root and not token.is_top_root: continue filtered_tokens_left.append(token) # now we know which nodes are left, and can therefore determine how # large the canvas needs to be - this is based on the max number of # branches found on any level of the tree, in other words, the number # of "terminal nodes" height_left = self.whitespace * self.fontsize * max([ self.max_breadth(node) for node in filtered_tokens_left if node.is_top_root ]) height_right = self.whitespace * self.fontsize * max([ self.max_breadth(node) for node in filtered_tokens_right if node.is_top_root ]) height = max(height_left, height_right) canvas = Drawing(str(self.dataset.get_results_path()), size=(width, height), style="font-family:monospace;font-size:%ipx" % self.fontsize) # the nodes on the left side of the graph now have the wrong word order, # because we reversed them earlier to generate the correct tree # hierarchy - now reverse the node labels so they are proper language # again for token in tokens_left: self.invert_node_labels(token) wrapper = SVG(overflow="visible") self.dataset.update_status("Rendering tree to SVG file") if sides != "right": wrapper = self.render(wrapper, [ token for token in filtered_tokens_left if token.is_root and token.children ], height=height, side=self.SIDE_LEFT) if sides != "left": wrapper = self.render(wrapper, [ token for token in filtered_tokens_right if token.is_root and token.children ], height=height, side=self.SIDE_RIGHT) # things may have been rendered outside the canvas, in which case we # need to readjust the SVG properties wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1}) canvas.update({"width": (self.x_max - self.x_min)}) canvas.add(wrapper) canvas.save(pretty=True) self.dataset.update_status("Finished") self.dataset.finish(len(tokens_left) + len(tokens_right))