Esempio n. 1
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a plain text file
		containing all post bodies as one continuous string, sanitized.
		"""

        link_regex = re.compile(r"https?://[^\s]+")
        delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")

        # settings
        strip_urls = self.parameters.get("strip-urls",
                                         self.options["strip-urls"]["default"])
        strip_symbols = self.parameters.get(
            "strip-symbols", self.options["strip-symbols"]["default"])
        sides = self.parameters.get("sides", self.options["sides"]["default"])
        self.align = self.parameters.get("align",
                                         self.options["align"]["default"])
        window = convert_to_int(
            self.parameters.get("window", self.options["window"]["default"]),
            5) + 1
        query = self.parameters.get("query", self.options["query"]["default"])
        self.limit = convert_to_int(
            self.parameters.get("limit", self.options["limit"]["default"]),
            100)

        left_branches = []
        right_branches = []

        # do some validation
        if not query.strip() or re.sub(r"\s", "", query) != query:
            self.dataset.update_status(
                "Invalid query for word tree generation. Query cannot be empty or contain whitespace."
            )
            self.dataset.finish(0)
            return

        window = min(window, self.options["window"]["max"] + 1)
        window = max(1, window)

        # find matching posts
        processed = 0
        for post in self.iterate_csv_items(self.source_file):
            processed += 1
            if processed % 500 == 0:
                self.dataset.update_status(
                    "Processing and tokenising post %i" % processed)
            body = post["body"]

            if strip_urls:
                body = link_regex.sub("", body)

            if strip_symbols:
                body = delete_regex.sub("", body)

            body = word_tokenize(body)
            positions = [
                i for i, x in enumerate(body) if x.lower() == query.lower()
            ]

            # get lists of tokens for both the left and right side of the tree
            # on the left side, all lists end with the query, on the right side,
            # they start with the query
            for position in positions:
                right_branches.append(body[position:position + window])
                left_branches.append(body[max(0, position - window):position +
                                          1])

        # Some settings for rendering the tree later
        self.step = self.fontsize * 0.6  # approximately the width of a monospace char
        self.gap = (7 * self.step)  # space for lines between nodes
        width = 1  # will be updated later

        # invert the left side of the tree (because that's the way we want the
        # branching to work for that side)
        # we'll visually invert the nodes in the tree again later
        left_branches = [list(reversed(branch)) for branch in left_branches]

        # first create vertical slices of tokens per level
        self.dataset.update_status("Generating token tree from posts")
        levels_right = [{} for i in range(0, window)]
        levels_left = [{} for i in range(0, window)]
        tokens_left = []
        tokens_right = []

        # for each "level" (each branching point representing a level), turn
        # tokens into nodes, record the max amount of occurences for any
        # token in that level, and keep track of what nodes are in which level.
        # The latter is needed because a token may occur multiple times, at
        # different points in the graph. Do this for both the left and right
        # side of the tree.
        for i in range(0, window):
            for branch in right_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_right[i]:
                    parent = levels_right[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_right[i][token] = Node(token,
                                                  parent=parent,
                                                  occurrences=1,
                                                  is_top_root=(parent is None))
                    tokens_right.append(levels_right[i][token])
                else:
                    levels_right[i][token].occurrences += 1

                occurrences = levels_right[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

            for branch in left_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_left[i]:
                    parent = levels_left[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_left[i][token] = Node(token,
                                                 parent=parent,
                                                 occurrences=1,
                                                 is_top_root=(parent is None))
                    tokens_left.append(levels_left[i][token])
                else:
                    levels_left[i][token].occurrences += 1

                occurrences = levels_left[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

        # nodes that have no siblings can be merged with their parents, else
        # the graph becomes unnecessarily large with lots of single-word nodes
        # connected to single-word nodes. additionally, we want the nodes with
        # the most branches to be sorted to the top, and then only retain the
        # most interesting (i.e. most-occurring) branches
        self.dataset.update_status("Merging and sorting tree nodes")
        for token in tokens_left:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        for token in tokens_right:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        # somewhat annoyingly, anytree does not simply delete nodes detached
        # from the tree in the previous steps, but makes them root nodes. We
        # don't need these root nodes (we only need the original root), so the
        # next step is to remove all root nodes that are not the main root.
        # We cannot modify a list in-place, so make a new list with the
        # relevant nodes
        level_sizes = {}
        filtered_tokens_right = []
        for token in tokens_right:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_right.append(token)

        filtered_tokens_left = []
        for token in tokens_left:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_left.append(token)

        # now we know which nodes are left, and can therefore determine how
        # large the canvas needs to be - this is based on the max number of
        # branches found on any level of the tree, in other words, the number
        # of "terminal nodes"
        height_left = self.whitespace * self.fontsize * max([
            self.max_breadth(node)
            for node in filtered_tokens_left if node.is_top_root
        ])
        height_right = self.whitespace * self.fontsize * max([
            self.max_breadth(node)
            for node in filtered_tokens_right if node.is_top_root
        ])
        height = max(height_left, height_right)

        canvas = Drawing(str(self.dataset.get_results_path()),
                         size=(width, height),
                         style="font-family:monospace;font-size:%ipx" %
                         self.fontsize)

        # the nodes on the left side of the graph now have the wrong word order,
        # because we reversed them earlier to generate the correct tree
        # hierarchy - now reverse the node labels so they are proper language
        # again
        for token in tokens_left:
            self.invert_node_labels(token)

        wrapper = SVG(overflow="visible")

        self.dataset.update_status("Rendering tree to SVG file")
        if sides != "right":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_left
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_LEFT)

        if sides != "left":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_right
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_RIGHT)

        # things may have been rendered outside the canvas, in which case we
        # need to readjust the SVG properties
        wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1})
        canvas.update({"width": (self.x_max - self.x_min)})

        canvas.add(wrapper)
        canvas.save(pretty=True)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(tokens_left) + len(tokens_right))