Ejemplo n.º 1
0
    def process(self):
        """
        This takes a 4CAT results file as input, and outputs a new CSV file
        with one column with image hashes, one with the first file name used
        for the image, and one with the amount of times the image was used
        """
        api_key = self.parameters.get("api_key")
        self.dataset.delete_parameter("api_key")  # sensitive, delete after use

        features = self.parameters.get("features")
        features = [{"type": feature} for feature in features]

        if not api_key:
            self.dataset.update_status("You need to provide a valid API key",
                                       is_final=True)
            self.dataset.finish(0)
            return

        max_images = convert_to_int(self.parameters.get("amount", 0), 100)
        total = self.source_dataset.num_rows if not max_images else min(
            max_images, self.source_dataset.num_rows)
        done = 0

        for image_file in self.iterate_archive_contents(self.source_file):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from Google Vision API")

            done += 1
            self.dataset.update_status("Annotating image %i/%i" %
                                       (done, total))

            try:
                annotations = self.annotate_image(image_file, api_key,
                                                  features)
            except RuntimeError:
                # cannot continue fetching, e.g. when API key is invalid
                break

            if not annotations:
                continue

            annotations = {"file_name": image_file.name, **annotations}

            with self.dataset.get_results_path().open(
                    "a", encoding="utf-8") as outfile:
                outfile.write(json.dumps(annotations) + "\n")

            if max_images and done >= max_images:
                break

        self.dataset.update_status("Annotations retrieved for %i images" %
                                   done)
        self.dataset.finish(done)
Ejemplo n.º 2
0
    def get_processor_pipeline(self):
        """
        This queues a series of post-processors to annotate images

        First, the required amount of images referenced in the dataset is
        downloaded, in order of most-referenced; then, the requested
        features are extracted using the Google Vision API; finally, the result
        is converted to a CSV file for easy processing.
        """
        amount = convert_to_int(self.parameters.get("amount", 10), 10)
        api_key = self.parameters.get("api_key", "")
        features = self.parameters.get("features", "")

        self.dataset.delete_parameter(
            "api_key")  # sensitive, delete as soon as possible

        pipeline = [
            # first, extract top images
            {
                "type": "top-images",
                "parameters": {
                    "overwrite": False
                }
            },
            # then, download the images we want to annotate
            {
                "type": "image-downloader",
                "parameters": {
                    "amount": amount,
                    "overwrite": False
                }
            },
            # then, annotate the downloaded images with the Google Vision API
            {
                "type": "google-vision-api",
                "parameters": {
                    "features": features,
                    "amount": amount,
                    "api_key": api_key
                }
            },
            # finally, create a simplified CSV file from the download NDJSON (which can also be retrieved later)
            {
                "type": "convert-vision-to-csv",
                "parameters": {}
            }
        ]

        return pipeline
Ejemplo n.º 3
0
	def process(self):
		"""
		Takes the thumbnails downloaded from YouTube metadata and
		turns it into an image wall. 

		"""
		results_path = self.dataset.get_results_path()
		dirname = Path(results_path.parent, results_path.name.replace(".", ""))

		# Get the required parameters
		# path to the YouTube csv data that was the source of the thumbnails
		root_csv = self.dataset.get_genealogy()[-3].get_results_path()
		max_amount = convert_to_int(self.parameters.get("max_amount", 0), 0)
		category_overlay = self.parameters.get("category_overlay")

		# Build that wall!
		self.make_imagewall(root_csv, max_amount=max_amount, category_overlay=category_overlay)
Ejemplo n.º 4
0
    def process(self):
        graphs = {}
        intervals = []

        smooth = self.parameters.get("smooth")
        normalise_values = self.parameters.get("normalise")
        completeness = convert_to_int(self.parameters.get("complete"), 0)
        graph_label = self.parameters.get("label")
        top = convert_to_int(self.parameters.get("top"), 10)

        # first gather graph data: each distinct item gets its own graph and
        # for each graph we have a sequence of intervals, each interval with
        # its own value
        first_date = "9999-99-99"
        last_date = "0000-00-00"

        for row in self.iterate_items(self.source_file):
            if row["item"] not in graphs:
                graphs[row["item"]] = {}

            # make sure the months and days are zero-padded
            interval = row.get("date", "")
            interval = "-".join([
                str(bit).zfill(2 if len(bit) != 4 else 4)
                for bit in interval.split("-")
            ])
            first_date = min(first_date, interval)
            last_date = max(last_date, interval)

            if interval not in intervals:
                intervals.append(interval)

            if interval not in graphs[row["item"]]:
                graphs[row["item"]][interval] = 0

            graphs[row["item"]][interval] += float(row.get("value", 0))

        # first make sure we actually have something to render
        intervals = sorted(intervals)
        if len(intervals) <= 1:
            self.dataset.update_status(
                "Not enough data for a side-by-side over-time visualisation.")
            self.dataset.finish(0)
            return

        # only retain most-occurring series - sort by sum of all frequencies
        if len(graphs) > top:
            selected_graphs = {
                graph: graphs[graph]
                for graph in sorted(
                    graphs,
                    key=lambda x: sum(
                        [graphs[x][interval] for interval in graphs[x]]),
                    reverse=True)[0:top]
            }
            graphs = selected_graphs

        # there may be items that do not have values for all intervals
        # this will distort the graph, so the next step is to make sure all
        # graphs consist of the same continuous interval list
        missing = {graph: 0 for graph in graphs}
        for graph in graphs:
            missing[graph], graphs[graph] = pad_interval(
                graphs[graph],
                first_interval=first_date,
                last_interval=last_date)

        # now that's done, make sure the graph datapoints are in order
        intervals = sorted(list(graphs[list(graphs)[0]].keys()))

        # delete graphs that do not have the required amount of intervals
        # this is useful to get rid of outliers and items that only occur
        # very few times over the full interval
        if completeness > 0:
            intervals_required = len(intervals) * (completeness / 100)
            disqualified = []
            for graph in graphs:
                if len(intervals) - missing[graph] < intervals_required:
                    disqualified.append(graph)

            graphs = {
                graph: graphs[graph]
                for graph in graphs if graph not in disqualified
            }

        # determine max value per item, so we can normalize them later
        limits = {}
        max_limit = 0
        for graph in graphs:
            for interval in graphs[graph]:
                limits[graph] = max(limits.get(graph, 0),
                                    abs(graphs[graph][interval]))
                max_limit = max(max_limit, abs(graphs[graph][interval]))

        # order graphs by highest (or lowest) value)
        limits = {
            limit: limits[limit]
            for limit in sorted(limits, key=lambda l: limits[l])
        }
        graphs = {graph: graphs[graph] for graph in limits}

        if not graphs:
            # maybe nothing is actually there to be graphed
            self.dataset.update_status(
                "No items match the selection criteria - nothing to visualise."
            )
            self.dataset.finish(0)
            return None

        # how many vertical grid lines (and labels) are to be included at most
        # 12 is a sensible default because it allows one label per month for a full
        # year's data
        max_gridlines = 12

        # If True, label is put at the lower left bottom of the graph rather than
        # outside it. Automatically set to True if one of the labels is long, as
        # else the label would fall off the screen
        label_in_graph = max([len(item) for item in graphs]) > 30

        # determine how wide each interval should be
        # the graph has a minimum width - but the graph's width will be
        # extended if at this minimum width each item does not have the
        # minimum per-item width
        min_full_width = 600
        min_item_width = 50
        item_width = max(min_item_width, min_full_width / len(intervals))

        # determine how much space each graph should get
        # same trade-off as for the interval width
        min_full_height = 300
        min_item_height = 100
        item_height = max(min_item_height, min_full_height / len(graphs))

        # margin - this should be enough for the text labels to fit in
        margin_base = 50
        margin_right = margin_base * 4
        margin_top = margin_base * 3

        # this determines the "flatness" of the isometric projection and an be
        # tweaked for different looks - basically corresponds to how far the
        # camera is above the horizon
        plane_angle = 120

        # don't change these
        plane_obverse = radians((180 - plane_angle) / 2)
        plane_angle = radians(plane_angle)

        # okay, now determine the full graphic size with these dimensions projected
        # semi-isometrically. We can also use these values later for drawing for
        # drawing grid lines, et cetera. The axis widths and heights here are the
        # dimensions of the bounding box wrapping the isometrically projected axes.
        x_axis_length = (item_width * (len(intervals) - 1))
        y_axis_length = (item_height * len(graphs))

        x_axis_width = (sin(plane_angle / 2) * x_axis_length)
        y_axis_width = (sin(plane_angle / 2) * y_axis_length)
        canvas_width = x_axis_width + y_axis_width

        # leave room for graph header
        if graph_label:
            margin_top += (2 * (canvas_width / 50))

        x_axis_height = (cos(plane_angle / 2) * x_axis_length)
        y_axis_height = (cos(plane_angle / 2) * y_axis_length)
        canvas_height = x_axis_height + y_axis_height

        # now we have the dimensions, the canvas can be instantiated
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width=(canvas_width + margin_base + margin_right),
            height=(canvas_height + margin_base + margin_top),
            header=graph_label)

        # draw gridlines - vertical
        gridline_x = y_axis_width + margin_base
        gridline_y = margin_top + canvas_height

        step_x_horizontal = sin(plane_angle / 2) * item_width
        step_y_horizontal = cos(plane_angle / 2) * item_width
        step_x_vertical = sin(plane_angle / 2) * item_height
        step_y_vertical = cos(plane_angle / 2) * item_height

        # labels for x axis
        # month and week both follow the same pattern
        # it's not always possible to distinguish between them but we will try
        # by looking for months greater than 12 in which case we are dealing
        # with weeks
        # we need to know this because for months there is an extra row in the
        # label with the full month
        is_week = False
        for i in range(0, len(intervals)):
            if re.match(r"^[0-9]{4}-[0-9]{2}",
                        intervals[i]) and int(intervals[i].split("-")[1]) > 12:
                is_week = True
                break

        skip = max(1, int(len(intervals) / max_gridlines))
        for i in range(0, len(intervals)):
            if i % skip == 0:
                canvas.add(
                    Line(start=(gridline_x, gridline_y),
                         end=(gridline_x - y_axis_width,
                              gridline_y - y_axis_height),
                         stroke="grey",
                         stroke_width=0.25))

                # to properly position the rotated and skewed text a container
                # element is needed
                label1 = str(intervals[i])[0:4]
                center = (gridline_x, gridline_y)
                container = SVG(x=center[0] - 25,
                                y=center[1],
                                width="50",
                                height="1.5em",
                                overflow="visible",
                                style="font-size:0.8em;")
                container.add(
                    Text(insert=("25%", "100%"),
                         text=label1,
                         transform="rotate(%f) skewX(%f)" %
                         (-degrees(plane_obverse), degrees(plane_obverse)),
                         text_anchor="middle",
                         baseline_shift="-0.5em",
                         style="font-weight:bold;"))

                if re.match(r"^[0-9]{4}-[0-9]{2}",
                            intervals[i]) and not is_week:
                    label2 = month_abbr[int(str(intervals[i])[5:7])]
                    if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}", intervals[i]):
                        label2 += " %i" % int(intervals[i][8:10])

                    container.add(
                        Text(insert=("25%", "150%"),
                             text=label2,
                             transform="rotate(%f) skewX(%f)" %
                             (-degrees(plane_obverse), degrees(plane_obverse)),
                             text_anchor="middle",
                             baseline_shift="-0.5em"))

                canvas.add(container)

            gridline_x += step_x_horizontal
            gridline_y -= step_y_horizontal

        # draw graphs as filled beziers
        top = step_y_vertical * 1.5
        graph_start_x = y_axis_width + margin_base
        graph_start_y = margin_top + canvas_height

        # draw graphs in reverse order, so the bottom one is most in the
        # foreground (in case of overlap)
        for graph in reversed(list(graphs)):
            self.dataset.update_status("Rendering graph for '%s'" % graph)

            # path starting at lower left corner of graph
            area_graph = Path(fill=self.colours[self.colour_index])
            area_graph.push("M %f %f" % (graph_start_x, graph_start_y))
            previous_value = None

            graph_x = graph_start_x
            graph_y = graph_start_y
            for interval in graphs[graph]:
                # normalise value
                value = graphs[graph][interval]
                try:
                    limit = limits[graph] if normalise_values else max_limit
                    value = top * copysign(abs(value) / limit, value)
                except ZeroDivisionError:
                    value = 0

                if previous_value is None:
                    # vertical line upwards to starting value of graph
                    area_graph.push("L %f %f" %
                                    (graph_start_x, graph_start_y - value))
                elif not smooth:
                    area_graph.push("L %f %f" % (graph_x, graph_y - value))
                else:
                    # quadratic bezier from previous value to current value
                    control_left = (graph_x - (step_x_horizontal / 2),
                                    graph_y + step_y_horizontal -
                                    previous_value - (step_y_horizontal / 2))
                    control_right = (graph_x - (step_x_horizontal / 2),
                                     graph_y - value + (step_y_horizontal / 2))
                    area_graph.push("C %f %f %f %f %f %f" %
                                    (*control_left, *control_right, graph_x,
                                     graph_y - value))

                previous_value = value
                graph_x += step_x_horizontal
                graph_y -= step_y_horizontal

            # line to the bottom of the graph at the current Y position
            area_graph.push(
                "L %f %f" %
                (graph_x - step_x_horizontal, graph_y + step_y_horizontal))
            area_graph.push("Z")  # then close the Path
            canvas.add(area_graph)

            # add text labels - skewing is a bit complicated and we need a
            # "center" to translate the origins properly.
            if label_in_graph:
                insert = (graph_start_x + 5, graph_start_y - 10)
            else:
                insert = (graph_x - (step_x_horizontal) + 5,
                          graph_y + step_y_horizontal - 10)

            # we need to take the skewing into account for the translation
            offset_y = tan(plane_obverse) * insert[0]
            canvas.add(
                Text(insert=(0, 0),
                     text=graph,
                     transform="skewY(%f) translate(%f %f)" %
                     (-degrees(plane_obverse), insert[0],
                      insert[1] + offset_y)))

            # cycle colours, back to the beginning if all have been used
            self.colour_index += 1
            if self.colour_index >= len(self.colours):
                self.colour_index = 0

            graph_start_x -= step_x_vertical
            graph_start_y -= step_y_vertical

        # draw gridlines - horizontal
        gridline_x = margin_base
        gridline_y = margin_top + canvas_height - y_axis_height
        for graph in graphs:
            gridline_x += step_x_vertical
            gridline_y += step_y_vertical
            canvas.add(
                Line(start=(gridline_x, gridline_y),
                     end=(gridline_x + x_axis_width,
                          gridline_y - x_axis_height),
                     stroke="black",
                     stroke_width=1))

        # x axis
        canvas.add(
            Line(start=(margin_base + y_axis_width,
                        margin_top + canvas_height),
                 end=(margin_base + canvas_width,
                      margin_top + canvas_height - x_axis_height),
                 stroke="black",
                 stroke_width=2))

        # and finally save the SVG
        canvas.save(pretty=True)
        self.dataset.finish(len(graphs))
Ejemplo n.º 5
0
    def process(self):
        """
		This takes previously generated Word2Vec models and uses them to find
		similar words based on a list of words
		"""
        self.dataset.update_status("Processing sentences")

        depth = max(1,
                    min(3, convert_to_int(self.parameters.get("crawl_depth"))))
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        num_words = convert_to_int(self.parameters.get("num-words"))
        try:
            threshold = float(self.parameters.get("threshold"))
        except ValueError:
            threshold = float(self.get_options()["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))

        # go through all models and calculate similarity for all given input words
        result = []
        staging_area = self.unpack_archive_contents(self.source_file)
        for model_file in staging_area.glob("*.model"):
            interval = model_file.stem

            # for each separate model, calculate top similar words for each
            # input word, giving us at most
            #   [max amount] * [number of input] * [number of intervals]
            # items
            self.dataset.update_status("Running model %s..." % model_file.name)
            model = KeyedVectors.load(str(model_file))
            word_queue = set()
            checked_words = set()
            level = 1

            words = input_words.copy()
            while words:
                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while extracting similar words")

                word = words.pop()
                checked_words.add(word)

                try:
                    similar_words = model.most_similar(positive=[word],
                                                       topn=num_words)
                except KeyError:
                    continue

                for similar_word in similar_words:
                    if similar_word[1] < threshold:
                        continue

                    result.append({
                        "date":
                        interval,
                        "input":
                        word,
                        "item":
                        similar_word[0],
                        "value":
                        similar_word[1],
                        "input_occurences":
                        model.vocab[word].count,
                        "item_occurences":
                        model.vocab[similar_word[0]].count,
                        "depth":
                        level
                    })

                    # queue word for the next iteration if there is one and
                    # it hasn't been seen yet
                    if level < depth and similar_word[0] not in checked_words:
                        word_queue.add(similar_word[0])

                # if all words have been checked, but we still have an
                # iteration to go, load the queued words into the list
                if not words and word_queue and level < depth:
                    level += 1
                    words = word_queue.copy()
                    word_queue = set()

        shutil.rmtree(staging_area)

        if not result:
            self.dataset.update_status(
                "None of the words were found in the word embedding model.",
                is_final=True)
            self.dataset.finish(0)
        else:
            self.write_csv_items_and_finish(result)
Ejemplo n.º 6
0
    def process(self):
        """
		Reads vector set and creates a CSV with ranked vectors
		"""
        self.dataset.update_status("Processing token sets")

        def file_to_timestamp(file):
            """
			Get comparable datestamp value for token file

			Token files are named YYYY-m.pb. This function converts that to a
			YYYYmm string, then that string to an int, so that it may be
			compared for sorting chronologically.

			:param str file:  File name
			:return int:  Comparable datestamp
			"""
            stem = file.split("/")[-1].split(".")[0].split("-")
            try:
                return int(stem[0] + stem[1].zfill(2))
            except (ValueError, IndexError):
                return 0

        results = []

        # truncate results as needed
        rank_style = self.parameters.get("top-style")
        cutoff = convert_to_int(self.parameters.get("top"))

        # now rank the vectors by most prevalent per "file" (i.e. interval)
        overall_top = {}
        index = 0
        for vector_file in self.iterate_archive_contents(self.source_file):
            # we support both pickle and json dumps of vectors
            vector_unpacker = pickle if vector_file.suffix == "pb" else json

            index += 1
            vector_set_name = vector_file.stem  # we don't need the full path
            self.dataset.update_status("Processing token set %i (%s)" %
                                       (index, vector_set_name))

            with vector_file.open("rb") as binary_tokens:
                # these were saved as pickle dumps so we need the binary mode
                vectors = vector_unpacker.load(binary_tokens)

            vectors = sorted(vectors, key=lambda x: x[1], reverse=True)

            # for overall ranking we need the full vector space per interval
            # because maybe an overall top-ranking vector is at the bottom
            # in this particular interval - we'll truncate the top list at
            # a later point in that case. Else, truncate it here
            if rank_style == "per-item":
                vectors = vectors[0:cutoff]

            for vector in vectors:
                if not vector[0].strip():
                    continue

                results.append({
                    "date": vector_set_name.split(".")[0],
                    "item": vector[0],
                    "value": vector[1]
                })

                if vector[0] not in overall_top:
                    overall_top[vector[0]] = 0

                overall_top[vector[0]] += int(vector[1])

        # this eliminates all items from the results that were not in the
        # *overall* top-occuring items. This only has an effect when vectors
        # were generated for multiple intervals
        if rank_style == "overall":
            overall_top = {
                item: overall_top[item]
                for item in sorted(overall_top,
                                   key=lambda x: overall_top[x],
                                   reverse=True)[0:cutoff]
            }
            filtered_results = []
            for item in results:
                if item["item"] in overall_top:
                    filtered_results.append(item)

            results = filtered_results

        # done!
        self.dataset.update_status("Writing results file")
        with open(self.dataset.get_results_path(), "w",
                  encoding="utf-8") as output:
            writer = csv.DictWriter(output,
                                    fieldnames=("date", "item", "value"))
            writer.writeheader()
            for row in results:
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(results))
Ejemplo n.º 7
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a number of files containing
		tokenised posts, grouped per time unit as specified in the parameters.
		"""
        self.dataset.update_status("Processing sentences")

        use_skipgram = 1 if self.parameters.get(
            "algorithm") == "skipgram" else 0
        window = min(10, max(1, convert_to_int(self.parameters.get("window"))))
        use_negative = 5 if self.parameters.get("negative") else 0
        min_count = max(1, convert_to_int(self.parameters.get("min_count")))
        dimensionality = convert_to_int(self.parameters.get("dimensionality"),
                                        100)
        detect_bigrams = self.parameters.get("detect-bigrams")
        model_type = self.parameters.get("model-type")
        max_words = convert_to_int(self.parameters.get("max_words"))

        if max_words == 0:
            # unlimited amount of words in model
            max_words = None

        staging_area = self.dataset.get_staging_area()
        model_builder = {
            "Word2Vec": Word2Vec,
            "FastText": FastText
        }[model_type]

        # go through all archived token sets and vectorise them
        models = 0
        for temp_file in self.iterate_archive_contents(self.source_file):
            # use the "list of lists" as input for the word2vec model
            # by default the tokeniser generates one list of tokens per
            # post... which may actually be preferable for short
            # 4chan-style posts. But alternatively it could generate one
            # list per sentence - this processor is agnostic in that regard
            token_set_name = temp_file.name
            self.dataset.update_status(
                "Extracting bigrams from token set %s..." % token_set_name)

            try:
                if detect_bigrams:
                    bigram_transformer = Phrases(
                        self.tokens_from_file(temp_file, staging_area))
                    bigram_transformer = Phraser(bigram_transformer)
                else:
                    bigram_transformer = None

                self.dataset.update_status(
                    "Training %s model for token set %s..." %
                    (model_builder.__name__, token_set_name))
                try:
                    model = model_builder(negative=use_negative,
                                          size=dimensionality,
                                          sg=use_skipgram,
                                          window=window,
                                          workers=3,
                                          min_count=min_count,
                                          max_final_vocab=max_words)

                    # we do not simply pass a sentences argument to model builder
                    # because we are using a generator, which exhausts, while
                    # Word2Vec needs to iterate over the sentences twice
                    # https://stackoverflow.com/a/57632747
                    model.build_vocab(
                        self.tokens_from_file(temp_file,
                                              staging_area,
                                              phraser=bigram_transformer))
                    model.train(self.tokens_from_file(
                        temp_file, staging_area, phraser=bigram_transformer),
                                epochs=model.iter,
                                total_examples=model.corpus_count)

                except RuntimeError as e:
                    if "you must first build vocabulary before training the model" in str(
                            e):
                        # not enough data. Skip - if this happens for all models
                        # an error will be generated later
                        continue
                    else:
                        raise e

            except UnicodeDecodeError:
                self.dataset.update_status(
                    "Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.",
                    is_final=True)
                self.dataset.finish(0)
                return

            # save - we only save the KeyedVectors for the model, this
            # saves space and we don't need to re-train the model later
            model_name = token_set_name.split(".")[0] + ".model"
            model.wv.save(str(staging_area.joinpath(model_name)))

            # save vocabulary too, some processors need it
            del model
            models += 1

        if models == 0:
            self.dataset.update_status(
                "Not enough data in source file to train %s models." %
                model_builder.__name__)
            shutil.rmtree(staging_area)
            self.dataset.finish(0)
            return

        # create another archive with all model files in it
        self.dataset.update_status("%s model(s) saved." %
                                   model_builder.__name__)
        self.write_archive_and_finish(staging_area)
Ejemplo n.º 8
0
	def process(self):
		"""
		Unzips and appends tokens to fetch and write a tf-idf matrix
		"""

		# Validate and process user inputs
		library = self.parameters.get("library", "gensim")

		if "-" not in self.parameters.get("n_size"):
			n_size = convert_to_int(self.parameters.get("n_size", 1), 1) 
			n_size = (n_size, n_size) # needs to be a tuple for sklearn.
		else:
			n_size_split = self.parameters.get("n_size").split("-")
			n_size = (convert_to_int(n_size_split[0]), convert_to_int(n_size_split[1]))
		
		min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1)
		max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1)
		max_output = convert_to_int(self.parameters.get("max_output", 10), 10)
		smartirs = self.parameters.get("smartirs", "nfc")

		# Get token sets
		self.dataset.update_status("Processing token sets")
		tokens = []
		dates = []

		# Go through all archived token sets and generate collocations for each
		for token_file in self.iterate_archive_contents(self.source_file):
			# Get the date
			date_string = token_file.stem
			dates.append(date_string)

			# we support both pickle and json dumps of vectors
			token_unpacker = pickle if token_file.suffix == "pb" else json

			try:
				with token_file.open("rb") as binary_tokens:
					# these were saved as pickle dumps so we need the binary mode
					post_tokens = token_unpacker.load(binary_tokens)

					# Flatten the list of list of tokens - we're treating the whole time series as one document.
					post_tokens = list(itertools.chain.from_iterable(post_tokens))

					# Add to all date's tokens
					tokens.append(post_tokens)

			except UnicodeDecodeError:
				self.dataset.update_status("Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True)
				self.dataset.finish(0)
				return

		# Make sure `min_occurrences` and `max_occurrences` are valid
		if min_occurrences > len(tokens):
			min_occurrences = len(tokens) - 1
		if max_occurrences <= 0 or max_occurrences > len(tokens):
			max_occurrences = len(tokens)

		# Get the tf-idf matrix.
		self.dataset.update_status("Generating tf-idf for token set")
		try:

			if library == "gensim":
				results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs)
			elif library == "scikit-learn":
				results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences,
								 max_occurrences=max_occurrences, top_n=max_output)
			else:
				self.dataset.update_status("Invalid library.")
				self.dataset.finish(0)
				return

			if results:
				# Generate csv and finish
				self.dataset.update_status("Writing to csv and finishing")
				self.write_csv_items_and_finish(results)

		except MemoryError:
			self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.")
			self.dataset.finish(0)
Ejemplo n.º 9
0
    async def execute_queries(self):
        """
		Get messages for queries

		This is basically what would be done in get_items(), except due to
		Telethon's architecture this needs to be called in an async method,
		which is this one.
		"""
        # session file has been created earlier, and we can re-use it here in
        # order to avoid having to re-enter the security code
        query = self.parameters

        hash_base = query["api_phone"].replace(
            "+", "") + query["api_id"] + query["api_hash"]
        session_id = hashlib.blake2b(hash_base.encode("ascii")).hexdigest()
        session_path = Path(config.PATH_ROOT).joinpath(config.PATH_SESSIONS,
                                                       session_id + ".session")

        client = None

        def cancel_start():
            """
			Replace interactive phone number input in Telethon

			By default, if Telethon cannot use the given session file to
			authenticate, it will interactively prompt the user for a phone
			number on the command line. That is not useful here, so instead
			raise a RuntimeError. This will be caught below and the user will
			be told they need to re-authenticate via 4CAT.
			"""
            raise RuntimeError("Connection cancelled")

        try:
            client = TelegramClient(str(session_path),
                                    int(query.get("api_id")),
                                    query.get("api_hash"),
                                    loop=self.eventloop)
            await client.start(phone=cancel_start)
        except RuntimeError:
            # session is no longer useable, delete file so user will be asked
            # for security code again
            self.dataset.update_status(
                "Session is not authenticated: login security code may have expired. You need to re-enter the security code.",
                is_final=True)
            session_path.unlink(missing_ok=True)
            if client and hasattr(client, "disconnect"):
                await client.disconnect()
            return None
        except Exception as e:
            self.dataset.update_status(
                "Error connecting to the Telegram API with provided credentials.",
                is_final=True)
            if client and hasattr(client, "disconnect"):
                await client.disconnect()
            return None

        # ready our parameters
        parameters = self.dataset.get_parameters()
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]
        max_items = convert_to_int(parameters.get("items", 10), 10)

        try:
            posts = await self.gather_posts(client, queries, max_items)
        except Exception as e:
            self.dataset.update_status("Error scraping posts from Telegram")
            self.log.error("Telegram scraping error: %s" %
                           traceback.format_exc())
            posts = None
        finally:
            await client.disconnect()

        return posts
Ejemplo n.º 10
0
    def get_items(self, query):
        """
        Use the Twitter v2 API historical search to get tweets

        :param query:
        :return:
        """
        # this is pretty sensitive so delete it immediately after storing in
        # memory
        bearer_token = self.parameters.get("api_bearer_token")
        auth = {"Authorization": "Bearer %s" % bearer_token}

        endpoint = "https://api.twitter.com/2/tweets/search/all"

        # these are all expansions and fields available at the time of writing
        # since it does not cost anything extra in terms of rate limiting, go
        # for as much data per tweet as possible...
        tweet_fields = ("attachments", "author_id", "context_annotations",
                        "conversation_id", "created_at", "entities", "geo",
                        "id", "in_reply_to_user_id", "lang", "public_metrics",
                        "possibly_sensitive", "referenced_tweets",
                        "reply_settings", "source", "text", "withheld")
        user_fields = ("created_at", "description", "entities", "id",
                       "location", "name", "pinned_tweet_id",
                       "profile_image_url", "protected", "public_metrics",
                       "url", "username", "verified", "withheld")
        place_fields = ("contained_within", "country", "country_code",
                        "full_name", "geo", "id", "name", "place_type")
        poll_fields = ("duration_minutes", "end_datetime", "id", "options",
                       "voting_status")
        expansions = ("attachments.poll_ids", "attachments.media_keys",
                      "author_id", "entities.mentions.username",
                      "geo.place_id", "in_reply_to_user_id",
                      "referenced_tweets.id", "referenced_tweets.id.author_id")
        media_fields = ("duration_ms", "height", "media_key",
                        "non_public_metrics", "organic_metrics",
                        "preview_image_url", "promoted_metrics",
                        "public_metrics", "type", "url", "width")
        amount = convert_to_int(self.parameters.get("amount"), 10)

        params = {
            "query": self.parameters.get("query", ""),
            "expansions": ",".join(expansions),
            "tweet.fields": ",".join(tweet_fields),
            "user.fields": ",".join(user_fields),
            "poll.fields": ",".join(poll_fields),
            "place.fields": ",".join(place_fields),
            "media.fields": ",".join(media_fields),
            "max_results": max(10, min(amount, 500))
            if amount > 0 else 500,  # 500 = upper limit, 10 = lower
        }

        if self.parameters.get("min_date"):
            params["start_time"] = datetime.datetime.fromtimestamp(
                self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        if self.parameters.get("max_date"):
            params["end_time"] = datetime.datetime.fromtimestamp(
                self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        tweets = 0
        self.dataset.log("Search parameters: %s" % repr(params))
        while True:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while getting tweets from the Twitter API")

            # there is a limit of one request per second, so stay on the safe side of this
            while self.previous_request == int(time.time()):
                time.sleep(0.1)
            time.sleep(0.05)
            self.previous_request = int(time.time())

            # now send the request, allowing for at least 5 replies if the connection seems unstable
            retries = 5
            api_response = None
            while retries > 0:
                try:
                    api_response = requests.get(endpoint,
                                                headers=auth,
                                                params=params)
                    break
                except (ConnectionError,
                        requests.exceptions.RequestException) as e:
                    retries -= 1
                    wait_time = (5 - retries) * 10
                    self.dataset.update_status(
                        "Got %s, waiting %i seconds before retrying" %
                        (str(e), wait_time))
                    time.sleep(wait_time)

            # rate limited - the limit at time of writing is 300 reqs per 15
            # minutes
            # usually you don't hit this when requesting batches of 500 at
            # 1/second
            if api_response.status_code == 429:
                resume_at = convert_to_int(
                    api_response.headers["x-rate-limit-reset"]) + 1
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Hit Twitter rate limit - waiting until %s to continue." %
                    resume_at_str)
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # API keys that are valid but don't have access or haven't been
            # activated properly get a 403
            elif api_response.status_code == 403:
                try:
                    structured_response = api_response.json()
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Could not connect to Twitter API "
                        "with this API key. %s" %
                        structured_response.get("detail", ""),
                        is_final=True)
                except (json.JSONDecodeError, ValueError):
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Your key may not have access to "
                        "the full-archive search endpoint.",
                        is_final=True)
                finally:
                    return

            # sometimes twitter says '503 service unavailable' for unclear
            # reasons - in that case just wait a while and try again
            elif api_response.status_code in (502, 503, 504):
                resume_at = time.time() + 60
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Twitter unavailable (status %i) - waiting until %s to continue."
                    % (api_response.status_code, resume_at_str))
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # this usually means the query is too long or otherwise contains
            # a syntax error
            elif api_response.status_code == 400:
                msg = "Response %i from the Twitter API; " % api_response.status_code
                try:
                    api_response = api_response.json()
                    msg += api_response.get("title", "")
                    if "detail" in api_response:
                        msg += ": " + api_response.get("detail", "")
                except (json.JSONDecodeError, TypeError):
                    msg += "Some of your parameters (e.g. date range) may be invalid."

                self.dataset.update_status(msg, is_final=True)
                return

            # invalid API key
            elif api_response.status_code == 401:
                self.dataset.update_status(
                    "Invalid API key - could not connect to Twitter API",
                    is_final=True)
                return

            # haven't seen one yet, but they probably exist
            elif api_response.status_code != 200:
                self.dataset.update_status(
                    "Unexpected HTTP status %i. Halting tweet collection." %
                    api_response.status_code,
                    is_final=True)
                self.log.warning(
                    "Twitter API v2 responded with status code %i. Response body: %s"
                    % (api_response.status_code, api_response.text))
                return

            elif not api_response:
                self.dataset.update_status(
                    "Could not connect to Twitter. Cancelling.", is_final=True)
                return

            api_response = api_response.json()

            # The API response contains tweets (of course) and 'includes',
            # objects that can be referenced in tweets. Later we will splice
            # this data into the tweets themselves to make them easier to
            # process. So extract them first...
            included_users = api_response.get("includes", {}).get("users", {})
            included_media = api_response.get("includes", {}).get("media", {})
            included_polls = api_response.get("includes", {}).get("polls", {})
            included_tweets = api_response.get("includes",
                                               {}).get("tweets", {})
            included_places = api_response.get("includes",
                                               {}).get("places", {})

            for tweet in api_response.get("data", []):
                if 0 < amount <= tweets:
                    break

                # splice referenced data back in
                # we use copy.deepcopy here because else we run into a
                # pass-by-reference quagmire
                tweet = self.enrich_tweet(tweet, included_users,
                                          included_media, included_polls,
                                          included_places,
                                          copy.deepcopy(included_tweets))

                tweets += 1
                if tweets % 500 == 0:
                    self.dataset.update_status(
                        "Received %i tweets from Twitter API" % tweets)

                yield tweet

            # paginate
            if (amount <= 0 or tweets < amount) and api_response.get(
                    "meta") and "next_token" in api_response["meta"]:
                params["next_token"] = api_response["meta"]["next_token"]
            else:
                break
Ejemplo n.º 11
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a new CSV file
		with one column with image hashes, one with the first file name used
		for the image, and one with the amount of times the image was used
		"""
        self.dataset.update_status("Reading source file")

        # prepare
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        sample_max = 75  # image size for colour sampling

        def numpy_to_rgb(numpy_array):
            """
			Helper function to go from numpy array to list of RGB strings

			Used in the K-Means clustering part
			"""
            return ",".join([str(int(value)) for value in numpy_array])

        max_images = convert_to_int(self.parameters.get("amount"), 100)
        sizing_mode = self.parameters.get("tile-size")
        sort_mode = self.parameters.get("sort-mode")

        # is there anything to put on a wall?
        if self.source_dataset.num_rows == 0:
            self.dataset.update_status(
                "No images available to render to image wall.", is_final=True)
            self.dataset.finish(0)
            return

        # 0 = use as many images as in the archive, up to the max
        if max_images == 0:
            max_images = self.get_options()["amount"]["max"]

        # we loop through the images twice - once to reduce them to a value
        # that can be sorted, and another time to actually copy them to the
        # canvas for the image wall

        # we create a staging area manually here, so it is not automatically
        # deleted after one loop, since we need two
        staging_area = self.dataset.get_staging_area()

        # first, extract and reduce, and store the sortable value in a
        # dictionary with the image file name as key
        image_colours = {}
        dimensions = {}  # used to calculate optimal tile size later
        index = 0
        random_values = list(range(0, self.source_dataset.num_rows))
        random.shuffle(random_values)

        for path in self.iterate_archive_contents(self.source_file,
                                                  staging_area):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while determining image wall order")

            try:
                picture = Image.open(str(path))
            except UnidentifiedImageError:
                self.dataset.update_status(
                    "Image %s could not be parsed. Skipping." % path)
                continue

            self.dataset.update_status(
                "Analysing %s (%i/%i)" %
                (path.name, len(dimensions), self.source_dataset.num_rows))

            # these calculations can take ages for huge images, so resize if it is
            # larger than the threshold
            dimensions[path.name] = (picture.width, picture.height)
            if sort_mode not in ("",
                                 "random") and (picture.height > sample_max
                                                or picture.width > sample_max):
                sample_width = int(sample_max * picture.width /
                                   max(picture.width, picture.height))
                sample_height = int(sample_max * picture.height /
                                    max(picture.width, picture.height))
                picture = ImageOps.fit(picture, (sample_width, sample_height))

            if sort_mode not in ("", "random"):
                # ensure we get RGB values for pixels
                picture = picture.convert("RGB")

            # determine a 'representative colour'
            if sort_mode == "random":
                # just randomly sort it, don't even look at the colours
                value = random_values.pop()

            elif sort_mode in ("average-rgb", "average-hsv"):
                # average colour, as RGB or HSV
                pixels = picture.getdata()
                if sort_mode == "average-hsv":
                    pixels = [colorsys.rgb_to_hsv(*pixel) for pixel in pixels]

                sum_colour = (sum([p[0] for p in pixels]),
                              sum([p[1] for p in pixels]),
                              sum([p[2] for p in pixels]))
                avg_colour = (sum_colour[0] / len(pixels),
                              sum_colour[1] / len(pixels),
                              sum_colour[2] / len(pixels))

                # this is a bit dumb, but since all the other modes return rgb...
                if sort_mode == "average-hsv":
                    avg_colour = colorsys.hsv_to_rgb(*avg_colour)

                value = avg_colour

            elif sort_mode == "dominant":
                # most-occurring colour
                colours = picture.getcolors(picture.width * picture.height)
                colours = sorted(colours, key=lambda x: x[0], reverse=True)
                value = colours[0][1]

            elif sort_mode in ("kmeans-dominant", "kmeans-average"):
                # use k-means clusters to determine the representative colour
                # this is more computationally expensive but gives far better
                # results.

                # determine k-means clusters for this image, i.e. the n most
                # dominant "average" colours, in this case n=3 (make parameter?)
                pixels = picture.getdata()
                clusters = KMeans(n_clusters=3,
                                  random_state=0)  # 0 so it is deterministic
                predicted_centroids = clusters.fit_predict(pixels).tolist()

                # now we have two options -
                if sort_mode == "kmeans-dominant":
                    # the colour of the single most dominant k-means centroid
                    ranked_centroids = {}
                    for index in range(0, len(clusters.cluster_centers_)):
                        ranked_centroids[numpy_to_rgb(
                            clusters.cluster_centers_[index]
                        )] = predicted_centroids.count(index)

                    value = [
                        int(v)
                        for v in sorted(ranked_centroids,
                                        key=lambda k: ranked_centroids[k],
                                        reverse=True)[0].split(",")
                    ]

                elif sort_mode == "kmeans-average":
                    # average colour of all k-means centroids, weighted by the
                    # dominance of each centroid
                    value = [0, 0, 0]
                    for index in clusters.labels_:
                        value[0] += clusters.cluster_centers_[index][0]
                        value[1] += clusters.cluster_centers_[index][1]
                        value[2] += clusters.cluster_centers_[index][2]

                    value[0] /= len(clusters.labels_)
                    value[1] /= len(clusters.labels_)
                    value[2] /= len(clusters.labels_)

            else:
                value = (0, 0, 0)

            # converted to HSV, because RGB does not sort nicely
            image_colours[path.name] = colorsys.rgb_to_hsv(*value)
            index += 1

        # only retain the top n of the sorted list of images - this gives us
        # our final image set
        sorted_image_files = [
            path for path in sorted(
                image_colours, key=lambda k: image_colours[k])[:max_images]
        ]
        dimensions = {path: dimensions[path] for path in sorted_image_files}
        average_size = (sum([k[0]
                             for k in dimensions.values()]) / len(dimensions),
                        sum([k[1]
                             for k in dimensions.values()]) / len(dimensions))

        self.dataset.update_status("Determining canvas and image sizes")

        # calculate 'tile sizes' (a tile is an image) and also the size of the
        # canvas we will need to fit them all. The canvas can never be larger than
        # this:
        max_pixels = self.TARGET_WIDTH * self.TARGET_HEIGHT

        if sizing_mode == "fit-height":
            # assuming every image has the overall average height, how wide would
            # the canvas need to be (if everything is on a single row)?
            full_width = 0
            tile_y = average_size[1]
            for dimension in dimensions.values():
                # ideally, we make everything the average height
                optimal_ratio = average_size[1] / dimension[1]
                full_width += dimension[0] * optimal_ratio

            # now we can calculate the total amount of pixels needed
            fitted_pixels = full_width * tile_y
            if fitted_pixels > max_pixels:
                # try again with a lower height
                area_ratio = max_pixels / fitted_pixels
                tile_y = int(tile_y * math.sqrt(area_ratio))
                fitted_pixels = max_pixels

            # find the canvas size that can fit this amount of pixels at the
            # required proportions, provided that y = multiple of avg height
            ideal_height = math.sqrt(fitted_pixels /
                                     (self.TARGET_WIDTH / self.TARGET_HEIGHT))
            size_y = math.ceil(ideal_height / tile_y) * tile_y
            size_x = fitted_pixels / size_y

            tile_x = -1  # varies

        elif sizing_mode == "square":
            # assuming each image is square, find a canvas with the right
            # proportions that would fit all of them
            # assume the average dimensions
            tile_size = int(sum(average_size) / 2)

            # this is how many pixels we need
            fitted_pixels = tile_size * tile_size * len(sorted_image_files)

            # does that fit our canvas?
            if fitted_pixels > max_pixels:
                tile_size = math.floor(
                    math.sqrt(max_pixels / len(sorted_image_files)))
                fitted_pixels = tile_size * tile_size * len(sorted_image_files)

            ideal_width = math.sqrt(fitted_pixels /
                                    (self.TARGET_HEIGHT / self.TARGET_WIDTH))
            size_x = math.ceil(ideal_width / tile_size) * tile_size
            size_y = math.ceil(fitted_pixels / size_x / tile_size) * tile_size

            tile_x = tile_y = tile_size

        elif sizing_mode == "average":
            tile_x = int(average_size[0])
            tile_y = int(average_size[1])

            fitted_pixels = tile_x * tile_y * len(sorted_image_files)
            if fitted_pixels > max_pixels:
                area_ratio = max_pixels / fitted_pixels
                tile_x = int(tile_x * math.sqrt(area_ratio))
                tile_y = int(tile_y * math.sqrt(area_ratio))
                fitted_pixels = tile_x * tile_y * len(sorted_image_files)

            ideal_width = math.sqrt(fitted_pixels /
                                    (self.TARGET_HEIGHT / self.TARGET_WIDTH))
            size_x = math.ceil(ideal_width / tile_x) * tile_x
            size_y = math.ceil(fitted_pixels / size_x / tile_y) * tile_y

        else:
            raise NotImplementedError("Sizing mode '%s' not implemented" %
                                      sizing_mode)

        self.dataset.log("Canvas size is %ix%i" % (size_x, size_y))
        wall = Image.new("RGBA", (int(size_x), int(size_y)))
        ImageDraw.floodfill(wall, (0, 0),
                            (255, 255, 255, 0))  # transparent background
        counter = 0
        offset_x = 0
        offset_y = 0

        tile_x = int(tile_x)
        tile_y = int(tile_y)

        # now actually putting the images on a wall is relatively trivial
        for path in sorted_image_files:
            counter += 1
            self.dataset.update_status(
                "Rendering %s (%i/%i) to image wall" %
                (path, counter, len(sorted_image_files)))
            picture = Image.open(str(staging_area.joinpath(path)))

            if tile_x == -1:
                picture_x = max(1,
                                int(picture.width * (tile_y / picture.height)))
                picture = ImageOps.fit(picture, (picture_x, tile_y),
                                       method=Image.BILINEAR)
            else:
                picture = ImageOps.fit(picture, (tile_x, tile_y),
                                       method=Image.BILINEAR)

            # simply put them side by side until the right edge is reached,
            # then move to a new row
            if offset_x + picture.width > wall.width:
                offset_x = 0
                offset_y += picture.height

            # this can happen in some edge cases: there is an extra row of
            # images we hadn't accounted for. In that case, simply enlarge the
            # canvas.
            if offset_y + picture.height > wall.height:
                new_wall = Image.new("RGBA",
                                     (wall.width, offset_y + picture.height))
                ImageDraw.floodfill(
                    new_wall, (0, 0),
                    (255, 255, 255, 0))  # transparent background
                new_wall.paste(wall, (0, 0))
                wall = new_wall

            wall.paste(picture, (offset_x, offset_y))
            offset_x += picture.width

        # finish up
        self.dataset.update_status("Saving result")
        wall.save(str(self.dataset.get_results_path()))
        shutil.rmtree(staging_area)

        self.dataset.update_status("Finished")
        self.dataset.finish(counter)
Ejemplo n.º 12
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a plain text file
		containing all post bodies as one continuous string, sanitized.
		"""

        link_regex = re.compile(r"https?://[^\s]+")
        delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")

        # settings
        strip_urls = self.parameters.get("strip-urls")
        strip_symbols = self.parameters.get("strip-symbols")
        sides = self.parameters.get("sides")
        self.align = self.parameters.get("align")
        window = convert_to_int(self.parameters.get("window"), 5) + 1
        query = self.parameters.get("query")
        self.limit = convert_to_int(self.parameters.get("limit"), 100)

        left_branches = []
        right_branches = []

        # do some validation
        if not query.strip() or re.sub(r"\s", "", query) != query:
            self.dataset.update_status(
                "Invalid query for word tree generation. Query cannot be empty or contain whitespace."
            )
            self.dataset.finish(0)
            return

        window = min(window, self.get_options()["window"]["max"] + 1)
        window = max(1, window)

        # find matching posts
        processed = 0
        for post in self.iterate_items(self.source_file):
            processed += 1
            if processed % 500 == 0:
                self.dataset.update_status(
                    "Processing and tokenising post %i" % processed)
            body = post["body"]
            if not body:
                continue

            if strip_urls:
                body = link_regex.sub("", body)

            if strip_symbols:
                body = delete_regex.sub("", body)

            body = word_tokenize(body)
            positions = [
                i for i, x in enumerate(body) if x.lower() == query.lower()
            ]

            # get lists of tokens for both the left and right side of the tree
            # on the left side, all lists end with the query, on the right side,
            # they start with the query
            for position in positions:
                right_branches.append(body[position:position + window])
                left_branches.append(body[max(0, position - window):position +
                                          1])

        # Some settings for rendering the tree later
        self.step = self.fontsize * 0.6  # approximately the width of a monospace char
        self.gap = (7 * self.step)  # space for lines between nodes
        width = 1  # will be updated later

        # invert the left side of the tree (because that's the way we want the
        # branching to work for that side)
        # we'll visually invert the nodes in the tree again later
        left_branches = [list(reversed(branch)) for branch in left_branches]

        # first create vertical slices of tokens per level
        self.dataset.update_status("Generating token tree from posts")
        levels_right = [{} for i in range(0, window)]
        levels_left = [{} for i in range(0, window)]
        tokens_left = []
        tokens_right = []

        # for each "level" (each branching point representing a level), turn
        # tokens into nodes, record the max amount of occurences for any
        # token in that level, and keep track of what nodes are in which level.
        # The latter is needed because a token may occur multiple times, at
        # different points in the graph. Do this for both the left and right
        # side of the tree.
        for i in range(0, window):
            for branch in right_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_right[i]:
                    parent = levels_right[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_right[i][token] = Node(token,
                                                  parent=parent,
                                                  occurrences=1,
                                                  is_top_root=(parent is None))
                    tokens_right.append(levels_right[i][token])
                else:
                    levels_right[i][token].occurrences += 1

                occurrences = levels_right[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

            for branch in left_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_left[i]:
                    parent = levels_left[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_left[i][token] = Node(token,
                                                 parent=parent,
                                                 occurrences=1,
                                                 is_top_root=(parent is None))
                    tokens_left.append(levels_left[i][token])
                else:
                    levels_left[i][token].occurrences += 1

                occurrences = levels_left[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

        # nodes that have no siblings can be merged with their parents, else
        # the graph becomes unnecessarily large with lots of single-word nodes
        # connected to single-word nodes. additionally, we want the nodes with
        # the most branches to be sorted to the top, and then only retain the
        # most interesting (i.e. most-occurring) branches
        self.dataset.update_status("Merging and sorting tree nodes")
        for token in tokens_left:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        for token in tokens_right:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        # somewhat annoyingly, anytree does not simply delete nodes detached
        # from the tree in the previous steps, but makes them root nodes. We
        # don't need these root nodes (we only need the original root), so the
        # next step is to remove all root nodes that are not the main root.
        # We cannot modify a list in-place, so make a new list with the
        # relevant nodes
        level_sizes = {}
        filtered_tokens_right = []
        for token in tokens_right:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_right.append(token)

        filtered_tokens_left = []
        for token in tokens_left:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_left.append(token)

        # now we know which nodes are left, and can therefore determine how
        # large the canvas needs to be - this is based on the max number of
        # branches found on any level of the tree, in other words, the number
        # of "terminal nodes"
        breadths_left = [
            self.max_breadth(node) for node in filtered_tokens_left
            if node.is_top_root
        ]
        breadths_right = [
            self.max_breadth(node) for node in filtered_tokens_right
            if node.is_top_root
        ]

        if not breadths_left:
            if sides == "left":
                self.dataset.update_status(
                    "No data available to the left of the query",
                    is_final=True)
                self.dataset.finish(0)
                return None
            elif sides == "both":
                sides = "right"
                breadths_left = [0]

        if not breadths_right:
            if sides == "right":
                self.dataset.update_status(
                    "No data available to the right of the query",
                    is_final=True)
                self.dataset.finish(0)
                return None
            elif sides == "both":
                sides = "left"
                breadths_right = [0]

        height_left = self.whitespace * self.fontsize * max(breadths_left)
        height_right = self.whitespace * self.fontsize * max(breadths_right)
        height = max(height_left, height_right)

        canvas = Drawing(str(self.dataset.get_results_path()),
                         size=(width, height),
                         style="font-family:monospace;font-size:%ipx" %
                         self.fontsize)

        # the nodes on the left side of the graph now have the wrong word order,
        # because we reversed them earlier to generate the correct tree
        # hierarchy - now reverse the node labels so they are proper language
        # again
        for token in tokens_left:
            self.invert_node_labels(token)

        wrapper = SVG(overflow="visible")

        self.dataset.update_status("Rendering tree to SVG file")
        if sides != "right":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_left
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_LEFT)

        if sides != "left":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_right
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_RIGHT)

        # things may have been rendered outside the canvas, in which case we
        # need to readjust the SVG properties
        wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1})
        canvas.update({"width": (self.x_max - self.x_min)})

        canvas.add(wrapper)
        canvas.save(pretty=True)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(tokens_left) + len(tokens_right))
Ejemplo n.º 13
0
    def process(self):
        # parse parameters
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        try:
            threshold = float(self.parameters.get("threshold"))
        except ValueError:
            threshold = float(self.get_options()["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))
        num_words = convert_to_int(self.parameters.get("num-words"))
        overlay = self.parameters.get("overlay")
        reduction_method = self.parameters.get("method")
        all_words = self.parameters.get("all-words")

        # load model files and initialise
        self.dataset.update_status("Unpacking word embedding models")
        staging_area = self.unpack_archive_contents(self.source_file)
        common_vocab = None
        vector_size = None
        models = {}

        # find words that are common to all models
        self.dataset.update_status("Determining cross-model common vocabulary")
        for model_file in staging_area.glob("*.model"):
            if self.interrupted:
                shutil.rmtree(staging_area)
                raise ProcessorInterruptedException(
                    "Interrupted while processing word embedding models")

            model = KeyedVectors.load(str(model_file)).wv
            models[model_file.stem] = model
            if vector_size is None:
                vector_size = model.vector_size  # needed later for dimensionality reduction

            if common_vocab is None:
                common_vocab = set(model.vocab.keys())
            else:
                common_vocab &= set(model.vocab.keys())  # intersect

        # sort common vocabulary by combined frequency across all models
        # this should make filtering for common words a bit faster further down
        self.dataset.update_status("Sorting vocabulary")
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: sum(
            [model.vocab[w].count for model in models.values()]),
                          reverse=True)

        # initial boundaries of 2D space (to be adjusted later based on t-sne
        # outcome)
        max_x = 0.0 - sys.float_info.max
        max_y = 0.0 - sys.float_info.max
        min_x = sys.float_info.max
        min_y = sys.float_info.max

        # for each model, find the words that we may want to plot - these are
        # the nearest neighbours for the given query words
        relevant_words = {}

        # the vectors need to be reduced all at once - but the vectors are
        # grouped by model. To solve this, keep one numpy array of vectors,
        # but also keep track of which indexes of this array belong to which
        # model, by storing the index of the first vector for a model
        vectors = numpy.empty((0, vector_size))
        vector_offsets = {}

        # now process each model
        for model_name, model in models.items():
            relevant_words[model_name] = set(
            )  # not a set, since order needs to be preserved
            self.dataset.update_status("Finding similar words in model '%s'" %
                                       model_name)

            for query in input_words:
                if query not in model.vocab:
                    self.dataset.update_status(
                        "Query '%s' was not found in model %s; cannot find nearest neighbours."
                        % (query, model_name),
                        is_final=True)
                    self.dataset.finish(0)
                    return

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while finding similar words")

                # use a larger sample (topn) than required since some of the
                # nearest neighbours may not be in the common vocabulary and
                # will therefore need to be ignored
                context = set([
                    word[0] for word in model.most_similar(query, topn=1000)
                    if word[0] in common_vocab and word[1] >= threshold
                ][:num_words])

                relevant_words[model_name] |= {
                    query
                } | context  # always include query word

        # now do another loop to determine which words to plot for each model
        # this is either the same as relevant_words, or a superset which
        # combines all relevant words for all models
        plottable_words = {}
        last_model = max(relevant_words.keys())
        all_relevant_words = set().union(*relevant_words.values())

        for model_name, words in relevant_words.items():
            plottable_words[model_name] = []
            vector_offsets[model_name] = len(vectors)

            # determine which words to plot for this model. either the nearest
            # neighbours for this model, or all nearest neighbours found across
            # all models
            words_to_include = all_relevant_words if all_words else relevant_words[
                model_name]

            for word in words_to_include:
                if word in plottable_words[model_name] or (
                        not overlay and model_name != last_model
                        and word not in input_words):
                    # only plot each word once per model, or if 'overlay'
                    # is not set, only once overall (for the most recent
                    # model)
                    continue

                vector = models[model_name][word]
                plottable_words[model_name].append(word)
                vectors = numpy.append(vectors, [vector], axis=0)

        del models  # no longer needed

        # reduce the vectors of all words to be plotted for this model to
        # a two-dimensional coordinate with the previously initialised tsne
        # transformer. here the two-dimensional vectors are interpreted as
        # cartesian coordinates
        if reduction_method == "PCA":
            pca = PCA(n_components=2, random_state=0)
            vectors = pca.fit_transform(vectors)
        elif reduction_method == "t-SNE":
            # initialise t-sne transformer
            # parameters taken from Hamilton et al.
            # https://github.com/williamleif/histwords/blob/master/viz/common.py
            tsne = TSNE(n_components=2,
                        random_state=0,
                        learning_rate=150,
                        init="pca")
            vectors = tsne.fit_transform(vectors)
        elif reduction_method == "TruncatedSVD":
            # standard sklearn parameters made explicit
            svd = TruncatedSVD(n_components=2,
                               algorithm="randomized",
                               n_iter=5,
                               random_state=0)
            vectors = svd.fit_transform(vectors)
        else:
            shutil.rmtree(staging_area)
            self.dataset.update_status(
                "Invalid dimensionality reduction technique selected",
                is_final=True)
            self.dataset.finish(0)
            return

        # also keep track of the boundaries of our 2D space, so we can plot
        # them properly later
        for position in vectors:
            max_x = max(max_x, position[0])
            max_y = max(max_y, position[1])
            min_x = min(min_x, position[0])
            min_y = min(min_y, position[1])

        # now we know for each model which words should be plotted and at what
        # position
        # with this knowledge, we can normalize the positions, and start
        # plotting them in a graph

        # a palette generated with https://medialab.github.io/iwanthue/
        colours = [
            "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb",
            "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c",
            "#00b0ca", "#9a0f76", "#ff70c8", "#713c88"
        ]
        colour_index = 0

        # make sure all coordinates are positive
        max_x -= min_x
        max_y -= min_y

        # determine graph dimensions and proportions
        width = 1000  # arbitrary
        height = width * (max_y / max_x)  # retain proportions
        scale = width / max_x

        # margin around the plot to give room for labels and to look better
        margin = width * 0.1
        width += 2 * margin
        height += 2 * margin

        # normalize all known positions to fit within the graph
        vectors = [(margin + ((position[0] - min_x) * scale),
                    margin + ((position[1] - min_y) * scale))
                   for position in vectors]

        # now all positions are finalised, we can determine the "journey" of
        # each query - the sequence of positions in the graph it takes, so we
        # can draw lines from position to position later
        journeys = {}
        for query in input_words:
            journeys[query] = []
            for model_name, words in plottable_words.items():
                index = words.index(query)
                journeys[query].append(vectors[vector_offsets[model_name] +
                                               index])

        # font sizes proportional to width (which is static and thus predictable)
        fontsize_large = width / 50
        fontsize_normal = width / 75
        fontsize_small = width / 100

        # now we have the dimensions, the canvas can be instantiated
        model_type = self.source_dataset.parameters.get(
            "model-type", "word2vec")
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width,
            height,
            header="%s nearest neighbours (fitting: %s) - '%s'" %
            (model_type, reduction_method, ",".join(input_words)),
            fontsize_normal=fontsize_normal,
            fontsize_large=fontsize_large,
            fontsize_small=fontsize_small)

        # use colour-coded backgrounds to distinguish the query words in the
        # graph, each model (= interval) with a separate colour
        for model_name in plottable_words:
            solid = Filter(id="solid-%s" % model_name)
            solid.feFlood(flood_color=colours[colour_index])
            solid.feComposite(in_="SourceGraphic")
            canvas.defs.add(solid)

            # this can get kind of confusing, but you shouldn't be using this
            # with more than 16 models anyway
            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # now plot each word for each model
        self.dataset.update_status("Plotting graph")
        words = SVG(insert=(0, 0), size=(width, height))
        queries = SVG(insert=(0, 0), size=(width, height))
        colour_index = 0

        for model_name, labels in plottable_words.items():
            positions = vectors[
                vector_offsets[model_name]:vector_offsets[model_name] +
                len(labels)]

            label_index = 0
            for position in positions:
                word = labels[label_index]
                is_query = word in input_words
                label_index += 1

                filter = ("url(#solid-%s)" %
                          model_name) if is_query else "none"
                colour = "#FFF" if is_query else colours[colour_index]
                fontsize = fontsize_normal if is_query else fontsize_small

                if word in input_words:
                    word += " (" + model_name + ")"

                label_container = SVG(insert=position,
                                      size=(1, 1),
                                      overflow="visible")
                label_container.add(
                    Text(insert=("50%", "50%"),
                         text=word,
                         dominant_baseline="middle",
                         text_anchor="middle",
                         style="fill:%s;font-size:%ipx" % (colour, fontsize),
                         filter=filter))

                # we make sure the queries are always rendered on top by
                # putting them in a separate SVG container
                if is_query:
                    queries.add(label_container)
                else:
                    words.add(label_container)

            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # plot a line between positions for query words
        lines = SVG(insert=(0, 0), size=(width, height))
        for query, journey in journeys.items():
            previous_position = None
            for position in journey:
                if previous_position is None:
                    previous_position = position
                    continue

                lines.add(
                    Line(start=previous_position,
                         end=position,
                         stroke="#CE1B28",
                         stroke_width=2))
                previous_position = position

        canvas.add(lines)
        canvas.add(words)
        canvas.add(queries)

        canvas.save(pretty=True)
        shutil.rmtree(staging_area)
        self.dataset.finish(len(journeys))