Example #1
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a plain text file
		containing all post bodies as one continuous string, sanitized.
		"""

        # OrderedDict because dates and headers should have order
        intervals = {}

        timeframe = self.parameters.get("timeframe",
                                        self.options["timeframe"]["default"])

        first_interval = "9999"
        last_interval = "0000"

        self.dataset.update_status("Processing posts")
        with self.dataset.get_results_path().open("w") as results:
            counter = 0

            for post in self.iterate_csv_items(self.source_file):
                date = get_interval_descriptor(post, timeframe)

                # Add a count for the respective timeframe
                if date not in intervals:
                    intervals[date] = 1
                else:
                    intervals[date] += 1

                first_interval = min(first_interval, date)
                last_interval = max(last_interval, date)

                counter += 1

                if counter % 2500 == 0:
                    self.dataset.update_status("Counted through " +
                                               str(counter) + " posts.")

            # pad interval if needed, this is useful if the result is to be
            # visualised as a histogram, for example
            if self.parameters.get("pad", self.options["pad"].get(
                    "default", True)) and timeframe != "all":
                missing, intervals = pad_interval(intervals, first_interval,
                                                  last_interval)

            # Write to csv
            csv_writer = csv.DictWriter(results,
                                        fieldnames=("date", "item",
                                                    "frequency"))
            csv_writer.writeheader()
            for interval in intervals:
                csv_writer.writerow({
                    "date": interval,
                    "item": "activity",
                    "frequency": intervals[interval]
                })

        self.dataset.finish(len(intervals))
Example #2
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a plain text file
		containing all post bodies as one continuous string, sanitized.
		"""

        # OrderedDict because dates and headers should have order
        intervals = {}

        timeframe = self.parameters.get("timeframe",
                                        self.options["timeframe"]["default"])

        first_interval = "9999"
        last_interval = "0000"

        self.dataset.update_status("Processing posts")
        with self.dataset.get_results_path().open("w") as results:
            counter = 0

            for post in self.iterate_csv_items(self.source_file):
                # Add a count for the respective timeframe
                if timeframe == "all":
                    date = "overall"
                else:
                    try:
                        timestamp = int(
                            datetime.datetime.strptime(
                                post["timestamp"],
                                "%Y-%m-%d %H:%M:%S").timestamp())
                    except ValueError:
                        self.dataset.update_status(
                            "Invalid date found in dataset; cannot count posts per interval."
                        )
                        self.dataset.finish(0)
                        return

                    date = datetime.datetime.utcfromtimestamp(timestamp)
                    if timeframe == "year":
                        date = str(date.year)
                    elif timeframe == "month":
                        date = str(date.year) + "-" + str(date.month).zfill(2)
                    elif timeframe == "week":
                        date = str(date.isocalendar()[0]) + "-" + str(
                            date.isocalendar()[1]).zfill(2)
                    else:
                        date = str(date.year) + "-" + str(
                            date.month).zfill(2) + "-" + str(date.day).zfill(2)

                if date not in intervals:
                    intervals[date] = 1
                else:
                    intervals[date] += 1

                first_interval = min(first_interval, date)
                last_interval = max(last_interval, date)

                counter += 1

                if counter % 2500 == 0:
                    self.dataset.update_status("Counted through " +
                                               str(counter) + " posts.")

            # pad interval if needed, this is useful if the result is to be
            # visualised as a histogram, for example
            if self.parameters.get("pad", self.options["pad"].get(
                    "default", True)) and timeframe != "all":
                missing, intervals = pad_interval(intervals, first_interval,
                                                  last_interval)

            # Write to csv
            csv_writer = csv.DictWriter(results,
                                        fieldnames=("date", "item",
                                                    "frequency"))
            csv_writer.writeheader()
            for interval in intervals:
                csv_writer.writerow({
                    "date": interval,
                    "item": "activity",
                    "frequency": intervals[interval]
                })

        self.dataset.finish(len(intervals))
Example #3
0
	def process(self):
		"""
		Render an SVG histogram/bar chart using a previous frequency analysis
		as input.
		"""
		self.dataset.update_status("Reading source file")
		header = self.parameters.get("header", self.options["header"]["default"])
		max_posts = 0

		# collect post numbers per month
		intervals = {}
		for post in self.iterate_csv_items(self.source_file):
			intervals[post["date"]] = int(post["frequency"])
			max_posts = max(max_posts, int(post["frequency"]))

		if len(intervals) <= 1:
			self.dataset.update_status("Not enough data available for a histogram; need more than one time series.")
			self.dataset.finish(0)
			return

		self.dataset.update_status("Cleaning up data")
		(missing, intervals) = pad_interval(intervals)

		# create histogram
		self.dataset.update_status("Drawing histogram")

		# you may change the following four variables to adjust the graph dimensions
		width = 1024
		height = 786
		y_margin = 75
		x_margin = 50
		x_margin_left = x_margin * 2
		tick_width = 5

		fontsize_normal = int(height / 40)
		fontsize_small = int(height / 75)

		# better don't touch the following
		line_width = round(width / 512)
		y_margin_top = 150 if header else 50
		y_height = height - (y_margin + y_margin_top)
		x_width = width - (x_margin + x_margin_left)
		canvas = Drawing(filename=str(self.dataset.get_results_path()), size=(width, height),
						 style="font-family:monospace;font-size:%ipx" % fontsize_normal)

		# normalize the Y axis to a multiple of a power of 10
		magnitude = pow(10, len(str(max_posts)) - 1)  # ew
		max_neat = math.ceil(max_posts / magnitude) * magnitude
		self.dataset.update_status("Max (normalized): %i (%i) (magnitude: %i)" % (max_posts, max_neat, magnitude))

		# draw border
		canvas.add(Rect(
			insert=(0, 0),
			size=(width, height),
			stroke="#000",
			stroke_width=line_width,
			fill="#FFF"
		))

		# draw header on a black background if needed
		if header:
			if len(header) > 40:
				header = header[:37] + "..."

			header_rect_height = (y_margin_top / 1.5)
			header_fontsize = (width / len(header))

			header_container = SVG(insert=(0, 0), size=(width, header_rect_height))
			header_container.add(Rect(
				insert=(0, 0),
				size=(width, header_rect_height),
				fill="#000"
			))
			header_container.add(Text(
				insert=("50%", "50%"),
				text=header,
				dominant_baseline="middle",
				text_anchor="middle",
				fill="#FFF",
				style="font-size:%i" % header_fontsize
			))
			canvas.add(header_container)

		# horizontal grid lines
		for i in range(0, 10):
			offset = (y_height / 10) * i
			canvas.add(Line(
				start=(x_margin_left, y_margin_top + offset),
				end=(width - x_margin, y_margin_top + offset),
				stroke="#EEE",
				stroke_width=line_width
			))

		# draw bars
		item_width = (width - (x_margin + x_margin_left)) / len(intervals)
		item_height = (height - y_margin - y_margin_top)
		bar_width = item_width * 0.9
		x = x_margin_left + (item_width / 2) - (bar_width / 2)

		if bar_width >= 8:
			arc_adjust = max(8, int(item_width / 5)) / 2
		else:
			arc_adjust = 0

		for interval in intervals:
			posts = int(intervals[interval])
			bar_height = ((posts / max_neat) * item_height)
			self.dataset.update_status("%s: %i posts" % (interval, posts))
			bar_top = height - y_margin - bar_height
			bar_bottom = height - y_margin

			if bar_height == 0:
				x += item_width
				continue

			bar = Path(fill="#000")
			bar.push("M %f %f" % (x, bar_bottom))
			bar.push("L %f %f" % (x, bar_top + (arc_adjust if bar_height > arc_adjust else 0)))
			if bar_height > arc_adjust > 0:
				control = (x, bar_top)
				bar.push("C %f %f %f %f %f %f" % (*control, *control, x + arc_adjust, bar_top))
			bar.push("L %f %f" % (x + bar_width - arc_adjust, height - y_margin - bar_height))
			if bar_height > arc_adjust > 0:
				control = (x + bar_width, bar_top)
				bar.push("C %f %f %f %f %f %f" % (*control, *control, x + bar_width, bar_top + arc_adjust))
			bar.push("L %f %f" % (x + bar_width, height - y_margin))
			bar.push("Z")
			canvas.add(bar)

			x += item_width

		# draw X and Y axis
		canvas.add(Line(
			start=(x_margin_left, height - y_margin),
			end=(width - x_margin, height - y_margin),
			stroke="#000",
			stroke_width=2
		))
		canvas.add(Line(
			start=(x_margin_left, y_margin_top),
			end=(x_margin_left, height - y_margin),
			stroke="#000",
			stroke_width=2
		))

		# draw ticks on Y axis
		for i in range(0, 10):
			offset = (y_height / 10) * i
			canvas.add(Line(
				start=(x_margin_left - tick_width, y_margin_top + offset),
				end=(x_margin_left, y_margin_top + offset),
				stroke="#000",
				stroke_width=line_width
			))

		# draw ticks on X axis
		for i in range(0, len(intervals)):
			offset = (x_width / len(intervals)) * (i + 0.5)
			canvas.add(Line(
				start=(x_margin_left + offset, height - y_margin),
				end=(x_margin_left + offset, height - y_margin + tick_width),
				stroke="#000",
				stroke_width=line_width
			))

		# prettify

		# y labels
		origin = (x_margin_left / 2)
		step = y_height / 10
		for i in range(0, 11):
			label = str(int((max_neat / 10) * i))
			labelsize = (len(label) * fontsize_normal * 1.25, fontsize_normal)
			label_x = origin - (tick_width * 2)
			label_y = height - y_margin - (i * step) - (labelsize[1] / 2)
			label_container = SVG(
				insert=(label_x, label_y),
				size=(x_margin_left / 2, x_margin_left / 5)
			)
			label_container.add(Text(
				insert=("100%", "50%"),
				text=label,
				dominant_baseline="middle",
				text_anchor="end"
			))
			canvas.add(label_container)

		# x labels
		label_width = max(fontsize_small * 6, item_width)
		label_x = x_margin_left
		label_y = height - y_margin + (tick_width * 2)
		next = 0
		for interval in intervals:
			if len(interval) == 7:
				label = month_abbr[int(interval[5:7])] + "\n" + interval[0:4]
			elif len(interval) == 10:
				label = str(int(interval[8:10])) + month_abbr[int(interval[5:7])] + "\n" + interval[0:4]
			else:
				label = interval.replace("-", "\n")

			if label_x > next:
				shift = 0
				for line in label.split("\n"):
					label_container = SVG(
						insert=(label_x + (item_width / 2) - (label_width / 2), label_y + (tick_width * 2)),
						size=(label_width, y_margin), overflow="visible")
					label_container.add(Text(
						insert=("50%", "0%"),
						text=line,
						dominant_baseline="middle",
						text_anchor="middle",
						baseline_shift=-shift
					))
					shift += fontsize_small * 2
					canvas.add(label_container)
					next = label_x + (label_width * 0.9)
			label_x += item_width

		# 4cat logo
		label = "made with 4cat - 4cat.oilab.nl"
		footersize = (fontsize_small * len(label) * 0.7, fontsize_small * 2)
		footer = SVG(insert=(width - footersize[0], height - footersize[1]), size=footersize)
		footer.add(Rect(insert=(0, 0), size=("100%", "100%"), fill="#000"))
		footer.add(Text(
			insert=("50%", "50%"),
			text=label,
			dominant_baseline="middle",
			text_anchor="middle",
			fill="#FFF",
			style="font-size:%i" % fontsize_small
		))
		canvas.add(footer)

		canvas.save(pretty=True)

		self.dataset.update_status("Finished")
		self.dataset.finish(len(intervals))
Example #4
0
    def process(self):
        graphs = {}
        intervals = []

        smooth = self.parameters.get("smooth",
                                     self.options["smooth"]["default"])
        normalise_values = self.parameters.get(
            "normalise", self.options["normalise"]["default"])
        completeness = convert_to_int(
            self.parameters.get("complete",
                                self.options["complete"]["default"]), 0)
        graph_label = self.parameters.get("label",
                                          self.options["label"]["default"])
        top = convert_to_int(
            self.parameters.get("top", self.options["top"]["default"]), 10)

        # first gather graph data: each distinct item gets its own graph and
        # for each graph we have a sequence of intervals, each interval with
        # its own value
        first_date = "9999-99-99"
        last_date = "0000-00-00"

        with self.source_file.open() as input:
            reader = csv.DictReader(input)

            item_key = "text" if "text" in reader.fieldnames else "item"
            date_key = "time" if "time" in reader.fieldnames else "date"
            value_key = "value" if "value" in reader.fieldnames else "frequency"

        for row in self.iterate_csv_items(self.source_file):
            if row[item_key] not in graphs:
                graphs[row[item_key]] = {}

            # make sure the months and days are zero-padded
            interval = row.get(date_key, "")
            interval = "-".join([
                str(bit).zfill(2 if len(bit) != 4 else 4)
                for bit in interval.split("-")
            ])
            first_date = min(first_date, interval)
            last_date = max(last_date, interval)

            if interval not in intervals:
                intervals.append(interval)

            if interval not in graphs[row[item_key]]:
                graphs[row[item_key]][interval] = 0

            graphs[row[item_key]][interval] += float(row.get(value_key, 0))

        # first make sure we actually have something to render
        intervals = sorted(intervals)
        if len(intervals) <= 1:
            self.dataset.update_status(
                "Not enough data for a side-by-side over-time visualisation.")
            self.dataset.finish(0)
            return

        # only retain most-occurring series - sort by sum of all frequencies
        if len(graphs) > top:
            selected_graphs = {
                graph: graphs[graph]
                for graph in sorted(
                    graphs,
                    key=lambda x: sum(
                        [graphs[x][interval] for interval in graphs[x]]),
                    reverse=True)[0:top]
            }
            graphs = selected_graphs

        # there may be items that do not have values for all intervals
        # this will distort the graph, so the next step is to make sure all
        # graphs consist of the same continuous interval list
        missing = {graph: 0 for graph in graphs}
        for graph in graphs:
            missing[graph], graphs[graph] = pad_interval(
                graphs[graph],
                first_interval=first_date,
                last_interval=last_date)

        # now that's done, make sure the graph datapoints are in order
        intervals = sorted(list(graphs[list(graphs)[0]].keys()))

        # delete graphs that do not have the required amount of intervals
        # this is useful to get rid of outliers and items that only occur
        # very few times over the full interval
        if completeness > 0:
            intervals_required = len(intervals) * (completeness / 100)
            disqualified = []
            for graph in graphs:
                if len(intervals) - missing[graph] < intervals_required:
                    disqualified.append(graph)

            graphs = {
                graph: graphs[graph]
                for graph in graphs if graph not in disqualified
            }

        # determine max value per item, so we can normalize them later
        limits = {}
        max_limit = 0
        for graph in graphs:
            for interval in graphs[graph]:
                limits[graph] = max(limits.get(graph, 0),
                                    abs(graphs[graph][interval]))
                max_limit = max(max_limit, abs(graphs[graph][interval]))

        # order graphs by highest (or lowest) value)
        limits = {
            limit: limits[limit]
            for limit in sorted(limits, key=lambda l: limits[l])
        }
        graphs = {graph: graphs[graph] for graph in limits}

        if not graphs:
            # maybe nothing is actually there to be graphed
            self.dataset.update_status(
                "No items match the selection criteria - nothing to visualise."
            )
            self.dataset.finish(0)
            return None

        # how many vertical grid lines (and labels) are to be included at most
        # 12 is a sensible default because it allows one label per month for a full
        # year's data
        max_gridlines = 12

        # If True, label is put at the lower left bottom of the graph rather than
        # outside it. Automatically set to True if one of the labels is long, as
        # else the label would fall off the screen
        label_in_graph = max([len(item) for item in graphs]) > 30

        # determine how wide each interval should be
        # the graph has a minimum width - but the graph's width will be
        # extended if at this minimum width each item does not have the
        # minimum per-item width
        min_full_width = 600
        min_item_width = 50
        item_width = max(min_item_width, min_full_width / len(intervals))

        # determine how much space each graph should get
        # same trade-off as for the interval width
        min_full_height = 300
        min_item_height = 100
        item_height = max(min_item_height, min_full_height / len(graphs))

        # margin - this should be enough for the text labels to fit in
        margin_base = 50
        margin_right = margin_base * 4
        margin_top = margin_base * 3

        # this determines the "flatness" of the isometric projection and an be
        # tweaked for different looks - basically corresponds to how far the
        # camera is above the horizon
        plane_angle = 120

        # don't change these
        plane_obverse = radians((180 - plane_angle) / 2)
        plane_angle = radians(plane_angle)

        # okay, now determine the full graphic size with these dimensions projected
        # semi-isometrically. We can also use these values later for drawing for
        # drawing grid lines, et cetera. The axis widths and heights here are the
        # dimensions of the bounding box wrapping the isometrically projected axes.
        x_axis_length = (item_width * (len(intervals) - 1))
        y_axis_length = (item_height * len(graphs))

        x_axis_width = (sin(plane_angle / 2) * x_axis_length)
        y_axis_width = (sin(plane_angle / 2) * y_axis_length)
        canvas_width = x_axis_width + y_axis_width

        # leave room for graph header
        if graph_label:
            margin_top += (2 * (canvas_width / 50))

        x_axis_height = (cos(plane_angle / 2) * x_axis_length)
        y_axis_height = (cos(plane_angle / 2) * y_axis_length)
        canvas_height = x_axis_height + y_axis_height

        # now we have the dimensions, the canvas can be instantiated
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width=(canvas_width + margin_base + margin_right),
            height=(canvas_height + margin_base + margin_top),
            header=graph_label)

        # draw gridlines - vertical
        gridline_x = y_axis_width + margin_base
        gridline_y = margin_top + canvas_height

        step_x_horizontal = sin(plane_angle / 2) * item_width
        step_y_horizontal = cos(plane_angle / 2) * item_width
        step_x_vertical = sin(plane_angle / 2) * item_height
        step_y_vertical = cos(plane_angle / 2) * item_height

        # labels for x axis
        skip = max(1, int(len(intervals) / max_gridlines))
        for i in range(0, len(intervals)):
            if i % skip == 0:
                canvas.add(
                    Line(start=(gridline_x, gridline_y),
                         end=(gridline_x - y_axis_width,
                              gridline_y - y_axis_height),
                         stroke="grey",
                         stroke_width=0.25))

                # to properly position the rotated and skewed text a container
                # element is needed
                label1 = str(intervals[i])[0:4]
                center = (gridline_x, gridline_y)
                container = SVG(x=center[0] - 25,
                                y=center[1],
                                width="50",
                                height="1.5em",
                                overflow="visible",
                                style="font-size:0.8em;")
                container.add(
                    Text(insert=("25%", "100%"),
                         text=label1,
                         transform="rotate(%f) skewX(%f)" %
                         (-degrees(plane_obverse), degrees(plane_obverse)),
                         text_anchor="middle",
                         baseline_shift="-0.5em",
                         style="font-weight:bold;"))

                if re.match(r"^[0-9]{4}-[0-9]{2}", intervals[i]):
                    label2 = month_abbr[int(str(intervals[i])[5:7])]
                    if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}", intervals[i]):
                        label2 += " %i" % int(intervals[i][8:10])

                    container.add(
                        Text(insert=("25%", "150%"),
                             text=label2,
                             transform="rotate(%f) skewX(%f)" %
                             (-degrees(plane_obverse), degrees(plane_obverse)),
                             text_anchor="middle",
                             baseline_shift="-0.5em"))

                canvas.add(container)

            gridline_x += step_x_horizontal
            gridline_y -= step_y_horizontal

        # draw graphs as filled beziers
        top = step_y_vertical * 1.5
        graph_start_x = y_axis_width + margin_base
        graph_start_y = margin_top + canvas_height

        # draw graphs in reverse order, so the bottom one is most in the
        # foreground (in case of overlap)
        for graph in reversed(list(graphs)):
            self.dataset.update_status("Rendering graph for '%s'" % graph)

            # path starting at lower left corner of graph
            area_graph = Path(fill=self.colours[self.colour_index])
            area_graph.push("M %f %f" % (graph_start_x, graph_start_y))
            previous_value = None

            graph_x = graph_start_x
            graph_y = graph_start_y
            for interval in graphs[graph]:
                # normalise value
                value = graphs[graph][interval]
                try:
                    limit = limits[graph] if normalise_values else max_limit
                    value = top * copysign(abs(value) / limit, value)
                except ZeroDivisionError:
                    value = 0

                if previous_value is None:
                    # vertical line upwards to starting value of graph
                    area_graph.push("L %f %f" %
                                    (graph_start_x, graph_start_y - value))
                elif not smooth:
                    area_graph.push("L %f %f" % (graph_x, graph_y - value))
                else:
                    # quadratic bezier from previous value to current value
                    control_left = (graph_x - (step_x_horizontal / 2),
                                    graph_y + step_y_horizontal -
                                    previous_value - (step_y_horizontal / 2))
                    control_right = (graph_x - (step_x_horizontal / 2),
                                     graph_y - value + (step_y_horizontal / 2))
                    area_graph.push("C %f %f %f %f %f %f" %
                                    (*control_left, *control_right, graph_x,
                                     graph_y - value))

                previous_value = value
                graph_x += step_x_horizontal
                graph_y -= step_y_horizontal

            # line to the bottom of the graph at the current Y position
            area_graph.push(
                "L %f %f" %
                (graph_x - step_x_horizontal, graph_y + step_y_horizontal))
            area_graph.push("Z")  # then close the Path
            canvas.add(area_graph)

            # add text labels - skewing is a bit complicated and we need a
            # "center" to translate the origins properly.
            if label_in_graph:
                insert = (graph_start_x + 5, graph_start_y - 10)
            else:
                insert = (graph_x - (step_x_horizontal) + 5,
                          graph_y + step_y_horizontal - 10)

            # we need to take the skewing into account for the translation
            offset_y = tan(plane_obverse) * insert[0]
            canvas.add(
                Text(insert=(0, 0),
                     text=graph,
                     transform="skewY(%f) translate(%f %f)" %
                     (-degrees(plane_obverse), insert[0],
                      insert[1] + offset_y)))

            # cycle colours, back to the beginning if all have been used
            self.colour_index += 1
            if self.colour_index >= len(self.colours):
                self.colour_index = 0

            graph_start_x -= step_x_vertical
            graph_start_y -= step_y_vertical

        # draw gridlines - horizontal
        gridline_x = margin_base
        gridline_y = margin_top + canvas_height - y_axis_height
        for graph in graphs:
            gridline_x += step_x_vertical
            gridline_y += step_y_vertical
            canvas.add(
                Line(start=(gridline_x, gridline_y),
                     end=(gridline_x + x_axis_width,
                          gridline_y - x_axis_height),
                     stroke="black",
                     stroke_width=1))

        # x axis
        canvas.add(
            Line(start=(margin_base + y_axis_width,
                        margin_top + canvas_height),
                 end=(margin_base + canvas_width,
                      margin_top + canvas_height - x_axis_height),
                 stroke="black",
                 stroke_width=2))

        # and finally save the SVG
        canvas.save(pretty=True)
        self.dataset.finish(len(graphs))