Ejemplo n.º 1
0
    def validate_query(query, request, user):
        """
        Validate input for a dataset query on the Douban data source.

        :param dict query:  Query parameters, from client-side.
        :param request:  Flask request
        :param User user:  User object of user who has submitted the query
        :return dict:  Safe query parameters
        """
        filtered_query = {}

        # the dates need to make sense as a range to search within
        after, before = query.get("daterange")
        if before and after and before < after:
            raise QueryParametersException("Date range must start before it ends")

        filtered_query["min_date"], filtered_query["max_date"] = (after, before)

        # normalize groups to just their IDs, even if a URL was provided, and
        # limit to 25
        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
                  query["groups"].replace("\n", ",").split(",")]
        groups = [group for group in groups if group][:25]
        if not any(groups):
            raise QueryParametersException("No valid groups were provided.")

        filtered_query["groups"] = ",".join(groups)

        # max amount of topics is 200 because after that Douban starts throwing 429s
        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)

        # strip HTML from posts?
        filtered_query["strip"] = bool(query.get("strip", False))

        return filtered_query
Ejemplo n.º 2
0
    def get_posts_complex(self, query):
        """
		Execute a query; get messages for given parameters

		:param dict query:  Query parameters, as part of the DataSet object
		:return list:  Posts, sorted by thread and post ID, in ascending order
		"""
        self.eventloop = asyncio.new_event_loop()
        session_path = Path(__file__).parent.joinpath(
            "sessions", self.dataset.parameters.get("session"))

        client = None
        try:
            client = TelegramClient(str(session_path),
                                    self.dataset.parameters.get("api_id"),
                                    self.dataset.parameters.get("api_hash"),
                                    loop=self.eventloop)
            client.start()
        except Exception as e:
            self.dataset.update_status(
                "Error connecting to the Telegram API with provided credentials.",
                is_final=True)
            self.dataset.finish()
            if client and hasattr(client, "disconnect"):
                client.disconnect()
            return None

        # ready our parameters
        parameters = self.dataset.get_parameters()
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]
        max_items = convert_to_int(parameters.get("items", 10), 10)

        # userinfo needs some work before it can be retrieved, something with
        # async method calls
        userinfo = False  # bool(parameters.get("scrape-userinfo", False))

        try:
            posts = self.gather_posts(client, queries, max_items, userinfo)
        except Exception as e:
            self.dataset.update_status("Error scraping posts from Telegram")
            self.log.error("Telegram scraping error: %s" %
                           traceback.format_exc())
            posts = None
        finally:
            client.disconnect()

        # delete personal data from parameters. We still have a Telegram
        # session saved to disk, but it's useless without this information.
        self.dataset.delete_parameter("api_id")
        self.dataset.delete_parameter("api_hash")
        self.dataset.delete_parameter("api_phone")

        return posts
Ejemplo n.º 3
0
    def process(self):
        """
        This takes a 4CAT results file as input, and outputs a new CSV file
        with one column with image hashes, one with the first file name used
        for the image, and one with the amount of times the image was used
        """
        api_key = self.parameters.get("api_key")
        self.dataset.delete_parameter("api_key")  # sensitive, delete after use

        features = self.parameters.get("features")
        features = [{"type": feature} for feature in features]

        if not api_key:
            self.dataset.update_status("You need to provide a valid API key",
                                       is_final=True)
            self.dataset.finish(0)
            return

        max_images = convert_to_int(self.parameters.get("amount", 0), 100)
        total = self.source_dataset.num_rows if not max_images else min(
            max_images, self.source_dataset.num_rows)
        done = 0

        for image_file in self.iterate_archive_contents(self.source_file):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from Google Vision API")

            done += 1
            self.dataset.update_status("Annotating image %i/%i" %
                                       (done, total))

            try:
                annotations = self.annotate_image(image_file, api_key,
                                                  features)
            except RuntimeError:
                # cannot continue fetching, e.g. when API key is invalid
                break

            if not annotations:
                continue

            annotations = {"file_name": image_file.name, **annotations}

            with self.dataset.get_results_path().open(
                    "a", encoding="utf-8") as outfile:
                outfile.write(json.dumps(annotations) + "\n")

            if max_images and done >= max_images:
                break

        self.dataset.update_status("Annotations retrieved for %i images" %
                                   done)
        self.dataset.finish(done)
Ejemplo n.º 4
0
    def get_processor_pipeline(self):
        """
        This queues a series of post-processors to annotate images

        First, the required amount of images referenced in the dataset is
        downloaded, in order of most-referenced; then, the requested
        features are extracted using the Google Vision API; finally, the result
        is converted to a CSV file for easy processing.
        """
        amount = convert_to_int(self.parameters.get("amount", 10), 10)
        api_key = self.parameters.get("api_key", "")
        features = self.parameters.get("features", "")

        self.dataset.delete_parameter(
            "api_key")  # sensitive, delete as soon as possible

        pipeline = [
            # first, extract top images
            {
                "type": "top-images",
                "parameters": {
                    "overwrite": False
                }
            },
            # then, download the images we want to annotate
            {
                "type": "image-downloader",
                "parameters": {
                    "amount": amount,
                    "overwrite": False
                }
            },
            # then, annotate the downloaded images with the Google Vision API
            {
                "type": "google-vision-api",
                "parameters": {
                    "features": features,
                    "amount": amount,
                    "api_key": api_key
                }
            },
            # finally, create a simplified CSV file from the download NDJSON (which can also be retrieved later)
            {
                "type": "convert-vision-to-csv",
                "parameters": {}
            }
        ]

        return pipeline
Ejemplo n.º 5
0
	def process(self):
		"""
		Takes the thumbnails downloaded from YouTube metadata and
		turns it into an image wall. 

		"""
		results_path = self.dataset.get_results_path()
		dirname = Path(results_path.parent, results_path.name.replace(".", ""))

		# Get the required parameters
		# path to the YouTube csv data that was the source of the thumbnails
		root_csv = self.dataset.get_genealogy()[-3].get_results_path()
		max_amount = convert_to_int(self.parameters.get("max_amount", 0), 0)
		category_overlay = self.parameters.get("category_overlay")

		# Build that wall!
		self.make_imagewall(root_csv, max_amount=max_amount, category_overlay=category_overlay)
Ejemplo n.º 6
0
	def process(self):
		"""
		Unzips and appends tokens to fetch and write a tf-idf matrix
		"""

		# Validate and process user inputs
		library = self.parameters.get("library", "gensim")

		if "-" not in self.parameters.get("n_size"):
			n_size = convert_to_int(self.parameters.get("n_size", 1), 1) 
			n_size = (n_size, n_size) # needs to be a tuple for sklearn.
		else:
			n_size_split = self.parameters.get("n_size").split("-")
			n_size = (convert_to_int(n_size_split[0]), convert_to_int(n_size_split[1]))
		
		min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1)
		max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1)
		max_output = convert_to_int(self.parameters.get("max_output", 10), 10)
		smartirs = self.parameters.get("smartirs", "nfc")

		# Get token sets
		self.dataset.update_status("Processing token sets")
		tokens = []
		dates = []

		# Go through all archived token sets and generate collocations for each
		for token_file in self.iterate_archive_contents(self.source_file):
			# Get the date
			date_string = token_file.stem
			dates.append(date_string)

			# we support both pickle and json dumps of vectors
			token_unpacker = pickle if token_file.suffix == "pb" else json

			try:
				with token_file.open("rb") as binary_tokens:
					# these were saved as pickle dumps so we need the binary mode
					post_tokens = token_unpacker.load(binary_tokens)

					# Flatten the list of list of tokens - we're treating the whole time series as one document.
					post_tokens = list(itertools.chain.from_iterable(post_tokens))

					# Add to all date's tokens
					tokens.append(post_tokens)

			except UnicodeDecodeError:
				self.dataset.update_status("Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.", is_final=True)
				self.dataset.finish(0)
				return

		# Make sure `min_occurrences` and `max_occurrences` are valid
		if min_occurrences > len(tokens):
			min_occurrences = len(tokens) - 1
		if max_occurrences <= 0 or max_occurrences > len(tokens):
			max_occurrences = len(tokens)

		# Get the tf-idf matrix.
		self.dataset.update_status("Generating tf-idf for token set")
		try:

			if library == "gensim":
				results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs)
			elif library == "scikit-learn":
				results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences,
								 max_occurrences=max_occurrences, top_n=max_output)
			else:
				self.dataset.update_status("Invalid library.")
				self.dataset.finish(0)
				return

			if results:
				# Generate csv and finish
				self.dataset.update_status("Writing to csv and finishing")
				self.write_csv_items_and_finish(results)

		except MemoryError:
			self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.")
			self.dataset.finish(0)
Ejemplo n.º 7
0
    def get_items(self, query):
        """
        Get Douban posts

        In the case of Douban, there is no need for multiple pathways, so we
        can route it all to the one post query method. Will scrape posts from the
        most recent topics for a given list of groups. Douban prevents scraping
        old content, so this is mostly useful to get a sense of what a given
        group is talking about at the moment.

        :param query:  Filtered query parameters
        :return:
        """
        groups = query["groups"].split(",")
        max_topics = min(convert_to_int(query["amount"], 100), 500)
        start = query["min_date"]
        end = query["max_date"]
        strip = bool(query["strip"])
        topics_processed = 0
        posts_processed = 0

        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}

        for group in groups:
            # get URL for group index
            group = str(group)
            group_url = "https://www.douban.com/group/%s/discussion?start=" % group

            offset = 0
            while True:
                # get list of topics in group, for the given offset
                fetch_url = group_url + str(offset)
                request = self.get_douban_url(fetch_url, headers=headers)

                # this would usually mean the group doesn't exist, or we hit some rate limit
                if request.status_code != 200:
                    self.dataset.update_status(
                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
                    break

                self.dataset.update_status("Scraping group %s...")

                # parse the HTML and get links to individual topics, as well as group name
                overview_page = BeautifulSoup(request.text, 'html.parser')
                group_name = overview_page.select_one(".group-item .title a").text

                for topic in overview_page.select("table.olt tr:not(.th)"):
                    if self.interrupted:
                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")

                    if topics_processed >= max_topics:
                        break

                    # get topic URL, and whether it is an 'elite' topic
                    topic_url = topic.find("a").get("href")
                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
                    topic_updated = int(
                        datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())

                    # if a date range is given, ignore topics outside of it
                    if start and topic_updated < start:
                        continue

                    if end and topic_updated > end:
                        break

                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
                    posts_processed, offset, min(max_topics, offset + 50), group_name))

                    # request topic page - fortunately all comments are on a single page
                    topic_request = self.get_douban_url(topic_url, headers=headers)
                    time.sleep(5)  # don't hit rate limits
                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
                    topic = topic_page.select_one("#topic-content")

                    topics_processed += 1

                    # include original post as the first item
                    try:
                        first_post = {
                            "id": topic_id,
                            "group_id": group,
                            "thread_id": topic_id,
                            "group_name": group_name,
                            "subject": topic_page.select_one("h1").text.strip(),
                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
                            "author": topic.select_one(".user-face img").get("alt"),
                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
                            "likes": 0,
                            "is_highlighted": "no",
                            "is_reply": "no",
                            "is_topic_elite": topic_is_elite,
                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
                        }
                    except (AttributeError, ValueError):
                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
                        continue

                    if strip:
                        first_post["body"] = strip_tags(first_post["body"])

                    posts_processed += 1
                    yield first_post

                    # now loop through all comments on the page
                    for comment in topic_page.select("ul#comments > li"):
                        comment_data = {
                            "id": comment.get("data-cid"),
                            "group_id": group,
                            "thread_id": topic_id,
                            "group_name": group_name,
                            "subject": "",
                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
                            "author": comment.select_one(".user-face img").get("alt"),
                            "author_id":
                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text,
                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
                            "likes": convert_to_int(
                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0),
                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
                                                                                   comment.select(
                                                                                       "ul#popular-comments li")] else "no",
                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
                            "is_topic_elite": topic_is_elite,
                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
                        }

                        if strip:
                            comment_data["body"] = strip_tags(comment_data["body"])

                        posts_processed += 1
                        yield comment_data

                if offset < max_topics - 50:
                    offset += 50
                else:
                    break
Ejemplo n.º 8
0
    def process(self):
        """
		This takes previously generated Word2Vec models and uses them to find
		similar words based on a list of words
		"""
        self.dataset.update_status("Processing sentences")

        words = self.parameters.get("words", "").split(",")
        if not words:
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(-1)
            return

        num_words = convert_to_int(self.parameters.get("num-words"),
                                   self.options["num-words"]["default"])
        try:
            threshold = float(
                self.parameters.get("threshold",
                                    self.options["threshold"]["default"]))
        except ValueError:
            threshold = float(self.options["threshold"]["default"])

        # prepare staging area
        temp_path = self.dataset.get_temporary_path()
        temp_path.mkdir()

        # go through all models and calculate similarity for all given input words
        result = []
        with zipfile.ZipFile(self.source_file, "r") as model_archive:
            model_files = model_archive.namelist()

            for model_file in model_files:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing token sets")

                # the model is stored as [interval].model
                model_name = model_file.split("/")[-1]
                interval = model_name.split(".")[0]

                # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes)
                temp_file = temp_path.joinpath(model_name)
                model_archive.extract(model_name, temp_path)

                # for each separate model, calculate top similar words for each
                # input word, giving us at most
                #   [max amount] * [number of input] * [number of intervals]
                # items
                self.dataset.update_status("Running model %s..." % model_name)
                model = Word2Vec.load(str(temp_file))
                for word in words:
                    similar_words = model.most_similar(positive=[word],
                                                       topn=num_words)
                    for similar_word in similar_words:
                        if similar_word[1] < threshold:
                            continue

                        result.append({
                            "date": interval,
                            "input": word,
                            "item": similar_word[0],
                            "value": similar_word[1]
                        })

                temp_file.unlink()

        # delete temporary folder
        shutil.rmtree(temp_path)

        self.write_csv_items_and_finish(result)
Ejemplo n.º 9
0
    def process(self):
        """
		Reads a CSV file, counts occurences of chosen values over all posts,
		and aggregates the results per chosen time frame
		"""

        # convenience variables
        timeframe = self.parameters.get("timeframe",
                                        self.options["timeframe"]["default"])
        attribute = self.parameters.get("attribute",
                                        self.options["attribute"]["default"])
        rank_style = self.parameters.get("top-style",
                                         self.options["top-style"]["default"])
        cutoff = convert_to_int(
            self.parameters.get("top", self.options["top"]["default"]))

        try:
            filter = re.compile(self.parameters.get("regex", None))
        except (TypeError, re.error):
            self.dataset.update_status(
                "Could not complete: regular expression invalid")
            self.dataset.finish(0)
            return

        # This is needed to check for URLs in the "domain" and "url" columns for Reddit submissions
        datasource = self.parent.parameters.get("datasource")

        # we need to be able to order the values later, chronologically, so use
        # and OrderedDict; all frequencies go into this variable
        items = OrderedDict()

        # if we're interested in overall top-ranking items rather than a
        # per-period ranking, we need to do a first pass in which all posts are
        # inspected to determine those overall top-scoring items
        overall_top = {}
        if rank_style == "overall":
            self.dataset.update_status("Determining overall top-%i items" %
                                       cutoff)
            for post in self.iterate_csv_items(self.source_file):
                values = self.get_values(post, attribute, filter)
                for value in values:
                    if value not in overall_top:
                        overall_top[value] = 0

                    overall_top[value] += 1

            overall_top = sorted(overall_top,
                                 key=lambda item: overall_top[item],
                                 reverse=True)[0:cutoff]

        # now for the real deal
        self.dataset.update_status("Reading source file")
        for post in self.iterate_csv_items(self.source_file):
            # determine where to put this data
            if timeframe == "all":
                time_unit = "overall"
            else:
                try:
                    timestamp = int(
                        datetime.datetime.strptime(
                            post["timestamp"],
                            "%Y-%m-%d %H:%M:%S").timestamp())
                except ValueError:
                    timestamp = 0
                date = datetime.datetime.fromtimestamp(timestamp)
                if timeframe == "year":
                    time_unit = str(date.year)
                elif timeframe == "month":
                    time_unit = str(date.year) + "-" + str(date.month).zfill(2)
                else:
                    time_unit = str(date.year) + "-" + str(
                        date.month).zfill(2) + "-" + str(date.day).zfill(2)

                # again, we need to be able to sort, so OrderedDict it is
            if time_unit not in items:
                items[time_unit] = OrderedDict()

            # get values from post
            values = self.get_values(post, attribute, filter)

            # keep track of occurrences of found items per relevant time period
            for value in values:
                if rank_style == "overall" and value not in overall_top:
                    continue

                if value not in items[time_unit]:
                    items[time_unit][value] = 0

                items[time_unit][value] += 1

        # sort by time and frequency
        self.dataset.update_status("Sorting items")
        sorted_items = OrderedDict(
            (key, items[key]) for key in sorted(items.keys()))
        for time_unit in sorted_items:
            sorted_unit = OrderedDict(
                (item, sorted_items[time_unit][item]) for item in sorted(
                    sorted_items[time_unit],
                    reverse=True,
                    key=lambda key: sorted_items[time_unit][key]))
            sorted_items[time_unit].clear()
            sorted_items[time_unit].update(sorted_unit)

            if cutoff > 0:
                # OrderedDict's API sucks and really needs some extra
                # convenience methods
                sorted_items[time_unit] = OrderedDict(
                    islice(sorted_items[time_unit].items(), cutoff))

        # convert to flat list
        rows = []
        for time_unit in sorted_items:
            for item in sorted_items[time_unit]:
                row = {
                    "date": time_unit,
                    "item": item,
                    "frequency": sorted_items[time_unit][item]
                }

                rows.append(row)

        # write as csv
        if rows:
            self.write_csv_items_and_finish(rows)
        else:
            self.dataset.update_status(
                "No posts contain the requested attributes.")
            self.dataset.finish(0)
Ejemplo n.º 10
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a number of files containing
		tokenised posts, grouped per time unit as specified in the parameters.
		"""
        self.dataset.update_status("Processing sentences")

        use_skipgram = 1 if self.parameters.get(
            "algorithm") == "skipgram" else 0
        window = min(10, max(1, convert_to_int(self.parameters.get("window"))))
        use_negative = 5 if self.parameters.get("negative") else 0

        # prepare staging area
        temp_path = self.dataset.get_temporary_path()
        temp_path.mkdir()

        # go through all archived token sets and vectorise them
        models = 0
        with zipfile.ZipFile(self.source_file, "r") as token_archive:
            token_sets = token_archive.namelist()

            # create one model file per token file
            for token_set in token_sets:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing token sets")

                # the model file's name will be based on the token set name,
                # i.e. 2020-08-01.json becomes 2020-08-01.model
                token_set_name = token_set.split("/")[-1]

                # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes)
                temp_file = temp_path.joinpath(token_set_name)
                token_archive.extract(token_set_name, temp_path)

                # use the "list of lists" as input for the word2vec model
                # by default the tokeniser generates one list of tokens per
                # post... which may actually be preferable for short
                # 4chan-style posts. But alternatively it could generate one
                # list per sentence - this processor is agnostic in that regard
                self.dataset.update_status(
                    "Training model for token set %s..." % token_set_name)
                with temp_file.open() as input:
                    model = Word2Vec(json.load(input),
                                     negative=use_negative,
                                     sg=use_skipgram,
                                     window=window)
                    model_name = token_set_name.split(".")[0] + ".model"
                    model.save(str(temp_path.joinpath(model_name)))
                    models += 1

                temp_file.unlink()

        # create another archive with all model files in it
        with zipfile.ZipFile(self.dataset.get_results_path(), "w") as zip:
            for output_path in temp_path.glob("*.model"):
                zip.write(output_path, output_path.name)
                output_path.unlink()

        # delete temporary folder
        shutil.rmtree(temp_path)

        self.dataset.update_status("Finished")
        self.dataset.finish(models)
Ejemplo n.º 11
0
    def process(self):
        """
		Reads vector set and creates a CSV with ranked vectors
		"""

        # prepare staging area
        results_path = self.dataset.get_temporary_path()
        results_path.mkdir()

        self.dataset.update_status("Processing token sets")
        vector_paths = []

        # go through all archived token sets and vectorise them
        results = []

        def file_to_timestamp(file):
            """
			Get comparable datestamp value for token file

			Token files are named YYYY-m.pb. This function converts that to a
			YYYYmm string, then that string to an int, so that it may be
			compared for sorting chronologically.

			:param str file:  File name
			:return int:  Comparable datestamp
			"""
            stem = file.split("/")[-1].split(".")[0].split("-")
            try:
                return int(stem[0] + stem[1].zfill(2))
            except (ValueError, IndexError):
                return 0

        results = []

        # truncate results as needed
        rank_style = self.parameters.get("top-style",
                                         self.options["top-style"]["default"])
        cutoff = convert_to_int(
            self.parameters.get("top", self.options["top"]["default"]),
            self.options["top"]["default"])

        # now rank the vectors by most prevalent per "file" (i.e. interval)
        overall_top = {}
        with zipfile.ZipFile(self.source_file, "r") as token_archive:
            vector_sets = sorted(token_archive.namelist(),
                                 key=file_to_timestamp)
            index = 0

            for vector_set in vector_sets:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing vector sets")

                index += 1
                vector_set_name = vector_set.split("/")[
                    -1]  # we don't need the full path
                self.dataset.update_status("Processing token set %i/%i" %
                                           (index, len(vector_sets)))

                # temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes)
                temp_path = results_path.joinpath(vector_set_name)
                token_archive.extract(vector_set_name, results_path)
                with temp_path.open("rb") as binary_tokens:
                    # these were saved as pickle dumps so we need the binary mode
                    vectors = pickle.load(binary_tokens)
                temp_path.unlink()

                vectors = sorted(vectors, key=lambda x: x[1], reverse=True)

                # for overall ranking we need the full vector space per interval
                # because maybe an overall top-ranking vector is at the bottom
                # in this particular interval - we'll truncate the top list at
                # a later point in that case. Else, truncate it here
                if rank_style == "per-item":
                    vectors = vectors[0:cutoff]

                for vector in vectors:
                    if not vector[0].strip():
                        continue

                    results.append({
                        "date": vector_set_name.split(".")[0],
                        "item": vector[0],
                        "frequency": vector[1]
                    })

                    if vector[0] not in overall_top:
                        overall_top[vector[0]] = 0

                    overall_top[vector[0]] += int(vector[1])

        # this eliminates all items from the results that were not in the
        # *overall* top-occuring items. This only has an effect when vectors
        # were generated for multiple intervals
        if rank_style == "overall":
            overall_top = {
                item: overall_top[item]
                for item in sorted(overall_top,
                                   key=lambda x: overall_top[x],
                                   reverse=True)[0:cutoff]
            }
            filtered_results = []
            for item in results:
                if item["item"] in overall_top:
                    filtered_results.append(item)

            results = filtered_results

        # delete temporary files and folder
        shutil.rmtree(results_path)

        # done!
        self.dataset.update_status("Writing results file")
        with open(self.dataset.get_results_path(), "w",
                  encoding="utf-8") as output:
            writer = csv.DictWriter(output,
                                    fieldnames=("date", "item", "frequency"))
            writer.writeheader()
            for row in results:
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(results))
Ejemplo n.º 12
0
    async def execute_queries(self):
        """
		Get messages for queries

		This is basically what would be done in get_items(), except due to
		Telethon's architecture this needs to be called in an async method,
		which is this one.
		"""
        # session file has been created earlier, and we can re-use it here in
        # order to avoid having to re-enter the security code
        query = self.parameters

        hash_base = query["api_phone"].replace(
            "+", "") + query["api_id"] + query["api_hash"]
        session_id = hashlib.blake2b(hash_base.encode("ascii")).hexdigest()
        session_path = Path(__file__).parent.joinpath("sessions",
                                                      session_id + ".session")

        client = None

        def cancel_start():
            """
			Replace interactive phone number input in Telethon

			By default, if Telethon cannot use the given session file to
			authenticate, it will interactively prompt the user for a phone
			number on the command line. That is not useful here, so instead
			raise a RuntimeError. This will be caught below and the user will
			be told they need to re-authenticate via 4CAT.
			"""
            raise RuntimeError("Connection cancelled")

        try:
            client = TelegramClient(str(session_path),
                                    int(query.get("api_id")),
                                    query.get("api_hash"),
                                    loop=self.eventloop)
            await client.start(phone=cancel_start)
        except RuntimeError:
            # session is no longer useable, delete file so user will be asked
            # for security code again
            self.dataset.update_status(
                "Session is not authenticated: login security code may have expired. You need to re-enter the security code.",
                is_final=True)
            session_path.unlink(missing_ok=True)
            if client and hasattr(client, "disconnect"):
                await client.disconnect()
            return None
        except Exception as e:
            self.dataset.update_status(
                "Error connecting to the Telegram API with provided credentials.",
                is_final=True)
            if client and hasattr(client, "disconnect"):
                await client.disconnect()
            return None

        # ready our parameters
        parameters = self.dataset.get_parameters()
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]
        max_items = convert_to_int(parameters.get("items", 10), 10)

        try:
            posts = await self.gather_posts(client, queries, max_items)
        except Exception as e:
            self.dataset.update_status("Error scraping posts from Telegram")
            self.log.error("Telegram scraping error: %s" %
                           traceback.format_exc())
            posts = None
        finally:
            await client.disconnect()

        return posts
Ejemplo n.º 13
0
    def process(self):
        # parse parameters
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        try:
            threshold = float(
                self.parameters.get("threshold",
                                    self.options["threshold"]["default"]))
        except ValueError:
            threshold = float(self.options["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))
        num_words = convert_to_int(self.parameters.get("num-words"),
                                   self.options["num-words"]["default"])
        overlay = self.parameters.get("overlay")
        reduction_method = self.parameters.get("method")
        all_words = self.parameters.get("all-words")

        # load model files and initialise
        self.dataset.update_status("Unpacking word embedding models")
        staging_area = self.unpack_archive_contents(self.source_file)
        common_vocab = None
        vector_size = None
        models = {}

        # find words that are common to all models
        self.dataset.update_status("Determining cross-model common vocabulary")
        for model_file in staging_area.glob("*.model"):
            if self.interrupted:
                shutil.rmtree(staging_area)
                raise ProcessorInterruptedException(
                    "Interrupted while processing word embedding models")

            model = KeyedVectors.load(str(model_file)).wv
            models[model_file.stem] = model
            if vector_size is None:
                vector_size = model.vector_size  # needed later for dimensionality reduction

            if common_vocab is None:
                common_vocab = set(model.vocab.keys())
            else:
                common_vocab &= set(model.vocab.keys())  # intersect

        # sort common vocabulary by combined frequency across all models
        # this should make filtering for common words a bit faster further down
        self.dataset.update_status("Sorting vocabulary")
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: sum(
            [model.vocab[w].count for model in models.values()]),
                          reverse=True)

        # initial boundaries of 2D space (to be adjusted later based on t-sne
        # outcome)
        max_x = 0.0 - sys.float_info.max
        max_y = 0.0 - sys.float_info.max
        min_x = sys.float_info.max
        min_y = sys.float_info.max

        # for each model, find the words that we may want to plot - these are
        # the nearest neighbours for the given query words
        relevant_words = {}

        # the vectors need to be reduced all at once - but the vectors are
        # grouped by model. To solve this, keep one numpy array of vectors,
        # but also keep track of which indexes of this array belong to which
        # model, by storing the index of the first vector for a model
        vectors = numpy.empty((0, vector_size))
        vector_offsets = {}

        # now process each model
        for model_name, model in models.items():
            relevant_words[model_name] = set(
            )  # not a set, since order needs to be preserved
            self.dataset.update_status("Finding similar words in model '%s'" %
                                       model_name)

            for query in input_words:
                if query not in model.vocab:
                    self.dataset.update_status(
                        "Query '%s' was not found in model %s; cannot find nearest neighbours."
                        % (query, model_name),
                        is_final=True)
                    self.dataset.finish(0)
                    return

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while finding similar words")

                # use a larger sample (topn) than required since some of the
                # nearest neighbours may not be in the common vocabulary and
                # will therefore need to be ignored
                context = set([
                    word[0] for word in model.most_similar(query, topn=1000)
                    if word[0] in common_vocab and word[1] >= threshold
                ][:num_words])

                relevant_words[model_name] |= {
                    query
                } | context  # always include query word

        # now do another loop to determine which words to plot for each model
        # this is either the same as relevant_words, or a superset which
        # combines all relevant words for all models
        plottable_words = {}
        last_model = max(relevant_words.keys())
        all_relevant_words = set().union(*relevant_words.values())

        for model_name, words in relevant_words.items():
            plottable_words[model_name] = []
            vector_offsets[model_name] = len(vectors)

            # determine which words to plot for this model. either the nearest
            # neighbours for this model, or all nearest neighbours found across
            # all models
            words_to_include = all_relevant_words if all_words else relevant_words[
                model_name]

            for word in words_to_include:
                if word in plottable_words[model_name] or (
                        not overlay and model_name != last_model
                        and word not in input_words):
                    # only plot each word once per model, or if 'overlay'
                    # is not set, only once overall (for the most recent
                    # model)
                    continue

                vector = models[model_name][word]
                plottable_words[model_name].append(word)
                vectors = numpy.append(vectors, [vector], axis=0)

        del models  # no longer needed

        # reduce the vectors of all words to be plotted for this model to
        # a two-dimensional coordinate with the previously initialised tsne
        # transformer. here the two-dimensional vectors are interpreted as
        # cartesian coordinates
        if reduction_method == "PCA":
            pca = PCA(n_components=2, random_state=0)
            vectors = pca.fit_transform(vectors)
        elif reduction_method == "t-SNE":
            # initialise t-sne transformer
            # parameters taken from Hamilton et al.
            # https://github.com/williamleif/histwords/blob/master/viz/common.py
            tsne = TSNE(n_components=2,
                        random_state=0,
                        learning_rate=150,
                        init="pca")
            vectors = tsne.fit_transform(vectors)
        elif reduction_method == "TruncatedSVD":
            # standard sklearn parameters made explicit
            svd = TruncatedSVD(n_components=2,
                               algorithm="randomized",
                               n_iter=5,
                               random_state=0)
            vectors = svd.fit_transform(vectors)
        else:
            shutil.rmtree(staging_area)
            self.dataset.update_status(
                "Invalid dimensionality reduction technique selected",
                is_final=True)
            self.dataset.finish(0)
            return

        # also keep track of the boundaries of our 2D space, so we can plot
        # them properly later
        for position in vectors:
            max_x = max(max_x, position[0])
            max_y = max(max_y, position[1])
            min_x = min(min_x, position[0])
            min_y = min(min_y, position[1])

        # now we know for each model which words should be plotted and at what
        # position
        # with this knowledge, we can normalize the positions, and start
        # plotting them in a graph

        # a palette generated with https://medialab.github.io/iwanthue/
        colours = [
            "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb",
            "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c",
            "#00b0ca", "#9a0f76", "#ff70c8", "#713c88"
        ]
        colour_index = 0

        # make sure all coordinates are positive
        max_x -= min_x
        max_y -= min_y

        # determine graph dimensions and proportions
        width = 1000  # arbitrary
        height = width * (max_y / max_x)  # retain proportions
        scale = width / max_x

        # margin around the plot to give room for labels and to look better
        margin = width * 0.1
        width += 2 * margin
        height += 2 * margin

        # normalize all known positions to fit within the graph
        vectors = [(margin + ((position[0] - min_x) * scale),
                    margin + ((position[1] - min_y) * scale))
                   for position in vectors]

        # now all positions are finalised, we can determine the "journey" of
        # each query - the sequence of positions in the graph it takes, so we
        # can draw lines from position to position later
        journeys = {}
        for query in input_words:
            journeys[query] = []
            for model_name, words in plottable_words.items():
                index = words.index(query)
                journeys[query].append(vectors[vector_offsets[model_name] +
                                               index])

        # font sizes proportional to width (which is static and thus predictable)
        fontsize_large = width / 50
        fontsize_normal = width / 75
        fontsize_small = width / 100

        # now we have the dimensions, the canvas can be instantiated
        model_type = self.source_dataset.parameters.get(
            "model-type", "word2vec")
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width,
            height,
            header="%s nearest neighbours (fitting: %s) - '%s'" %
            (model_type, reduction_method, ",".join(input_words)),
            fontsize_normal=fontsize_normal,
            fontsize_large=fontsize_large,
            fontsize_small=fontsize_small)

        # use colour-coded backgrounds to distinguish the query words in the
        # graph, each model (= interval) with a separate colour
        for model_name in plottable_words:
            solid = Filter(id="solid-%s" % model_name)
            solid.feFlood(flood_color=colours[colour_index])
            solid.feComposite(in_="SourceGraphic")
            canvas.defs.add(solid)
            colour_index += 1

        # now plot each word for each model
        self.dataset.update_status("Plotting graph")
        words = SVG(insert=(0, 0), size=(width, height))
        queries = SVG(insert=(0, 0), size=(width, height))
        colour_index = 0

        for model_name, labels in plottable_words.items():
            positions = vectors[
                vector_offsets[model_name]:vector_offsets[model_name] +
                len(labels)]

            label_index = 0
            for position in positions:
                word = labels[label_index]
                is_query = word in input_words
                label_index += 1

                filter = ("url(#solid-%s)" %
                          model_name) if is_query else "none"
                colour = "#FFF" if is_query else colours[colour_index]
                fontsize = fontsize_normal if is_query else fontsize_small

                if word in input_words:
                    word += " (" + model_name + ")"

                label_container = SVG(insert=position,
                                      size=(1, 1),
                                      overflow="visible")
                label_container.add(
                    Text(insert=("50%", "50%"),
                         text=word,
                         dominant_baseline="middle",
                         text_anchor="middle",
                         style="fill:%s;font-size:%ipx" % (colour, fontsize),
                         filter=filter))

                # we make sure the queries are always rendered on top by
                # putting them in a separate SVG container
                if is_query:
                    queries.add(label_container)
                else:
                    words.add(label_container)

            colour_index = 0 if colour_index >= len(
                colours) else colour_index + 1

        # plot a line between positions for query words
        lines = SVG(insert=(0, 0), size=(width, height))
        for query, journey in journeys.items():
            previous_position = None
            for position in journey:
                if previous_position is None:
                    previous_position = position
                    continue

                lines.add(
                    Line(start=previous_position,
                         end=position,
                         stroke="#CE1B28",
                         stroke_width=2))
                previous_position = position

        canvas.add(lines)
        canvas.add(words)
        canvas.add(queries)

        canvas.save(pretty=True)
        shutil.rmtree(staging_area)
        self.dataset.finish(len(journeys))
Ejemplo n.º 14
0
    def get_items(self, query):
        """
        Use the Twitter v2 API historical search to get tweets

        :param query:
        :return:
        """
        # this is pretty sensitive so delete it immediately after storing in
        # memory
        bearer_token = self.parameters.get("api_bearer_token")
        auth = {"Authorization": "Bearer %s" % bearer_token}

        endpoint = "https://api.twitter.com/2/tweets/search/all"

        # these are all expansions and fields available at the time of writing
        # since it does not cost anything extra in terms of rate limiting, go
        # for as much data per tweet as possible...
        tweet_fields = ("attachments", "author_id", "context_annotations",
                        "conversation_id", "created_at", "entities", "geo",
                        "id", "in_reply_to_user_id", "lang", "public_metrics",
                        "possibly_sensitive", "referenced_tweets",
                        "reply_settings", "source", "text", "withheld")
        user_fields = ("created_at", "description", "entities", "id",
                       "location", "name", "pinned_tweet_id",
                       "profile_image_url", "protected", "public_metrics",
                       "url", "username", "verified", "withheld")
        place_fields = ("contained_within", "country", "country_code",
                        "full_name", "geo", "id", "name", "place_type")
        poll_fields = ("duration_minutes", "end_datetime", "id", "options",
                       "voting_status")
        expansions = ("attachments.poll_ids", "attachments.media_keys",
                      "author_id", "entities.mentions.username",
                      "geo.place_id", "in_reply_to_user_id",
                      "referenced_tweets.id", "referenced_tweets.id.author_id")
        media_fields = ("duration_ms", "height", "media_key",
                        "non_public_metrics", "organic_metrics",
                        "preview_image_url", "promoted_metrics",
                        "public_metrics", "type", "url", "width")
        amount = convert_to_int(self.parameters.get("amount"), 10)

        params = {
            "query": self.parameters.get("query", ""),
            "expansions": ",".join(expansions),
            "tweet.fields": ",".join(tweet_fields),
            "user.fields": ",".join(user_fields),
            "poll.fields": ",".join(poll_fields),
            "place.fields": ",".join(place_fields),
            "media.fields": ",".join(media_fields),
            "max_results": max(10, min(amount, 500))
            if amount > 0 else 500,  # 500 = upper limit, 10 = lower
        }

        if self.parameters.get("min_date"):
            params["start_time"] = datetime.datetime.fromtimestamp(
                self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        if self.parameters.get("max_date"):
            params["end_time"] = datetime.datetime.fromtimestamp(
                self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        tweets = 0
        self.dataset.log("Search parameters: %s" % repr(params))
        while True:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while getting tweets from the Twitter API")

            # there is a limit of one request per second, so stay on the safe side of this
            while self.previous_request == int(time.time()):
                time.sleep(0.1)
            time.sleep(0.05)
            self.previous_request = int(time.time())

            # now send the request, allowing for at least 5 replies if the connection seems unstable
            retries = 5
            api_response = None
            while retries > 0:
                try:
                    api_response = requests.get(endpoint,
                                                headers=auth,
                                                params=params)
                    break
                except (ConnectionError,
                        requests.exceptions.RequestException) as e:
                    retries -= 1
                    wait_time = (5 - retries) * 10
                    self.dataset.update_status(
                        "Got %s, waiting %i seconds before retrying" %
                        (str(e), wait_time))
                    time.sleep(wait_time)

            # rate limited - the limit at time of writing is 300 reqs per 15
            # minutes
            # usually you don't hit this when requesting batches of 500 at
            # 1/second
            if api_response.status_code == 429:
                resume_at = convert_to_int(
                    api_response.headers["x-rate-limit-reset"]) + 1
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Hit Twitter rate limit - waiting until %s to continue." %
                    resume_at_str)
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # API keys that are valid but don't have access or haven't been
            # activated properly get a 403
            elif api_response.status_code == 403:
                try:
                    structured_response = api_response.json()
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Could not connect to Twitter API "
                        "with this API key. %s" %
                        structured_response.get("detail", ""),
                        is_final=True)
                except (json.JSONDecodeError, ValueError):
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Your key may not have access to "
                        "the full-archive search endpoint.",
                        is_final=True)
                finally:
                    return

            # sometimes twitter says '503 service unavailable' for unclear
            # reasons - in that case just wait a while and try again
            elif api_response.status_code in (502, 503, 504):
                resume_at = time.time() + 60
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Twitter unavailable (status %i) - waiting until %s to continue."
                    % (api_response.status_code, resume_at_str))
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # this usually means the query is too long or otherwise contains
            # a syntax error
            elif api_response.status_code == 400:
                msg = "Response %i from the Twitter API; " % api_response.status_code
                try:
                    api_response = api_response.json()
                    msg += api_response.get("title", "")
                    if "detail" in api_response:
                        msg += ": " + api_response.get("detail", "")
                except (json.JSONDecodeError, TypeError):
                    msg += "Some of your parameters (e.g. date range) may be invalid."

                self.dataset.update_status(msg, is_final=True)
                return

            # invalid API key
            elif api_response.status_code == 401:
                self.dataset.update_status(
                    "Invalid API key - could not connect to Twitter API",
                    is_final=True)
                return

            # haven't seen one yet, but they probably exist
            elif api_response.status_code != 200:
                self.dataset.update_status(
                    "Unexpected HTTP status %i. Halting tweet collection." %
                    api_response.status_code,
                    is_final=True)
                self.log.warning(
                    "Twitter API v2 responded with status code %i. Response body: %s"
                    % (api_response.status_code, api_response.text))
                return

            elif not api_response:
                self.dataset.update_status(
                    "Could not connect to Twitter. Cancelling.", is_final=True)
                return

            api_response = api_response.json()

            # The API response contains tweets (of course) and 'includes',
            # objects that can be referenced in tweets. Later we will splice
            # this data into the tweets themselves to make them easier to
            # process. So extract them first...
            included_users = api_response.get("includes", {}).get("users", {})
            included_media = api_response.get("includes", {}).get("media", {})
            included_polls = api_response.get("includes", {}).get("polls", {})
            included_tweets = api_response.get("includes",
                                               {}).get("tweets", {})
            included_places = api_response.get("includes",
                                               {}).get("places", {})

            for tweet in api_response.get("data", []):
                if 0 < amount <= tweets:
                    break

                # splice referenced data back in
                # we use copy.deepcopy here because else we run into a
                # pass-by-reference quagmire
                tweet = self.enrich_tweet(tweet, included_users,
                                          included_media, included_polls,
                                          included_places,
                                          copy.deepcopy(included_tweets))

                tweets += 1
                if tweets % 500 == 0:
                    self.dataset.update_status(
                        "Received %i tweets from Twitter API" % tweets)

                yield tweet

            # paginate
            if (amount <= 0 or tweets < amount) and api_response.get(
                    "meta") and "next_token" in api_response["meta"]:
                params["next_token"] = api_response["meta"]["next_token"]
            else:
                break
Ejemplo n.º 15
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a number of files containing
		tokenised posts, grouped per time unit as specified in the parameters.
		"""
        self.dataset.update_status("Processing sentences")

        use_skipgram = 1 if self.parameters.get(
            "algorithm") == "skipgram" else 0
        window = min(
            10,
            max(
                1,
                convert_to_int(self.parameters.get("window"),
                               int(self.options["window"]["default"]))))
        use_negative = 5 if self.parameters.get("negative") else 0
        min_count = max(
            1,
            convert_to_int(self.parameters.get("min_count"),
                           self.options["min_count"]["default"]))
        dimensionality = convert_to_int(self.parameters.get("dimensionality"),
                                        100)
        detect_bigrams = self.parameters.get("detect-bigrams")
        model_type = self.parameters.get("model-type")

        if not model_type:
            model_type = self.options["model-type"]["default"]

        staging_area = self.dataset.get_staging_area()
        model_builder = {
            "Word2Vec": Word2Vec,
            "FastText": FastText
        }[model_type]

        # go through all archived token sets and vectorise them
        models = 0
        for temp_file in self.iterate_archive_contents(self.source_file):
            # use the "list of lists" as input for the word2vec model
            # by default the tokeniser generates one list of tokens per
            # post... which may actually be preferable for short
            # 4chan-style posts. But alternatively it could generate one
            # list per sentence - this processor is agnostic in that regard
            token_set_name = temp_file.name
            self.dataset.update_status(
                "Extracting bigrams from token set %s..." % token_set_name)

            try:
                if detect_bigrams:
                    bigram_transformer = Phrases(
                        self.tokens_from_file(temp_file, staging_area))
                    bigram_transformer = Phraser(bigram_transformer)
                else:
                    bigram_transformer = None

                self.dataset.update_status(
                    "Training %s model for token set %s..." %
                    (model_builder.__name__, token_set_name))
                try:
                    model = model_builder(negative=use_negative,
                                          size=dimensionality,
                                          sg=use_skipgram,
                                          window=window,
                                          workers=3,
                                          min_count=min_count)

                    # we do not simply pass a sentences argument to model builder
                    # because we are using a generator, which exhausts, while
                    # Word2Vec needs to iterate over the sentences twice
                    # https://stackoverflow.com/a/57632747
                    model.build_vocab(
                        self.tokens_from_file(temp_file,
                                              staging_area,
                                              phraser=bigram_transformer))
                    model.train(self.tokens_from_file(
                        temp_file, staging_area, phraser=bigram_transformer),
                                epochs=model.iter,
                                total_examples=model.corpus_count)

                except RuntimeError as e:
                    if "you must first build vocabulary before training the model" in str(
                            e):
                        # not enough data. Skip - if this happens for all models
                        # an error will be generated later
                        continue
                    else:
                        raise e

            except UnicodeDecodeError:
                self.dataset.update_status(
                    "Error reading input data. If it was imported from outside 4CAT, make sure it is encoded as UTF-8.",
                    is_final=True)
                self.dataset.finish(0)
                return

            # save - we only save the KeyedVectors for the model, this
            # saves space and we don't need to re-train the model later
            model_name = token_set_name.split(".")[0] + ".model"
            model.wv.save(str(staging_area.joinpath(model_name)))

            # save vocabulary too, some processors need it
            del model
            models += 1

        if models == 0:
            self.dataset.update_status(
                "Not enough data in source file to train %s models." %
                model_builder.__name__)
            shutil.rmtree(staging_area)
            self.dataset.finish(0)
            return

        # create another archive with all model files in it
        self.dataset.update_status("%s model(s) saved." %
                                   model_builder.__name__)
        self.write_archive_and_finish(staging_area)
Ejemplo n.º 16
0
    def process(self):
        """
		This takes previously generated Word2Vec models and uses them to find
		similar words based on a list of words
		"""
        self.dataset.update_status("Processing sentences")

        depth = max(
            1,
            min(
                3,
                convert_to_int(
                    self.parameters.get(
                        "crawl_depth", self.options["crawl_depth"]["default"]),
                    self.options["crawl_depth"]["default"])))
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        num_words = convert_to_int(self.parameters.get("num-words"),
                                   self.options["num-words"]["default"])
        try:
            threshold = float(
                self.parameters.get("threshold",
                                    self.options["threshold"]["default"]))
        except ValueError:
            threshold = float(self.options["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))

        # go through all models and calculate similarity for all given input words
        result = []
        staging_area = self.unpack_archive_contents(self.source_file)
        for model_file in staging_area.glob("*.model"):
            interval = model_file.stem

            # for each separate model, calculate top similar words for each
            # input word, giving us at most
            #   [max amount] * [number of input] * [number of intervals]
            # items
            self.dataset.update_status("Running model %s..." % model_file.name)
            model = KeyedVectors.load(str(model_file))
            word_queue = set()
            checked_words = set()
            level = 1

            words = input_words.copy()
            while words:
                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while extracting similar words")

                word = words.pop()
                checked_words.add(word)

                try:
                    similar_words = model.most_similar(positive=[word],
                                                       topn=num_words)
                except KeyError:
                    continue

                for similar_word in similar_words:
                    if similar_word[1] < threshold:
                        continue

                    result.append({
                        "date":
                        interval,
                        "input":
                        word,
                        "item":
                        similar_word[0],
                        "value":
                        similar_word[1],
                        "input_occurences":
                        model.vocab[word].count,
                        "item_occurences":
                        model.vocab[similar_word[0]].count,
                        "depth":
                        level
                    })

                    # queue word for the next iteration if there is one and
                    # it hasn't been seen yet
                    if level < depth and similar_word[0] not in checked_words:
                        word_queue.add(similar_word[0])

                # if all words have been checked, but we still have an
                # iteration to go, load the queued words into the list
                if not words and word_queue and level < depth:
                    level += 1
                    words = word_queue.copy()
                    word_queue = set()

        shutil.rmtree(staging_area)

        if not result:
            self.dataset.update_status(
                "None of the words were found in the word embedding model.",
                is_final=True)
            self.dataset.finish(0)
        else:
            self.write_csv_items_and_finish(result)
Ejemplo n.º 17
0
    def process(self):
        """
		Reads vector set and creates a CSV with ranked vectors
		"""
        self.dataset.update_status("Processing token sets")

        def file_to_timestamp(file):
            """
			Get comparable datestamp value for token file

			Token files are named YYYY-m.pb. This function converts that to a
			YYYYmm string, then that string to an int, so that it may be
			compared for sorting chronologically.

			:param str file:  File name
			:return int:  Comparable datestamp
			"""
            stem = file.split("/")[-1].split(".")[0].split("-")
            try:
                return int(stem[0] + stem[1].zfill(2))
            except (ValueError, IndexError):
                return 0

        results = []

        # truncate results as needed
        rank_style = self.parameters.get("top-style",
                                         self.options["top-style"]["default"])
        cutoff = convert_to_int(
            self.parameters.get("top", self.options["top"]["default"]),
            self.options["top"]["default"])

        # now rank the vectors by most prevalent per "file" (i.e. interval)
        overall_top = {}
        index = 0
        for vector_file in self.iterate_archive_contents(self.source_file):
            # we support both pickle and json dumps of vectors
            vector_unpacker = pickle if vector_file.suffix == "pb" else json

            index += 1
            vector_set_name = vector_file.stem  # we don't need the full path
            self.dataset.update_status("Processing token set %i (%s)" %
                                       (index, vector_set_name))

            with vector_file.open("rb") as binary_tokens:
                # these were saved as pickle dumps so we need the binary mode
                vectors = vector_unpacker.load(binary_tokens)

            vectors = sorted(vectors, key=lambda x: x[1], reverse=True)

            # for overall ranking we need the full vector space per interval
            # because maybe an overall top-ranking vector is at the bottom
            # in this particular interval - we'll truncate the top list at
            # a later point in that case. Else, truncate it here
            if rank_style == "per-item":
                vectors = vectors[0:cutoff]

            for vector in vectors:
                if not vector[0].strip():
                    continue

                results.append({
                    "date": vector_set_name.split(".")[0],
                    "item": vector[0],
                    "value": vector[1]
                })

                if vector[0] not in overall_top:
                    overall_top[vector[0]] = 0

                overall_top[vector[0]] += int(vector[1])

        # this eliminates all items from the results that were not in the
        # *overall* top-occuring items. This only has an effect when vectors
        # were generated for multiple intervals
        if rank_style == "overall":
            overall_top = {
                item: overall_top[item]
                for item in sorted(overall_top,
                                   key=lambda x: overall_top[x],
                                   reverse=True)[0:cutoff]
            }
            filtered_results = []
            for item in results:
                if item["item"] in overall_top:
                    filtered_results.append(item)

            results = filtered_results

        # done!
        self.dataset.update_status("Writing results file")
        with open(self.dataset.get_results_path(), "w",
                  encoding="utf-8") as output:
            writer = csv.DictWriter(output,
                                    fieldnames=("date", "item", "value"))
            writer.writeheader()
            for row in results:
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(results))
Ejemplo n.º 18
0
	def process(self):
		"""
		Unzips and appends tokens to fetch and write a tf-idf matrix
		"""

		# Validate and process user inputs - parse to int
		library = self.parameters.get("library", "gensim")
		n_size = convert_to_int(self.parameters.get("n_size", 1), 1)
		min_occurrences = convert_to_int(self.parameters.get("min_occurrences", 1), 1)
		max_occurrences = convert_to_int(self.parameters.get("min_occurrences", -1), -1)
		max_output = convert_to_int(self.parameters.get("max_output", 10), 10)
		smartirs = self.parameters.get("smartirs", "nfc")

		# Get token sets
		self.dataset.update_status("Processing token sets")
		tokens = []
		dates = []

		results_path = self.dataset.get_results_path()
		dirname = Path(results_path.parent, results_path.name.replace(".", ""))

		# Go through all archived token sets and generate collocations for each
		with zipfile.ZipFile(str(self.source_file), "r") as token_archive:
			token_sets = token_archive.namelist()
			index = 0

			# Loop through the tokens (can also be a single set)
			for tokens_name in token_sets:
				if self.interrupted:
					raise ProcessorInterruptedException("Interrupted while loading token sets")

				# Get the date
				date_string = tokens_name.split('.')[0]
				dates.append(date_string)

				# Temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes)
				temp_path = dirname.joinpath(tokens_name)
				token_archive.extract(str(tokens_name), str(dirname))

				# we support both pickle and json dumps of vectors
				token_unpacker = pickle if tokens_name.split(".")[-1] == "pb" else json

				with temp_path.open("rb") as binary_tokens:

					# these were saved as pickle dumps so we need the binary mode
					post_tokens = token_unpacker.load(binary_tokens)
	
					# Flatten the list of list of tokens - we're treating the whole time series as one document.
					post_tokens = list(itertools.chain.from_iterable(post_tokens))

					# Add to all date's tokens
					tokens.append(post_tokens)

				temp_path.unlink()

		# Make sure `min_occurrences` and `max_occurrences` are valid
		if min_occurrences > len(tokens):
			min_occurrences = len(tokens) - 1
		if max_occurrences <= 0 or max_occurrences > len(tokens):
			max_occurrences = len(tokens)

		# Get the tf-idf matrix.
		self.dataset.update_status("Generating tf-idf for token set")
		try:

			if library == "gensim":
				results = self.get_tfidf_gensim(tokens, dates, top_n=max_output, smartirs=smartirs)
			elif library == "scikit-learn":
				results = self.get_tfidf_sklearn(tokens, dates, ngram_range=n_size, min_occurrences=min_occurrences,
								 max_occurrences=max_occurrences, top_n=max_output)
			else:
				self.dataset.update_status("Invalid library.")
				self.dataset.finish(0)
				return

			if results:
				# Generate csv and finish
				self.dataset.update_status("Writing to csv and finishing")
				self.write_csv_items_and_finish(results)

		except MemoryError:
			self.dataset.update_status("Out of memory - dataset too large to run tf-idf analysis.")
			self.dataset.finish(0)
Ejemplo n.º 19
0
    def process(self):
        items = {}
        max_weight = 1
        colour_property = self.options.get(
            "colour_property", self.options["colour_property"]["default"])
        size_property = self.options.get(
            "size_property", self.options["size_property"]["default"])

        # first create a map with the ranks for each period
        with self.source_file.open() as input:
            reader = csv.DictReader(input)

            weight_attribute = "value" if "value" in reader.fieldnames else "frequency"
            item_attribute = "item" if "item" in reader.fieldnames else "text"
            date_attribute = "date" if "date" in reader.fieldnames else "time"

            weighted = (weight_attribute in reader.fieldnames)
            for row in reader:
                if row[date_attribute] not in items:
                    items[row[date_attribute]] = {}

                weight = convert_to_int(row[weight_attribute],
                                        1) if weighted else 1
                items[row[date_attribute]][row[item_attribute]] = weight
                max_weight = max(max_weight, weight)

        # determine per-period changes
        # this is used for determining what colour to give to nodes, and
        # visualise outlying items in the data
        changes = {}
        max_change = 1
        for period in items:
            changes[period] = {}
            for item in items[period]:
                now = items[period][item]
                then = -1
                for previous_period in items:
                    if previous_period == period:
                        break
                    for previous_item in items[previous_period]:
                        if previous_item == item:
                            then = items[previous_period][item]

                if then >= 0:
                    change = abs(now - then)
                    max_change = max(max_change, change)
                    changes[period][item] = change
                else:
                    changes[period][item] = 1

        # some sizing parameters for the chart - experiment with those
        box_width = 12
        box_height = 10  # boxes will never be smaller than this
        box_max_height = 100
        box_gap_x = 90
        box_gap_y = 5

        # don't change this - initial X value for top left box
        box_start_x = 0

        # we use this to know if and where to draw the flow curve between a box
        # and its previous counterpart
        previous_boxes = {}
        previous = []

        # we need to store the svg elements before drawing them to the canvas
        # because we need to know what elements to draw before we can set the
        # canvas up for drawing to
        boxes = []
        labels = []
        flows = []
        definitions = []

        # this is the default colour for items (it's blue-ish)
        # we're using HSV, so we can increase the hue for more prominent items
        base_colour = [.55, .95, .95]
        max_y = 0

        # go through all periods and draw boxes and flows
        for period in items:
            # reset Y coordinate, i.e. start at top
            box_start_y = 0

            for item in items[period]:
                # determine weight (and thereby height) of this particular item
                weight = items[period][item]
                weight_factor = weight / max_weight
                height = int(max(box_height, box_max_height * weight_factor)
                             ) if size_property and weighted else box_height

                # colour ranges from blue to red
                change = changes[period][item]
                change_factor = 0 if not weighted or change <= 0 else (
                    changes[period][item] / max_change)
                colour = base_colour.copy()
                colour[0] += (1 - base_colour[0]) * (
                    weight_factor
                    if colour_property == "weight" else change_factor)

                # first draw the box
                box_fill = "rgb(%i, %i, %i)" % tuple(
                    [int(v * 255) for v in colorsys.hsv_to_rgb(*colour)])
                box = Rect(insert=(box_start_x, box_start_y),
                           size=(box_width, height),
                           fill=box_fill)
                boxes.append(box)

                # then the text label
                label_y = (box_start_y + (height / 2)) + 3
                label = Text(
                    text=(item + (" (%s)" % weight if weight != 1 else "")),
                    insert=(box_start_x + box_width + box_gap_y, label_y))
                labels.append(label)

                # store the max y coordinate, which marks the SVG overall height
                max_y = max(max_y, (box["y"] + box["height"]))

                # then draw the flow curve, if the box was ranked in an earlier
                # period as well
                if item in previous:
                    previous_box = previous_boxes[item]

                    # create a gradient from the colour of the previous box for
                    # this item to this box's colour
                    colour_from = previous_box["fill"]
                    colour_to = box["fill"]

                    gradient = LinearGradient(start=(0, 0), end=(1, 0))
                    gradient.add_stop_color(offset="0%", color=colour_from)
                    gradient.add_stop_color(offset="100%", color=colour_to)
                    definitions.append(gradient)

                    # the addition of ' none' in the auto-generated fill colour
                    # messes up some viewers/browsers, so get rid of it
                    gradient_key = gradient.get_paint_server().replace(
                        " none", "")

                    # calculate control points for the connecting bezier bar
                    # the top_offset determines the 'steepness' of the curve,
                    # experiment with the "/ 2" part to make it less or more
                    # steep
                    top_offset = (box["x"] - previous_box["x"] +
                                  previous_box["width"]) / 2
                    control_top_left = (previous_box["x"] +
                                        previous_box["width"] + top_offset,
                                        previous_box["y"])
                    control_top_right = (box["x"] - top_offset, box["y"])

                    bottom_offset = top_offset  # mirroring looks best
                    control_bottom_left = (previous_box["x"] +
                                           previous_box["width"] +
                                           bottom_offset, previous_box["y"] +
                                           previous_box["height"])
                    control_bottom_right = (box["x"] - bottom_offset,
                                            box["y"] + box["height"])

                    # now add the bezier curves - svgwrite has no convenience
                    # function for beziers unfortunately. we're using cubic
                    # beziers though quadratic could work as well since our
                    # control points are, in principle, mirrored
                    flow_start = (previous_box["x"] + previous_box["width"],
                                  previous_box["y"])
                    flow = Path(fill=gradient_key, opacity="0.35")
                    flow.push("M %f %f" % flow_start)  # go to start
                    flow.push("C %f %f %f %f %f %f" %
                              (*control_top_left, *control_top_right, box["x"],
                               box["y"]))  # top bezier
                    flow.push(
                        "L %f %f" %
                        (box["x"], box["y"] + box["height"]))  # right boundary
                    flow.push("C %f %f %f %f %f %f" %
                              (*control_bottom_right, *control_bottom_left,
                               previous_box["x"] + previous_box["width"],
                               previous_box["y"] +
                               previous_box["height"]))  # bottom bezier
                    flow.push("L %f %f" % flow_start)  # back to start
                    flow.push("Z")  # close path

                    flows.append(flow)

                # mark this item as having appeared previously
                previous.append(item)
                previous_boxes[item] = box

                box_start_y += height + box_gap_y

            box_start_x += (box_gap_x + box_width)

        # generate SVG canvas to add elements to
        canvas = Drawing(self.dataset.get_results_path(),
                         size=(len(items) * (box_width + box_gap_x), max_y),
                         style="font-family:monospace;font-size:8px;")

        # now add the various shapes and paths. We only do this here rather than
        # as we go because only at this point can the canvas be instantiated, as
        # before we don't know the dimensions of the SVG drawing.

        # add our gradients so they can be referenced
        for definition in definitions:
            canvas.defs.add(definition)

        # add flows (which should go beyond the boxes)
        for flow in flows:
            canvas.add(flow)

        # add boxes and labels:
        for item in (*boxes, *labels):
            canvas.add(item)

        # finally, save the svg file
        canvas.saveas(pretty=True,
                      filename=str(self.dataset.get_results_path()))
        self.dataset.finish(len(items) * len(list(items.items()).pop()))
Ejemplo n.º 20
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a plain text file
		containing all post bodies as one continuous string, sanitized.
		"""

        link_regex = re.compile(r"https?://[^\s]+")
        delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")

        # settings
        strip_urls = self.parameters.get("strip-urls",
                                         self.options["strip-urls"]["default"])
        strip_symbols = self.parameters.get(
            "strip-symbols", self.options["strip-symbols"]["default"])
        sides = self.parameters.get("sides", self.options["sides"]["default"])
        self.align = self.parameters.get("align",
                                         self.options["align"]["default"])
        window = convert_to_int(
            self.parameters.get("window", self.options["window"]["default"]),
            5) + 1
        query = self.parameters.get("query", self.options["query"]["default"])
        self.limit = convert_to_int(
            self.parameters.get("limit", self.options["limit"]["default"]),
            100)

        left_branches = []
        right_branches = []

        # do some validation
        if not query.strip() or re.sub(r"\s", "", query) != query:
            self.dataset.update_status(
                "Invalid query for word tree generation. Query cannot be empty or contain whitespace."
            )
            self.dataset.finish(0)
            return

        window = min(window, self.options["window"]["max"] + 1)
        window = max(1, window)

        # find matching posts
        processed = 0
        for post in self.iterate_csv_items(self.source_file):
            processed += 1
            if processed % 500 == 0:
                self.dataset.update_status(
                    "Processing and tokenising post %i" % processed)
            body = post["body"]

            if strip_urls:
                body = link_regex.sub("", body)

            if strip_symbols:
                body = delete_regex.sub("", body)

            body = word_tokenize(body)
            positions = [
                i for i, x in enumerate(body) if x.lower() == query.lower()
            ]

            # get lists of tokens for both the left and right side of the tree
            # on the left side, all lists end with the query, on the right side,
            # they start with the query
            for position in positions:
                right_branches.append(body[position:position + window])
                left_branches.append(body[max(0, position - window):position +
                                          1])

        # Some settings for rendering the tree later
        self.step = self.fontsize * 0.6  # approximately the width of a monospace char
        self.gap = (7 * self.step)  # space for lines between nodes
        width = 1  # will be updated later

        # invert the left side of the tree (because that's the way we want the
        # branching to work for that side)
        # we'll visually invert the nodes in the tree again later
        left_branches = [list(reversed(branch)) for branch in left_branches]

        # first create vertical slices of tokens per level
        self.dataset.update_status("Generating token tree from posts")
        levels_right = [{} for i in range(0, window)]
        levels_left = [{} for i in range(0, window)]
        tokens_left = []
        tokens_right = []

        # for each "level" (each branching point representing a level), turn
        # tokens into nodes, record the max amount of occurences for any
        # token in that level, and keep track of what nodes are in which level.
        # The latter is needed because a token may occur multiple times, at
        # different points in the graph. Do this for both the left and right
        # side of the tree.
        for i in range(0, window):
            for branch in right_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_right[i]:
                    parent = levels_right[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_right[i][token] = Node(token,
                                                  parent=parent,
                                                  occurrences=1,
                                                  is_top_root=(parent is None))
                    tokens_right.append(levels_right[i][token])
                else:
                    levels_right[i][token].occurrences += 1

                occurrences = levels_right[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

            for branch in left_branches:
                if i >= len(branch):
                    continue

                token = branch[i].lower()
                if token not in levels_left[i]:
                    parent = levels_left[i - 1][branch[
                        i - 1].lower()] if i > 0 else None
                    levels_left[i][token] = Node(token,
                                                 parent=parent,
                                                 occurrences=1,
                                                 is_top_root=(parent is None))
                    tokens_left.append(levels_left[i][token])
                else:
                    levels_left[i][token].occurrences += 1

                occurrences = levels_left[i][token].occurrences
                self.max_occurrences[i] = max(
                    occurrences, self.max_occurrences[i]
                ) if i in self.max_occurrences else occurrences

        # nodes that have no siblings can be merged with their parents, else
        # the graph becomes unnecessarily large with lots of single-word nodes
        # connected to single-word nodes. additionally, we want the nodes with
        # the most branches to be sorted to the top, and then only retain the
        # most interesting (i.e. most-occurring) branches
        self.dataset.update_status("Merging and sorting tree nodes")
        for token in tokens_left:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        for token in tokens_right:
            self.merge_upwards(token)
            self.sort_node(token)
            self.limit_subtree(token)

        # somewhat annoyingly, anytree does not simply delete nodes detached
        # from the tree in the previous steps, but makes them root nodes. We
        # don't need these root nodes (we only need the original root), so the
        # next step is to remove all root nodes that are not the main root.
        # We cannot modify a list in-place, so make a new list with the
        # relevant nodes
        level_sizes = {}
        filtered_tokens_right = []
        for token in tokens_right:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_right.append(token)

        filtered_tokens_left = []
        for token in tokens_left:
            if token.is_root and not token.is_top_root:
                continue

            filtered_tokens_left.append(token)

        # now we know which nodes are left, and can therefore determine how
        # large the canvas needs to be - this is based on the max number of
        # branches found on any level of the tree, in other words, the number
        # of "terminal nodes"
        height_left = self.whitespace * self.fontsize * max([
            self.max_breadth(node)
            for node in filtered_tokens_left if node.is_top_root
        ])
        height_right = self.whitespace * self.fontsize * max([
            self.max_breadth(node)
            for node in filtered_tokens_right if node.is_top_root
        ])
        height = max(height_left, height_right)

        canvas = Drawing(str(self.dataset.get_results_path()),
                         size=(width, height),
                         style="font-family:monospace;font-size:%ipx" %
                         self.fontsize)

        # the nodes on the left side of the graph now have the wrong word order,
        # because we reversed them earlier to generate the correct tree
        # hierarchy - now reverse the node labels so they are proper language
        # again
        for token in tokens_left:
            self.invert_node_labels(token)

        wrapper = SVG(overflow="visible")

        self.dataset.update_status("Rendering tree to SVG file")
        if sides != "right":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_left
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_LEFT)

        if sides != "left":
            wrapper = self.render(wrapper, [
                token for token in filtered_tokens_right
                if token.is_root and token.children
            ],
                                  height=height,
                                  side=self.SIDE_RIGHT)

        # things may have been rendered outside the canvas, in which case we
        # need to readjust the SVG properties
        wrapper.update({"x": 0 if self.x_min >= 0 else self.x_min * -1})
        canvas.update({"width": (self.x_max - self.x_min)})

        canvas.add(wrapper)
        canvas.save(pretty=True)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(tokens_left) + len(tokens_right))
Ejemplo n.º 21
0
    def process(self):
        graphs = {}
        intervals = []

        smooth = self.parameters.get("smooth",
                                     self.options["smooth"]["default"])
        normalise_values = self.parameters.get(
            "normalise", self.options["normalise"]["default"])
        completeness = convert_to_int(
            self.parameters.get("complete",
                                self.options["complete"]["default"]), 0)
        graph_label = self.parameters.get("label",
                                          self.options["label"]["default"])
        top = convert_to_int(
            self.parameters.get("top", self.options["top"]["default"]), 10)

        # first gather graph data: each distinct item gets its own graph and
        # for each graph we have a sequence of intervals, each interval with
        # its own value
        first_date = "9999-99-99"
        last_date = "0000-00-00"

        with self.source_file.open() as input:
            reader = csv.DictReader(input)

            item_key = "text" if "text" in reader.fieldnames else "item"
            date_key = "time" if "time" in reader.fieldnames else "date"
            value_key = "value" if "value" in reader.fieldnames else "frequency"

        for row in self.iterate_csv_items(self.source_file):
            if row[item_key] not in graphs:
                graphs[row[item_key]] = {}

            # make sure the months and days are zero-padded
            interval = row.get(date_key, "")
            interval = "-".join([
                str(bit).zfill(2 if len(bit) != 4 else 4)
                for bit in interval.split("-")
            ])
            first_date = min(first_date, interval)
            last_date = max(last_date, interval)

            if interval not in intervals:
                intervals.append(interval)

            if interval not in graphs[row[item_key]]:
                graphs[row[item_key]][interval] = 0

            graphs[row[item_key]][interval] += float(row.get(value_key, 0))

        # first make sure we actually have something to render
        intervals = sorted(intervals)
        if len(intervals) <= 1:
            self.dataset.update_status(
                "Not enough data for a side-by-side over-time visualisation.")
            self.dataset.finish(0)
            return

        # only retain most-occurring series - sort by sum of all frequencies
        if len(graphs) > top:
            selected_graphs = {
                graph: graphs[graph]
                for graph in sorted(
                    graphs,
                    key=lambda x: sum(
                        [graphs[x][interval] for interval in graphs[x]]),
                    reverse=True)[0:top]
            }
            graphs = selected_graphs

        # there may be items that do not have values for all intervals
        # this will distort the graph, so the next step is to make sure all
        # graphs consist of the same continuous interval list
        missing = {graph: 0 for graph in graphs}
        for graph in graphs:
            missing[graph], graphs[graph] = pad_interval(
                graphs[graph],
                first_interval=first_date,
                last_interval=last_date)

        # now that's done, make sure the graph datapoints are in order
        intervals = sorted(list(graphs[list(graphs)[0]].keys()))

        # delete graphs that do not have the required amount of intervals
        # this is useful to get rid of outliers and items that only occur
        # very few times over the full interval
        if completeness > 0:
            intervals_required = len(intervals) * (completeness / 100)
            disqualified = []
            for graph in graphs:
                if len(intervals) - missing[graph] < intervals_required:
                    disqualified.append(graph)

            graphs = {
                graph: graphs[graph]
                for graph in graphs if graph not in disqualified
            }

        # determine max value per item, so we can normalize them later
        limits = {}
        max_limit = 0
        for graph in graphs:
            for interval in graphs[graph]:
                limits[graph] = max(limits.get(graph, 0),
                                    abs(graphs[graph][interval]))
                max_limit = max(max_limit, abs(graphs[graph][interval]))

        # order graphs by highest (or lowest) value)
        limits = {
            limit: limits[limit]
            for limit in sorted(limits, key=lambda l: limits[l])
        }
        graphs = {graph: graphs[graph] for graph in limits}

        if not graphs:
            # maybe nothing is actually there to be graphed
            self.dataset.update_status(
                "No items match the selection criteria - nothing to visualise."
            )
            self.dataset.finish(0)
            return None

        # how many vertical grid lines (and labels) are to be included at most
        # 12 is a sensible default because it allows one label per month for a full
        # year's data
        max_gridlines = 12

        # If True, label is put at the lower left bottom of the graph rather than
        # outside it. Automatically set to True if one of the labels is long, as
        # else the label would fall off the screen
        label_in_graph = max([len(item) for item in graphs]) > 30

        # determine how wide each interval should be
        # the graph has a minimum width - but the graph's width will be
        # extended if at this minimum width each item does not have the
        # minimum per-item width
        min_full_width = 600
        min_item_width = 50
        item_width = max(min_item_width, min_full_width / len(intervals))

        # determine how much space each graph should get
        # same trade-off as for the interval width
        min_full_height = 300
        min_item_height = 100
        item_height = max(min_item_height, min_full_height / len(graphs))

        # margin - this should be enough for the text labels to fit in
        margin_base = 50
        margin_right = margin_base * 4
        margin_top = margin_base * 3

        # this determines the "flatness" of the isometric projection and an be
        # tweaked for different looks - basically corresponds to how far the
        # camera is above the horizon
        plane_angle = 120

        # don't change these
        plane_obverse = radians((180 - plane_angle) / 2)
        plane_angle = radians(plane_angle)

        # okay, now determine the full graphic size with these dimensions projected
        # semi-isometrically. We can also use these values later for drawing for
        # drawing grid lines, et cetera. The axis widths and heights here are the
        # dimensions of the bounding box wrapping the isometrically projected axes.
        x_axis_length = (item_width * (len(intervals) - 1))
        y_axis_length = (item_height * len(graphs))

        x_axis_width = (sin(plane_angle / 2) * x_axis_length)
        y_axis_width = (sin(plane_angle / 2) * y_axis_length)
        canvas_width = x_axis_width + y_axis_width

        # leave room for graph header
        if graph_label:
            margin_top += (2 * (canvas_width / 50))

        x_axis_height = (cos(plane_angle / 2) * x_axis_length)
        y_axis_height = (cos(plane_angle / 2) * y_axis_length)
        canvas_height = x_axis_height + y_axis_height

        # now we have the dimensions, the canvas can be instantiated
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width=(canvas_width + margin_base + margin_right),
            height=(canvas_height + margin_base + margin_top),
            header=graph_label)

        # draw gridlines - vertical
        gridline_x = y_axis_width + margin_base
        gridline_y = margin_top + canvas_height

        step_x_horizontal = sin(plane_angle / 2) * item_width
        step_y_horizontal = cos(plane_angle / 2) * item_width
        step_x_vertical = sin(plane_angle / 2) * item_height
        step_y_vertical = cos(plane_angle / 2) * item_height

        # labels for x axis
        skip = max(1, int(len(intervals) / max_gridlines))
        for i in range(0, len(intervals)):
            if i % skip == 0:
                canvas.add(
                    Line(start=(gridline_x, gridline_y),
                         end=(gridline_x - y_axis_width,
                              gridline_y - y_axis_height),
                         stroke="grey",
                         stroke_width=0.25))

                # to properly position the rotated and skewed text a container
                # element is needed
                label1 = str(intervals[i])[0:4]
                center = (gridline_x, gridline_y)
                container = SVG(x=center[0] - 25,
                                y=center[1],
                                width="50",
                                height="1.5em",
                                overflow="visible",
                                style="font-size:0.8em;")
                container.add(
                    Text(insert=("25%", "100%"),
                         text=label1,
                         transform="rotate(%f) skewX(%f)" %
                         (-degrees(plane_obverse), degrees(plane_obverse)),
                         text_anchor="middle",
                         baseline_shift="-0.5em",
                         style="font-weight:bold;"))

                if re.match(r"^[0-9]{4}-[0-9]{2}", intervals[i]):
                    label2 = month_abbr[int(str(intervals[i])[5:7])]
                    if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}", intervals[i]):
                        label2 += " %i" % int(intervals[i][8:10])

                    container.add(
                        Text(insert=("25%", "150%"),
                             text=label2,
                             transform="rotate(%f) skewX(%f)" %
                             (-degrees(plane_obverse), degrees(plane_obverse)),
                             text_anchor="middle",
                             baseline_shift="-0.5em"))

                canvas.add(container)

            gridline_x += step_x_horizontal
            gridline_y -= step_y_horizontal

        # draw graphs as filled beziers
        top = step_y_vertical * 1.5
        graph_start_x = y_axis_width + margin_base
        graph_start_y = margin_top + canvas_height

        # draw graphs in reverse order, so the bottom one is most in the
        # foreground (in case of overlap)
        for graph in reversed(list(graphs)):
            self.dataset.update_status("Rendering graph for '%s'" % graph)

            # path starting at lower left corner of graph
            area_graph = Path(fill=self.colours[self.colour_index])
            area_graph.push("M %f %f" % (graph_start_x, graph_start_y))
            previous_value = None

            graph_x = graph_start_x
            graph_y = graph_start_y
            for interval in graphs[graph]:
                # normalise value
                value = graphs[graph][interval]
                try:
                    limit = limits[graph] if normalise_values else max_limit
                    value = top * copysign(abs(value) / limit, value)
                except ZeroDivisionError:
                    value = 0

                if previous_value is None:
                    # vertical line upwards to starting value of graph
                    area_graph.push("L %f %f" %
                                    (graph_start_x, graph_start_y - value))
                elif not smooth:
                    area_graph.push("L %f %f" % (graph_x, graph_y - value))
                else:
                    # quadratic bezier from previous value to current value
                    control_left = (graph_x - (step_x_horizontal / 2),
                                    graph_y + step_y_horizontal -
                                    previous_value - (step_y_horizontal / 2))
                    control_right = (graph_x - (step_x_horizontal / 2),
                                     graph_y - value + (step_y_horizontal / 2))
                    area_graph.push("C %f %f %f %f %f %f" %
                                    (*control_left, *control_right, graph_x,
                                     graph_y - value))

                previous_value = value
                graph_x += step_x_horizontal
                graph_y -= step_y_horizontal

            # line to the bottom of the graph at the current Y position
            area_graph.push(
                "L %f %f" %
                (graph_x - step_x_horizontal, graph_y + step_y_horizontal))
            area_graph.push("Z")  # then close the Path
            canvas.add(area_graph)

            # add text labels - skewing is a bit complicated and we need a
            # "center" to translate the origins properly.
            if label_in_graph:
                insert = (graph_start_x + 5, graph_start_y - 10)
            else:
                insert = (graph_x - (step_x_horizontal) + 5,
                          graph_y + step_y_horizontal - 10)

            # we need to take the skewing into account for the translation
            offset_y = tan(plane_obverse) * insert[0]
            canvas.add(
                Text(insert=(0, 0),
                     text=graph,
                     transform="skewY(%f) translate(%f %f)" %
                     (-degrees(plane_obverse), insert[0],
                      insert[1] + offset_y)))

            # cycle colours, back to the beginning if all have been used
            self.colour_index += 1
            if self.colour_index >= len(self.colours):
                self.colour_index = 0

            graph_start_x -= step_x_vertical
            graph_start_y -= step_y_vertical

        # draw gridlines - horizontal
        gridline_x = margin_base
        gridline_y = margin_top + canvas_height - y_axis_height
        for graph in graphs:
            gridline_x += step_x_vertical
            gridline_y += step_y_vertical
            canvas.add(
                Line(start=(gridline_x, gridline_y),
                     end=(gridline_x + x_axis_width,
                          gridline_y - x_axis_height),
                     stroke="black",
                     stroke_width=1))

        # x axis
        canvas.add(
            Line(start=(margin_base + y_axis_width,
                        margin_top + canvas_height),
                 end=(margin_base + canvas_width,
                      margin_top + canvas_height - x_axis_height),
                 stroke="black",
                 stroke_width=2))

        # and finally save the SVG
        canvas.save(pretty=True)
        self.dataset.finish(len(graphs))
Ejemplo n.º 22
0
	def process(self):
		"""
		Reads a CSV file, counts occurences of chosen values over all posts,
		and aggregates the results per chosen time frame
		"""

		# convenience variables
		timeframe = self.parameters.get("timeframe", self.options["timeframe"]["default"])
		scope = self.parameters.get("scope", self.options["scope"]["default"])
		rank_style = self.parameters.get("top-style", self.options["top-style"]["default"])
		cutoff = convert_to_int(self.parameters.get("top", self.options["top"]["default"]))

		# This is needed to check for URLs in the "domain" and "url" columns for Reddit submissions
		datasource = self.parent.parameters.get("datasource")

		# now for the real deal
		self.dataset.update_status("Reading source file")
		overall_top = {}
		interval_top = {}

		for post in self.iterate_csv_items(self.source_file):
			# determine where to put this data
			if timeframe == "all":
				time_unit = "overall"
			else:
				try:
					timestamp = int(datetime.datetime.strptime(post["timestamp"], "%Y-%m-%d %H:%M:%S").timestamp())
				except ValueError:
					timestamp = 0
				date = datetime.datetime.fromtimestamp(timestamp)
				if timeframe == "year":
					time_unit = str(date.year)
				elif timeframe == "month":
					time_unit = str(date.year) + "-" + str(date.month).zfill(2)
				else:
					time_unit = str(date.year) + "-" + str(date.month).zfill(2) + "-" + str(date.day).zfill(2)

			if time_unit not in interval_top:
				interval_top[time_unit] = {}

			if scope == "unambiguous":
				terms = post["hatebase_terms_unambiguous"]
			elif scope == "ambiguous":
				terms = post["hatebase_terms_ambiguous"]
			else:
				terms = post["hatebase_terms"]

			terms = terms.split(",")
			if not terms:
				continue

			for term in terms:
				if not term.strip():
					continue

				if term not in overall_top:
					overall_top[term] = 0

				overall_top[term] += 1

				if term not in interval_top[time_unit]:
					interval_top[time_unit][term] = 0

				interval_top[time_unit][term] += 1

		# this eliminates all items from the results that were not in the
		# *overall* top-occuring items. This only has an effect when vectors
		# were generated for multiple intervals
		if rank_style == "overall":
			overall_top = {item: overall_top[item] for item in
						   sorted(overall_top, key=lambda x: overall_top[x], reverse=True)[0:cutoff]}

			filtered_results = {}
			for interval in interval_top:
				filtered_results[interval] = {}
				for term in interval_top[interval]:
					if term in overall_top:
						filtered_results[interval][term] = interval_top[interval][term]

			interval_top = filtered_results

		rows = []
		for interval in interval_top:
			interval_top[interval] = {term: interval_top[interval][term] for term in
									  sorted(interval_top[interval], reverse=True,
											 key=lambda x: interval_top[interval][x])[0:cutoff]}

		for interval in sorted(interval_top):
			for term in interval_top[interval]:
				rows.append({
					"date": interval,
					"item": term,
					"frequency": interval_top[interval][term]
				})

		# write as csv
		if rows:
			self.write_csv_items_and_finish(rows)
		else:
			self.dataset.finish(0)
Ejemplo n.º 23
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a new CSV file
		with one column with image hashes, one with the first file name used
		for the image, and one with the amount of times the image was used
		"""
        self.dataset.update_status("Reading source file")

        # prepare
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        sample_max = 75  # image size for colour sampling

        def numpy_to_rgb(numpy_array):
            """
			Helper function to go from numpy array to list of RGB strings

			Used in the K-Means clustering part
			"""
            return ",".join([str(int(value)) for value in numpy_array])

        max_images = convert_to_int(self.parameters.get("amount"), 100)
        sizing_mode = self.parameters.get("tile-size",
                                          self.options["tile-size"]["default"])
        sort_mode = self.parameters.get("sort-mode")

        # is there anything to put on a wall?
        if self.source_dataset.num_rows == 0:
            self.dataset.update_status(
                "No images available to render to image wall.", is_final=True)
            self.dataset.finish(0)
            return

        # 0 = use as many images as in the archive, up to the max
        if max_images == 0:
            max_images = self.options["amount"]["max"]

        # we loop through the images twice - once to reduce them to a value
        # that can be sorted, and another time to actually copy them to the
        # canvas for the image wall

        # we create a staging area manually here, so it is not automatically
        # deleted after one loop, since we need two
        staging_area = self.dataset.get_staging_area()

        # first, extract and reduce, and store the sortable value in a
        # dictionary with the image file name as key
        image_colours = {}
        dimensions = {}  # used to calculate optimal tile size later
        index = 0
        random_values = list(range(0, self.source_dataset.num_rows))
        random.shuffle(random_values)

        for path in self.iterate_archive_contents(self.source_file,
                                                  staging_area):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while determining image wall order")

            try:
                picture = Image.open(str(path))
            except UnidentifiedImageError:
                self.dataset.update_status(
                    "Image %s could not be parsed. Skipping." % path)
                continue

            self.dataset.update_status(
                "Analysing %s (%i/%i)" %
                (path.name, len(dimensions), self.source_dataset.num_rows))

            # these calculations can take ages for huge images, so resize if it is
            # larger than the threshold
            dimensions[path.name] = (picture.width, picture.height)
            if sort_mode not in ("",
                                 "random") and (picture.height > sample_max
                                                or picture.width > sample_max):
                sample_width = int(sample_max * picture.width /
                                   max(picture.width, picture.height))
                sample_height = int(sample_max * picture.height /
                                    max(picture.width, picture.height))
                picture = ImageOps.fit(picture, (sample_width, sample_height))

            if sort_mode not in ("", "random"):
                # ensure we get RGB values for pixels
                picture = picture.convert("RGB")

            # determine a 'representative colour'
            if sort_mode == "random":
                # just randomly sort it, don't even look at the colours
                value = random_values.pop()

            elif sort_mode in ("average-rgb", "average-hsv"):
                # average colour, as RGB or HSV
                pixels = picture.getdata()
                if sort_mode == "average-hsv":
                    pixels = [colorsys.rgb_to_hsv(*pixel) for pixel in pixels]

                sum_colour = (sum([p[0] for p in pixels]),
                              sum([p[1] for p in pixels]),
                              sum([p[2] for p in pixels]))
                avg_colour = (sum_colour[0] / len(pixels),
                              sum_colour[1] / len(pixels),
                              sum_colour[2] / len(pixels))

                # this is a bit dumb, but since all the other modes return rgb...
                if sort_mode == "average-hsv":
                    avg_colour = colorsys.hsv_to_rgb(*avg_colour)

                value = avg_colour

            elif sort_mode == "dominant":
                # most-occurring colour
                colours = picture.getcolors(picture.width * picture.height)
                colours = sorted(colours, key=lambda x: x[0], reverse=True)
                value = colours[0][1]

            elif sort_mode in ("kmeans-dominant", "kmeans-average"):
                # use k-means clusters to determine the representative colour
                # this is more computationally expensive but gives far better
                # results.

                # determine k-means clusters for this image, i.e. the n most
                # dominant "average" colours, in this case n=3 (make parameter?)
                pixels = picture.getdata()
                clusters = KMeans(n_clusters=3,
                                  random_state=0)  # 0 so it is deterministic
                predicted_centroids = clusters.fit_predict(pixels).tolist()

                # now we have two options -
                if sort_mode == "kmeans-dominant":
                    # the colour of the single most dominant k-means centroid
                    ranked_centroids = {}
                    for index in range(0, len(clusters.cluster_centers_)):
                        ranked_centroids[numpy_to_rgb(
                            clusters.cluster_centers_[index]
                        )] = predicted_centroids.count(index)

                    value = [
                        int(v)
                        for v in sorted(ranked_centroids,
                                        key=lambda k: ranked_centroids[k],
                                        reverse=True)[0].split(",")
                    ]

                elif sort_mode == "kmeans-average":
                    # average colour of all k-means centroids, weighted by the
                    # dominance of each centroid
                    value = [0, 0, 0]
                    for index in clusters.labels_:
                        value[0] += clusters.cluster_centers_[index][0]
                        value[1] += clusters.cluster_centers_[index][1]
                        value[2] += clusters.cluster_centers_[index][2]

                    value[0] /= len(clusters.labels_)
                    value[1] /= len(clusters.labels_)
                    value[2] /= len(clusters.labels_)

            else:
                value = (0, 0, 0)

            # converted to HSV, because RGB does not sort nicely
            image_colours[path.name] = colorsys.rgb_to_hsv(*value)
            index += 1

        # only retain the top n of the sorted list of images - this gives us
        # our final image set
        sorted_image_files = [
            path for path in sorted(
                image_colours, key=lambda k: image_colours[k])[:max_images]
        ]
        dimensions = {path: dimensions[path] for path in sorted_image_files}
        average_size = (sum([k[0]
                             for k in dimensions.values()]) / len(dimensions),
                        sum([k[1]
                             for k in dimensions.values()]) / len(dimensions))

        self.dataset.update_status("Determining canvas and image sizes")

        # calculate 'tile sizes' (a tile is an image) and also the size of the
        # canvas we will need to fit them all. The canvas can never be larger than
        # this:
        max_pixels = self.TARGET_WIDTH * self.TARGET_HEIGHT

        if sizing_mode == "fit-height":
            # assuming every image has the overall average height, how wide would
            # the canvas need to be (if everything is on a single row)?
            full_width = 0
            tile_y = average_size[1]
            for dimension in dimensions.values():
                # ideally, we make everything the average height
                optimal_ratio = average_size[1] / dimension[1]
                full_width += dimension[0] * optimal_ratio

            # now we can calculate the total amount of pixels needed
            fitted_pixels = full_width * tile_y
            if fitted_pixels > max_pixels:
                # try again with a lower height
                area_ratio = max_pixels / fitted_pixels
                tile_y = int(tile_y * math.sqrt(area_ratio))
                fitted_pixels = max_pixels

            # find the canvas size that can fit this amount of pixels at the
            # required proportions, provided that y = multiple of avg height
            ideal_height = math.sqrt(fitted_pixels /
                                     (self.TARGET_WIDTH / self.TARGET_HEIGHT))
            size_y = math.ceil(ideal_height / tile_y) * tile_y
            size_x = fitted_pixels / size_y

            tile_x = -1  # varies

        elif sizing_mode == "square":
            # assuming each image is square, find a canvas with the right
            # proportions that would fit all of them
            # assume the average dimensions
            tile_size = int(sum(average_size) / 2)

            # this is how many pixels we need
            fitted_pixels = tile_size * tile_size * len(sorted_image_files)

            # does that fit our canvas?
            if fitted_pixels > max_pixels:
                tile_size = math.floor(
                    math.sqrt(max_pixels / len(sorted_image_files)))
                fitted_pixels = tile_size * tile_size * len(sorted_image_files)

            ideal_width = math.sqrt(fitted_pixels /
                                    (self.TARGET_HEIGHT / self.TARGET_WIDTH))
            size_x = math.ceil(ideal_width / tile_size) * tile_size
            size_y = math.ceil(fitted_pixels / size_x / tile_size) * tile_size

            tile_x = tile_y = tile_size

        elif sizing_mode == "average":
            tile_x = int(average_size[0])
            tile_y = int(average_size[1])

            fitted_pixels = tile_x * tile_y * len(sorted_image_files)
            if fitted_pixels > max_pixels:
                area_ratio = max_pixels / fitted_pixels
                tile_x = int(tile_x * math.sqrt(area_ratio))
                tile_y = int(tile_y * math.sqrt(area_ratio))
                fitted_pixels = tile_x * tile_y * len(sorted_image_files)

            ideal_width = math.sqrt(fitted_pixels /
                                    (self.TARGET_HEIGHT / self.TARGET_WIDTH))
            size_x = math.ceil(ideal_width / tile_x) * tile_x
            size_y = math.ceil(fitted_pixels / size_x / tile_y) * tile_y

        else:
            raise NotImplementedError("Sizing mode '%s' not implemented" %
                                      sizing_mode)

        self.dataset.log("Canvas size is %ix%i" % (size_x, size_y))
        wall = Image.new("RGBA", (int(size_x), int(size_y)))
        ImageDraw.floodfill(wall, (0, 0),
                            (255, 255, 255, 0))  # transparent background
        counter = 0
        offset_x = 0
        offset_y = 0

        tile_x = int(tile_x)
        tile_y = int(tile_y)

        # now actually putting the images on a wall is relatively trivial
        for path in sorted_image_files:
            counter += 1
            self.dataset.update_status(
                "Rendering %s (%i/%i) to image wall" %
                (path, counter, len(sorted_image_files)))
            picture = Image.open(str(staging_area.joinpath(path)))

            if tile_x == -1:
                picture_x = max(1,
                                int(picture.width * (tile_y / picture.height)))
                picture = ImageOps.fit(picture, (picture_x, tile_y),
                                       method=Image.BILINEAR)
            else:
                picture = ImageOps.fit(picture, (tile_x, tile_y),
                                       method=Image.BILINEAR)

            # simply put them side by side until the right edge is reached,
            # then move to a new row
            if offset_x + picture.width > wall.width:
                offset_x = 0
                offset_y += picture.height

            # this can happen in some edge cases: there is an extra row of
            # images we hadn't accounted for. In that case, simply enlarge the
            # canvas.
            if offset_y + picture.height > wall.height:
                new_wall = Image.new("RGBA",
                                     (wall.width, offset_y + picture.height))
                ImageDraw.floodfill(
                    new_wall, (0, 0),
                    (255, 255, 255, 0))  # transparent background
                new_wall.paste(wall, (0, 0))
                wall = new_wall

            wall.paste(picture, (offset_x, offset_y))
            offset_x += picture.width

        # finish up
        self.dataset.update_status("Saving result")
        wall.save(str(self.dataset.get_results_path()))
        shutil.rmtree(staging_area)

        self.dataset.update_status("Finished")
        self.dataset.finish(counter)