def process(self):
        """
        This takes a Twitter NDJSON file to be importable as a JSON file by TCAT's import-jsondump.php
        """
        posts = 0
        self.dataset.update_status("Converting posts")

        # This handles and writes one Tweet at a time
        with self.dataset.get_results_path().open("w") as output:
            for post in self.iterate_items(self.source_file,
                                           bypass_map_item=True):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing NDJSON file")

                posts += 1

                post = self.map_to_TCAT(post)

                # TCAT has a check on line 62 of /import/import-jsondump.php
                # that rejects strings large than 40960
                #https://github.com/digitalmethodsinitiative/dmi-tcat/blob/9654fe3ff489fd3b0efc6ddcf7c19adf8ed7726d/import/import-jsondump.php#L62
                # We are obviously dropping some tweets because of this
                if len(json.dumps(post)) < 40960:
                    output.write(json.dumps(post, ensure_ascii=False))
                    # NDJSON file is expected by TCAT
                    output.write('\n')

        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
Exemple #2
0
    def fetch_posts(self,
                    post_ids,
                    where=None,
                    replacements=None,
                    groups=None):
        """
		Fetch post data from database

		:param list post_ids:  List of post IDs to return data for
		:return list: List of posts, with a dictionary representing the database record for each post
		"""
        if not where:
            where = []

        if not replacements:
            replacements = []

        columns = ", ".join(self.return_cols)
        where.append("id IN %s")
        replacements.append(post_ids)

        if self.interrupted:
            raise ProcessorInterruptedException(
                "Interrupted while fetching post data")

        if groups:
            where.append("id IN ( SELECT post_id FROM groups_" + self.prefix +
                         " WHERE \"group\" LIKE ANY(%s) )")
            replacements.append(groups)

        query = "SELECT " + columns + " FROM posts_" + self.prefix + " WHERE " + " AND ".join(
            where) + " ORDER BY id ASC"
        return self.db.fetchall_interruptable(self.queue, query, replacements)
Exemple #3
0
    def call_penelope_api(self, endpoint, *args, **kwargs):
        """
		Call PENELOPE API and don't crash (immediately) if it fails

		:param endpoint: Endpoint to call relative to HTTP root
		:param args:
		:param kwargs:
		:return: Response, or `None`
		"""
        retries = 0
        while retries < self.max_retries:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from the Penelope API")

            try:
                url = "http://penelope.vub.be/guardian-climate-change-data/" + endpoint
                response = requests.get(url, *args, **kwargs)
                break
            except requests.RequestException as e:
                self.log.info(
                    "Error %s while querying PENELOPE Guardian API - retrying..."
                    % e)
                retries += 1

        if retries >= self.max_retries:
            self.log.error("Error during PENELOPE fetch of query %s" %
                           self.dataset.key)
            self.dataset.update_status(
                "Error while searching for posts on PENELOPE Guardian API")
            return None
        else:
            return response.json()
Exemple #4
0
    def get_post_by_id(self, blog_name, post_id):
        """
		Fetch individual posts
		:param blog_name, str: The blog's name
		:param id, int: The post ID

		returns result list, a list with a dictionary with the post's information
		"""
        if self.interrupted:
            raise ProcessorInterruptedException(
                "Interrupted while fetching post from Tumblr")

        client = self.connect_to_tumblr()

        # Request the specific post.
        post = client.posts(blog_name, id=post_id)

        # Tumblr API can sometimes return with this kind of error:
        # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
        if "posts" not in post:
            return None

        # Get the first element of the list - it's always one post.
        result = post["posts"][0]

        return result
Exemple #5
0
    def process(self):
        """
		This takes a CSV file as input and writes the same data as a JSON file
		"""
        posts = 0
        self.dataset.update_status("Converting posts")

        # we write to file per row, instead of json.dumps()ing all of it at
        # once, since else we risk having to keep a lot of data in memory,
        # and this buffers one row at most
        with self.dataset.get_results_path().open("w") as output:
            output.write("[")
            for post in self.iterate_items(self.source_file):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing CSV file")

                posts += 1

                if posts > 1:
                    output.write(",")

                output.write(json.dumps(post))
            output.write("]")

        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
Exemple #6
0
    def process(self):
        """
        This takes a 4CAT results file as input, and outputs a new CSV file
        with one column with image hashes, one with the first file name used
        for the image, and one with the amount of times the image was used
        """
        api_key = self.parameters.get("api_key")
        self.dataset.delete_parameter("api_key")  # sensitive, delete after use

        features = self.parameters.get("features")
        features = [{"type": feature} for feature in features]

        if not api_key:
            self.dataset.update_status("You need to provide a valid API key",
                                       is_final=True)
            self.dataset.finish(0)
            return

        max_images = convert_to_int(self.parameters.get("amount", 0), 100)
        total = self.source_dataset.num_rows if not max_images else min(
            max_images, self.source_dataset.num_rows)
        done = 0

        for image_file in self.iterate_archive_contents(self.source_file):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from Google Vision API")

            done += 1
            self.dataset.update_status("Annotating image %i/%i" %
                                       (done, total))

            try:
                annotations = self.annotate_image(image_file, api_key,
                                                  features)
            except RuntimeError:
                # cannot continue fetching, e.g. when API key is invalid
                break

            if not annotations:
                continue

            annotations = {"file_name": image_file.name, **annotations}

            with self.dataset.get_results_path().open(
                    "a", encoding="utf-8") as outfile:
                outfile.write(json.dumps(annotations) + "\n")

            if max_images and done >= max_images:
                break

        self.dataset.update_status("Annotations retrieved for %i images" %
                                   done)
        self.dataset.finish(done)
    def tokens_from_file(self, file, staging_area, phraser=None):
        """
		Read tokens from token dump

		If the tokens were saved as JSON, take advantage of this and return
		them as a generator, reducing memory usage and allowing interruption.

		:param Path file:
		:param Path staging_area:  Path to staging area, so it can be cleaned
		up when the processor is interrupted
		:param Phraser phraser:  Optional. If given, the yielded sentence is
		passed through the phraser to detect (e.g.) bigrams.
		:return list:  A set of tokens
		"""

        if file.suffix == "pb":
            with file.open("rb") as input:
                return pickle.load(input)

        with file.open("r") as input:
            input.seek(1)
            while True:
                line = input.readline()
                if line is None:
                    break

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while reading tokens")

                if line == "]":
                    # this marks the end of the file
                    return

                try:
                    # the tokeniser dumps the json with one set of tokens per
                    # line, ending with a comma
                    line = line.strip()
                    if line[-1] == ",":
                        line = line[:-1]

                    token_set = json.loads(line)
                    if phraser:
                        yield phraser[token_set]
                    else:
                        yield token_set
                except json.JSONDecodeError:
                    # old-format json dumps are not suitable for the generator
                    # approach
                    input.seek(0)
                    everything = json.load(input)
                    return everything
Exemple #8
0
    def iterate_archive_contents(self, path, staging_area=None):
        """
		A generator that iterates through files in an archive

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are temporarily unzipped and deleted after use.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return:  An iterator with a Path item for each file
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area") and not staging_area:
                self.staging_area = self.dataset.get_staging_area()
                staging_area = self.staging_area

        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    if hasattr(self, "staging_area"):
                        shutil.rmtree(self.staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(file_name, staging_area)

                yield temp_file
                if hasattr(self, "staging_area"):
                    temp_file.unlink()

        if hasattr(self, "staging_area"):
            shutil.rmtree(self.staging_area)
            del self.staging_area
Exemple #9
0
        def resolve_redirect(url, depth=0):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while expanding URL")

            if hasattr(url, "group"):
                url = url.group(0)

            # get host name to compare to list of shorteners
            host_name = re.sub(r"^[a-z]*://", "", url).split("/")[0].lower()

            if depth >= 10:
                return url

            elif "api.parler.com/l" not in url and host_name not in self.redirect_domains:
                # skip non-redirects
                return url

            elif url in cache:
                return cache[url]

            # to avoid infinite recursion, do not go deeper than 5 loops and
            # keep track of current depth here:
            depth += 1

            # do this explicitly because it is a known issue and will save
            # one request
            if host_name == "t.co" and "http://" in url:
                url = url.replace("http://", "https://")

            try:
                time.sleep(0.1)
                head_request = requests.head(url, timeout=5)
            except (requests.RequestException, ConnectionError, ValueError,
                    TimeoutError) as e:
                return url

            # if the returned page's status code is in the 'valid request'
            # range, and if it has a Location header different from the page's
            # url, recursively resolve the page it redirects to up to a given
            # depth - infinite recursion is prevented by using a cache
            if 200 <= head_request.status_code < 400:
                redirected_to = head_request.headers.get("Location", url)
                if redirected_to != url:
                    cache[url] = redirected_to
                    return resolve_redirect(redirected_to, depth)

            return url
Exemple #10
0
    def unpack_archive_contents(self, path, staging_area=None):
        """
		Unpack all files in an archive to a staging area

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are unzipped to a staging area. The staging area is *not*
		cleaned up automatically.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return Path:  A path to the staging area
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area"):
                self.staging_area = self.dataset.get_staging_area()

            staging_area = self.staging_area

        paths = []
        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(archived_file, staging_area)
                paths.append(temp_file)

        return staging_area
Exemple #11
0
    async def gather_posts(self, client, queries, max_items):
        """
		Gather messages for each entity for which messages are requested

		:param TelegramClient client:  Telegram Client
		:param list queries:  List of entities to query (as string)
		:param int max_items:  Messages to scrape per entity
		:return list:  List of messages, each message a dictionary.
		"""
        posts = []
        for query in queries:
            self.dataset.update_status("Fetching messages for entity '%s'" %
                                       query)
            query_posts = []
            i = 0
            try:
                async for message in client.iter_messages(entity=query):
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while fetching message data from the Telegram API"
                        )

                    if i % 500 == 0:
                        self.dataset.update_status(
                            "Retrieved %i posts for entity '%s'" %
                            (len(query_posts) + len(posts), query))

                    if message.action is not None:
                        # e.g. someone joins the channel - not an actual message
                        continue

                    parsed_message = self.import_message(message, query)
                    query_posts.append(parsed_message)

                    i += 1
                    if i > max_items:
                        break
            except (ValueError, UsernameInvalidError) as e:
                self.dataset.update_status("Could not scrape entity '%s'" %
                                           query)

            posts += list(reversed(query_posts))

        return posts
Exemple #12
0
    def items_to_ndjson(self, items, filepath):
        """
		Save retrieved items as an ndjson file

		NDJSON is a file with one valid JSON value per line, in this case each
		of these JSON values represents a retrieved item. This is useful if the
		retrieved data cannot easily be completely stored as a flat CSV file
		and we want to leave the choice of how to flatten it to the user. Note
		that no conversion (e.g. html stripping or pseudonymisation) is done
		here - the items are saved as-is.

		:param Iterator items:  Items to save
		:param Path filepath:  Location to save results file
		"""
        if not filepath:
            raise ResourceWarning("No valid results path supplied")

        # cache hashed author names, so the hashing function (which is
        # relatively expensive) is not run too often
        pseudonymise_author = bool(self.parameters.get("pseudonymise", None))
        if pseudonymise_author:
            hash_cache = {}
            hasher = hashlib.blake2b(digest_size=24)
            hasher.update(str(config.ANONYMISATION_SALT).encode("utf-8"))

        processed = 0
        with filepath.open("w", encoding="utf-8", newline="") as outfile:
            for item in items:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results to file")

                # replace author column with salted hash of the author name, if
                # pseudonymisation is enabled
                if pseudonymise_author:
                    check_cashe = CheckCashe(hash_cache, hasher)
                    self.search_and_update(item, ['author'],
                                           check_cashe.update_cache)

                outfile.write(json.dumps(item) + "\n")
                processed += 1

        return processed
Exemple #13
0
    def write_csv_items_and_finish(self, data):
        """
		Write data as csv to results file and finish dataset

		Determines result file path using dataset's path determination helper
		methods. After writing results, the dataset is marked finished. Will
		raise a ProcessorInterruptedException if the interrupted flag for this
		processor is set while iterating.

		:param data: A list or tuple of dictionaries, all with the same keys
		"""
        if not (isinstance(data, typing.List)
                or isinstance(data, typing.Tuple)) or isinstance(data, str):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        if not data:
            raise ValueError(
                "write_csv_items requires a dictionary with at least one item")

        if not isinstance(data[0], dict):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        self.dataset.update_status("Writing results file")
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8",
                                                  newline='') as results:
            writer = csv.DictWriter(results, fieldnames=data[0].keys())
            writer.writeheader()

            for row in data:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results file")
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(data))
Exemple #14
0
    def process(self):
        """
		This takes a CSV file as input and writes the same data as a JSON file
		"""
        posts = 0
        self.dataset.update_status("Converting posts")

        # painstaking empirical work has determined that this dialect is
        # compatible with the MacOS version of Microsoft Excel
        csv.register_dialect("excel-mac",
                             delimiter=";",
                             doublequote=True,
                             escapechar=None,
                             lineterminator="\r\n",
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL,
                             skipinitialspace=False,
                             strict=False)

        # recreate CSV file with the new dialect
        with self.dataset.get_results_path().open("w") as output:
            fieldnames = self.get_item_keys(self.source_file)

            writer = csv.DictWriter(output,
                                    fieldnames=fieldnames,
                                    dialect="excel-mac")
            writer.writeheader()

            for post in self.iterate_items(self.source_file):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing CSV file")

                writer.writerow(post)
                posts += 1

        # done!
        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
Exemple #15
0
    def call_penelope_api(self, params, *args, **kwargs):
        """
        Call PENELOPE API and don't crash (immediately) if it fails

        :param params: Call parameters
        :param args:
        :param kwargs:
        :return: Response, or `None`
        """
        #https://penelope.vub.be/parliament-data/get-speeches/<search_query>/<dataset_name>/<start_date>/<end_date>/<max_number>
        url = "https://penelope.vub.be/parliament-data/get-speeches/%s/%s/%s/%s/"
        url = url % (urllib.parse.quote(
            params["dataset_name"]), urllib.parse.quote(
                params["start_date"]), urllib.parse.quote(params["end_date"]),
                     urllib.parse.quote(params["search_query"]))

        retries = 0
        while retries < self.max_retries:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from the Penelope API")

            try:
                response = requests.get(url, *args, **kwargs)
                break
            except requests.RequestException as e:
                self.log.info(
                    "Error %s while querying PENELOPE Parliament Speeches API - retrying..."
                    % e)
                retries += 1

        if retries >= self.max_retries:
            self.log.error("Error during PENELOPE fetch of query %s" %
                           self.dataset.key)
            self.dataset.update_status(
                "Error while searching for posts on PENELOPE Parliament Speeches API"
            )
            return None
        else:
            return response.json()["speeches"]
Exemple #16
0
    def items_to_csv(self, results, filepath):
        """
		Takes a dictionary of results, converts it to a csv, and writes it to the
		given location. This is mostly a generic dictionary-to-CSV processor but
		some specific processing is done on the "body" key to strip HTML from it,
		and a human-readable timestamp is provided next to the UNIX timestamp.

		:param results:			List of dict rows from data source.
		:param filepath:    	Filepath for the resulting csv

		:return int:  Amount of posts that were processed

		"""
        if not filepath:
            raise ResourceWarning("No result file for query")

        # write the dictionary to a csv
        if not isinstance(filepath, Path):
            filepath = Path(filepath)

        # cache hashed author names, so the hashing function (which is
        # relatively expensive) is not run too often
        pseudonymise_author = bool(self.parameters.get("pseudonymise", None))
        hash_cache = {}

        # prepare hasher (which we may or may not need)
        # we use BLAKE2	for its (so far!) resistance against cryptanalysis and
        # speed, since we will potentially need to calculate a large amount of
        # hashes
        hasher = hashlib.blake2b(digest_size=24)
        hasher.update(str(config.ANONYMISATION_SALT).encode("utf-8"))

        processed = 0
        header_written = False
        with filepath.open("w", encoding="utf-8") as csvfile:
            # Parsing: remove the HTML tags, but keep the <br> as a newline
            # Takes around 1.5 times longer
            for row in results:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results to file")

                if not header_written:
                    fieldnames = list(row.keys())
                    fieldnames.append("unix_timestamp")
                    writer = csv.DictWriter(csvfile,
                                            fieldnames=fieldnames,
                                            lineterminator='\n')
                    writer.writeheader()
                    header_written = True

                processed += 1

                # Create human dates from timestamp
                from datetime import datetime, timezone

                if "timestamp" in row:
                    # Data sources should have "timestamp" as a unix epoch integer,
                    # but do some conversion if this is not the case.
                    timestamp = row["timestamp"]
                    if not isinstance(timestamp, int):
                        if isinstance(
                                timestamp, str
                        ) and "-" not in timestamp:  # String representation of epoch timestamp
                            timestamp = int(timestamp)
                        elif isinstance(
                                timestamp,
                                str) and "-" in timestamp:  # Date string
                            try:
                                timestamp = datetime.strptime(
                                    timestamp, "%Y-%m-%d %H:%M:%S").replace(
                                        tzinfo=timezone.utc).timestamp()
                            except ValueError:
                                timestamp = "undefined"
                        else:
                            timestamp = "undefined"

                    # Add a human-readable date format as well, if we have a valid timestamp.
                    row["unix_timestamp"] = timestamp
                    if timestamp != "undefined":
                        row["timestamp"] = datetime.utcfromtimestamp(
                            timestamp).strftime('%Y-%m-%d %H:%M:%S')
                    else:
                        row["timestamp"] = timestamp
                else:
                    row["timestamp"] = "undefined"

                # Parse html to text
                if row["body"]:
                    row["body"] = strip_tags(row["body"])

                # replace author column with salted hash of the author name, if
                # pseudonymisation is enabled
                if pseudonymise_author:
                    check_cashe = CheckCashe(hash_cache, hasher)
                    author_fields = [
                        field for field in row.keys() if "author" in field
                    ]
                    for author_field in author_fields:
                        row[author_field] = check_cashe.update_cache(
                            row[author_field])
                writer.writerow(row)

        return processed
Exemple #17
0
    def get_post_notes(self, di_blogs_ids, only_text_reblogs=True):
        """
		Gets the post notes.
		:param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values.
		:param only_text_reblogs, bool: Whether to only keep notes that are text reblogs.
		"""

        client = self.connect_to_tumblr()

        # List of dict to get reblogs. Items are: [{"blog_name": post_id}]
        text_reblogs = []

        max_date = None

        # Do some counting
        len_blogs = len(di_blogs_ids)
        count = 0

        # Stop trying to fetch the notes after this many retries
        max_notes_retries = 10
        notes_retries = 0

        for key, value in di_blogs_ids.items():

            count += 1

            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching post notes from Tumblr")

            # First, get the blog names and post_ids from reblogs
            # Keep digging till there's nothing left, or if we can fetch no new notes
            while True:

                # Requests a post's notes
                notes = client.notes(key, id=value, before_timestamp=max_date)

                if only_text_reblogs:

                    if "notes" in notes:
                        notes_retries = 0

                        for note in notes["notes"]:
                            # If it's a reblog, extract the data and save the rest of the posts for later
                            if note["type"] == "reblog":
                                if note.get("added_text"):
                                    text_reblogs.append(
                                        {note["blog_name"]: note["post_id"]})

                        if notes.get("_links"):
                            max_date = notes["_links"]["next"]["query_params"][
                                "before_timestamp"]

                        # If there's no `_links` key, that's all.
                        else:
                            break

                    # If there's no "notes" key in the returned dict, something might be up
                    else:
                        self.log.update_status(
                            "Couldn't get notes for Tumblr request " +
                            str(notes))
                        notes_retries += 1
                        pass

                    if notes_retries > max_notes_retries:
                        self.failed_notes.append(key)
                        break

            self.dataset.update_status(
                "Identified %i text reblogs in %i/%i notes" %
                (len(text_reblogs), count, len_blogs))

        return text_reblogs
Exemple #18
0
    def get_posts_by_blog(self, blog, max_date=None, min_date=None):
        """
		Get Tumblr posts posts with a certain blog
		:param tag, str: the name of the blog you want to look for
		:param min_date: a unix timestamp, indicates posts should be min_date this date.
	    :param max_date: a unix timestamp, indicates posts should be max_date this date.

	    :returns: a dict created from the JSON response
		"""

        blog = blog + ".tumblr.com"
        client = self.connect_to_tumblr()

        if not max_date:
            max_date = int(time.time())

        # Store all posts in here
        all_posts = []

        # Store notes here, if they exist and are requested
        all_notes = []

        # Some retries to make sure the Tumblr API actually returns everything
        retries = 0
        self.max_retries = 48  # 2 days

        # Get Tumblr posts until there's no more left.
        while True:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching blog posts from Tumblr")

            # Stop min_date 20 retries
            if retries >= self.max_retries:
                self.dataset.update_status("No more posts")
                break

            try:
                # Use the pytumblr library to make the API call
                posts = client.posts(blog,
                                     before=max_date,
                                     limit=20,
                                     reblog_info=True,
                                     notes_info=True,
                                     filter="raw")
                posts = posts["posts"]

                #if (max_date - posts[0]["timestamp"]) > 500000:
                #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
                #self.dataset.update_status([post["timestamp"] for post in posts])

            except Exception as e:

                self.dataset.update_status(
                    "Reached the limit of the Tumblr API. Last timestamp: %s" %
                    str(max_date))
                self.api_limit_reached = True
                break

            # Make sure the Tumblr API doesn't magically stop at an earlier date
            if not posts or isinstance(posts, str):
                retries += 1
                max_date -= 3600  # Decrease by an hour
                self.dataset.update_status(
                    "No posts returned by Tumblr - checking whether this is really all (retry %s/48)"
                    % str(retries))
                continue

            # Append posts to main list
            else:
                # Keep the notes, if so indicated
                if self.parameters.get("fetch_reblogs"):
                    for post in posts:
                        if "notes" in post:
                            all_notes.append(post["notes"])

                posts = self.parse_tumblr_posts(posts)

                # Get the lowest date
                max_date = sorted([post["timestamp"] for post in posts])[0]

                # Manually check if we have a lower date than the min date (`min_date`) already.
                # This functonality is not natively supported by Tumblr.
                if min_date:
                    if max_date < min_date:

                        # Get rid of all the posts that are earlier than the max_date timestamp
                        posts = [
                            post for post in posts
                            if post["timestamp"] >= min_date
                        ]

                        if posts:
                            all_posts += posts
                        break

                retries = 0

                all_posts += posts

                #if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000:
                #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
                #self.dataset.update_status([post["timestamp"] for post in posts])

            if len(all_posts) >= self.max_posts:
                self.max_posts_reached = True
                break

            self.dataset.update_status("Collected %s posts" %
                                       str(len(all_posts)))

        return all_posts, all_notes
Exemple #19
0
    def get_posts_by_tag(self, tag, max_date=None, min_date=None):
        """
		Get Tumblr posts posts with a certain tag
		:param tag, str: the tag you want to look for
		:param min_date: a unix timestamp, indicates posts should be min_date this date.
	    :param max_date: a unix timestamp, indicates posts should be max_date this date.

	    :returns: a dict created from the JSON response
		"""

        client = self.connect_to_tumblr()

        # Store all posts in here
        all_posts = []

        # Some retries to make sure the Tumblr API actually returns everything.
        retries = 0
        date_retries = 0

        # We're gonna change max_date, so store a copy for reference.
        max_date_original = max_date

        # We use the averag time difference between posts to spot possible gaps in the data.
        all_time_difs = []
        avg_time_dif = 0
        time_difs_len = 0

        # Get Tumblr posts until there's no more left.
        while True:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching tag posts from Tumblr")

            # Stop after max for date reductions
            if date_retries >= self.max_date_retries:
                self.dataset.update_status("No more posts in this date range")
                break

            # Stop after max retries for API/connection stuff
            if retries >= self.max_retries:
                self.dataset.update_status("No more posts")
                break

            try:
                # Use the pytumblr library to make the API call
                posts = client.tagged(tag,
                                      before=max_date,
                                      limit=20,
                                      filter="raw")
            except ConnectionError:
                self.update_status(
                    "Encountered a connection error, waiting 10 seconds.")
                time.sleep(10)
                retries += 1
                continue

            # Get rid of posts that we already enountered,
            # preventing Tumblr API shenanigans or double posts because of
            # time reductions. Make sure it's no odd error string, though.
            unseen_posts = []
            for check_post in posts:
                # Sometimes the API repsonds just with "meta", "response", or "errors".
                if isinstance(check_post, str):
                    self.dataset.update_status("Couldnt add post:", check_post)
                    retries += 1
                    break
                else:
                    retries = 0
                    if check_post["id"] not in self.seen_ids:
                        unseen_posts.append(check_post)
            posts = unseen_posts

            # For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested.
            # So we have to prevent this manually.
            if max_date_original:
                posts = [
                    post for post in posts
                    if post["timestamp"] <= max_date_original
                ]

            max_date_str = datetime.fromtimestamp(max_date).strftime(
                "%Y-%m-%d %H:%M:%S")

            # except Exception as e:
            # 	print(e)
            # 	self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
            # 	self.api_limit_reached = True
            # 	break

            # Make sure the Tumblr API doesn't magically stop at an earlier date
            if not posts:

                date_retries += 1

                # We're first gonna check carefully if there's small timegaps by
                # decreasing by six hours.
                # If that didn't result in any new posts, also dedicate 12 date_retries
                # with reductions of six months, just to be sure there's no data from
                # years earlier missing.

                if date_retries < 96:
                    max_date -= 21600  # Decrease by six hours
                    self.dataset.update_status(
                        "Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)"
                        % (
                            str(len(all_posts)),
                            tag,
                            max_date_str,
                            str(date_retries),
                        ))
                elif date_retries <= self.max_date_retries:
                    max_date -= 604800  # Decrease by one week
                    retry_str = str(date_retries - 96)
                    self.dataset.update_status(
                        "Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)"
                        % (
                            str(len(all_posts)),
                            tag,
                            max_date_str,
                            str(retry_str),
                        ))

                # We can stop when the max date drops below the min date.
                if min_date:
                    if max_date <= min_date:
                        break

                continue

            # Append posts to main list
            else:

                posts = self.parse_tumblr_posts(posts)

                # Get all timestamps and sort them.
                post_dates = sorted([post["timestamp"] for post in posts])

                # Get the lowest date and use it as the next "before" parameter.
                max_date = post_dates[0]

                # Tumblr's API is volatile - it doesn't neatly sort posts by date,
                # so it can happen that there's suddenly huge jumps in time.
                # Check if this is happening by extracting the difference between all consecutive dates.
                time_difs = list()
                post_dates.reverse()

                for i, date in enumerate(post_dates):

                    if i == (len(post_dates) - 1):
                        break

                    # Calculate and add time differences
                    time_dif = date - post_dates[i + 1]

                    # After having collected 250 posts, check whether the time
                    # difference between posts far exceeds the average time difference
                    # between posts. If it's more than five times this amount,
                    # restart the query with the timestamp just before the gap, minus the
                    # average time difference up to this point - something might be up with Tumblr's API.
                    if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5):

                        time_str = datetime.fromtimestamp(date).strftime(
                            "%Y-%m-%d %H:%M:%S")
                        self.dataset.update_status(
                            "Time difference of %s spotted, restarting query at %s"
                            % (
                                str(time_dif),
                                time_str,
                            ))

                        self.seen_ids.update([post["id"] for post in posts])
                        posts = [
                            post for post in posts if post["timestamp"] >= date
                        ]
                        if posts:
                            all_posts += posts

                        max_date = date
                        break

                    time_difs.append(time_dif)

                # To start a new query
                if not posts:
                    break

                # Manually check if we have a lower date than the lowest allowed date already (min date).
                # This functonality is not natively supported by Tumblr.
                if min_date:
                    if max_date < min_date:

                        # Get rid of all the posts that are earlier than the max_date timestamp
                        posts = [
                            post for post in posts
                            if post["timestamp"] >= min_date
                            and post["timestamp"] <= max_date_original
                        ]

                        if posts:
                            all_posts += posts
                            self.seen_ids.update(
                                [post["id"] for post in posts])
                        break

                # We got a new post, so we can reset the retry counts.
                date_retries = 0
                retries = 0

                # Add retrieved posts top the main list
                all_posts += posts

                # Add to seen ids
                self.seen_ids.update([post["id"] for post in posts])

                # Add time differences and calculate new average time difference
                all_time_difs += time_difs

                # Make the average time difference a moving average,
                # to be flexible with faster and slower post paces.
                # Delete the first 100 posts every hundred or so items.
                if (len(all_time_difs) - time_difs_len) > 100:
                    all_time_difs = all_time_difs[time_difs_len:]
                if all_time_difs:
                    time_difs_len = len(all_time_difs)
                    avg_time_dif = sum(all_time_difs) / len(all_time_difs)

            if len(all_posts) >= self.max_posts:
                self.max_posts_reached = True
                break

            self.dataset.update_status(
                "Collected %s posts for tag %s, now looking for posts before %s"
                % (
                    str(len(all_posts)),
                    tag,
                    max_date_str,
                ))

        return all_posts
Exemple #20
0
    def process(self):
        """
		Opens the SpaCy output and gets ze entities.

		"""

        # Validate whether the user enabled the right parameters.
        if "ner" not in self.source_dataset.parameters["enable"]:
            self.dataset.update_status(
                "Enable \"Named entity recognition\" in previous module")
            self.dataset.finish(0)
            return

        if self.source_dataset.num_rows > 25000:
            self.dataset.update_status(
                "Named entity recognition is only available for datasets smaller than 25.000 items."
            )
            self.dataset.finish(0)
            return

        else:
            # Extract the SpaCy docs first
            self.dataset.update_status("Unzipping SpaCy docs")

            # Store all the entities in this list
            li_entities = []
            nlp = spacy.load("en_core_web_sm")  # Load model

            for doc_file in self.iterate_archive_contents(self.source_file):
                with doc_file.open("rb") as pickle_file:
                    # Load DocBin
                    file = pickle.load(pickle_file)
                    doc_bin = DocBin().from_bytes(file)
                    docs = list(doc_bin.get_docs(nlp.vocab))

                for doc in docs:
                    post_entities = []

                    # stop processing if worker has been asked to stop
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while processing documents")

                    for ent in doc.ents:
                        if ent.label_ in self.parameters["entities"]:
                            post_entities.append(
                                (ent.text, ent.label_))  # Add a tuple

                    li_entities.append(post_entities)

            results = []

            if li_entities:

                # Also add the data to the original csv file, if indicated.
                if self.parameters.get("overwrite"):
                    self.update_parent(li_entities)

                all_entities = []
                # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily.
                for post_ents in li_entities:
                    for pair in post_ents:
                        if pair and len(pair[0]) > 1:
                            pair = pair[0].lower() + " |#| " + pair[1]
                            all_entities.append(pair)

                # Group and rank
                count_nouns = Counter(all_entities).most_common()
                # Unsplit and list the count.
                results = [{
                    "word": tpl[0].split(" |#| ")[0],
                    "entity": tpl[0].split(" |#| ")[1],
                    "count": tpl[1]
                } for tpl in count_nouns]

            # done!
            if results:
                self.dataset.update_status("Finished")
                self.write_csv_items_and_finish(results)
            else:
                self.dataset.update_status(
                    "Finished, but no entities were extracted.")
                self.dataset.finish(0)
    def get_videos_query(self, session, query, csrftoken, detail):
        """
        Scrape videos for given BitChute search query

        :param session:  HTTP Session to use
        :param str user:  Search query to scrape videos for
        :param str csrftoken:  CSRF token to use for requests
        :param str detail:  Detail level to scrape, basic/detail/comments

        :return:  Video data dictionaries, as a generator
        """
        page = 0
        num_items = 0
        while True:
            self.dataset.update_status("Retrieved %i items for query '%s'" %
                                       (num_items, query))

            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while scraping BitChute")

            # prepare the request - the CSRF param *must* be the first or the request will fail
            post_data = {
                "csrfmiddlewaretoken": csrftoken,
                "query": query,
                "kind": "video",
                "duration": "",
                "sort": "",
                "page": str(page)
            }
            headers = {
                'Referer': "https://www.bitchute.com/search",
                'Origin': "https://www.bitchute.com/search"
            }
            response = self.request_from_bitchute(
                session, "POST", "https://www.bitchute.com/api/search/list/",
                headers, post_data)

            if not response["success"] or response[
                    "count"] == 0 or num_items >= self.max_items:
                break

            comments = []
            for video_data in response["results"]:
                if num_items >= self.max_items:
                    break
                else:
                    num_items += 1

                # note: deleted videos will have a published date of 'None'. To
                # avoid crashing the backend the easiest way is to set it to something
                # that is obviously not a valid date in this context.
                if video_data["published"] is None:
                    video_data["published"] = "1970-01-01"
                # this is only included as '5 months ago' and so forth, not exact date
                # so use dateparser to at least approximate the date
                dt = dateparser.parse(video_data["published"])

                video = {
                    "id": video_data["id"],
                    "thread_id": video_data["id"],
                    "subject": video_data["name"],
                    "body": video_data["description"],
                    "author": video_data["channel_name"],
                    "author_id": video_data["channel_path"].split("/")[2],
                    "timestamp": int(dt.timestamp()),
                    "url": "https://www.bitchute.com" + video_data["path"],
                    "views": video_data["views"],
                    "length": video_data["duration"],
                    "thumbnail_image": video_data["images"]["thumbnail"]
                }

                if detail != "basic":
                    video, comments = self.append_details(video, detail)
                    if not video:
                        # unrecoverable error while scraping details
                        return

                yield video
                for comment in comments:
                    # these need to be yielded *after* the video because else the result file will have the comments
                    # before the video, which is weird
                    yield comment

            page += 1
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a new CSV file
		with all posts containing the original query exactly, ignoring any
		* or " in the query
		"""
        months = {}

        # we use these to extract URLs and host names if needed
        link_regex = re.compile(r"https?://en.wikipedia\.org/wiki/[^\s.]+")
        wiki_page = re.compile(r"[\[\[[^\]]+\]\]")
        category_regex = re.compile(r"\[\[Category:[^\]]+\]\]")
        trailing_comma = re.compile(r",$")

        # initialise
        links = {}
        all_categories = {}
        counter = 1
        errors = 0
        page_categories = {}
        page_links = {}
        deep_pages = {}

        # find all links in post bodies
        self.dataset.update_status("Reading source file")
        for post in self.iterate_items(self.source_file):
            wiki_links = link_regex.findall(post["body"])
            wiki_links = [trailing_comma.sub("", link) for link in wiki_links]

            # if we have a per-post URL, include that as well
            if "url" in post and post["url"] and link_regex.match(post["url"]):
                wiki_links.append(post["url"])

            for link in wiki_links:
                link = "/wiki/".join(link.split("/wiki/")[1:]).split("#")[0]
                if link not in links:
                    links[link] = 0

                links[link] += 1

        # just a helper function to get the HTML content of a node
        def stringify_children(node):
            from lxml.etree import tostring
            from itertools import chain
            parts = ([node.text] + list(
                chain(*([c.text, tostring(c), c.tail]
                        for c in node.getchildren()))) + [node.tail])
            # filter removes possible Nones in texts and tails
            return ''.join(filter(None, parts))

        self.dataset.update_status("Fetching categories from Wikipedia API...")
        for link in links:
            if link not in page_categories:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while fetching data from Wikipedia")

                page_categories[link] = set()
                self.dataset.update_status(
                    "Fetching categories from Wikipedia API, page %i of %i" %
                    (counter, len(links)))
                counter += 1

                # fetch wikipedia source
                url = "https://en.wikipedia.org/w/index.php?title=" + link + "&action=edit"
                try:
                    page = requests.get(url)
                except requests.RequestException:
                    errors += 1
                    continue

                if page.status_code != 200:
                    errors += 1
                    continue

                # get link to image file from HTML returned
                parser = etree.HTMLParser()
                tree = etree.parse(StringIO(page.content.decode("utf-8")),
                                   parser)

                try:
                    wiki_source = stringify_children(
                        css("#wpTextbox1")(tree)[0])
                except IndexError:
                    # not a source page?
                    errors += 1
                    continue

                # extract category names from category link syntax
                categories = category_regex.findall(wiki_source)
                categories = set([
                    ":".join(category.split(":")[1:])[:-2].split("|")[0]
                    for category in categories
                ])

                # save category links
                for category in categories:

                    # Add " (cat)" to the category strings.
                    # This is needed because pages can sometimes have the same name as the category.
                    # This will result in a faulty graph, since there's duplicate nodes.

                    category += " (cat)"

                    if category not in all_categories:
                        all_categories[category] = 0

                    all_categories[category] += 1
                    page_categories[link].add(category)

                # if needed, also include pages linked to from within the
                # fetched page source
                if self.parameters.get("deep_pages", None):
                    linked_pages = wiki_page.findall(wiki_source)
                    for page in linked_pages:
                        page = page.split("|")[0]

                        if page not in deep_pages:
                            deep_pages[page] = 0

                        deep_pages[page] += 1

                        if link not in page_links:
                            page_links[link] = set()

                        page_links[link].add(page)

        # write GDF file
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8") as results:
            results.write("nodedef>name VARCHAR,type VARCHAR,weight INTEGER\n")
            for page in page_categories:
                results.write("'" + page.replace("_", " ").replace(",", "") +
                              "',page," + str(links[page]).replace(",", "") +
                              "\n")

            for category in all_categories:
                results.write("'" +
                              category.replace("_", " ").replace(",", "") +
                              "',category," +
                              str(all_categories[category]).replace(",", "") +
                              "\n")

            results.write(
                "edgedef>node1 VARCHAR, node2 VARCHAR, weight INTEGER\n")
            for page in page_categories:
                for category in page_categories[page]:
                    results.write("'" +
                                  page.replace("_", " ").replace(",", "") +
                                  "','" +
                                  category.replace("_", " ").replace(",", "") +
                                  "'\n")

        self.dataset.finish(len(page_categories))
	def process(self):
		"""
		This takes a 4CAT results file as input, and outputs a new CSV file
		with one column with image hashes, one with the first file name used
		for the image, and one with the amount of times the image was used
		"""
		images = {}

		urls = []

		# is there anything for us to download?
		if self.source_dataset.num_rows == 0:
			self.dataset.update_status("No images to download.", is_final=True)
			self.dataset.finish(0)
			return

		# Get the source file data path
		top_parent = self.dataset.get_genealogy()[0]
		datasource = top_parent.parameters["datasource"]

		try:
			amount = max(0, min(1000, int(self.parameters.get("amount", 0))))
		except ValueError:
			amount = 100

		extensions = {}
		
		# 4chan is the odd one out (images are traced to and scraped from
		# external archives rather than 4chan itself) so here we collect the
		# relevant archive URLs for any 4chan images we encounter
		if datasource == "4chan":
			self.dataset.update_status("Reading source file")
			external = "fireden" if top_parent.parameters.get("board") == "v" else "4plebs"
			rate_limit = 1 if external == "fireden" else 16

			for post in self.iterate_items(self.source_file):
				# stop processing if worker has been asked to stop
				if self.interrupted:
					raise ProcessorInterruptedException("Interrupted while extracting image URLs")

				extension = post["filename"].split(".")[1].lower()
				if extension not in ("jpg", "jpeg", "png", "gif"):
					continue

				local_file = post["url_4cat"].split("/")[-1]
				local_path = Path(config.PATH_IMAGES, local_file)
				if local_path.exists():
					url = local_path
				else:
					url = post["url_" + external]

				urls.append(url)
				extensions[url] = extension

		# With other sources, simply take the URLs as they are provided by the
		# parent dataset
		else:
			for row in self.iterate_items(self.source_file):

				img_url = row["item"]
				extension = img_url.split(".")[-1].lower()
				extensions[img_url] = extension
				urls.append(img_url)

		# prepare staging area
		results_path = self.dataset.get_staging_area()
		counter = 0
		downloaded_images = 0

		# Used to overwrite top-images csv file with download status
		success = []

		# loop through images and download them - until we have as many images
		# as required. Note that images that cannot be downloaded or parsed do
		# not count towards that limit
		for path in urls:
			if downloaded_images >= amount:
				break

			# stop processing if worker has been asked to stop
			if self.interrupted:
				raise ProcessorInterruptedException("Interrupted while downloading images.")

			counter += 1
			success.append({"download_status": "failed", "img_name": ""})
			self.dataset.update_status("Downloading image %i of %i" % (counter, len(urls)))

			# acquire and resize image
			try:
				if datasource == "4chan":
					picture = self.get_4chan_image(path, rate_limit=rate_limit)
				else:
					picture, image_name = self.get_image(path)

			except (requests.RequestException, IndexError, FileNotFoundError) as e:
				continue

			# Again, some different processing for 4chan
			if datasource == "4chan":

				# hash needs to be hexified if it's a 4chan hash
				if not isinstance(path, Path) and path[-2:] == "==":
					md5 = hashlib.md5()
					b64hash = base64.b64decode(path.split("/")[-1].split(".")[0].replace("_", "/"))
					
					try:
						md5.update(b64hash)
					except binascii.Error:
						self.log.warning("Invalid base64 hash %s, skipping" % b64hash)
						continue

					hash = md5.hexdigest()
				
				# if we're using an already-saved image the image filename is good as it is
				else:
					hash = path.stem

				# determine file name and where to save
				image_name = hash + "." + extensions[path]
				imagepath = str(results_path.joinpath(image_name))

			# For other data sources, we take the imagename it already had.
			else:
				imagepath = str(results_path.joinpath(image_name))

			# save file
			try:
				picture.save(imagepath, format="png")
				downloaded_images += 1
			except (OSError, ValueError):
				self.log.warning("Could not save image %s to disk - invalid format" % path)
				continue

			# If this all succeeded, we update the download status and the filename.
			success[counter - 1]["download_status"] = "succeeded"
			success[counter - 1]["img_name"] = image_name

		# Also add the data to the original csv file, if indicated.
		if self.parameters.get("overwrite"):
			self.update_parent(success)

		# finish up
		self.dataset.update_status("Compressing images")
		self.write_archive_and_finish(results_path)
Exemple #24
0
    def process(self):
        """
		Run 4CAT search query

		Gets query details, passes them on to the object's search method, and
		writes the results to a CSV file. If that all went well, the query and
		job are marked as finished.
		"""

        query_parameters = self.dataset.get_parameters()
        results_file = self.dataset.get_results_path()

        self.log.info("Querying: %s" % str(query_parameters))

        # Execute the relevant query (string-based, random, countryflag-based)
        try:
            posts = self.search(query_parameters)
        except WorkerInterruptedException:
            raise ProcessorInterruptedException(
                "Interrupted while collecting data, trying again later.")

        # Write posts to csv and update the DataBase status to finished
        num_posts = 0
        if posts:
            self.dataset.update_status("Writing posts to result file")
            if not hasattr(self, "extension") or self.extension == "csv":
                num_posts = self.items_to_csv(posts, results_file)
            elif self.extension == "ndjson":
                num_posts = self.items_to_ndjson(posts, results_file)
            else:
                raise NotImplementedError(
                    "Datasource query cannot be saved as %s file" %
                    self.extension)

            self.dataset.update_status(
                "Query finished, results are available.")
        elif posts is not None:
            self.dataset.update_status("Query finished, no results found.")

        # queue predefined post-processors
        if num_posts > 0 and query_parameters.get("next", []):
            for next in query_parameters.get("next"):
                next_parameters = next.get("parameters", {})
                next_type = next.get("type", "")
                available_processors = self.dataset.get_available_processors()

                # run it only if the post-processor is actually available for this query
                if next_type in available_processors:
                    next_analysis = DataSet(
                        parameters=next_parameters,
                        type=next_type,
                        db=self.db,
                        parent=self.dataset.key,
                        extension=available_processors[next_type]["extension"])
                    self.queue.add_job(next_type, remote_id=next_analysis.key)

        # see if we need to register the result somewhere
        if query_parameters.get("copy_to", None):
            # copy the results to an arbitrary place that was passed
            if self.dataset.get_results_path().exists():
                # but only if we actually have something to copy
                shutil.copyfile(str(self.dataset.get_results_path()),
                                query_parameters.get("copy_to"))
            else:
                # if copy_to was passed, that means it's important that this
                # file exists somewhere, so we create it as an empty file
                with open(query_parameters.get("copy_to"), "w") as empty_file:
                    empty_file.write("")

        self.dataset.finish(num_rows=num_posts)
    def process(self):
        """
		Post the stringified dataset to the VUB API and process the results
		"""
        self.dataset.update_status(
            "Sending post data to PENELOPE API endpoint")

        chunk_size = 50  # results may vary
        chunk = []
        processed = 0
        entities = 0

        # the API has some problems with fancy quote characters, etc, and they
        # presumably don't make a difference for the results, so strip
        # everything that's not plain text (or a few non-harmful characters)
        # would need updating if languages other than English are supported
        non_alpha = re.compile(r"[^a-zA-Z0-9%!?+*&@#)(/:;, -]")

        with self.dataset.get_results_path().open("w") as output:
            writer = csv.DictWriter(output,
                                    fieldnames=("sentence", "utterance",
                                                "frameEvokingElement", "cause",
                                                "effect"))
            writer.writeheader()
            reader = self.iterate_items(self.source_file)
            while True:
                # the API can't handle too many sentences at once, so send
                # them in chunks
                self.dataset.update_status(
                    "%i sentences processed via PENELOPE API..." % processed)
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while interfacing with PENELOPE API")

                end_of_the_line = False
                try:
                    post = reader.__next__()
                    sentence = non_alpha.sub("", post["sentence"])
                    processed += 1
                    if not sentence:
                        # could be that it's just symbols, no text
                        continue

                    chunk.append(sentence)
                except StopIteration:
                    end_of_the_line = True

                if len(chunk) == chunk_size or end_of_the_line:
                    payload = {"texts": chunk, "frames": ["Causation"]}
                    response = requests.post(
                        "https://penelope.vub.be/semantic-frame-extractor/texts-extract-frames",
                        data=json.dumps(payload),
                        headers={"Content-type": "application/json"})

                    if response.status_code != 200:
                        self.log.warning(
                            "PENELOPE Semantic Frame API crashed for chunk %s"
                            % repr(chunk))
                        self.dataset.update_status(
                            "PENELOPE API response could not be parsed.")
                        entities = 0
                        break

                    # filter response to only include those sentences that
                    # actually contained any semantic frames
                    for frameset_list in response.json().get("frameSets", []):
                        if not frameset_list:
                            continue

                        for frameset in frameset_list:
                            if not frameset.get("entities", None):
                                continue

                            for entity in frameset.get("entities"):
                                entities += 1
                                writer.writerow({
                                    "sentence":
                                    frameset["utterance"],
                                    "utterance":
                                    entity.get("utterance", ""),
                                    "frameEvokingElement":
                                    entity.get("frameEvokingElement", ""),
                                    "cause":
                                    entity.get("cause", ""),
                                    "effect":
                                    entity.get("effect", "")
                                })

                    chunk = []

                if end_of_the_line:
                    self.dataset.update_status("Finished")
                    break
                else:
                    # let 'em breathe
                    time.sleep(1)

        self.dataset.finish(entities)
    def get_videos_user(self, session, user, csrftoken, detail):
        """
        Scrape videos for given BitChute user

        :param session:  HTTP Session to use
        :param str user:  Username to scrape videos for
        :param str csrftoken:  CSRF token to use for requests
        :param str detail:  Detail level to scrape, basic/detail/comments

        :return:  Video data dictionaries, as a generator
        """
        offset = 0
        num_items = 0
        base_url = "https://www.bitchute.com/channel/%s/" % user
        url = base_url + "extend/"

        container = session.get(base_url)
        container_soup = BeautifulSoup(container.text, 'html.parser')
        headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}

        while True:
            self.dataset.update_status("Retrieved %i items for query '%s'" %
                                       (num_items, user))

            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while scraping BitChute")

            post_data = {
                "csrfmiddlewaretoken": csrftoken,
                "name": "",
                "offset": str(offset)
            }

            try:
                self.dataset.log("Fetching data for BitChute video %s" % url)
                request = session.post(url, data=post_data, headers=headers)
                if request.status_code != 200:
                    raise ConnectionError()
                response = request.json()
            except (json.JSONDecodeError, requests.RequestException,
                    ConnectionError) as e:
                self.dataset.update_status(
                    "Error while interacting with BitChute (%s) - try again later."
                    % e,
                    is_final=True)
                return

            soup = BeautifulSoup(response["html"], 'html.parser')
            videos = soup.select(".channel-videos-container")
            comments = []

            if len(videos) == 0 or num_items >= self.max_items:
                break

            for video_element in videos:
                if num_items >= self.max_items:
                    break
                else:
                    num_items += 1

                offset += 1

                link = video_element.select_one(".channel-videos-title a")
                video = {
                    "id":
                    link["href"].split("/")[-2],
                    "thread_id":
                    link["href"].split("/")[-2],
                    "subject":
                    link.text,
                    "body":
                    video_element.select_one(".channel-videos-text").
                    encode_contents().decode("utf-8").strip(),
                    "author":
                    container_soup.select_one(".details .name a").text,
                    "author_id":
                    container_soup.select_one(
                        ".details .name a")["href"].split("/")[2],
                    "timestamp":
                    int(
                        dateparser.parse(
                            video_element.select_one(
                                ".channel-videos-details.text-right.hidden-xs"
                            ).text).timestamp()),
                    "url":
                    "https://www.bitchute.com" + link["href"],
                    "views":
                    video_element.select_one(".video-views").text.strip(),
                    "length":
                    video_element.select_one(".video-duration").text.strip(),
                    "thumbnail_image":
                    video_element.select_one(
                        ".channel-videos-image img")["src"],
                }

                if detail != "basic":
                    video, comments = self.append_details(video, detail)
                    if not video:
                        # unrecoverable error while scraping details
                        return

                yield video
                for comment in comments:
                    # these need to be yielded *after* the video because else the result file will have the comments
                    # before the video, which is weird
                    yield comment
Exemple #27
0
    def process(self):
        """
		This takes previously generated Word2Vec models and uses them to find
		similar words based on a list of words
		"""
        self.dataset.update_status("Processing sentences")

        depth = max(1,
                    min(3, convert_to_int(self.parameters.get("crawl_depth"))))
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        num_words = convert_to_int(self.parameters.get("num-words"))
        try:
            threshold = float(self.parameters.get("threshold"))
        except ValueError:
            threshold = float(self.get_options()["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))

        # go through all models and calculate similarity for all given input words
        result = []
        staging_area = self.unpack_archive_contents(self.source_file)
        for model_file in staging_area.glob("*.model"):
            interval = model_file.stem

            # for each separate model, calculate top similar words for each
            # input word, giving us at most
            #   [max amount] * [number of input] * [number of intervals]
            # items
            self.dataset.update_status("Running model %s..." % model_file.name)
            model = KeyedVectors.load(str(model_file))
            word_queue = set()
            checked_words = set()
            level = 1

            words = input_words.copy()
            while words:
                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while extracting similar words")

                word = words.pop()
                checked_words.add(word)

                try:
                    similar_words = model.most_similar(positive=[word],
                                                       topn=num_words)
                except KeyError:
                    continue

                for similar_word in similar_words:
                    if similar_word[1] < threshold:
                        continue

                    result.append({
                        "date":
                        interval,
                        "input":
                        word,
                        "item":
                        similar_word[0],
                        "value":
                        similar_word[1],
                        "input_occurences":
                        model.vocab[word].count,
                        "item_occurences":
                        model.vocab[similar_word[0]].count,
                        "depth":
                        level
                    })

                    # queue word for the next iteration if there is one and
                    # it hasn't been seen yet
                    if level < depth and similar_word[0] not in checked_words:
                        word_queue.add(similar_word[0])

                # if all words have been checked, but we still have an
                # iteration to go, load the queued words into the list
                if not words and word_queue and level < depth:
                    level += 1
                    words = word_queue.copy()
                    word_queue = set()

        shutil.rmtree(staging_area)

        if not result:
            self.dataset.update_status(
                "None of the words were found in the word embedding model.",
                is_final=True)
            self.dataset.finish(0)
        else:
            self.write_csv_items_and_finish(result)
Exemple #28
0
    def get_items(self, query):
        """
        Run custom search

        Fetches data from Parler via instaloader.
        """

        # ready our parameters
        parameters = self.dataset.get_parameters()
        max_posts = parameters.get("items", 100)
        min_timestamp = parameters.get("min_date", 0)
        max_timestamp = parameters.get("max_date", time.time())
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]
        scrape_echoes = parameters.get("scrape_echoes", False)
        num_query = 0

        # start a HTTP session. Parler uses two session 'cookies' that are required on each request, else no response
        # will be given. These can only be obtained by logging in. Logging in via 4CAT is not preferred, because it will
        # lead to quick rate limiting and requires people to share their passwords. Instead, ask users to obtain these
        # values by logging in themselves.
        session = requests.Session()
        session.cookies.set("mst", parameters.get("mst", ""))
        session.cookies.set("jst", parameters.get("jst", ""))
        session.headers[
            "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"

        user_map = {}
        ref_map = {}
        seen_parleys = set()

        for query in queries:
            if not query.strip():
                continue

            num_query += 1
            query = query.strip()
            is_hashtag = (query[0] == "#")

            if is_hashtag:
                params = {"tag": query[1:], "limit": 100}
                url = "https://api.parler.com/v1/post/hashtag"
            else:
                # for user queries, we need the user ID, which is *not* the username and can only be obtained
                # via the API
                try:
                    user_id_src = self.request_from_parler(
                        session,
                        "GET",
                        "https://api.parler.com/v1/profile",
                        data={"username": query})
                    user_id = user_id_src["_id"]
                except KeyError:
                    # user does not exist or no results
                    continue
                except json.JSONDecodeError as e:
                    self.log.warning("%s:\n\n%s" % (e, user_id_src.text))
                    continue
                params = {"id": user_id, "limit": 100}
                url = "https://api.parler.com/v1/post/creator"

            cursor = ""
            num_posts = 0
            while True:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while scraping Parler")

                if cursor:
                    # handles pagination
                    params["startkey"] = cursor

                try:
                    chunk_posts = self.request_from_parler(session,
                                                           "GET",
                                                           url,
                                                           data=params)

                    if chunk_posts.status_code in (404, 400):
                        # no results
                        break

                    if chunk_posts.status_code != 200:
                        # no results
                        self.dataset.update_status(
                            "Got unexpected status from Parler API (%i) - cannot parse data, halting."
                            % chunk_posts.status_code,
                            is_final=True)
                        return

                    chunk_posts = chunk_posts.json()
                except json.JSONDecodeError:
                    # this would be weird
                    self.dataset.update_status(
                        "Got unexpected response from Parler API - cannot parse data, halting.",
                        is_final=True)
                    return
                except (requests.RequestException, ConnectionError):
                    # this would be weird
                    self.dataset.update_status(
                        "Error connecting to Parler - halting.", is_final=True)
                    return

                if "posts" not in chunk_posts:
                    self.log.warning(repr(chunk_posts))
                    break

                for user in chunk_posts.get("users", {}):
                    user_map[user["id"]] = user["username"]

                for ref in chunk_posts.get("postRefs", {}):
                    ref_map[ref["_id"]] = ref

                done = False
                for post in chunk_posts["posts"]:
                    # fairly straighforward - most of the API response maps 1-on-1 to 4CAT data fields
                    # in case of reposts (echoes), use the original data and mark it as a repost
                    if post.get("source_dataset") and int(post.get("depth",
                                                                   0)) == 1:
                        if not scrape_echoes:
                            continue

                        reposted_by = user_map.get(post["creator"])
                        post_src = ref_map[post.get("source_dataset")]
                    else:
                        reposted_by = ""
                        post_src = post

                    if post_src["_id"] in seen_parleys:
                        # items may be scraped twice e.g. when querying two
                        # separate hashtags that are both used in a single
                        # parley - so keep track of seen parleys and skip
                        continue

                    seen_parleys.add(post_src["_id"])

                    dt = datetime.datetime.strptime(post["createdAt"],
                                                    "%Y%m%d%H%M%S")
                    post = {
                        "id":
                        post_src["_id"],
                        "thread_id":
                        post_src["_id"],
                        "subject":
                        "",
                        "body":
                        post_src["body"],
                        "author":
                        user_map.get(post_src["creator"], ""),
                        "timestamp":
                        int(dt.timestamp()),
                        "comments":
                        self.expand_number(post_src["comments"]),
                        "urls":
                        ",".join([("https://api.parler.com/l/" + link)
                                  for link in post_src["links"]]),
                        "hashtags":
                        ",".join(post_src["hashtags"]),
                        "impressions":
                        self.expand_number(post_src["impressions"]),
                        "reposts":
                        self.expand_number(post_src["reposts"]),
                        "upvotes":
                        self.expand_number(post_src["upvotes"]),
                        "permalink":
                        post_src.get("shareLink", ""),
                        "reposted_by":
                        reposted_by
                    }

                    if min_timestamp and dt.timestamp() < min_timestamp:
                        done = True
                        break

                    if max_timestamp and dt.timestamp() >= max_timestamp:
                        continue

                    num_posts += 1
                    yield post

                    if num_posts >= max_posts:
                        break

                self.dataset.update_status(
                    "Retrieved %i posts for query '%s' (%i/%i)" %
                    (num_posts, query, num_query, len(queries)))

                # paginate, if needed
                if not done and num_posts < max_posts and not chunk_posts[
                        "last"]:
                    cursor = chunk_posts["next"]
                    time.sleep(1.5)
                else:
                    break

            time.sleep(1)
    def get_items(self, query):
        """
		Run custom search

		Fetches data from Instagram via instaloader.
		"""
        # this is useful to include in the results because researchers are
        # always thirsty for them hashtags
        hashtag = re.compile(r"#([^\s,.+=-]+)")
        mention = re.compile(r"@([a-zA-Z0-9_]+)")

        instagram = instaloader.Instaloader(quiet=True,
                                            download_pictures=False,
                                            download_videos=False,
                                            download_comments=True,
                                            download_geotags=False,
                                            download_video_thumbnails=False,
                                            compress_json=False,
                                            save_metadata=True)

        # ready our parameters
        parameters = self.dataset.get_parameters()
        scope = parameters.get("search_scope", "")
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]

        posts = []
        max_posts = self.dataset.parameters.get("items", 500)

        # for each query, get items
        for query in queries:
            chunk_size = 0
            self.dataset.update_status("Retrieving posts ('%s')" % query)
            try:
                if scope == "hashtag":
                    query = query.replace("#", "")
                    chunk = instagram.get_hashtag_posts(query)
                elif scope == "username":
                    query = query.replace("@", "")
                    profile = instaloader.Profile.from_username(
                        instagram.context, query)
                    chunk = profile.get_posts()
                else:
                    self.log.warning(
                        "Invalid search scope for instagram scraper: %s" %
                        repr(scope))
                    return []

                # "chunk" is a generator so actually retrieve the posts next
                posts_processed = 0
                for post in chunk:
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while fetching posts from Instagram")

                    chunk_size += 1
                    self.dataset.update_status(
                        "Retrieving posts ('%s', %i posts)" %
                        (query, chunk_size))
                    if posts_processed >= max_posts:
                        break
                    try:
                        posts.append(chunk.__next__())
                        posts_processed += 1
                    except StopIteration:
                        break
            except instaloader.InstaloaderException as e:
                # should we abort here and return 0 posts?
                self.log.warning("Instaloader exception during query %s: %s" %
                                 (self.dataset.key, e))
                self.dataset.update_status(
                    "Error while retrieving posts for query '%s'" % query)

        # go through posts, and retrieve comments
        results = []
        posts_processed = 0
        comments_bit = " and comments" if self.parameters.get(
            "scrape_comments", False) else ""

        for post in posts:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching post metadata from Instagram")

            posts_processed += 1
            self.dataset.update_status("Retrieving metadata%s for post %i" %
                                       (comments_bit, posts_processed))

            thread_id = post.shortcode

            try:
                results.append({
                    "id":
                    thread_id,
                    "thread_id":
                    thread_id,
                    "parent_id":
                    thread_id,
                    "body":
                    post.caption if post.caption is not None else "",
                    "author":
                    post.owner_username,
                    "timestamp":
                    int(post.date_utc.timestamp()),
                    "type":
                    "video" if post.is_video else "picture",
                    "url":
                    post.video_url if post.is_video else post.url,
                    "thumbnail_url":
                    post.url,
                    "hashtags":
                    ",".join(post.caption_hashtags),
                    "usertags":
                    ",".join(post.tagged_users),
                    "mentioned":
                    ",".join(
                        mention.findall(post.caption) if post.caption else ""),
                    "num_likes":
                    post.likes,
                    "num_comments":
                    post.comments,
                    "subject":
                    ""
                })
            except (instaloader.QueryReturnedNotFoundException,
                    instaloader.ConnectionException):
                pass

            if not self.parameters.get("scrape_comments", False):
                continue

            try:
                for comment in post.get_comments():
                    answers = [answer for answer in comment.answers]

                    try:
                        results.append({
                            "id":
                            comment.id,
                            "thread_id":
                            thread_id,
                            "parent_id":
                            thread_id,
                            "body":
                            comment.text,
                            "author":
                            comment.owner.username,
                            "timestamp":
                            int(comment.created_at_utc.timestamp()),
                            "type":
                            "comment",
                            "url":
                            "",
                            "hashtags":
                            ",".join(hashtag.findall(comment.text)),
                            "usertags":
                            "",
                            "mentioned":
                            ",".join(mention.findall(comment.text)),
                            "num_likes":
                            comment.likes_count if hasattr(
                                comment, "likes_count") else 0,
                            "num_comments":
                            len(answers),
                            "subject":
                            ""
                        })
                    except instaloader.QueryReturnedNotFoundException:
                        pass

                    # instagram only has one reply depth level at the time of
                    # writing, represented here
                    for answer in answers:
                        try:
                            results.append({
                                "id":
                                answer.id,
                                "thread_id":
                                thread_id,
                                "parent_id":
                                comment.id,
                                "body":
                                answer.text,
                                "author":
                                answer.owner.username,
                                "timestamp":
                                int(answer.created_at_utc.timestamp()),
                                "type":
                                "comment",
                                "url":
                                "",
                                "hashtags":
                                ",".join(hashtag.findall(answer.text)),
                                "usertags":
                                "",
                                "mentioned":
                                ",".join(mention.findall(answer.text)),
                                "num_likes":
                                answer.likes_count if hasattr(
                                    answer, "likes_count") else 0,
                                "num_comments":
                                0,
                                "subject":
                                ""
                            })
                        except instaloader.QueryReturnedNotFoundException:
                            pass

            except (instaloader.QueryReturnedNotFoundException,
                    instaloader.ConnectionException):
                # data not available...? this happens sometimes, not clear why
                pass

        # remove temporary fetched data and return posts
        return results
Exemple #30
0
    def get_items(self, query):
        """
        Use the Twitter v2 API historical search to get tweets

        :param query:
        :return:
        """
        # this is pretty sensitive so delete it immediately after storing in
        # memory
        bearer_token = self.parameters.get("api_bearer_token")
        auth = {"Authorization": "Bearer %s" % bearer_token}

        endpoint = "https://api.twitter.com/2/tweets/search/all"

        # these are all expansions and fields available at the time of writing
        # since it does not cost anything extra in terms of rate limiting, go
        # for as much data per tweet as possible...
        tweet_fields = ("attachments", "author_id", "context_annotations",
                        "conversation_id", "created_at", "entities", "geo",
                        "id", "in_reply_to_user_id", "lang", "public_metrics",
                        "possibly_sensitive", "referenced_tweets",
                        "reply_settings", "source", "text", "withheld")
        user_fields = ("created_at", "description", "entities", "id",
                       "location", "name", "pinned_tweet_id",
                       "profile_image_url", "protected", "public_metrics",
                       "url", "username", "verified", "withheld")
        place_fields = ("contained_within", "country", "country_code",
                        "full_name", "geo", "id", "name", "place_type")
        poll_fields = ("duration_minutes", "end_datetime", "id", "options",
                       "voting_status")
        expansions = ("attachments.poll_ids", "attachments.media_keys",
                      "author_id", "entities.mentions.username",
                      "geo.place_id", "in_reply_to_user_id",
                      "referenced_tweets.id", "referenced_tweets.id.author_id")
        media_fields = ("duration_ms", "height", "media_key",
                        "non_public_metrics", "organic_metrics",
                        "preview_image_url", "promoted_metrics",
                        "public_metrics", "type", "url", "width")
        amount = convert_to_int(self.parameters.get("amount"), 10)

        params = {
            "query": self.parameters.get("query", ""),
            "expansions": ",".join(expansions),
            "tweet.fields": ",".join(tweet_fields),
            "user.fields": ",".join(user_fields),
            "poll.fields": ",".join(poll_fields),
            "place.fields": ",".join(place_fields),
            "media.fields": ",".join(media_fields),
            "max_results": max(10, min(amount, 500))
            if amount > 0 else 500,  # 500 = upper limit, 10 = lower
        }

        if self.parameters.get("min_date"):
            params["start_time"] = datetime.datetime.fromtimestamp(
                self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        if self.parameters.get("max_date"):
            params["end_time"] = datetime.datetime.fromtimestamp(
                self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ")

        tweets = 0
        self.dataset.log("Search parameters: %s" % repr(params))
        while True:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while getting tweets from the Twitter API")

            # there is a limit of one request per second, so stay on the safe side of this
            while self.previous_request == int(time.time()):
                time.sleep(0.1)
            time.sleep(0.05)
            self.previous_request = int(time.time())

            # now send the request, allowing for at least 5 replies if the connection seems unstable
            retries = 5
            api_response = None
            while retries > 0:
                try:
                    api_response = requests.get(endpoint,
                                                headers=auth,
                                                params=params)
                    break
                except (ConnectionError,
                        requests.exceptions.RequestException) as e:
                    retries -= 1
                    wait_time = (5 - retries) * 10
                    self.dataset.update_status(
                        "Got %s, waiting %i seconds before retrying" %
                        (str(e), wait_time))
                    time.sleep(wait_time)

            # rate limited - the limit at time of writing is 300 reqs per 15
            # minutes
            # usually you don't hit this when requesting batches of 500 at
            # 1/second
            if api_response.status_code == 429:
                resume_at = convert_to_int(
                    api_response.headers["x-rate-limit-reset"]) + 1
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Hit Twitter rate limit - waiting until %s to continue." %
                    resume_at_str)
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # API keys that are valid but don't have access or haven't been
            # activated properly get a 403
            elif api_response.status_code == 403:
                try:
                    structured_response = api_response.json()
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Could not connect to Twitter API "
                        "with this API key. %s" %
                        structured_response.get("detail", ""),
                        is_final=True)
                except (json.JSONDecodeError, ValueError):
                    self.dataset.update_status(
                        "'Forbidden' error from Twitter API. Your key may not have access to "
                        "the full-archive search endpoint.",
                        is_final=True)
                finally:
                    return

            # sometimes twitter says '503 service unavailable' for unclear
            # reasons - in that case just wait a while and try again
            elif api_response.status_code in (502, 503, 504):
                resume_at = time.time() + 60
                resume_at_str = datetime.datetime.fromtimestamp(
                    int(resume_at)).strftime("%c")
                self.dataset.update_status(
                    "Twitter unavailable (status %i) - waiting until %s to continue."
                    % (api_response.status_code, resume_at_str))
                while time.time() <= resume_at:
                    time.sleep(0.5)
                continue

            # this usually means the query is too long or otherwise contains
            # a syntax error
            elif api_response.status_code == 400:
                msg = "Response %i from the Twitter API; " % api_response.status_code
                try:
                    api_response = api_response.json()
                    msg += api_response.get("title", "")
                    if "detail" in api_response:
                        msg += ": " + api_response.get("detail", "")
                except (json.JSONDecodeError, TypeError):
                    msg += "Some of your parameters (e.g. date range) may be invalid."

                self.dataset.update_status(msg, is_final=True)
                return

            # invalid API key
            elif api_response.status_code == 401:
                self.dataset.update_status(
                    "Invalid API key - could not connect to Twitter API",
                    is_final=True)
                return

            # haven't seen one yet, but they probably exist
            elif api_response.status_code != 200:
                self.dataset.update_status(
                    "Unexpected HTTP status %i. Halting tweet collection." %
                    api_response.status_code,
                    is_final=True)
                self.log.warning(
                    "Twitter API v2 responded with status code %i. Response body: %s"
                    % (api_response.status_code, api_response.text))
                return

            elif not api_response:
                self.dataset.update_status(
                    "Could not connect to Twitter. Cancelling.", is_final=True)
                return

            api_response = api_response.json()

            # The API response contains tweets (of course) and 'includes',
            # objects that can be referenced in tweets. Later we will splice
            # this data into the tweets themselves to make them easier to
            # process. So extract them first...
            included_users = api_response.get("includes", {}).get("users", {})
            included_media = api_response.get("includes", {}).get("media", {})
            included_polls = api_response.get("includes", {}).get("polls", {})
            included_tweets = api_response.get("includes",
                                               {}).get("tweets", {})
            included_places = api_response.get("includes",
                                               {}).get("places", {})

            for tweet in api_response.get("data", []):
                if 0 < amount <= tweets:
                    break

                # splice referenced data back in
                # we use copy.deepcopy here because else we run into a
                # pass-by-reference quagmire
                tweet = self.enrich_tweet(tweet, included_users,
                                          included_media, included_polls,
                                          included_places,
                                          copy.deepcopy(included_tweets))

                tweets += 1
                if tweets % 500 == 0:
                    self.dataset.update_status(
                        "Received %i tweets from Twitter API" % tweets)

                yield tweet

            # paginate
            if (amount <= 0 or tweets < amount) and api_response.get(
                    "meta") and "next_token" in api_response["meta"]:
                params["next_token"] = api_response["meta"]["next_token"]
            else:
                break