def process(self): """ This takes a Twitter NDJSON file to be importable as a JSON file by TCAT's import-jsondump.php """ posts = 0 self.dataset.update_status("Converting posts") # This handles and writes one Tweet at a time with self.dataset.get_results_path().open("w") as output: for post in self.iterate_items(self.source_file, bypass_map_item=True): # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing NDJSON file") posts += 1 post = self.map_to_TCAT(post) # TCAT has a check on line 62 of /import/import-jsondump.php # that rejects strings large than 40960 #https://github.com/digitalmethodsinitiative/dmi-tcat/blob/9654fe3ff489fd3b0efc6ddcf7c19adf8ed7726d/import/import-jsondump.php#L62 # We are obviously dropping some tweets because of this if len(json.dumps(post)) < 40960: output.write(json.dumps(post, ensure_ascii=False)) # NDJSON file is expected by TCAT output.write('\n') self.dataset.update_status("Finished.") self.dataset.finish(num_rows=posts)
def fetch_posts(self, post_ids, where=None, replacements=None, groups=None): """ Fetch post data from database :param list post_ids: List of post IDs to return data for :return list: List of posts, with a dictionary representing the database record for each post """ if not where: where = [] if not replacements: replacements = [] columns = ", ".join(self.return_cols) where.append("id IN %s") replacements.append(post_ids) if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching post data") if groups: where.append("id IN ( SELECT post_id FROM groups_" + self.prefix + " WHERE \"group\" LIKE ANY(%s) )") replacements.append(groups) query = "SELECT " + columns + " FROM posts_" + self.prefix + " WHERE " + " AND ".join( where) + " ORDER BY id ASC" return self.db.fetchall_interruptable(self.queue, query, replacements)
def call_penelope_api(self, endpoint, *args, **kwargs): """ Call PENELOPE API and don't crash (immediately) if it fails :param endpoint: Endpoint to call relative to HTTP root :param args: :param kwargs: :return: Response, or `None` """ retries = 0 while retries < self.max_retries: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from the Penelope API") try: url = "http://penelope.vub.be/guardian-climate-change-data/" + endpoint response = requests.get(url, *args, **kwargs) break except requests.RequestException as e: self.log.info( "Error %s while querying PENELOPE Guardian API - retrying..." % e) retries += 1 if retries >= self.max_retries: self.log.error("Error during PENELOPE fetch of query %s" % self.dataset.key) self.dataset.update_status( "Error while searching for posts on PENELOPE Guardian API") return None else: return response.json()
def get_post_by_id(self, blog_name, post_id): """ Fetch individual posts :param blog_name, str: The blog's name :param id, int: The post ID returns result list, a list with a dictionary with the post's information """ if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching post from Tumblr") client = self.connect_to_tumblr() # Request the specific post. post = client.posts(blog_name, id=post_id) # Tumblr API can sometimes return with this kind of error: # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} if "posts" not in post: return None # Get the first element of the list - it's always one post. result = post["posts"][0] return result
def process(self): """ This takes a CSV file as input and writes the same data as a JSON file """ posts = 0 self.dataset.update_status("Converting posts") # we write to file per row, instead of json.dumps()ing all of it at # once, since else we risk having to keep a lot of data in memory, # and this buffers one row at most with self.dataset.get_results_path().open("w") as output: output.write("[") for post in self.iterate_items(self.source_file): # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing CSV file") posts += 1 if posts > 1: output.write(",") output.write(json.dumps(post)) output.write("]") self.dataset.update_status("Finished.") self.dataset.finish(num_rows=posts)
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ api_key = self.parameters.get("api_key") self.dataset.delete_parameter("api_key") # sensitive, delete after use features = self.parameters.get("features") features = [{"type": feature} for feature in features] if not api_key: self.dataset.update_status("You need to provide a valid API key", is_final=True) self.dataset.finish(0) return max_images = convert_to_int(self.parameters.get("amount", 0), 100) total = self.source_dataset.num_rows if not max_images else min( max_images, self.source_dataset.num_rows) done = 0 for image_file in self.iterate_archive_contents(self.source_file): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from Google Vision API") done += 1 self.dataset.update_status("Annotating image %i/%i" % (done, total)) try: annotations = self.annotate_image(image_file, api_key, features) except RuntimeError: # cannot continue fetching, e.g. when API key is invalid break if not annotations: continue annotations = {"file_name": image_file.name, **annotations} with self.dataset.get_results_path().open( "a", encoding="utf-8") as outfile: outfile.write(json.dumps(annotations) + "\n") if max_images and done >= max_images: break self.dataset.update_status("Annotations retrieved for %i images" % done) self.dataset.finish(done)
def tokens_from_file(self, file, staging_area, phraser=None): """ Read tokens from token dump If the tokens were saved as JSON, take advantage of this and return them as a generator, reducing memory usage and allowing interruption. :param Path file: :param Path staging_area: Path to staging area, so it can be cleaned up when the processor is interrupted :param Phraser phraser: Optional. If given, the yielded sentence is passed through the phraser to detect (e.g.) bigrams. :return list: A set of tokens """ if file.suffix == "pb": with file.open("rb") as input: return pickle.load(input) with file.open("r") as input: input.seek(1) while True: line = input.readline() if line is None: break if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while reading tokens") if line == "]": # this marks the end of the file return try: # the tokeniser dumps the json with one set of tokens per # line, ending with a comma line = line.strip() if line[-1] == ",": line = line[:-1] token_set = json.loads(line) if phraser: yield phraser[token_set] else: yield token_set except json.JSONDecodeError: # old-format json dumps are not suitable for the generator # approach input.seek(0) everything = json.load(input) return everything
def iterate_archive_contents(self, path, staging_area=None): """ A generator that iterates through files in an archive With every iteration, the processor's 'interrupted' flag is checked, and if set a ProcessorInterruptedException is raised, which by default is caught and subsequently stops execution gracefully. Files are temporarily unzipped and deleted after use. :param Path path: Path to zip file to read :param Path staging_area: Where to store the files while they're being worked with. If omitted, a temporary folder is created and deleted after use :return: An iterator with a Path item for each file """ if not path.exists(): return if staging_area and (not staging_area.exists() or not staging_area.is_dir()): raise RuntimeError("Staging area %s is not a valid folder") else: if not hasattr(self, "staging_area") and not staging_area: self.staging_area = self.dataset.get_staging_area() staging_area = self.staging_area with zipfile.ZipFile(path, "r") as archive_file: archive_contents = sorted(archive_file.namelist()) for archived_file in archive_contents: if self.interrupted: if hasattr(self, "staging_area"): shutil.rmtree(self.staging_area) raise ProcessorInterruptedException( "Interrupted while iterating zip file contents") file_name = archived_file.split("/")[-1] temp_file = staging_area.joinpath(file_name) archive_file.extract(file_name, staging_area) yield temp_file if hasattr(self, "staging_area"): temp_file.unlink() if hasattr(self, "staging_area"): shutil.rmtree(self.staging_area) del self.staging_area
def resolve_redirect(url, depth=0): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while expanding URL") if hasattr(url, "group"): url = url.group(0) # get host name to compare to list of shorteners host_name = re.sub(r"^[a-z]*://", "", url).split("/")[0].lower() if depth >= 10: return url elif "api.parler.com/l" not in url and host_name not in self.redirect_domains: # skip non-redirects return url elif url in cache: return cache[url] # to avoid infinite recursion, do not go deeper than 5 loops and # keep track of current depth here: depth += 1 # do this explicitly because it is a known issue and will save # one request if host_name == "t.co" and "http://" in url: url = url.replace("http://", "https://") try: time.sleep(0.1) head_request = requests.head(url, timeout=5) except (requests.RequestException, ConnectionError, ValueError, TimeoutError) as e: return url # if the returned page's status code is in the 'valid request' # range, and if it has a Location header different from the page's # url, recursively resolve the page it redirects to up to a given # depth - infinite recursion is prevented by using a cache if 200 <= head_request.status_code < 400: redirected_to = head_request.headers.get("Location", url) if redirected_to != url: cache[url] = redirected_to return resolve_redirect(redirected_to, depth) return url
def unpack_archive_contents(self, path, staging_area=None): """ Unpack all files in an archive to a staging area With every iteration, the processor's 'interrupted' flag is checked, and if set a ProcessorInterruptedException is raised, which by default is caught and subsequently stops execution gracefully. Files are unzipped to a staging area. The staging area is *not* cleaned up automatically. :param Path path: Path to zip file to read :param Path staging_area: Where to store the files while they're being worked with. If omitted, a temporary folder is created and deleted after use :return Path: A path to the staging area """ if not path.exists(): return if staging_area and (not staging_area.exists() or not staging_area.is_dir()): raise RuntimeError("Staging area %s is not a valid folder") else: if not hasattr(self, "staging_area"): self.staging_area = self.dataset.get_staging_area() staging_area = self.staging_area paths = [] with zipfile.ZipFile(path, "r") as archive_file: archive_contents = sorted(archive_file.namelist()) for archived_file in archive_contents: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while iterating zip file contents") file_name = archived_file.split("/")[-1] temp_file = staging_area.joinpath(file_name) archive_file.extract(archived_file, staging_area) paths.append(temp_file) return staging_area
async def gather_posts(self, client, queries, max_items): """ Gather messages for each entity for which messages are requested :param TelegramClient client: Telegram Client :param list queries: List of entities to query (as string) :param int max_items: Messages to scrape per entity :return list: List of messages, each message a dictionary. """ posts = [] for query in queries: self.dataset.update_status("Fetching messages for entity '%s'" % query) query_posts = [] i = 0 try: async for message in client.iter_messages(entity=query): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching message data from the Telegram API" ) if i % 500 == 0: self.dataset.update_status( "Retrieved %i posts for entity '%s'" % (len(query_posts) + len(posts), query)) if message.action is not None: # e.g. someone joins the channel - not an actual message continue parsed_message = self.import_message(message, query) query_posts.append(parsed_message) i += 1 if i > max_items: break except (ValueError, UsernameInvalidError) as e: self.dataset.update_status("Could not scrape entity '%s'" % query) posts += list(reversed(query_posts)) return posts
def items_to_ndjson(self, items, filepath): """ Save retrieved items as an ndjson file NDJSON is a file with one valid JSON value per line, in this case each of these JSON values represents a retrieved item. This is useful if the retrieved data cannot easily be completely stored as a flat CSV file and we want to leave the choice of how to flatten it to the user. Note that no conversion (e.g. html stripping or pseudonymisation) is done here - the items are saved as-is. :param Iterator items: Items to save :param Path filepath: Location to save results file """ if not filepath: raise ResourceWarning("No valid results path supplied") # cache hashed author names, so the hashing function (which is # relatively expensive) is not run too often pseudonymise_author = bool(self.parameters.get("pseudonymise", None)) if pseudonymise_author: hash_cache = {} hasher = hashlib.blake2b(digest_size=24) hasher.update(str(config.ANONYMISATION_SALT).encode("utf-8")) processed = 0 with filepath.open("w", encoding="utf-8", newline="") as outfile: for item in items: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while writing results to file") # replace author column with salted hash of the author name, if # pseudonymisation is enabled if pseudonymise_author: check_cashe = CheckCashe(hash_cache, hasher) self.search_and_update(item, ['author'], check_cashe.update_cache) outfile.write(json.dumps(item) + "\n") processed += 1 return processed
def write_csv_items_and_finish(self, data): """ Write data as csv to results file and finish dataset Determines result file path using dataset's path determination helper methods. After writing results, the dataset is marked finished. Will raise a ProcessorInterruptedException if the interrupted flag for this processor is set while iterating. :param data: A list or tuple of dictionaries, all with the same keys """ if not (isinstance(data, typing.List) or isinstance(data, typing.Tuple)) or isinstance(data, str): raise TypeError( "write_csv_items requires a list or tuple of dictionaries as argument" ) if not data: raise ValueError( "write_csv_items requires a dictionary with at least one item") if not isinstance(data[0], dict): raise TypeError( "write_csv_items requires a list or tuple of dictionaries as argument" ) self.dataset.update_status("Writing results file") with self.dataset.get_results_path().open("w", encoding="utf-8", newline='') as results: writer = csv.DictWriter(results, fieldnames=data[0].keys()) writer.writeheader() for row in data: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while writing results file") writer.writerow(row) self.dataset.update_status("Finished") self.dataset.finish(len(data))
def process(self): """ This takes a CSV file as input and writes the same data as a JSON file """ posts = 0 self.dataset.update_status("Converting posts") # painstaking empirical work has determined that this dialect is # compatible with the MacOS version of Microsoft Excel csv.register_dialect("excel-mac", delimiter=";", doublequote=True, escapechar=None, lineterminator="\r\n", quotechar='"', quoting=csv.QUOTE_MINIMAL, skipinitialspace=False, strict=False) # recreate CSV file with the new dialect with self.dataset.get_results_path().open("w") as output: fieldnames = self.get_item_keys(self.source_file) writer = csv.DictWriter(output, fieldnames=fieldnames, dialect="excel-mac") writer.writeheader() for post in self.iterate_items(self.source_file): # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing CSV file") writer.writerow(post) posts += 1 # done! self.dataset.update_status("Finished.") self.dataset.finish(num_rows=posts)
def call_penelope_api(self, params, *args, **kwargs): """ Call PENELOPE API and don't crash (immediately) if it fails :param params: Call parameters :param args: :param kwargs: :return: Response, or `None` """ #https://penelope.vub.be/parliament-data/get-speeches/<search_query>/<dataset_name>/<start_date>/<end_date>/<max_number> url = "https://penelope.vub.be/parliament-data/get-speeches/%s/%s/%s/%s/" url = url % (urllib.parse.quote( params["dataset_name"]), urllib.parse.quote( params["start_date"]), urllib.parse.quote(params["end_date"]), urllib.parse.quote(params["search_query"])) retries = 0 while retries < self.max_retries: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from the Penelope API") try: response = requests.get(url, *args, **kwargs) break except requests.RequestException as e: self.log.info( "Error %s while querying PENELOPE Parliament Speeches API - retrying..." % e) retries += 1 if retries >= self.max_retries: self.log.error("Error during PENELOPE fetch of query %s" % self.dataset.key) self.dataset.update_status( "Error while searching for posts on PENELOPE Parliament Speeches API" ) return None else: return response.json()["speeches"]
def items_to_csv(self, results, filepath): """ Takes a dictionary of results, converts it to a csv, and writes it to the given location. This is mostly a generic dictionary-to-CSV processor but some specific processing is done on the "body" key to strip HTML from it, and a human-readable timestamp is provided next to the UNIX timestamp. :param results: List of dict rows from data source. :param filepath: Filepath for the resulting csv :return int: Amount of posts that were processed """ if not filepath: raise ResourceWarning("No result file for query") # write the dictionary to a csv if not isinstance(filepath, Path): filepath = Path(filepath) # cache hashed author names, so the hashing function (which is # relatively expensive) is not run too often pseudonymise_author = bool(self.parameters.get("pseudonymise", None)) hash_cache = {} # prepare hasher (which we may or may not need) # we use BLAKE2 for its (so far!) resistance against cryptanalysis and # speed, since we will potentially need to calculate a large amount of # hashes hasher = hashlib.blake2b(digest_size=24) hasher.update(str(config.ANONYMISATION_SALT).encode("utf-8")) processed = 0 header_written = False with filepath.open("w", encoding="utf-8") as csvfile: # Parsing: remove the HTML tags, but keep the <br> as a newline # Takes around 1.5 times longer for row in results: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while writing results to file") if not header_written: fieldnames = list(row.keys()) fieldnames.append("unix_timestamp") writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n') writer.writeheader() header_written = True processed += 1 # Create human dates from timestamp from datetime import datetime, timezone if "timestamp" in row: # Data sources should have "timestamp" as a unix epoch integer, # but do some conversion if this is not the case. timestamp = row["timestamp"] if not isinstance(timestamp, int): if isinstance( timestamp, str ) and "-" not in timestamp: # String representation of epoch timestamp timestamp = int(timestamp) elif isinstance( timestamp, str) and "-" in timestamp: # Date string try: timestamp = datetime.strptime( timestamp, "%Y-%m-%d %H:%M:%S").replace( tzinfo=timezone.utc).timestamp() except ValueError: timestamp = "undefined" else: timestamp = "undefined" # Add a human-readable date format as well, if we have a valid timestamp. row["unix_timestamp"] = timestamp if timestamp != "undefined": row["timestamp"] = datetime.utcfromtimestamp( timestamp).strftime('%Y-%m-%d %H:%M:%S') else: row["timestamp"] = timestamp else: row["timestamp"] = "undefined" # Parse html to text if row["body"]: row["body"] = strip_tags(row["body"]) # replace author column with salted hash of the author name, if # pseudonymisation is enabled if pseudonymise_author: check_cashe = CheckCashe(hash_cache, hasher) author_fields = [ field for field in row.keys() if "author" in field ] for author_field in author_fields: row[author_field] = check_cashe.update_cache( row[author_field]) writer.writerow(row) return processed
def get_post_notes(self, di_blogs_ids, only_text_reblogs=True): """ Gets the post notes. :param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values. :param only_text_reblogs, bool: Whether to only keep notes that are text reblogs. """ client = self.connect_to_tumblr() # List of dict to get reblogs. Items are: [{"blog_name": post_id}] text_reblogs = [] max_date = None # Do some counting len_blogs = len(di_blogs_ids) count = 0 # Stop trying to fetch the notes after this many retries max_notes_retries = 10 notes_retries = 0 for key, value in di_blogs_ids.items(): count += 1 if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching post notes from Tumblr") # First, get the blog names and post_ids from reblogs # Keep digging till there's nothing left, or if we can fetch no new notes while True: # Requests a post's notes notes = client.notes(key, id=value, before_timestamp=max_date) if only_text_reblogs: if "notes" in notes: notes_retries = 0 for note in notes["notes"]: # If it's a reblog, extract the data and save the rest of the posts for later if note["type"] == "reblog": if note.get("added_text"): text_reblogs.append( {note["blog_name"]: note["post_id"]}) if notes.get("_links"): max_date = notes["_links"]["next"]["query_params"][ "before_timestamp"] # If there's no `_links` key, that's all. else: break # If there's no "notes" key in the returned dict, something might be up else: self.log.update_status( "Couldn't get notes for Tumblr request " + str(notes)) notes_retries += 1 pass if notes_retries > max_notes_retries: self.failed_notes.append(key) break self.dataset.update_status( "Identified %i text reblogs in %i/%i notes" % (len(text_reblogs), count, len_blogs)) return text_reblogs
def get_posts_by_blog(self, blog, max_date=None, min_date=None): """ Get Tumblr posts posts with a certain blog :param tag, str: the name of the blog you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. :param max_date: a unix timestamp, indicates posts should be max_date this date. :returns: a dict created from the JSON response """ blog = blog + ".tumblr.com" client = self.connect_to_tumblr() if not max_date: max_date = int(time.time()) # Store all posts in here all_posts = [] # Store notes here, if they exist and are requested all_notes = [] # Some retries to make sure the Tumblr API actually returns everything retries = 0 self.max_retries = 48 # 2 days # Get Tumblr posts until there's no more left. while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching blog posts from Tumblr") # Stop min_date 20 retries if retries >= self.max_retries: self.dataset.update_status("No more posts") break try: # Use the pytumblr library to make the API call posts = client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw") posts = posts["posts"] #if (max_date - posts[0]["timestamp"]) > 500000: #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") #self.dataset.update_status([post["timestamp"] for post in posts]) except Exception as e: self.dataset.update_status( "Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) self.api_limit_reached = True break # Make sure the Tumblr API doesn't magically stop at an earlier date if not posts or isinstance(posts, str): retries += 1 max_date -= 3600 # Decrease by an hour self.dataset.update_status( "No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) continue # Append posts to main list else: # Keep the notes, if so indicated if self.parameters.get("fetch_reblogs"): for post in posts: if "notes" in post: all_notes.append(post["notes"]) posts = self.parse_tumblr_posts(posts) # Get the lowest date max_date = sorted([post["timestamp"] for post in posts])[0] # Manually check if we have a lower date than the min date (`min_date`) already. # This functonality is not natively supported by Tumblr. if min_date: if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp posts = [ post for post in posts if post["timestamp"] >= min_date ] if posts: all_posts += posts break retries = 0 all_posts += posts #if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000: #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") #self.dataset.update_status([post["timestamp"] for post in posts]) if len(all_posts) >= self.max_posts: self.max_posts_reached = True break self.dataset.update_status("Collected %s posts" % str(len(all_posts))) return all_posts, all_notes
def get_posts_by_tag(self, tag, max_date=None, min_date=None): """ Get Tumblr posts posts with a certain tag :param tag, str: the tag you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. :param max_date: a unix timestamp, indicates posts should be max_date this date. :returns: a dict created from the JSON response """ client = self.connect_to_tumblr() # Store all posts in here all_posts = [] # Some retries to make sure the Tumblr API actually returns everything. retries = 0 date_retries = 0 # We're gonna change max_date, so store a copy for reference. max_date_original = max_date # We use the averag time difference between posts to spot possible gaps in the data. all_time_difs = [] avg_time_dif = 0 time_difs_len = 0 # Get Tumblr posts until there's no more left. while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching tag posts from Tumblr") # Stop after max for date reductions if date_retries >= self.max_date_retries: self.dataset.update_status("No more posts in this date range") break # Stop after max retries for API/connection stuff if retries >= self.max_retries: self.dataset.update_status("No more posts") break try: # Use the pytumblr library to make the API call posts = client.tagged(tag, before=max_date, limit=20, filter="raw") except ConnectionError: self.update_status( "Encountered a connection error, waiting 10 seconds.") time.sleep(10) retries += 1 continue # Get rid of posts that we already enountered, # preventing Tumblr API shenanigans or double posts because of # time reductions. Make sure it's no odd error string, though. unseen_posts = [] for check_post in posts: # Sometimes the API repsonds just with "meta", "response", or "errors". if isinstance(check_post, str): self.dataset.update_status("Couldnt add post:", check_post) retries += 1 break else: retries = 0 if check_post["id"] not in self.seen_ids: unseen_posts.append(check_post) posts = unseen_posts # For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested. # So we have to prevent this manually. if max_date_original: posts = [ post for post in posts if post["timestamp"] <= max_date_original ] max_date_str = datetime.fromtimestamp(max_date).strftime( "%Y-%m-%d %H:%M:%S") # except Exception as e: # print(e) # self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) # self.api_limit_reached = True # break # Make sure the Tumblr API doesn't magically stop at an earlier date if not posts: date_retries += 1 # We're first gonna check carefully if there's small timegaps by # decreasing by six hours. # If that didn't result in any new posts, also dedicate 12 date_retries # with reductions of six months, just to be sure there's no data from # years earlier missing. if date_retries < 96: max_date -= 21600 # Decrease by six hours self.dataset.update_status( "Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)" % ( str(len(all_posts)), tag, max_date_str, str(date_retries), )) elif date_retries <= self.max_date_retries: max_date -= 604800 # Decrease by one week retry_str = str(date_retries - 96) self.dataset.update_status( "Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)" % ( str(len(all_posts)), tag, max_date_str, str(retry_str), )) # We can stop when the max date drops below the min date. if min_date: if max_date <= min_date: break continue # Append posts to main list else: posts = self.parse_tumblr_posts(posts) # Get all timestamps and sort them. post_dates = sorted([post["timestamp"] for post in posts]) # Get the lowest date and use it as the next "before" parameter. max_date = post_dates[0] # Tumblr's API is volatile - it doesn't neatly sort posts by date, # so it can happen that there's suddenly huge jumps in time. # Check if this is happening by extracting the difference between all consecutive dates. time_difs = list() post_dates.reverse() for i, date in enumerate(post_dates): if i == (len(post_dates) - 1): break # Calculate and add time differences time_dif = date - post_dates[i + 1] # After having collected 250 posts, check whether the time # difference between posts far exceeds the average time difference # between posts. If it's more than five times this amount, # restart the query with the timestamp just before the gap, minus the # average time difference up to this point - something might be up with Tumblr's API. if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5): time_str = datetime.fromtimestamp(date).strftime( "%Y-%m-%d %H:%M:%S") self.dataset.update_status( "Time difference of %s spotted, restarting query at %s" % ( str(time_dif), time_str, )) self.seen_ids.update([post["id"] for post in posts]) posts = [ post for post in posts if post["timestamp"] >= date ] if posts: all_posts += posts max_date = date break time_difs.append(time_dif) # To start a new query if not posts: break # Manually check if we have a lower date than the lowest allowed date already (min date). # This functonality is not natively supported by Tumblr. if min_date: if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp posts = [ post for post in posts if post["timestamp"] >= min_date and post["timestamp"] <= max_date_original ] if posts: all_posts += posts self.seen_ids.update( [post["id"] for post in posts]) break # We got a new post, so we can reset the retry counts. date_retries = 0 retries = 0 # Add retrieved posts top the main list all_posts += posts # Add to seen ids self.seen_ids.update([post["id"] for post in posts]) # Add time differences and calculate new average time difference all_time_difs += time_difs # Make the average time difference a moving average, # to be flexible with faster and slower post paces. # Delete the first 100 posts every hundred or so items. if (len(all_time_difs) - time_difs_len) > 100: all_time_difs = all_time_difs[time_difs_len:] if all_time_difs: time_difs_len = len(all_time_difs) avg_time_dif = sum(all_time_difs) / len(all_time_difs) if len(all_posts) >= self.max_posts: self.max_posts_reached = True break self.dataset.update_status( "Collected %s posts for tag %s, now looking for posts before %s" % ( str(len(all_posts)), tag, max_date_str, )) return all_posts
def process(self): """ Opens the SpaCy output and gets ze entities. """ # Validate whether the user enabled the right parameters. if "ner" not in self.source_dataset.parameters["enable"]: self.dataset.update_status( "Enable \"Named entity recognition\" in previous module") self.dataset.finish(0) return if self.source_dataset.num_rows > 25000: self.dataset.update_status( "Named entity recognition is only available for datasets smaller than 25.000 items." ) self.dataset.finish(0) return else: # Extract the SpaCy docs first self.dataset.update_status("Unzipping SpaCy docs") # Store all the entities in this list li_entities = [] nlp = spacy.load("en_core_web_sm") # Load model for doc_file in self.iterate_archive_contents(self.source_file): with doc_file.open("rb") as pickle_file: # Load DocBin file = pickle.load(pickle_file) doc_bin = DocBin().from_bytes(file) docs = list(doc_bin.get_docs(nlp.vocab)) for doc in docs: post_entities = [] # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException( "Interrupted while processing documents") for ent in doc.ents: if ent.label_ in self.parameters["entities"]: post_entities.append( (ent.text, ent.label_)) # Add a tuple li_entities.append(post_entities) results = [] if li_entities: # Also add the data to the original csv file, if indicated. if self.parameters.get("overwrite"): self.update_parent(li_entities) all_entities = [] # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily. for post_ents in li_entities: for pair in post_ents: if pair and len(pair[0]) > 1: pair = pair[0].lower() + " |#| " + pair[1] all_entities.append(pair) # Group and rank count_nouns = Counter(all_entities).most_common() # Unsplit and list the count. results = [{ "word": tpl[0].split(" |#| ")[0], "entity": tpl[0].split(" |#| ")[1], "count": tpl[1] } for tpl in count_nouns] # done! if results: self.dataset.update_status("Finished") self.write_csv_items_and_finish(results) else: self.dataset.update_status( "Finished, but no entities were extracted.") self.dataset.finish(0)
def get_videos_query(self, session, query, csrftoken, detail): """ Scrape videos for given BitChute search query :param session: HTTP Session to use :param str user: Search query to scrape videos for :param str csrftoken: CSRF token to use for requests :param str detail: Detail level to scrape, basic/detail/comments :return: Video data dictionaries, as a generator """ page = 0 num_items = 0 while True: self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query)) if self.interrupted: raise ProcessorInterruptedException( "Interrupted while scraping BitChute") # prepare the request - the CSRF param *must* be the first or the request will fail post_data = { "csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "", "sort": "", "page": str(page) } headers = { 'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search" } response = self.request_from_bitchute( session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data) if not response["success"] or response[ "count"] == 0 or num_items >= self.max_items: break comments = [] for video_data in response["results"]: if num_items >= self.max_items: break else: num_items += 1 # note: deleted videos will have a published date of 'None'. To # avoid crashing the backend the easiest way is to set it to something # that is obviously not a valid date in this context. if video_data["published"] is None: video_data["published"] = "1970-01-01" # this is only included as '5 months ago' and so forth, not exact date # so use dateparser to at least approximate the date dt = dateparser.parse(video_data["published"]) video = { "id": video_data["id"], "thread_id": video_data["id"], "subject": video_data["name"], "body": video_data["description"], "author": video_data["channel_name"], "author_id": video_data["channel_path"].split("/")[2], "timestamp": int(dt.timestamp()), "url": "https://www.bitchute.com" + video_data["path"], "views": video_data["views"], "length": video_data["duration"], "thumbnail_image": video_data["images"]["thumbnail"] } if detail != "basic": video, comments = self.append_details(video, detail) if not video: # unrecoverable error while scraping details return yield video for comment in comments: # these need to be yielded *after* the video because else the result file will have the comments # before the video, which is weird yield comment page += 1
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with all posts containing the original query exactly, ignoring any * or " in the query """ months = {} # we use these to extract URLs and host names if needed link_regex = re.compile(r"https?://en.wikipedia\.org/wiki/[^\s.]+") wiki_page = re.compile(r"[\[\[[^\]]+\]\]") category_regex = re.compile(r"\[\[Category:[^\]]+\]\]") trailing_comma = re.compile(r",$") # initialise links = {} all_categories = {} counter = 1 errors = 0 page_categories = {} page_links = {} deep_pages = {} # find all links in post bodies self.dataset.update_status("Reading source file") for post in self.iterate_items(self.source_file): wiki_links = link_regex.findall(post["body"]) wiki_links = [trailing_comma.sub("", link) for link in wiki_links] # if we have a per-post URL, include that as well if "url" in post and post["url"] and link_regex.match(post["url"]): wiki_links.append(post["url"]) for link in wiki_links: link = "/wiki/".join(link.split("/wiki/")[1:]).split("#")[0] if link not in links: links[link] = 0 links[link] += 1 # just a helper function to get the HTML content of a node def stringify_children(node): from lxml.etree import tostring from itertools import chain parts = ([node.text] + list( chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) + [node.tail]) # filter removes possible Nones in texts and tails return ''.join(filter(None, parts)) self.dataset.update_status("Fetching categories from Wikipedia API...") for link in links: if link not in page_categories: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching data from Wikipedia") page_categories[link] = set() self.dataset.update_status( "Fetching categories from Wikipedia API, page %i of %i" % (counter, len(links))) counter += 1 # fetch wikipedia source url = "https://en.wikipedia.org/w/index.php?title=" + link + "&action=edit" try: page = requests.get(url) except requests.RequestException: errors += 1 continue if page.status_code != 200: errors += 1 continue # get link to image file from HTML returned parser = etree.HTMLParser() tree = etree.parse(StringIO(page.content.decode("utf-8")), parser) try: wiki_source = stringify_children( css("#wpTextbox1")(tree)[0]) except IndexError: # not a source page? errors += 1 continue # extract category names from category link syntax categories = category_regex.findall(wiki_source) categories = set([ ":".join(category.split(":")[1:])[:-2].split("|")[0] for category in categories ]) # save category links for category in categories: # Add " (cat)" to the category strings. # This is needed because pages can sometimes have the same name as the category. # This will result in a faulty graph, since there's duplicate nodes. category += " (cat)" if category not in all_categories: all_categories[category] = 0 all_categories[category] += 1 page_categories[link].add(category) # if needed, also include pages linked to from within the # fetched page source if self.parameters.get("deep_pages", None): linked_pages = wiki_page.findall(wiki_source) for page in linked_pages: page = page.split("|")[0] if page not in deep_pages: deep_pages[page] = 0 deep_pages[page] += 1 if link not in page_links: page_links[link] = set() page_links[link].add(page) # write GDF file with self.dataset.get_results_path().open("w", encoding="utf-8") as results: results.write("nodedef>name VARCHAR,type VARCHAR,weight INTEGER\n") for page in page_categories: results.write("'" + page.replace("_", " ").replace(",", "") + "',page," + str(links[page]).replace(",", "") + "\n") for category in all_categories: results.write("'" + category.replace("_", " ").replace(",", "") + "',category," + str(all_categories[category]).replace(",", "") + "\n") results.write( "edgedef>node1 VARCHAR, node2 VARCHAR, weight INTEGER\n") for page in page_categories: for category in page_categories[page]: results.write("'" + page.replace("_", " ").replace(",", "") + "','" + category.replace("_", " ").replace(",", "") + "'\n") self.dataset.finish(len(page_categories))
def process(self): """ This takes a 4CAT results file as input, and outputs a new CSV file with one column with image hashes, one with the first file name used for the image, and one with the amount of times the image was used """ images = {} urls = [] # is there anything for us to download? if self.source_dataset.num_rows == 0: self.dataset.update_status("No images to download.", is_final=True) self.dataset.finish(0) return # Get the source file data path top_parent = self.dataset.get_genealogy()[0] datasource = top_parent.parameters["datasource"] try: amount = max(0, min(1000, int(self.parameters.get("amount", 0)))) except ValueError: amount = 100 extensions = {} # 4chan is the odd one out (images are traced to and scraped from # external archives rather than 4chan itself) so here we collect the # relevant archive URLs for any 4chan images we encounter if datasource == "4chan": self.dataset.update_status("Reading source file") external = "fireden" if top_parent.parameters.get("board") == "v" else "4plebs" rate_limit = 1 if external == "fireden" else 16 for post in self.iterate_items(self.source_file): # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException("Interrupted while extracting image URLs") extension = post["filename"].split(".")[1].lower() if extension not in ("jpg", "jpeg", "png", "gif"): continue local_file = post["url_4cat"].split("/")[-1] local_path = Path(config.PATH_IMAGES, local_file) if local_path.exists(): url = local_path else: url = post["url_" + external] urls.append(url) extensions[url] = extension # With other sources, simply take the URLs as they are provided by the # parent dataset else: for row in self.iterate_items(self.source_file): img_url = row["item"] extension = img_url.split(".")[-1].lower() extensions[img_url] = extension urls.append(img_url) # prepare staging area results_path = self.dataset.get_staging_area() counter = 0 downloaded_images = 0 # Used to overwrite top-images csv file with download status success = [] # loop through images and download them - until we have as many images # as required. Note that images that cannot be downloaded or parsed do # not count towards that limit for path in urls: if downloaded_images >= amount: break # stop processing if worker has been asked to stop if self.interrupted: raise ProcessorInterruptedException("Interrupted while downloading images.") counter += 1 success.append({"download_status": "failed", "img_name": ""}) self.dataset.update_status("Downloading image %i of %i" % (counter, len(urls))) # acquire and resize image try: if datasource == "4chan": picture = self.get_4chan_image(path, rate_limit=rate_limit) else: picture, image_name = self.get_image(path) except (requests.RequestException, IndexError, FileNotFoundError) as e: continue # Again, some different processing for 4chan if datasource == "4chan": # hash needs to be hexified if it's a 4chan hash if not isinstance(path, Path) and path[-2:] == "==": md5 = hashlib.md5() b64hash = base64.b64decode(path.split("/")[-1].split(".")[0].replace("_", "/")) try: md5.update(b64hash) except binascii.Error: self.log.warning("Invalid base64 hash %s, skipping" % b64hash) continue hash = md5.hexdigest() # if we're using an already-saved image the image filename is good as it is else: hash = path.stem # determine file name and where to save image_name = hash + "." + extensions[path] imagepath = str(results_path.joinpath(image_name)) # For other data sources, we take the imagename it already had. else: imagepath = str(results_path.joinpath(image_name)) # save file try: picture.save(imagepath, format="png") downloaded_images += 1 except (OSError, ValueError): self.log.warning("Could not save image %s to disk - invalid format" % path) continue # If this all succeeded, we update the download status and the filename. success[counter - 1]["download_status"] = "succeeded" success[counter - 1]["img_name"] = image_name # Also add the data to the original csv file, if indicated. if self.parameters.get("overwrite"): self.update_parent(success) # finish up self.dataset.update_status("Compressing images") self.write_archive_and_finish(results_path)
def process(self): """ Run 4CAT search query Gets query details, passes them on to the object's search method, and writes the results to a CSV file. If that all went well, the query and job are marked as finished. """ query_parameters = self.dataset.get_parameters() results_file = self.dataset.get_results_path() self.log.info("Querying: %s" % str(query_parameters)) # Execute the relevant query (string-based, random, countryflag-based) try: posts = self.search(query_parameters) except WorkerInterruptedException: raise ProcessorInterruptedException( "Interrupted while collecting data, trying again later.") # Write posts to csv and update the DataBase status to finished num_posts = 0 if posts: self.dataset.update_status("Writing posts to result file") if not hasattr(self, "extension") or self.extension == "csv": num_posts = self.items_to_csv(posts, results_file) elif self.extension == "ndjson": num_posts = self.items_to_ndjson(posts, results_file) else: raise NotImplementedError( "Datasource query cannot be saved as %s file" % self.extension) self.dataset.update_status( "Query finished, results are available.") elif posts is not None: self.dataset.update_status("Query finished, no results found.") # queue predefined post-processors if num_posts > 0 and query_parameters.get("next", []): for next in query_parameters.get("next"): next_parameters = next.get("parameters", {}) next_type = next.get("type", "") available_processors = self.dataset.get_available_processors() # run it only if the post-processor is actually available for this query if next_type in available_processors: next_analysis = DataSet( parameters=next_parameters, type=next_type, db=self.db, parent=self.dataset.key, extension=available_processors[next_type]["extension"]) self.queue.add_job(next_type, remote_id=next_analysis.key) # see if we need to register the result somewhere if query_parameters.get("copy_to", None): # copy the results to an arbitrary place that was passed if self.dataset.get_results_path().exists(): # but only if we actually have something to copy shutil.copyfile(str(self.dataset.get_results_path()), query_parameters.get("copy_to")) else: # if copy_to was passed, that means it's important that this # file exists somewhere, so we create it as an empty file with open(query_parameters.get("copy_to"), "w") as empty_file: empty_file.write("") self.dataset.finish(num_rows=num_posts)
def process(self): """ Post the stringified dataset to the VUB API and process the results """ self.dataset.update_status( "Sending post data to PENELOPE API endpoint") chunk_size = 50 # results may vary chunk = [] processed = 0 entities = 0 # the API has some problems with fancy quote characters, etc, and they # presumably don't make a difference for the results, so strip # everything that's not plain text (or a few non-harmful characters) # would need updating if languages other than English are supported non_alpha = re.compile(r"[^a-zA-Z0-9%!?+*&@#)(/:;, -]") with self.dataset.get_results_path().open("w") as output: writer = csv.DictWriter(output, fieldnames=("sentence", "utterance", "frameEvokingElement", "cause", "effect")) writer.writeheader() reader = self.iterate_items(self.source_file) while True: # the API can't handle too many sentences at once, so send # them in chunks self.dataset.update_status( "%i sentences processed via PENELOPE API..." % processed) if self.interrupted: raise ProcessorInterruptedException( "Interrupted while interfacing with PENELOPE API") end_of_the_line = False try: post = reader.__next__() sentence = non_alpha.sub("", post["sentence"]) processed += 1 if not sentence: # could be that it's just symbols, no text continue chunk.append(sentence) except StopIteration: end_of_the_line = True if len(chunk) == chunk_size or end_of_the_line: payload = {"texts": chunk, "frames": ["Causation"]} response = requests.post( "https://penelope.vub.be/semantic-frame-extractor/texts-extract-frames", data=json.dumps(payload), headers={"Content-type": "application/json"}) if response.status_code != 200: self.log.warning( "PENELOPE Semantic Frame API crashed for chunk %s" % repr(chunk)) self.dataset.update_status( "PENELOPE API response could not be parsed.") entities = 0 break # filter response to only include those sentences that # actually contained any semantic frames for frameset_list in response.json().get("frameSets", []): if not frameset_list: continue for frameset in frameset_list: if not frameset.get("entities", None): continue for entity in frameset.get("entities"): entities += 1 writer.writerow({ "sentence": frameset["utterance"], "utterance": entity.get("utterance", ""), "frameEvokingElement": entity.get("frameEvokingElement", ""), "cause": entity.get("cause", ""), "effect": entity.get("effect", "") }) chunk = [] if end_of_the_line: self.dataset.update_status("Finished") break else: # let 'em breathe time.sleep(1) self.dataset.finish(entities)
def get_videos_user(self, session, user, csrftoken, detail): """ Scrape videos for given BitChute user :param session: HTTP Session to use :param str user: Username to scrape videos for :param str csrftoken: CSRF token to use for requests :param str detail: Detail level to scrape, basic/detail/comments :return: Video data dictionaries, as a generator """ offset = 0 num_items = 0 base_url = "https://www.bitchute.com/channel/%s/" % user url = base_url + "extend/" container = session.get(base_url) container_soup = BeautifulSoup(container.text, 'html.parser') headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"} while True: self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user)) if self.interrupted: raise ProcessorInterruptedException( "Interrupted while scraping BitChute") post_data = { "csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset) } try: self.dataset.log("Fetching data for BitChute video %s" % url) request = session.post(url, data=post_data, headers=headers) if request.status_code != 200: raise ConnectionError() response = request.json() except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e: self.dataset.update_status( "Error while interacting with BitChute (%s) - try again later." % e, is_final=True) return soup = BeautifulSoup(response["html"], 'html.parser') videos = soup.select(".channel-videos-container") comments = [] if len(videos) == 0 or num_items >= self.max_items: break for video_element in videos: if num_items >= self.max_items: break else: num_items += 1 offset += 1 link = video_element.select_one(".channel-videos-title a") video = { "id": link["href"].split("/")[-2], "thread_id": link["href"].split("/")[-2], "subject": link.text, "body": video_element.select_one(".channel-videos-text"). encode_contents().decode("utf-8").strip(), "author": container_soup.select_one(".details .name a").text, "author_id": container_soup.select_one( ".details .name a")["href"].split("/")[2], "timestamp": int( dateparser.parse( video_element.select_one( ".channel-videos-details.text-right.hidden-xs" ).text).timestamp()), "url": "https://www.bitchute.com" + link["href"], "views": video_element.select_one(".video-views").text.strip(), "length": video_element.select_one(".video-duration").text.strip(), "thumbnail_image": video_element.select_one( ".channel-videos-image img")["src"], } if detail != "basic": video, comments = self.append_details(video, detail) if not video: # unrecoverable error while scraping details return yield video for comment in comments: # these need to be yielded *after* the video because else the result file will have the comments # before the video, which is weird yield comment
def process(self): """ This takes previously generated Word2Vec models and uses them to find similar words based on a list of words """ self.dataset.update_status("Processing sentences") depth = max(1, min(3, convert_to_int(self.parameters.get("crawl_depth")))) input_words = self.parameters.get("words", "") if not input_words or not input_words.split(","): self.dataset.update_status( "No input words provided, cannot look for similar words.", is_final=True) self.dataset.finish(0) return input_words = input_words.split(",") num_words = convert_to_int(self.parameters.get("num-words")) try: threshold = float(self.parameters.get("threshold")) except ValueError: threshold = float(self.get_options()["threshold"]["default"]) threshold = max(-1.0, min(1.0, threshold)) # go through all models and calculate similarity for all given input words result = [] staging_area = self.unpack_archive_contents(self.source_file) for model_file in staging_area.glob("*.model"): interval = model_file.stem # for each separate model, calculate top similar words for each # input word, giving us at most # [max amount] * [number of input] * [number of intervals] # items self.dataset.update_status("Running model %s..." % model_file.name) model = KeyedVectors.load(str(model_file)) word_queue = set() checked_words = set() level = 1 words = input_words.copy() while words: if self.interrupted: shutil.rmtree(staging_area) raise ProcessorInterruptedException( "Interrupted while extracting similar words") word = words.pop() checked_words.add(word) try: similar_words = model.most_similar(positive=[word], topn=num_words) except KeyError: continue for similar_word in similar_words: if similar_word[1] < threshold: continue result.append({ "date": interval, "input": word, "item": similar_word[0], "value": similar_word[1], "input_occurences": model.vocab[word].count, "item_occurences": model.vocab[similar_word[0]].count, "depth": level }) # queue word for the next iteration if there is one and # it hasn't been seen yet if level < depth and similar_word[0] not in checked_words: word_queue.add(similar_word[0]) # if all words have been checked, but we still have an # iteration to go, load the queued words into the list if not words and word_queue and level < depth: level += 1 words = word_queue.copy() word_queue = set() shutil.rmtree(staging_area) if not result: self.dataset.update_status( "None of the words were found in the word embedding model.", is_final=True) self.dataset.finish(0) else: self.write_csv_items_and_finish(result)
def get_items(self, query): """ Run custom search Fetches data from Parler via instaloader. """ # ready our parameters parameters = self.dataset.get_parameters() max_posts = parameters.get("items", 100) min_timestamp = parameters.get("min_date", 0) max_timestamp = parameters.get("max_date", time.time()) queries = [ query.strip() for query in parameters.get("query", "").split(",") ] scrape_echoes = parameters.get("scrape_echoes", False) num_query = 0 # start a HTTP session. Parler uses two session 'cookies' that are required on each request, else no response # will be given. These can only be obtained by logging in. Logging in via 4CAT is not preferred, because it will # lead to quick rate limiting and requires people to share their passwords. Instead, ask users to obtain these # values by logging in themselves. session = requests.Session() session.cookies.set("mst", parameters.get("mst", "")) session.cookies.set("jst", parameters.get("jst", "")) session.headers[ "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" user_map = {} ref_map = {} seen_parleys = set() for query in queries: if not query.strip(): continue num_query += 1 query = query.strip() is_hashtag = (query[0] == "#") if is_hashtag: params = {"tag": query[1:], "limit": 100} url = "https://api.parler.com/v1/post/hashtag" else: # for user queries, we need the user ID, which is *not* the username and can only be obtained # via the API try: user_id_src = self.request_from_parler( session, "GET", "https://api.parler.com/v1/profile", data={"username": query}) user_id = user_id_src["_id"] except KeyError: # user does not exist or no results continue except json.JSONDecodeError as e: self.log.warning("%s:\n\n%s" % (e, user_id_src.text)) continue params = {"id": user_id, "limit": 100} url = "https://api.parler.com/v1/post/creator" cursor = "" num_posts = 0 while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while scraping Parler") if cursor: # handles pagination params["startkey"] = cursor try: chunk_posts = self.request_from_parler(session, "GET", url, data=params) if chunk_posts.status_code in (404, 400): # no results break if chunk_posts.status_code != 200: # no results self.dataset.update_status( "Got unexpected status from Parler API (%i) - cannot parse data, halting." % chunk_posts.status_code, is_final=True) return chunk_posts = chunk_posts.json() except json.JSONDecodeError: # this would be weird self.dataset.update_status( "Got unexpected response from Parler API - cannot parse data, halting.", is_final=True) return except (requests.RequestException, ConnectionError): # this would be weird self.dataset.update_status( "Error connecting to Parler - halting.", is_final=True) return if "posts" not in chunk_posts: self.log.warning(repr(chunk_posts)) break for user in chunk_posts.get("users", {}): user_map[user["id"]] = user["username"] for ref in chunk_posts.get("postRefs", {}): ref_map[ref["_id"]] = ref done = False for post in chunk_posts["posts"]: # fairly straighforward - most of the API response maps 1-on-1 to 4CAT data fields # in case of reposts (echoes), use the original data and mark it as a repost if post.get("source_dataset") and int(post.get("depth", 0)) == 1: if not scrape_echoes: continue reposted_by = user_map.get(post["creator"]) post_src = ref_map[post.get("source_dataset")] else: reposted_by = "" post_src = post if post_src["_id"] in seen_parleys: # items may be scraped twice e.g. when querying two # separate hashtags that are both used in a single # parley - so keep track of seen parleys and skip continue seen_parleys.add(post_src["_id"]) dt = datetime.datetime.strptime(post["createdAt"], "%Y%m%d%H%M%S") post = { "id": post_src["_id"], "thread_id": post_src["_id"], "subject": "", "body": post_src["body"], "author": user_map.get(post_src["creator"], ""), "timestamp": int(dt.timestamp()), "comments": self.expand_number(post_src["comments"]), "urls": ",".join([("https://api.parler.com/l/" + link) for link in post_src["links"]]), "hashtags": ",".join(post_src["hashtags"]), "impressions": self.expand_number(post_src["impressions"]), "reposts": self.expand_number(post_src["reposts"]), "upvotes": self.expand_number(post_src["upvotes"]), "permalink": post_src.get("shareLink", ""), "reposted_by": reposted_by } if min_timestamp and dt.timestamp() < min_timestamp: done = True break if max_timestamp and dt.timestamp() >= max_timestamp: continue num_posts += 1 yield post if num_posts >= max_posts: break self.dataset.update_status( "Retrieved %i posts for query '%s' (%i/%i)" % (num_posts, query, num_query, len(queries))) # paginate, if needed if not done and num_posts < max_posts and not chunk_posts[ "last"]: cursor = chunk_posts["next"] time.sleep(1.5) else: break time.sleep(1)
def get_items(self, query): """ Run custom search Fetches data from Instagram via instaloader. """ # this is useful to include in the results because researchers are # always thirsty for them hashtags hashtag = re.compile(r"#([^\s,.+=-]+)") mention = re.compile(r"@([a-zA-Z0-9_]+)") instagram = instaloader.Instaloader(quiet=True, download_pictures=False, download_videos=False, download_comments=True, download_geotags=False, download_video_thumbnails=False, compress_json=False, save_metadata=True) # ready our parameters parameters = self.dataset.get_parameters() scope = parameters.get("search_scope", "") queries = [ query.strip() for query in parameters.get("query", "").split(",") ] posts = [] max_posts = self.dataset.parameters.get("items", 500) # for each query, get items for query in queries: chunk_size = 0 self.dataset.update_status("Retrieving posts ('%s')" % query) try: if scope == "hashtag": query = query.replace("#", "") chunk = instagram.get_hashtag_posts(query) elif scope == "username": query = query.replace("@", "") profile = instaloader.Profile.from_username( instagram.context, query) chunk = profile.get_posts() else: self.log.warning( "Invalid search scope for instagram scraper: %s" % repr(scope)) return [] # "chunk" is a generator so actually retrieve the posts next posts_processed = 0 for post in chunk: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching posts from Instagram") chunk_size += 1 self.dataset.update_status( "Retrieving posts ('%s', %i posts)" % (query, chunk_size)) if posts_processed >= max_posts: break try: posts.append(chunk.__next__()) posts_processed += 1 except StopIteration: break except instaloader.InstaloaderException as e: # should we abort here and return 0 posts? self.log.warning("Instaloader exception during query %s: %s" % (self.dataset.key, e)) self.dataset.update_status( "Error while retrieving posts for query '%s'" % query) # go through posts, and retrieve comments results = [] posts_processed = 0 comments_bit = " and comments" if self.parameters.get( "scrape_comments", False) else "" for post in posts: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while fetching post metadata from Instagram") posts_processed += 1 self.dataset.update_status("Retrieving metadata%s for post %i" % (comments_bit, posts_processed)) thread_id = post.shortcode try: results.append({ "id": thread_id, "thread_id": thread_id, "parent_id": thread_id, "body": post.caption if post.caption is not None else "", "author": post.owner_username, "timestamp": int(post.date_utc.timestamp()), "type": "video" if post.is_video else "picture", "url": post.video_url if post.is_video else post.url, "thumbnail_url": post.url, "hashtags": ",".join(post.caption_hashtags), "usertags": ",".join(post.tagged_users), "mentioned": ",".join( mention.findall(post.caption) if post.caption else ""), "num_likes": post.likes, "num_comments": post.comments, "subject": "" }) except (instaloader.QueryReturnedNotFoundException, instaloader.ConnectionException): pass if not self.parameters.get("scrape_comments", False): continue try: for comment in post.get_comments(): answers = [answer for answer in comment.answers] try: results.append({ "id": comment.id, "thread_id": thread_id, "parent_id": thread_id, "body": comment.text, "author": comment.owner.username, "timestamp": int(comment.created_at_utc.timestamp()), "type": "comment", "url": "", "hashtags": ",".join(hashtag.findall(comment.text)), "usertags": "", "mentioned": ",".join(mention.findall(comment.text)), "num_likes": comment.likes_count if hasattr( comment, "likes_count") else 0, "num_comments": len(answers), "subject": "" }) except instaloader.QueryReturnedNotFoundException: pass # instagram only has one reply depth level at the time of # writing, represented here for answer in answers: try: results.append({ "id": answer.id, "thread_id": thread_id, "parent_id": comment.id, "body": answer.text, "author": answer.owner.username, "timestamp": int(answer.created_at_utc.timestamp()), "type": "comment", "url": "", "hashtags": ",".join(hashtag.findall(answer.text)), "usertags": "", "mentioned": ",".join(mention.findall(answer.text)), "num_likes": answer.likes_count if hasattr( answer, "likes_count") else 0, "num_comments": 0, "subject": "" }) except instaloader.QueryReturnedNotFoundException: pass except (instaloader.QueryReturnedNotFoundException, instaloader.ConnectionException): # data not available...? this happens sometimes, not clear why pass # remove temporary fetched data and return posts return results
def get_items(self, query): """ Use the Twitter v2 API historical search to get tweets :param query: :return: """ # this is pretty sensitive so delete it immediately after storing in # memory bearer_token = self.parameters.get("api_bearer_token") auth = {"Authorization": "Bearer %s" % bearer_token} endpoint = "https://api.twitter.com/2/tweets/search/all" # these are all expansions and fields available at the time of writing # since it does not cost anything extra in terms of rate limiting, go # for as much data per tweet as possible... tweet_fields = ("attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "public_metrics", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "text", "withheld") user_fields = ("created_at", "description", "entities", "id", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "withheld") place_fields = ("contained_within", "country", "country_code", "full_name", "geo", "id", "name", "place_type") poll_fields = ("duration_minutes", "end_datetime", "id", "options", "voting_status") expansions = ("attachments.poll_ids", "attachments.media_keys", "author_id", "entities.mentions.username", "geo.place_id", "in_reply_to_user_id", "referenced_tweets.id", "referenced_tweets.id.author_id") media_fields = ("duration_ms", "height", "media_key", "non_public_metrics", "organic_metrics", "preview_image_url", "promoted_metrics", "public_metrics", "type", "url", "width") amount = convert_to_int(self.parameters.get("amount"), 10) params = { "query": self.parameters.get("query", ""), "expansions": ",".join(expansions), "tweet.fields": ",".join(tweet_fields), "user.fields": ",".join(user_fields), "poll.fields": ",".join(poll_fields), "place.fields": ",".join(place_fields), "media.fields": ",".join(media_fields), "max_results": max(10, min(amount, 500)) if amount > 0 else 500, # 500 = upper limit, 10 = lower } if self.parameters.get("min_date"): params["start_time"] = datetime.datetime.fromtimestamp( self.parameters["min_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") if self.parameters.get("max_date"): params["end_time"] = datetime.datetime.fromtimestamp( self.parameters["max_date"]).strftime("%Y-%m-%dT%H:%M:%SZ") tweets = 0 self.dataset.log("Search parameters: %s" % repr(params)) while True: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while getting tweets from the Twitter API") # there is a limit of one request per second, so stay on the safe side of this while self.previous_request == int(time.time()): time.sleep(0.1) time.sleep(0.05) self.previous_request = int(time.time()) # now send the request, allowing for at least 5 replies if the connection seems unstable retries = 5 api_response = None while retries > 0: try: api_response = requests.get(endpoint, headers=auth, params=params) break except (ConnectionError, requests.exceptions.RequestException) as e: retries -= 1 wait_time = (5 - retries) * 10 self.dataset.update_status( "Got %s, waiting %i seconds before retrying" % (str(e), wait_time)) time.sleep(wait_time) # rate limited - the limit at time of writing is 300 reqs per 15 # minutes # usually you don't hit this when requesting batches of 500 at # 1/second if api_response.status_code == 429: resume_at = convert_to_int( api_response.headers["x-rate-limit-reset"]) + 1 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Hit Twitter rate limit - waiting until %s to continue." % resume_at_str) while time.time() <= resume_at: time.sleep(0.5) continue # API keys that are valid but don't have access or haven't been # activated properly get a 403 elif api_response.status_code == 403: try: structured_response = api_response.json() self.dataset.update_status( "'Forbidden' error from Twitter API. Could not connect to Twitter API " "with this API key. %s" % structured_response.get("detail", ""), is_final=True) except (json.JSONDecodeError, ValueError): self.dataset.update_status( "'Forbidden' error from Twitter API. Your key may not have access to " "the full-archive search endpoint.", is_final=True) finally: return # sometimes twitter says '503 service unavailable' for unclear # reasons - in that case just wait a while and try again elif api_response.status_code in (502, 503, 504): resume_at = time.time() + 60 resume_at_str = datetime.datetime.fromtimestamp( int(resume_at)).strftime("%c") self.dataset.update_status( "Twitter unavailable (status %i) - waiting until %s to continue." % (api_response.status_code, resume_at_str)) while time.time() <= resume_at: time.sleep(0.5) continue # this usually means the query is too long or otherwise contains # a syntax error elif api_response.status_code == 400: msg = "Response %i from the Twitter API; " % api_response.status_code try: api_response = api_response.json() msg += api_response.get("title", "") if "detail" in api_response: msg += ": " + api_response.get("detail", "") except (json.JSONDecodeError, TypeError): msg += "Some of your parameters (e.g. date range) may be invalid." self.dataset.update_status(msg, is_final=True) return # invalid API key elif api_response.status_code == 401: self.dataset.update_status( "Invalid API key - could not connect to Twitter API", is_final=True) return # haven't seen one yet, but they probably exist elif api_response.status_code != 200: self.dataset.update_status( "Unexpected HTTP status %i. Halting tweet collection." % api_response.status_code, is_final=True) self.log.warning( "Twitter API v2 responded with status code %i. Response body: %s" % (api_response.status_code, api_response.text)) return elif not api_response: self.dataset.update_status( "Could not connect to Twitter. Cancelling.", is_final=True) return api_response = api_response.json() # The API response contains tweets (of course) and 'includes', # objects that can be referenced in tweets. Later we will splice # this data into the tweets themselves to make them easier to # process. So extract them first... included_users = api_response.get("includes", {}).get("users", {}) included_media = api_response.get("includes", {}).get("media", {}) included_polls = api_response.get("includes", {}).get("polls", {}) included_tweets = api_response.get("includes", {}).get("tweets", {}) included_places = api_response.get("includes", {}).get("places", {}) for tweet in api_response.get("data", []): if 0 < amount <= tweets: break # splice referenced data back in # we use copy.deepcopy here because else we run into a # pass-by-reference quagmire tweet = self.enrich_tweet(tweet, included_users, included_media, included_polls, included_places, copy.deepcopy(included_tweets)) tweets += 1 if tweets % 500 == 0: self.dataset.update_status( "Received %i tweets from Twitter API" % tweets) yield tweet # paginate if (amount <= 0 or tweets < amount) and api_response.get( "meta") and "next_token" in api_response["meta"]: params["next_token"] = api_response["meta"]["next_token"] else: break