def validate_query(query, request, user): """ Validate input for a dataset query on the Douban data source. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ filtered_query = {} # the dates need to make sense as a range to search within after, before = query.get("daterange") if before and after and before < after: raise QueryParametersException("Date range must start before it ends") filtered_query["min_date"], filtered_query["max_date"] = (after, before) # normalize groups to just their IDs, even if a URL was provided, and # limit to 25 groups = [group.split("/group/").pop().split("/")[0].strip() for group in query["groups"].replace("\n", ",").split(",")] groups = [group for group in groups if group][:25] if not any(groups): raise QueryParametersException("No valid groups were provided.") filtered_query["groups"] = ",".join(groups) # max amount of topics is 200 because after that Douban starts throwing 429s filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) # strip HTML from posts? filtered_query["strip"] = bool(query.get("strip", False)) return filtered_query
def validate_query(query, request, user): """ Validate BitChute query input :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list with no wrapping # whitespace items = query.get("query").replace("\n", ",") if len(items.split(",")) > 15: raise QueryParametersException( "You cannot query more than 15 items at a time.") # simple! return { "items": query.get("max_posts"), "query": items, "scope": query.get("search_scope"), "item_type": query.get("search_type") }
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV or tab file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # do we have an uploaded file? if "data_upload" not in request.files: raise QueryParametersException("No file was offered for upload.") platform = query.get("platform", "") if platform not in ImportFromExternalTool.required_columns: raise QueryParametersException("Invalid platform") file = request.files["data_upload"] if not file: raise QueryParametersException("No file was offered for upload.") wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") # validate file as csv reader = csv.DictReader(wrapped_upload, delimiter=",") try: fields = reader.fieldnames except UnicodeDecodeError: raise QueryParametersException( "Uploaded file is not a well-formed CSV file.") # check if all required fields are present required = ImportFromExternalTool.required_columns[platform] missing = [] for field in required: if field not in reader.fieldnames: missing.append(field) if missing: raise QueryParametersException( "The following required columns are not present in the csv file: %s" % ", ".join(missing)) wrapped_upload.detach() # return metadata - the filename is sanitised and serves no purpose at # this point in time, but can be used to uniquely identify a dataset disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") return { "filename": disallowed_characters.sub("", file.filename), "time": time.time(), "datasource": platform, "board": "upload", "platform": platform }
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not query.get("body_match", None): raise QueryParametersException("Please provide a search query") # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query["daterange"] del query["daterange"] # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException( "When setting a date range, please provide both an upper and lower limit." ) return query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 5: raise QueryParametersException( "You cannot query more than 5 items at a time.") # simple! return { "items": query.get("max_posts"), "query": items, "board": query.get("search_scope") + "s", # used in web interface "search_scope": query.get("search_scope"), "scrape_comments": query.get("scrape_comments") }
def validate_query(query, request, user): """ Validate input for a dataset query on the Usenet data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not user.is_admin() and not user.get_value( "usenet.can_query_without_keyword", False) and not query.get("body_match", None) and not query.get( "subject_match", None) and query.get("search_scope", "") != "random-sample": raise QueryParametersException( "Please provide a body query, subject query or random sample size." ) # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query.get("daterange") if any(query.get("daterange")) and not all(query.get("daterange")): raise QueryParametersException( "When providing a date range, set both an upper and lower limit." ) del query["daterange"] # if we made it this far, the query can be executed return query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # 'location' would be possible as well but apparently requires a login if query.get("search_scope", "") not in ("hashtag", "username", "music"): raise QueryParametersException( "Invalid search scope: must be hashtag, username or music") # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # 100 is mostly arbitrary - may need tweaking max_posts = 100 if not user.get_value( "tiktok.allow_more_posts", False) and not user.is_admin() else 1000 if query.get("max_posts", ""): try: max_posts = min(abs(int(query.get("max_posts"))), max_posts) except TypeError: raise QueryParametersException( "Provide a valid number of posts to query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"[@#\s]+") items = whitespace.sub("", query.get("query").replace("\n", ",")).split(",") if len(items) > 5: raise QueryParametersException( "You cannot query more than 5 items at a time.") sigil = { "hashtag": "#", "username": "******", "music": "🎶" }[query.get("search_scope")] items = ",".join([sigil + item for item in items if item]) # simple! return { "items": max_posts, "query": items, "board": query.get("search_scope"), # used in web interface "search_scope": query.get("search_scope") }
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # 'location' would be possible as well but apparently requires a login if query.get("search_scope", "") not in ("tag", "blog"): raise QueryParametersException("Invalid search scope: must be tag or blog") # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list items = query.get("query").replace("\n", ",").replace("#","").replace("\r", ",") items = items.split(",") items = [item.lstrip().rstrip() for item in items if item] # set before if query.get("max_date"): before = int(datetime.datetime.strptime(query.get("max_date", ""), "%Y-%m-%d").timestamp()) else: before = None # set before if query.get("min_date"): after = int(datetime.datetime.strptime(query.get("min_date", ""), "%Y-%m-%d").timestamp()) else: after = None # Not more than 5 plox if len(items) > 5: raise QueryParametersException("Only query for five or less tags or blogs.") # no query 4 u if not items: raise QueryParametersException("Invalid search search query.") # simple! return { "query": items, "board": query.get("search_scope") + "s", # used in web interface "search_scope": query.get("search_scope"), "fetch_reblogs": bool(query.get("fetch_reblogs", False)), "before": before, "after": after }
def validate_query(query, request, user): """ Validate Telegram query :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") if not query.get("session", "").strip(): raise QueryParametersException( "You need to authenticate with the Telegram API first.") if not query.get("api_id", None) or not query.get("api_hash", None): raise QueryParametersException( "You need to provide valid Telegram API credentials first.") if "api_phone" in query: del query["api_phone"] # 5000 is mostly arbitrary - may need tweaking max_posts = 50000 if query.get("max_posts", ""): try: max_posts = min(abs(int(query.get("max_posts"))), max_posts) except TypeError: raise QueryParametersException( "Provide a valid number of messages to query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 25: raise QueryParametersException( "You cannot query more than 25 items at a time.") # eliminate empty queries items = ",".join([item for item in items.split(",") if item]) # simple! return { "items": max_posts, "query": items, "board": "", # needed for web interface "scrape-userinfo": bool(query.get("scrape-userinfo", False)), "session": query.get("session"), "api_id": query.get("api_id"), "api_hash": query.get("api_hash") }
def validate_query(query, request, user): """ Validate Parler query input :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") if not query.get("jst") or not query.get("mst"): raise QueryParametersException( "You must provide the 'JST' and 'MST' values") # 500 is mostly arbitrary - may need tweaking max_posts = 2500 if query.get("max_posts", ""): try: max_posts = min(abs(int(query.get("max_posts"))), max_posts) except TypeError: raise QueryParametersException( "Provide a valid number of posts to query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 15: raise QueryParametersException( "You cannot query more than 15 items at a time.") # the dates need to make sense as a range to search within after, before = query.get("daterange") if before and after and before < after: raise QueryParametersException( "Date range must start before it ends") query["min_date"], query["max_date"] = (after, before) # simple! return { "items": max_posts, "query": items, "min_date": query.get("min_date", None), "max_date": query.get("max_date", None), "jst": query.get("jst"), "mst": query.get("mst"), "scrape_echoes": bool(query.get("scrape_echoes", False)) }
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not user.is_admin() and not user.get_value("4chan.can_query_without_keyword", False) and not query.get("body_match", None) and not query.get("subject_match", None) and query.get("search_scope", "") != "random-sample": raise QueryParametersException("Please provide a message or subject search query") query["min_date"], query["max_date"] = query["daterange"] del query["daterange"] if query.get("search_scope") not in ("dense-threads",): del query["scope_density"] del query["scope_length"] if query.get("search_scope") not in ("match-ids",): del query["valid_ids"] return query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # 'location' would be possible as well but apparently requires a login if query.get("search_scope", "") not in ("hashtag", "username"): raise QueryParametersException( "Invalid search scope: must be hashtag or username") # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # 500 is mostly arbitrary - may need tweaking max_posts = 2500 if query.get("max_posts", ""): try: max_posts = min(abs(int(query.get("max_posts"))), max_posts) except TypeError: raise QueryParametersException( "Provide a valid number of posts to query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 5: raise QueryParametersException( "You cannot query more than 5 items at a time.") # simple! return { "items": max_posts, "query": items, "board": query.get("search_scope") + "s", # used in web interface "search_scope": query.get("search_scope"), "scrape_comments": bool(query.get("scrape_comments", False)) }
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not query.get("body_match", None) and not query.get("subject_match", None): raise QueryParametersException("Please provide a search query") if query.get("corpus") not in ("deu", "gbr"): raise QueryParametersException("Please choose a valid corpus to search within") # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException("When setting a date range, please provide both an upper and lower limit.") # the dates need to make sense as a range to search within if query.get("min_date", None) and query.get("max_date", None): try: before = int(query.get("max_date", "")) after = int(query.get("min_date", "")) except ValueError: raise QueryParametersException("Please provide valid dates for the date range.") if after < 946684800: raise QueryParametersException("Please provide valid dates for the date range.") if before < after: raise QueryParametersException( "Please provide a valid date range where the start is before the end of the range.") if after - before > (6 * 86400 * 30.25): raise QueryParametersException("The date range for this query can span 6 months at most.") query["min_date"] = after query["max_date"] = before else: raise QueryParametersException("You need to provide a date range for your query") is_placeholder = re.compile("_proxy$") filtered_query = {} for field in query: if not is_placeholder.search(field): filtered_query[field] = query[field] # if we made it this far, the query can be executed return filtered_query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list items = query.get("query").replace("#","") items = items.split("\n") # Not more than 10 plox if len(items) > 10: raise QueryParametersException("Only query for ten or less tags or blogs." + str(len(items))) # no query 4 u if not items: raise QueryParametersException("Search query cannot be empty.") # So it shows nicely in the frontend. items = ", ".join([item.lstrip().rstrip() for item in items if item]) # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query.get("daterange") if any(query.get("daterange")) and not all(query.get("daterange")): raise QueryParametersException("When providing a date range, set both an upper and lower limit.") del query["daterange"] query["query"] = items query["board"] = query.get("search_scope") + "s" # used in web interface # if we made it this far, the query can be executed return query
def validate_query(query, request, user): """ Validate input for a dataset query on the Twitter data source. Will raise a QueryParametersException if invalid parameters are encountered. Parameters are additionally sanitised. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not query.get("query", None): raise QueryParametersException("Please provide a query.") if not query.get("api_bearer_token", None): raise QueryParametersException( "Please provide a valid bearer token.") if len(query.get("query")) > 1024: raise QueryParametersException( "Twitter API queries cannot be longer than 1024 characters.") # the dates need to make sense as a range to search within # but, on Twitter, you can also specify before *or* after only after, before = query.get("daterange") if before and after and before < after: raise QueryParametersException( "Date range must start before it ends") # if we made it this far, the query can be executed return { "query": query.get("query"), "api_bearer_token": query.get("api_bearer_token"), "min_date": after, "max_date": before, "amount": query.get("amount") }
def validate_query(query, request, user): """ Validate Telegram query :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") if not query.get("api_id", None) or not query.get( "api_hash", None) or not query.get("api_phone", None): raise QueryParametersException( "You need to provide valid Telegram API credentials first.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 25: raise QueryParametersException( "You cannot query more than 25 items at a time.") # eliminate empty queries items = ",".join([item for item in items.split(",") if item]) # simple! return { "items": query.get("max_posts"), "query": items, "board": "", # needed for web interface "api_id": query.get("api_id"), "api_hash": query.get("api_hash"), "api_phone": query.get("api_phone") }
def validate_query(query, request, user): """ Validate input for a dataset query on the Guardian data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not query.get("body_match", None) and not query.get("subject_match", None): raise QueryParametersException("Please provide a search query") # only one of two dense threads options may be chosen at the same time, and # it requires valid density and length parameters. full threads is implied, # so it is otherwise left alone here if query.get("search_scope", "") == "dense-threads": try: dense_density = int(query.get("scope_density", "")) except ValueError: raise QueryParametersException("Please provide a valid numerical density percentage.") if dense_density < 15 or dense_density > 100: raise QueryParametersException("Please provide a density percentage between 15 and 100.") try: dense_length = int(query.get("scope_length", "")) except ValueError: raise QueryParametersException("Please provide a valid numerical dense thread length.") if dense_length < 30: raise QueryParametersException("Please provide a dense thread length of at least 30.") # the dates need to make sense as a range to search within if not all(query.get("daterange")): raise QueryParametersException("You must provide a date range") query["min_date"], query["max_date"] = query.get("daterange") del query["daterange"] if query["max_date"] and (query["max_date"] - query["min_date"]) > (86400 * 31 * 6): raise QueryParametersException("Date range may span 6 months at most") # if we made it this far, the query can be executed return query
def parse_value(settings, choice, silently_correct=True): """ Filter user input Makes sure user input for post-processors is valid and within the parameters specified by the post-processor :param obj settings: Settings, including defaults and valid options :param choice: The chosen option, to be parsed :param bool silently_correct: If true, replace invalid values with the given default value; else, raise a QueryParametersException if a value is invalid. :return: Validated and parsed input """ input_type = settings.get("type", "") if input_type in (UserInput.OPTION_INFO, UserInput.OPTION_DIVIDER): # these are structural form elements and can never return a value return None elif input_type == UserInput.OPTION_TOGGLE: # simple boolean toggle return choice is not None elif input_type in (UserInput.OPTION_DATE, UserInput.OPTION_DATERANGE): # parse either integers (unix timestamps) or try to guess the date # format (the latter may be used for input if JavaScript is turned # off in the front-end and the input comes from there) value = None try: value = int(choice) except ValueError: parsed_choice = parse_datetime(choice) value = int(parsed_choice.timestamp()) finally: return value elif input_type == UserInput.OPTION_MULTI: # any number of values out of a list of possible values # comma-separated during input, returned as a list of valid options if not choice: return settings.get("default", []) chosen = choice.split(",") return [item for item in chosen if item in settings.get("options", [])] elif input_type == UserInput.OPTION_CHOICE: # select box # one out of multiple options # return option if valid, or default if choice not in settings.get("options"): if not silently_correct: raise QueryParametersException("Invalid value selected; must be one of %s." % ", ".join(settings.get("options", {}).keys())) else: return settings.get("default", "") else: return choice elif input_type in (UserInput.OPTION_TEXT, UserInput.OPTION_TEXT_LARGE): # text string # optionally clamp it as an integer; return default if not a valid # integer if "max" in settings: try: choice = min(settings["max"], int(choice)) except (ValueError, TypeError) as e: if not silently_correct: raise QueryParametersException("Provide a value of %i or lower." % settings["max"]) choice = settings.get("default") if "min" in settings: try: choice = max(settings["min"], int(choice)) except (ValueError, TypeError) as e: if not silently_correct: raise QueryParametersException("Provide a value of %i or more." % settings["min"]) choice = settings.get("default") if choice is None or choice == "": choice = settings.get("default") if choice is None: choice = 0 if "min" in settings or "max" in settings else "" return choice else: # no filtering return choice
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not user.is_admin() and not user.get_value( "4chan.can_query_without_keyword", False) and not query.get("body_match", None) and not query.get( "subject_match", None) and query.get("search_scope", "") != "random-sample": raise QueryParametersException( "Please provide a body query, subject query or random sample size." ) # Make sure to accept only a body or subject match. if not query.get("body_match", None) and query.get( "subject_match", None): query["body_match"] = "" elif query.get("body_match", None) and not query.get("subject_match", None): query["subject_match"] = "" # body query and full threads are incompatible, returning too many posts # in most cases if query.get("body_match", None): if "full_threads" in query: del query["full_threads"] # random sample requires a sample size, and is additionally incompatible # with full threads if query.get("search_scope", "") == "random-sample": try: sample_size = int(query.get("random_amount", 0)) except ValueError: raise QueryParametersException( "Please provide a valid numerical sample size.") if sample_size < 1 or sample_size > 100000: raise QueryParametersException( "Please provide a sample size between 1 and 100000.") if "full_threads" in query: del query["full_threads"] # only one of two dense threads options may be chosen at the same time, and # it requires valid density and length parameters. full threads is implied, # so it is otherwise left alone here if query.get("search_scope", "") == "dense-threads": try: dense_density = int(query.get("scope_density", "")) except ValueError: raise QueryParametersException( "Please provide a valid numerical density percentage.") if dense_density < 15 or dense_density > 100: raise QueryParametersException( "Please provide a density percentage between 15 and 100.") try: dense_length = int(query.get("scope_length", "")) except ValueError: raise QueryParametersException( "Please provide a valid numerical dense thread length.") if dense_length < 30: raise QueryParametersException( "Please provide a dense thread length of at least 30.") # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException( "When setting a date range, please provide both an upper and lower limit." ) # the dates need to make sense as a range to search within if query.get("min_date", None) and query.get("max_date", None): try: before = int(query.get("max_date", "")) after = int(query.get("min_date", "")) except ValueError: raise QueryParametersException( "Please provide valid dates for the date range.") if before < after: raise QueryParametersException( "Please provide a valid date range where the start is before the end of the range." ) query["min_date"] = after query["max_date"] = before is_placeholder = re.compile("_proxy$") filtered_query = {} for field in query: if not is_placeholder.search(field): filtered_query[field] = query[field] # if we made it this far, the query can be executed return filtered_query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # 'location' would be possible as well but apparently requires a login if query.get("search_scope", "") not in ("tag", "blog"): raise QueryParametersException("Invalid search scope: must be tag or blog") # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list items = query.get("query").replace("\n", ",").replace("#","").replace("\r", ",") items = items.split(",") items = [item.lstrip().rstrip() for item in items if item] print(query.get("max_date"), query.get("min_date")) # Set dates, if given. if query.get("max_date") or query.get("min_date"): # On some OSes, the date is submitted as dd-mm-yyyy. Make sure to also fetch these. ddmmyyyy = r"^([0-2][0-9]|(3)[0-1])(-)(((0)[0-9])|((1)[0-2]))(-)\d{4}$" date_format = "%Y-%m-%d" # Before if query.get("max_date"): try: if re.match(ddmmyyyy, query.get("max_date","")): date_format = "%d-%m-%Y" before = int(datetime.datetime.strptime(query.get("max_date", ""), date_format).timestamp()) except ValueError: raise QueryParametersException("Invalid value for max date %s " % str(query.get("max_date"))) else: before = None # After if query.get("min_date"): date_format = "%Y-%m-%d" try: if re.match(ddmmyyyy, query.get("min_date","")): date_format = "%d-%m-%Y" after = int(datetime.datetime.strptime(query.get("min_date", ""), date_format).timestamp()) except ValueError: raise QueryParametersException("Invalid value for min date %s " % str(query.get("min_date"))) else: after = None else: before = None after = None # Not more than 5 plox if len(items) > 5: raise QueryParametersException("Only query for five or less tags or blogs.") # no query 4 u if not items: raise QueryParametersException("Invalid search search query.") # simple! return { "query": items, "board": query.get("search_scope") + "s", # used in web interface "search_scope": query.get("search_scope"), "fetch_reblogs": bool(query.get("fetch_reblogs", False)), "before": before, "after": after }
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # we need a board! r_prefix = re.compile(r"^/?r/") boards = [ r_prefix.sub("", board) for board in query.get("board", "").split(",") if board.strip() ] if not boards: raise QueryParametersException( "Please provide a board or a comma-separated list of boards to query." ) # ignore leading r/ for boards query["board"] = ",".join(boards) # this is the bare minimum, else we can't narrow down the full data set if not user.is_admin() and not user.get_value( "reddit.can_query_without_keyword", False) and not query.get( "body_match", "").strip() and not query.get( "subject_match", "").strip(): raise QueryParametersException( "Please provide a body query or subject query.") # body query and full threads are incompatible, returning too many posts # in most cases if query.get("body_match", None): if "full_threads" in query: del query["full_threads"] # Make sure no body or subject searches starting with just a minus sign are possible, e.g. "-Trump" if query.get("body_match", None) or query.get("subject_match", None): queries_to_check = [] if query.get("body_match", None): queries_to_check += [ body_query.strip() for body_query in query["body_match"].split(" ") ] if query.get("subject_match", None): queries_to_check += [ subject_query.strip() for subject_query in query["subject_match"].split(" ") ] startswith_minus = [ query_check.startswith("-") for query_check in queries_to_check ] if all(startswith_minus): raise QueryParametersException( "Please provide body queries that do not start with a minus sign." ) # only one of two dense threads options may be chosen at the same time, and # it requires valid density and length parameters. full threads is implied, # so it is otherwise left alone here if query.get("search_scope", "") == "dense-threads": try: dense_density = int(query.get("scope_density", "")) except ValueError: raise QueryParametersException( "Please provide a valid numerical density percentage.") if dense_density < 15 or dense_density > 100: raise QueryParametersException( "Please provide a density percentage between 15 and 100.") try: dense_length = int(query.get("scope_length", "")) except ValueError: raise QueryParametersException( "Please provide a valid numerical dense thread length.") if dense_length < 30: raise QueryParametersException( "Please provide a dense thread length of at least 30.") # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException( "When setting a date range, please provide both an upper and lower limit." ) # the dates need to make sense as a range to search within if query.get("min_date", None) and query.get("max_date", None): try: before = int(query.get("max_date", "")) after = int(query.get("min_date", "")) except ValueError: raise QueryParametersException( "Please provide valid dates for the date range.") if before < after: raise QueryParametersException( "Please provide a valid date range where the start is before the end of the range." ) query["min_date"] = after query["max_date"] = before is_placeholder = re.compile("_proxy$") filtered_query = {} for field in query: if not is_placeholder.search(field): filtered_query[field] = query[field] # if we made it this far, the query can be executed return filtered_query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV or tab file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # do we have an uploaded file? if "data_upload" not in request.files: raise QueryParametersException("No file was offered for upload.") file = request.files["data_upload"] if not file: raise QueryParametersException("No file was offered for upload.") encoding = SearchCustom.sniff_encoding(file) wrapped_file = io.TextIOWrapper(file, encoding=encoding) sample = wrapped_file.read(1024 * 1024) wrapped_file.seek(0) dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t")) # With validated csvs, save as is but make sure the raw file is sorted reader = csv.DictReader(wrapped_file, dialect=dialect) try: fields = reader.fieldnames except UnicodeDecodeError: raise QueryParametersException( "Uploaded file is not a well-formed CSV or TAB file.") # check if all required fields are present required = ("id", "thread_id", "subject", "author", "body", "timestamp") missing = [] for field in required: if field not in reader.fieldnames: missing.append(field) if missing: raise QueryParametersException( "The following required columns are not present in the csv file: %s" % ", ".join(missing)) try: row = reader.__next__() try: parse_datetime(row["timestamp"]) except ValueError: raise QueryParametersException( "Your 'timestamp' column does not use a recognisable format (yyyy-mm-dd hh:mm:ss is recommended)" ) except StopIteration: pass wrapped_file.detach() # Whether to strip the HTML tags strip_html = False if query.get("strip_html"): strip_html = True # return metadata - the filename is sanitised and serves no purpose at # this point in time, but can be used to uniquely identify a dataset disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") return { "filename": disallowed_characters.sub("", file.filename), "time": time.time(), "datasource": "custom", "board": "upload", "strip_html": strip_html }
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # 'location' would be possible as well but apparently requires a login if query.get("search_scope", "") not in ("hashtag", "username"): raise QueryParametersException("Invalid search scope: must be hashtag or username") # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") if not query.get("username", None).strip() or not query.get("password", None).strip(): raise QueryParametersException("You need to provide a username and password") username = query.get("username") password = query.get("password") login_tester = instaloader.Instaloader() try: login_tester.login(username, password) except instaloader.TwoFactorAuthRequiredException: raise QueryParametersException( "Two-factor authentication with Instagram is not available via 4CAT at this time. Disable it for your Instagram account and try again.") except (instaloader.InvalidArgumentException, instaloader.BadCredentialsException): raise QueryParametersException("Invalid Instagram username or password.") # there are some fundamental limits to how safe we can make this, but # we can at least encrypt it so that if someone has access to the # database but not the 4CAT config file, they cannot use the login # details # we use the 4CAT anyonymisation salt (which *should* be a long, # random string) # making sure the 4CAT config file is kept safe is left as an exercise # for the reader... key = SearchInstagram.salt_to_fernet_key() fernet = Fernet(key) obfuscated_login = fernet.encrypt(json.dumps([username, password]).encode("utf-8")) # 500 is mostly arbitrary - may need tweaking max_posts = 2500 if query.get("max_posts", ""): try: max_posts = min(abs(int(query.get("max_posts"))), max_posts) except TypeError: raise QueryParametersException("Provide a valid number of posts to query.") # reformat queries to be a comma-separated list with no wrapping # whitespace whitespace = re.compile(r"\s+") items = whitespace.sub("", query.get("query").replace("\n", ",")) if len(items.split(",")) > 5: raise QueryParametersException("You cannot query more than 5 items at a time.") # simple! return { "login": obfuscated_login.decode("utf-8"), "items": max_posts, "query": items, "board": query.get("search_scope") + "s", # used in web interface "search_scope": query.get("search_scope"), "scrape_comments": bool(query.get("scrape_comments", False)) }
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # this is the bare minimum, else we can't narrow down the full data set if not query.get("body_match", None) and not query.get("subject_match", None): raise QueryParametersException("Please provide a search query") # only one of two dense threads options may be chosen at the same time, and # it requires valid density and length parameters. full threads is implied, # so it is otherwise left alone here if query.get("search_scope", "") == "dense-threads": try: dense_density = int(query.get("scope_density", "")) except ValueError: raise QueryParametersException("Please provide a valid numerical density percentage.") if dense_density < 15 or dense_density > 100: raise QueryParametersException("Please provide a density percentage between 15 and 100.") try: dense_length = int(query.get("scope_length", "")) except ValueError: raise QueryParametersException("Please provide a valid numerical dense thread length.") if dense_length < 30: raise QueryParametersException("Please provide a dense thread length of at least 30.") # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException("When setting a date range, please provide both an upper and lower limit.") # the dates need to make sense as a range to search within if query.get("min_date", None) and query.get("max_date", None): try: before = int(query.get("max_date", "")) after = int(query.get("min_date", "")) except ValueError: raise QueryParametersException("Please provide valid dates for the date range.") if after < 946684800: raise QueryParametersException("Please provide valid dates for the date range.") if before < after: raise QueryParametersException( "Please provide a valid date range where the start is before the end of the range.") if after - before > (6 * 86400 * 30.25): raise QueryParametersException("The date range for this query can span 6 months at most.") query["min_date"] = after query["max_date"] = before else: raise QueryParametersException("You need to provide a date range for your query") is_placeholder = re.compile("_proxy$") filtered_query = {} for field in query: if not is_placeholder.search(field): filtered_query[field] = query[field] # if we made it this far, the query can be executed return filtered_query
def parse_all(options, input, silently_correct=True): """ Parse form input for the provided options Ignores all input not belonging to any of the defined options: parses and sanitises the rest, and returns a dictionary with the sanitised options. If an option is *not* present in the input, the default value is used, and if that is absent, `None`. In other words, this ensures a dictionary with 1) only white-listed keys, 2) a value of an expected type for each key. :param dict options: Options, as a name -> settings dictionary :param dict input: Input, as a form field -> value dictionary :param bool silently_correct: If true, replace invalid values with the given default value; else, raise a QueryParametersException if a value is invalid. :return dict: Sanitised form input """ parsed_input = {} # all parameters are submitted as option-[parameter ID], this is an # artifact of how the web interface works and we can simply remove the # prefix input = {re.sub(r"^option-", "", field): input[field] for field in input} for option, settings in options.items(): if settings.get("type") in (UserInput.OPTION_DIVIDER, UserInput.OPTION_INFO): # these are structural form elements and never have a value continue elif settings.get("type") == UserInput.OPTION_DATERANGE: # special case, since it combines two inputs option_min = option + "-min" option_max = option + "-max" # normally this is taken care of client-side, but in case this # didn't work, try to salvage it server-side if option_min not in input or input.get(option_min) == "-1": option_min += "_proxy" if option_max not in input or input.get(option_max) == "-1": option_max += "_proxy" # save as a tuple of unix timestamps (or None) after, before = (UserInput.parse_value(settings, input.get(option_min), silently_correct), UserInput.parse_value(settings, input.get(option_max), silently_correct)) if before and after and after > before: if not silently_correct: raise QueryParametersException("End of date range must be after beginning of date range.") else: before = after parsed_input[option] = (after, before) elif settings.get("type") == UserInput.OPTION_TOGGLE: # special case too, since if a checkbox is unchecked, it simply # does not show up in the input parsed_input[option] = option in input elif option not in input: # not provided? use default parsed_input[option] = settings.get("default", None) else: # normal parsing and sanitisation parsed_input[option] = UserInput.parse_value(settings, input[option], silently_correct) return parsed_input
def validate_query(query, request, user): """ Validate input for a dataset query on the 4chan data source. Will raise a QueryParametersException if invalid parameters are encountered. Mutually exclusive parameters may also be sanitised by ignoring either of the mutually exclusive options. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # we need a board! r_prefix = re.compile(r"^/?r/") boards = [ r_prefix.sub("", board) for board in query.get("board", "").split(",") if board.strip() ] if not boards: raise QueryParametersException( "Please provide a board or a comma-separated list of boards to query." ) # ignore leading r/ for boards query["board"] = ",".join(boards) # this is the bare minimum, else we can't narrow down the full data set if not user.is_admin() and not user.get_value( "reddit.can_query_without_keyword", False) and not query.get( "body_match", "").strip() and not query.get( "subject_match", "").strip(): raise QueryParametersException( "Please provide a body query or subject query.") # body query and full threads are incompatible, returning too many posts # in most cases if query.get("body_match", None): if "full_threads" in query: del query["full_threads"] # Make sure no body or subject searches starting with just a minus sign are possible, e.g. "-Trump" if query.get("body_match", None) or query.get("subject_match", None): queries_to_check = [] if query.get("body_match", None): queries_to_check += [ body_query.strip() for body_query in query["body_match"].split(" ") ] if query.get("subject_match", None): queries_to_check += [ subject_query.strip() for subject_query in query["subject_match"].split(" ") ] startswith_minus = [ query_check.startswith("-") for query_check in queries_to_check ] if all(startswith_minus): raise QueryParametersException( "Please provide body queries that do not start with a minus sign." ) # both dates need to be set, or none if query.get("min_date", None) and not query.get("max_date", None): raise QueryParametersException( "When setting a date range, please provide both an upper and lower limit." ) # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query.get("daterange") if "*" in query.get("body_match", "") and not user.get_value( "reddit.can_query_without_keyword", False): raise QueryParametersException( "Wildcard queries are not allowed as they typically return too many results to properly process." ) if "*" in query.get("board", "") and not user.get_value( "reddit.can_query_without_keyword", False): raise QueryParametersException( "Wildcards are not allowed for boards as this typically returns too many results to properly process." ) del query["daterange"] if query.get("search_scope") not in ("dense-threads", ): del query["scope_density"] del query["scope_length"] # if we made it this far, the query can be executed return query
def validate_query(query, request, user): """ Validate custom data input Confirms that the uploaded file is a valid CSV or tab file and, if so, returns some metadata. :param dict query: Query parameters, from client-side. :param request: Flask request :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ # do we have an uploaded file? if "data_upload" not in request.files: raise QueryParametersException("No file was offered for upload.") file = request.files["data_upload"] if not file: raise QueryParametersException("No file was offered for upload.") wrapped_upload = io.TextIOWrapper(file, encoding="utf-8") # validate file as tab if file.filename.endswith(".tab"): reader = csv.DictReader(wrapped_upload, delimiter="\t", quoting=csv.QUOTE_NONE) # validate file as csv else: reader = csv.DictReader(wrapped_upload) try: fields = reader.fieldnames except UnicodeDecodeError: raise QueryParametersException( "Uploaded file is not a well-formed CSV or TAB file.") # check if all required fields are present required = ("id", "thread_id", "subject", "author", "body", "timestamp") missing = [] for field in required: if field not in reader.fieldnames: missing.append(field) if missing: raise QueryParametersException( "The following required columns are not present in the csv file: %s" % ", ".join(missing)) try: row = reader.__next__() try: datetime.datetime.strptime(row["timestamp"], "%Y-%m-%d %H:%M:%S") except ValueError: raise QueryParametersException( "Your 'timestamp' column does not have the required format (YYY-MM-DD hh:mm:ss)" ) except StopIteration: pass wrapped_upload.detach() # Whether to strip the HTML tags strip_html = False if query.get("strip_html"): strip_html = True # return metadata - the filename is sanitised and serves no purpose at # this point in time, but can be used to uniquely identify a dataset disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") return { "filename": disallowed_characters.sub("", file.filename), "time": time.time(), "datasource": "custom", "board": "upload", "strip_html": strip_html }