def check_for_match(self, pattern: re.Pattern) -> bool: match = [] if self.title: match += pattern.findall(self.title.lower()) if self.text: match += pattern.findall(self.text.lower()) if match: logger.info('{0} - {1}: Match!!!'.format(self.id, self.source_name)) self.match_words = tools.delete_duplicates(match) return True else: return False
def parse_cpp(input_string: str, regex: re.Pattern = FRAME_REGEX) ->...: """ this method parses the input_string using the regex pattern and compiles a list of Class objects containing every c++ class :input_string: a string containing c++ code. :regex: a regex pattern, to be used to parse c++ structs and classes :return: a list() containg Class() objects """ classes = [] for match in regex.findall(input_string): cpp_class = Class(match[1].strip(), [], []) # parse description cli_description = CLI_FLAG_REGEX.findall(match[0]) if cli_description: cpp_class.doc_string.extend([ line.strip() for line in COMMENT_REGEX.findall(cli_description[0]) ]) # parse function body for line in match[2].split('\n'): line = line.strip() if line.endswith((';', ',')): line = line[:-1] if not line or line.startswith('//'): continue cpp_class.members.append(line) classes.append(cpp_class) return classes
def resolve_path( self, path: str, *, _rx: re.Pattern = re.compile("{([^}{]+)}"), # {xxx} or {xxx:int} _default_schema: t.Dict[str, t.Any] = {"type": "string"}, ) -> t.Tuple[str, t.Optional[t.Dict[str, t.Dict[str, t.Any]]]]: v = self._parsed_path_map.get(path) if v is not None: return v if "{" not in path: v = self._parsed_path_map[path] = (path, None) return v schemas = {} for pattern in _rx.findall(path): name = pattern.strip() schema = _default_schema if ":" in pattern: name, typ = name.split(":", 1) name = name.strip() typ = typ.strip() # todo: lookup type schema = {"type": typ} # xxx path = path.replace(pattern, name) schemas[name] = schema return path, schemas
def normalise_text_to_only_regex_matches(text: str, matcher: re.Pattern) -> str: """ Filter out the characters in the text that do not match the given matcher :param text: str :param matcher: re.Pattern :return: str """ return "".join(matcher.findall(text))
def process_url(url:str, pattern:re.Pattern, target:str) -> str: try: html = url_to_html(url) # Get html from a given url matches = pattern.findall(html) # Find all regex matches in html according to given pattern matches = [match for match in matches if target in match] # Filter the matches to those which contain a particular target string return ','.join(matches) if matches else 'none' # Convert this list of matches to a comma-separated string except HTTPError: return 'url not found' except ValueError: return 'not a valid web address'
def _find_inner(partial_xmp: str, pattern: re.Pattern) -> str: match = pattern.findall(partial_xmp) # If called on a string but no match was found, findall() returns an empty list: if match: # Returns the whole match return match[0] else: raise XMPTagNotFoundError( "A tag pattern did not match with the XMP string. The tag may not exist." )
def findall_in_files(pattern: re.Pattern, filenames: List[str], encoding: str) -> re.Match: """Generator""" for filename in filenames: logging.debug('util.findall_in_files: input file %s', filename) with open(filename, 'rb') as ifile: for match in pattern.findall(ifile.read().decode(encoding)): logging.debug( 'util.findall_in_files(): match: file = %s, text = %s', filename, match) yield match
def _fuzzy_id(self, regex: re.Pattern, text: str) -> str: """transform a sample id into fuzzy mode according the regex pattern Args: regex (re.Pattern): The info retains are in the capture patterns text (str): input sample id Returns: str: fuzzy mode sample id """ matches = regex.findall(text) if matches: text = '_'.join(matches[0]) return text
def parse_single_match(words: list, compiled_re: re.Pattern) -> float: """ Loop through words and try to match them to compiled_re. Return match as a float. Note: current word is removed from original list. :param words: list of words :param compiled_re: :return: float volume """ for i in range(len(words)): w = words[i].replace(",", ".") m = compiled_re.findall(w) if m: val = float(m[0]) words.pop(i) return val
def parse_dup_match(words: list, compiled_re: re.Pattern, units: dict) -> float: """ Loop through words and return a float, if a volume string (1dl, 0.5l etc) is found. Note: current word is removed from original list. :param words: list of words :param compiled_re: :param units: :return: float volume """ for i in range(len(words)): w = words[i].replace(",", ".") m = compiled_re.findall(w) if m: val = float(m[0][0]) * units[m[0][1]] words.pop(i) return val
def _get_latest( html: str, pattern: re.Pattern, sort_key=t.Optional[t.Callable] ) -> t.Optional[t.Union[str, t.Tuple[str, ...]]]: match = pattern.findall(html) if not match: log.warning("%s did not match", pattern.pattern) return None if sort_key is None or len(match) == 1: result = match[0] else: log.debug("%s matched multiple times, selected latest", pattern.pattern) result = max(match, key=sort_key) log.debug("%s matched %s", pattern.pattern, result) return result
def replace(self, search_value: re.Pattern, replace_value: str, replace_replaced_words: bool = False): if not replace_replaced_words and self.search_value_contains_replaced_words(search_value, replace_value): return self replacing_word = self.word if search_value.search(self.word) is not None: replacing_word = search_value.sub(replace_value, self.word) collection = search_value.findall(self.word) replaced_words: List[str] if len(collection) > 1: replaced_words = list(map(lambda s: s.replace(s, replace_value), collection)) else: replaced_words = [] if replacing_word != self.word: for word in replaced_words: self.replaced_words.add(word) self.word = replacing_word return self
def replace_with_func_single(self, search_value: re.Pattern, func: Callable[[], str], replace_replaced_words: bool = False): replace_value = func() if not replace_replaced_words and self.search_value_contains_replaced_words(search_value, replace_value): return self replacing_word = self.word if search_value.search(self.word) is not None: match = search_value.search(self.word).group() replacing_word = self.word.replace(match, replace_value) collection = search_value.findall(self.word) replaced_words: List[str] if len(collection) > 1: replaced_words = list(map(lambda s: s.replace(s, replace_value), collection)) else: replaced_words = [] if replacing_word != self.word: for word in replaced_words: self.replaced_words.add(word) self.word = replacing_word return self
def cut_part( text: str, split_pattern: re.Pattern, with_spliter: bool = True, with_offset: bool = False) -> list: """ Cut text to parts by the given Regex Pattern. Parameters ---------- text: raw text. split_pattern: how to split text. with_spliter: whether the parts contain spliters. with_offset: whether the parts contain offsets. Returns -------- out: cutted parts. """ spliters = split_pattern.findall(text) length = len(spliters) lst = [] start = 0 for i, part in enumerate(split_pattern.split(text)): if i < length: if with_spliter: part = part + spliters[i] len_spliter = 0 else: len_spliter = len(spliters[i]) else: len_spliter = 0 end = start + len(part) + len_spliter if part: if with_offset: item = (part, start, end) else: item = part lst.append(item) start = end return lst
def parseTimeFromRegex_(regex: re.Pattern, text: str) -> int: results = regex.findall(text) if len(results) == 0: return None if results[0][0] == '': return None hours = 0 minutes = 0 seconds = 0 milliseconds = 0 try: hours = int(results[0][0]) minutes = int(results[0][1]) seconds = float(results[0][2]) milliseconds = float(results[0][3]) except Exception: pass # 对于 timeColonFormatMilliseconds_ 来说 这里是匹配不到 milliseconds 的 # 不过下一步计算的时候 由于seconds是小数 所以又修正了... return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)
def replace_with_func_multiple(self, search_value: re.Pattern, func: Callable[[str, str], str], replace_replaced_words: bool = False): if search_value.search(self.word) is None: return self word = self.word captures = search_value.search(word) replace_value = func(captures.group(1), captures.group(2)) if not replace_replaced_words and self.search_value_contains_replaced_words(search_value, replace_value): return self replacing_word = self.word.replace(captures.group(0), replace_value) collection = search_value.findall(self.word) collection = list(flatten(collection)) replaced_words: List[str] if len(collection) > 1: replaced_words = list(map(lambda s: s.replace(s, replace_value), collection)) else: replaced_words = [] if replacing_word != self.word: for word in replaced_words: self.replaced_words.add(word) self.word = replacing_word return self
def main(imgur_id: str, imgur_secret: str, imgur_refresh: str, reddit_secret: str, reddit_id: str, reddit_password: str, reddit_agent: str, reddit_username: str, latex: re.Pattern, context: re.Pattern, hypercontext: re.Pattern) -> None: """Runs the bot :param imgur_id: The Imgur client ID :type imgur_id: str :param imgur_secret: The Imgur client secret :type imgur_secret: str :param imgur_refresh: The Imgur client refresh token :type imgur_refresh: str :param reddit_secret: The script secret :type reddit_secret: str :param reddit_id: The script ID :type reddit_id: str :param reddit_password: The bot account's password :type reddit_password: str :param reddit_agent: The script's user agent :type reddit_agent: str :param reddit_username: The bot account's username :type reddit_username: str :param latex: The pattern to match for a LaTeX expression :type latex: re.Pattern :param context: The pattern to match for a context :type context: re.Pattern :param hypercontext: The pattern to match for a hyperlink's context :type hypercontext: re.Pattern :raises ValueError: Any of the credentials are invalid """ # Recursively starts bot in case of 503 try: # Creates the Reddit client and the Imgur client r = reddit_client(reddit_secret, reddit_id, reddit_password, reddit_agent, reddit_username) i = authenticate(imgur_id, imgur_secret, imgur_refresh) while True: # Inbox records all mentions in Reddit # This will use Reddit inbox's read/unread feature to keep track of processed comments for comment in praw.models.util.stream_generator(r.inbox.unread): contexts = [] formulae = [] ctx = [] hyperctx = [] # For each formula found, add to the list formulae.extend(latex.findall(comment.body)) # Add context for each formula to the list contexts.extend(re.split(latex, comment.body)) for content in contexts: # Add primary contexts to a list ctx.extend(context.findall(content)) # Add hyperlink contexts to a list hyperctx.extend(hypercontext.findall(content)) if formulae != []: try: with timeout(10): form_comment(i, comment, formulae, ctx, hyperctx) # This covers people making LaTeX renders that are too big except Exception: comment.mark_read() except Exception: time.sleep(60) main(imgur_id, imgur_secret, imgur_refresh, reddit_secret, reddit_id, reddit_password, reddit_agent, reddit_username, latex, context, hypercontext)
def find_matches(pattern: re.Pattern, input_str: str) -> bool: matches = pattern.findall(input_str) if len(matches) == 0: return False return True
def _test_regex_findall_dict(self, regex: re.Pattern, dct: Dict[str, List[str]]): for test, matches in dct.items(): with self.subTest(test=test): self.assertEqual(matches, regex.findall(test))
def find_tier(x: str, compiled_re: re.Pattern): matches = compiled_re.findall(x) if matches: return matches[0] return None