def segment(inc_regex: Pattern, ex_regex: Pattern, **kwargs): """ Process any files that need to be separated into segments, consistent with the include and exclude regular expressions. If the keyword argument 'force' is True, then files that already have been segmented are resegmented. """ force = kwargs.get('force', False) for dfname in DigFile.all_dig_files(): # is this name consistent with the patterns? if not inc_regex.search(dfname): continue if ex_regex and ex_regex.search(dfname): continue df = DigFile(join(DigFile.dig_dir(), dfname)) if not df.is_segment: n = df.has_segments if n > 0 and not force: continue # Now attempt to segment fids = Fiducials(df) splits = fids.values if len(splits): fids.split() print( f"Split {df.filename} into {len(splits)} segments using Fiducials" ) continue # If that didn't work, what else do we want to try? dt = kwargs.get('frame_length', 50e-6) # 50 µs splitIntoEvenFrames(df, timeBetweenFrames=dt) print(f"Split {df.filename} into even frames")
def compare( a: Dict[HashableLessThan, FileProperties], b: Dict[HashableLessThan, FileProperties], hasher: Optional[Hasher], left: bool = True, right: bool = True, both: bool = True, ignore: re.Pattern = None, file: IO[str] = stdout, ) -> None: aset = a.keys() bset = b.keys() # retleft = None # retright = None # note: the key is usually the `relpath` or the `hash` if left: print("In left only", file=file) for key in sorted(aset - bset): if ignore and ignore.match(fspath(key)): continue print("lo:", key, a[key].relpath, file=file) if right: print("In right only", file=file) for key in sorted(bset - aset): if ignore and ignore.match(fspath(key)): continue print("ro:", key, b[key].relpath, file=file) if both: print("On both, but different", file=file) for key in sorted(aset & bset): if ignore and ignore.match(fspath(key)): continue aprops = a[key] bprops = b[key] if aprops.isdir != bprops.isdir: print("bo:", "one is dir, one is file", key, file=file) if not aprops.isdir: if aprops.size != bprops.size: print("bo:", "size different", key, aprops.size, bprops.size, file=file) elif aprops.size == 0 and bprops.size == 0: pass elif hasher is not None: # same size if (aprops.hash or aprops.abspath) and (bprops.hash or bprops.abspath): if not aprops.hash: aprops.hash = hasher.get(Path(aprops.abspath)) # type: ignore [arg-type] if not bprops.hash: bprops.hash = hasher.get(Path(bprops.abspath)) # type: ignore [arg-type] if aprops.hash != bprops.hash: print("bo:", "hash different", key, aprops.hash, bprops.hash, file=file) # else: pass # same files else: print("bo:", "no hash or abspath for same size files", key, file=file)
def _splitByDate(pattern: Pattern, content: str) -> List[str]: ''' Splitting whole *.txt file content using date extraction regex we just built ''' def _getTimeFormatRegex() -> Pattern: ''' Returns regular expression for extracting AM/PM pattern from chat timestamp, where AM/PM could be prefixed with "\s" -> whitespace ''' return reg_compile(r'^(\s?[a|p]m)$', flags=IGNORECASE) _timeFormatRegex = _getTimeFormatRegex() splitted = list(filter(lambda v: not _timeFormatRegex.search(v), filter(lambda v: len(v) != 0, filter(lambda v: v, pattern.split(content))))) index = -1 for k, v in enumerate(splitted): if k != 0 and pattern.search(v): index = k break if index == -1: return splitted return splitted[index:]
def is_wanted_based_on_metadata(data: Iterable[Optional[str]], allow_re: re.Pattern = None, block_re: re.Pattern = None) -> bool: """Test each RE against each item in data (title, description...)""" if allow_re is None and block_re is None: return True wanted = True blocked = False if allow_re is not None: wanted = False if block_re is not None: blocked = True for item in data: if not item: continue if allow_re and allow_re.search(item): wanted = True if block_re and block_re.search(item): blocked = True if blocked: return False return wanted
def _get_regex_matches_in_scss_files( regex_pattern: re.Pattern, exclude_files: Optional[Iterable[str]] = None ) -> Iterable[Tuple[str, Iterable[Tuple[str, str]]]]: """Return a generator holding all matches of regex_pattern in scss_files (without exclude_files) Returned tuples hold the scss file's path and a list of line and match per match E.g.: ( "git/check_mk/web/htdocs/themes/facelift/_main.scss", [ ("Line 123", "rgb(0, 0, 0)"), ("Line 234", "rgb(255, 255, 255)"), ] ) """ for scss_file in scss_files(): if exclude_files and scss_file.name in exclude_files: continue with open(scss_file) as f: file_matches: List[Tuple[str, str]] = [] for i, l in enumerate(f): if match := regex_pattern.search(l): file_matches.append((f"Line: {str(i)}", match.group())) if file_matches: yield (str(scss_file), file_matches)
def _minify_dir(name: str, regex: re.Pattern = re.compile(r'^(\W*\w)')) -> str: """Shorten a string to the first group that matches regex. :param name: the single name from the path that is being shrunk :param regex: the pattern used to minify the name (using group 0) :return: the minified name if possible, else the whole name """ if match := regex.match(name): return cast(str, match[0])
def display_aggregated_results( task_name: str, *, use_simplified_metric_name: bool = False, metrics_names: List[str] = None, exclude_regex: Pattern = None, include_regex: Pattern = None, renames: List[Tuple[str, str]] = None, n_steps: int, ): df = read_csv(get_aggregate_csv_file(task_name)) if use_simplified_metric_name: df["metric"] = df["metric"].map(lambda s: s.replace( "/eval_phase/test_stream", "").replace("/Task000", "")) if exclude_regex: df = df[df["run_algo"].map(lambda s: exclude_regex.match(s) is None)] if include_regex: df = df[df["run_algo"].map( lambda s: include_regex.match(s) is not None)] for algo_name, replacement in renames or []: df["run_algo"] = df["run_algo"].map(lambda s: replacement if s == algo_name else s) algo_name2score = dict(df[(df["step"] == n_steps) & (df["metric"] == "Top1_Acc_Stream")].groupby( "run_algo").mean()["value"].iteritems()) df["run_algo"] = df["run_algo"].map( lambda name: f"{name} ({algo_name2score[name]:.1%})") all_metrics_names = sorted(set(df["metric"]), key=_get_metric_name_priority) print(all_metrics_names) metrics_names = metrics_names or all_metrics_names g: FacetGrid = relplot( data=df, kind="line", x="step", y="value", hue="run_algo", col="metric", col_order=metrics_names, col_wrap=min(3, len(metrics_names)), facet_kws={ "sharex": False, "sharey": False, "legend_out": False }, ) fig: Figure = g.fig fig.suptitle(task_name, fontsize=16) fig.tight_layout() fig.show()
def filter_regex(event: NewMessage.Event, pattern: re.Pattern) -> bool: text = event.message.text if text and pattern.search(text): return True if event.message.buttons: for button_row in event.message.buttons: for button in button_row: if button.text and pattern.search(button.text): return True if button.url and pattern.search(button.url): return True return False
def update_version(pattern: re.Pattern, v: str, file_path: str): print(f"Replacing {pattern} to {version} in {file_path}") with open(file_path, "r+") as f: file_content = f.read() if not pattern.search(file_content): raise Exception( f"Pattern {pattern!r} doesn't found in {file_path!r} file") new_content = pattern.sub(fr'\g<1>{v}\g<2>', file_content) if file_content == new_content: return f.seek(0) f.truncate() f.write(new_content)
def process( path: Path, locale: str, re_download_link: re.Pattern, re_old_versions: re.Pattern, re_change_log: re.Pattern, change_log: str, ): print(f"Processing {path}") with open(path, "r") as fi: text = fi.read() mt = re_download_link.search(text) if mt is None: print(f"Download link not found in: {path}") return plugin_name = mt.groups()[0] major_version = mt.groups()[1] minor_version = mt.groups()[2] patch_version = mt.groups()[3] download_url = mt.groups()[4] # Add old download link to Old Versions section. old_version = f"{major_version}.{minor_version}.{patch_version}" old_version_link = f"- [{plugin_name} {old_version} - VST 3 (github.com)]({download_url})" text = re_old_versions.sub( lambda exp: f"{exp.group()}\n{old_version_link}", text, count=1) # Update download link. new_version = f"{major_version}.{minor_version}.{int(patch_version) + 1}" new_downlaod_url = f"https://github.com/ryukau/VSTPlugins/releases/download/{release_name}/{plugin_name}{new_version}.zip" new_link = compose_download_link(locale, plugin_name, new_version, new_downlaod_url) if new_link is None: return text = re_download_link.sub(new_link, text, count=1) # Add change log. text = re_change_log.sub( lambda exp: f"{exp.group()}\n- {new_version}{change_log}", text, count=1) out_dir = Path("out") / Path(path.parts[-2]) out_dir.mkdir(parents=True, exist_ok=True) with open(out_dir / Path(path.name), "w") as fi: fi.write(text)
def get_drug_names_by_suffix(drug_name: str, suffixes: List[str], split_chars: re.Pattern, remove_chars: re.Pattern): drug_name_token_list = [] drug_name = drug_name.lower() drug_token = split_chars.split(drug_name) drug_token = [remove_chars.sub("", token) for token in drug_token] for token in drug_token: for suffix in suffixes: if token.endswith(suffix): drug_name_token_list.append(token) break return drug_name_token_list
def passes_filter(self, account: re.Pattern, sender: re.Pattern, conversation: re.Pattern, message: re.Pattern, flags: re.Pattern): if account.fullmatch(self.account) is None: return False if sender.fullmatch(self.sender) is None: return False if conversation.fullmatch(self.conversation) is None: return False if message.fullmatch(self.message) is None: return False if flags.fullmatch(self.flags) is None: return False return True
def check_for_match(self, pattern: re.Pattern) -> bool: match = [] if self.title: match += pattern.findall(self.title.lower()) if self.text: match += pattern.findall(self.text.lower()) if match: logger.info('{0} - {1}: Match!!!'.format(self.id, self.source_name)) self.match_words = tools.delete_duplicates(match) return True else: return False
def samp(corpus: TextIO, samp_corpora: List[TextIO], samp_size: int, fd_removed: TextIO, valid_pwd: Pattern): for samp_corpus in samp_corpora: if not samp_corpus.writable(): print("Training and Testing SHOULD be Writable!", file=sys.stderr) sys.exit(-1) if len(samp_corpora) < 1: print("At least one sample file!", file=sys.stderr) sys.exit(-1) pwd_set = [] count_invalid = defaultdict(int) for line in corpus: line = line.strip("\r\n") if valid_pwd.match(line) is None: count_invalid[line] += 1 continue pwd_set.append(line) samp_size = min(len(pwd_set), samp_size) for idx, samp_corpus in enumerate(samp_corpora): shuffle(pwd_set) for line in pwd_set[:samp_size]: samp_corpus.write(f"{line}\n") samp_corpus.flush() print(f"{idx + 1} sample file saved here: {samp_corpus.name}", file=sys.stderr) samp_corpus.close() if len(count_invalid) != 0 and fd_removed is not None: print(f"Removed invalid passwords saved in {fd_removed.name}", file=sys.stderr) for p, n in sorted(count_invalid.items(), key=lambda x: x[1], reverse=True): fd_removed.write(f"{p}\t{n}\n") fd_removed.close() print("Done!", file=sys.stderr)
def process_logs(query: re.Pattern, args: argparse.Namespace) -> None: with open(args.input_file_name, 'r') as f: with open(args.output_file_name, 'a') as o: for line in f: match = query.match(line) if match is None: continue named_matches = match.groupdict() # these if statements could probably be put into a method and refactored to be better # but for now this is okay if named_matches['start'] != '-' and args.start is not None and \ int(named_matches['start']) < args.start: continue if named_matches['start'] != '-' and args.end is not None and \ int(named_matches['start']) > args.end: continue if named_matches['bytes'] != '-' and args.bytes is not None and \ int(named_matches['bytes']) < args.bytes: continue if named_matches['packets'] != '-' and args.packets is not None and \ int(named_matches['packets']) < args.packets: continue # print(f"[FOUND] {line.rstrip()}") o.write(line)
def _parse_from_keywords( self, transaction_type: str, description: list[str], keywords: re.Pattern, *, bookdate: date, value_date: date, amount: Decimal, ) -> BaseTransaction: d = dict[str, str]() current_key = 'transaction_type' current_value = transaction_type for line in description[1:]: m = keywords.match(line) if m is None: current_value += line else: d[current_key] = current_value.rstrip() current_key = m.group(1) current_value = line[m.end():] d[current_key] = current_value.rstrip() omschrijving = d.get('Omschrijving') if omschrijving is None: omschrijving = d['Kenmerk'] return Transaction(account=self.account, description=omschrijving, operation_date=bookdate, value_date=value_date, amount=amount, currency=self.currency, metadata=d)
def read_data_lines(self, dat_file, regex: re.Pattern, start_flag, end_flag=None, split_data=False) -> list: """Reads line by line without any spaces to search for strings while disregarding formatting""" read_data = False results = [] with open(dat_file, "r") as f: for line in f.readlines(): compact_str = line.replace(" ", "").strip().lower() if start_flag in compact_str: read_data = True if end_flag is not None and end_flag in compact_str: return results if read_data is False: continue res = regex.search(line) if res is not None: result_data = res.group(1) if split_data: result_data = result_data.split() results.append(result_data) return results
def search_in_page(regex: re.Pattern, page: Page) -> List[dict]: """Search for `text` in `page` and extract meta Arguments needle: the text to search for page: page number (1-based index) Returns a list of meta """ result = [] page_meta = page.getTextPage().extractDICT() # we are using get(key, []) to bypass any missing key errors for blk in page_meta.get('blocks', []): for ln in blk.get('lines', []): for spn in ln.get('spans', []): text = spn.get('text', "") # the current search algorithm is very naive and doesn't handle # line breaks and more complex layout. might want to take a # look at `page.searchFor`, but the current algorithm should be # enough for TeX-generated pdf if regex.search(text): result.append(spn) return result
def _finditer_with_line_numbers( pattern: re.Pattern, string: str) -> ty.Iterator[ty.Tuple[re.Match, int]]: """ A version of 're.finditer' that returns '(match, line_number)' pairs. """ matches = list(pattern.finditer(string)) if not matches: return [] end = matches[-1].start() # -1 so a failed 'rfind' maps to the first line. newline_table = {-1: 0} for i, m in enumerate(re.finditer(r"\n", string), 1): # don't find newlines past our last match offset = m.start() if offset > end: break newline_table[offset] = i # Failing to find the newline is OK, -1 maps to 0. for m in matches: newline_offset = string.rfind("\n", 0, m.start()) line_number = (newline_table[newline_offset] + 1 ) # + 1 since line numbers doesnt start at 0 yield m, line_number
def re_replace( items: Iterable[str], regex: re.Pattern, subfunc: Callable[[re.Match], str]) -> Generator[str, None, None]: for item in items: m = regex.search(item) if m is not None: yield subfunc(m)
def valid_fc(argument: str, *, _fc: re.Pattern = _friend_code) -> str: fc = argument.upper().strip('"') m = _fc.match(fc) if m is None: raise commands.BadArgument("Not a valid friend code!") return "{one}-{two}-{three}".format(**m.groupdict())
def check_pattern(arg_value: str, pattern: re.Pattern): if not pattern.match(arg_value): raise argparse.ArgumentTypeError( f"Invalid value provided! Must match regex pattern: {pattern.pattern}" ) else: return arg_value
def password_match(line: str, password_policy_and_password_regex: re.Pattern) -> bool: match = password_policy_and_password_regex.match(line) if match == None: return False min_letter_count = int(match.group(1)) max_letter_count = int(match.group(2)) letter = match.group(3) password = match.group(4) occurences_count = password.count(letter) result = min_letter_count <= occurences_count <= max_letter_count print( "{password:<30} {occurences_count:>3}{letter} {belongs} [{min_letter_count}, {max_letter_count}] {result}" .format( password=password, occurences_count=occurences_count, letter=letter, belongs=("∈" if result else "∉"), min_letter_count=min_letter_count, max_letter_count=max_letter_count, result=('\033[92mOK\033[0m' if result else '\033[91mKO\033[0m'))) return result
def get_pr_number_from_commit_message(commit_message: str, pattern: re.Pattern) -> int: """ コミットメッセージからPR番号を取得 ※PR番号はコミットメッセージの1行目に含まれることを想定 Parameters ---------- commit_message : str コミットメッセージ pattern: re.Pattern PR番号を表現する正規表現 グループマッチの1つ目を使用する Returns ------- int PR番号 ※取得できない場合には0を返す """ first_row = commit_message.split("\n")[0] m = pattern.search(first_row) if not m: # コミットメッセージの1行目にPR番号が含まれていない場合 return 0 pr_number = int(m.groups()[0]) return pr_number
def match(self, regexp: Pattern) -> Optional[ParseResult[str]]: match = regexp.match(self.string, self.index) if match: value = match.group(0) source = Source(self.string, self.index + len(value)) return ParseResult(value, source) return None
def applyRegex(filename, regex: re.Pattern): infile = open(filename, "r") return [ ":".join(str(elem) for elem in [filename, lineNumber, line]) for lineNumber, line in enumerate(infile.readlines()) if regex.search(line) is not None ]
def _resolve_version(version: str, regex: re.Pattern, value: str) -> str: """Extracts the version from the match, used the matched group indicated by an string with format: "\\1" or "\\1?value_1:value_2" (ternary version) in the \\;version field of the regex """ if not version: return version matches = regex.search(value) if not matches: return version resolved = version matches = [matches.group()] + list(matches.groups()) for index, match in enumerate(matches): ternary = re.search("\\\\{}\\?([^:]+):(.*)$".format(index), version) if ternary: ternary = [ternary.group()] + list(ternary.groups()) if len(ternary) == 3: resolved = version.replace(ternary[0], ternary[1] if match else ternary[2]) resolved = resolved.strip().replace("\\{}".format(index), match or "") return resolved
def _verify_public_instance_jwt( cache: 'cg_cache.inter_request.Backend[str]', signature: str, allowed_hosts: re.Pattern, ) -> str: # First get the url from the jwt without verifying, then get the # public key and do the verification. unsafe_decoded = jwt.decode(signature, verify=False) if allowed_hosts.match(unsafe_decoded.get('url', None)) is None: raise PermissionException(401) try: decoded = cache.cached_call( key=unsafe_decoded['url'], get_value=lambda: _download_public_key( unsafe_decoded['url'], unsafe_decoded['id'], ), callback=lambda public_key: jwt.decode( signature, key=public_key, algorithms='RS256', verify=True, ) ) assert decoded == unsafe_decoded except BaseException as exc: # pylint: disable=broad-except logger.error('Got unauthorized broker request', exc_info=True) raise PermissionException(401) from exc else: return decoded['url']
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0): at = self.pos if regex: if not isinstance(regex, re.Pattern): print("uncompiled regex passed to peek!") regex = re.compile(regex) match = regex.match(self.content[at:]) if match is None: return None if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)): print("Cannot consume regex group that does not start at match start!") return None self.pos += len(match.group(regex_group)) return match.group(regex_group) if text: if self.content[at:].startswith(text): self.pos += len(text) return text return None self.pos += size return self.content[at:at + size]
def get_api_key(base_url: str, regex: re.Pattern) -> str: response = requests.get(base_url) re_return = regex.search(response.text).group() if not re_return: raise ValueError("Could not find the api key you were looking for.") return re_return[7:]