def __call__(self, i_url): for url, m in i_url: if url.startswith('http') and regex.match(self.http_regex, url): path = f"{config.hidden_folder}/pdfs/{urllib.parse.quote_plus(url)}.pdf" path = path.replace("(", "") path = path.replace(")", "") if not os.path.exists(path): os.system(f"chromium --headless \ --disable-gpu \ --disable-translate \ --disable-extensions \ --disable-background-networking \ --safebrowsing-disable-auto-update \ --disable-sync \ --metrics-recording-only \ --disable-default-apps \ --no-first-run \ --mute-audio \ --hide-scrollbars \ --disable-software-rasterizer " f"--print-to-pdf={path} {url}") yield path, m elif os.path.exists(url) and regex.match(self.file_regex, url) is not None: yield url, m else: logging.error(f"{url} is not a valid url/path")
def get_suffix_and_law_name(self, string: str): """ Returns: A tuple containing length of 1. the article between numbers and law name (eg. " der ") 2. length of name of law as in the given string 3. The type of the reference. If not found lengths are 0. """ suffix_match = regex.match(r"^,?\s+?de[sr]\s+", string) if suffix_match: suffix_len = suffix_match.end() law_test = string[suffix_len:suffix_len + 1000] dict_suffix_len = self.get_dict_law_name_len(law_test) if dict_suffix_len: return suffix_len, dict_suffix_len, "dict" sgb_suffix_len = self.get_sgb_law_name_len(law_test) if sgb_suffix_len: return suffix_len, sgb_suffix_len, "sgb" eu_suffix_len = self.get_eu_law_name_len(law_test) if eu_suffix_len: return suffix_len, eu_suffix_len, "eu" ignore_suffix_len = self.get_ignore_law_name_len(law_test) if ignore_suffix_len: return suffix_len, ignore_suffix_len, "ignore" return suffix_len, 0, "unknown" else: # no der/des suffix suffix_match = regex.match(r"^[\s\n]+", string[:1000]) if suffix_match: suffix_len = len(suffix_match[0]) law_test = string[suffix_len:1000] dict_suffix_len = self.get_dict_law_name_len(law_test) if dict_suffix_len: return suffix_len, dict_suffix_len, "dict" sgb_suffix_len = self.get_sgb_law_name_len(law_test) if sgb_suffix_len: return suffix_len, sgb_suffix_len, "sgb" ignore_no_suffix_len = self.get_no_suffix_ignore_law_name_len( law_test) if ignore_no_suffix_len: return suffix_len, ignore_no_suffix_len, "ignore" return 0, 0, "internal"
def extract_abp(content): """Extracts blocked and unblocked domains from ABP style content.""" pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1) pattern_supported_block = re.compile( r"^\|\|.+\^(?>$|.+(?:" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|" r"\bdocument\b|" r"\ball\b" # r"\ball\b|" # r"\bpopup\b" r"))", re.V1, ) pattern_scrub_blocked_list = [ r"^\|\|", r"\^($|.+(?>" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|\bdocument\b|" r"\ball\b|" r"\bpopup\b|" r"\S+))", ] pattern_scrub_blocked = re.compile( "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1 ) block_rules = [ x for x in content if re.match(pattern_supported_block, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] blocked_domains = [ re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules ] blocked_domains = [x for x in blocked_domains if valid_domain(x)] pattern_supported_unblock = re.compile(r"@@\|\|.+\^$") unblock_rules = [ x for x in content if re.match(pattern_supported_unblock, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] unblocked_domains = [ x.replace("@@||", "").replace("^", "").replace("$important", "") for x in unblock_rules ] regex_rules = [] return blocked_domains, unblocked_domains, unblock_rules, regex_rules
def find_references(soup, pattern, attrs): """ Finds the references in the soup and marks them a tag """ logs = [] # For debug text_tags = list(soup.find_all("text")) for text_tag in text_tags: for text_tag_string in list(text_tag.contents): if type(text_tag_string) is not bs4.element.NavigableString: continue tag_cursor = text_tag_string last_match_end = 0 matches = pattern.finditer(text_tag_string) for match in list(matches): if regex.match(r"\s?,?of\b", text_tag_string[match.end():]): continue ref_tag = soup.new_tag("reference", **attrs) pre_text, ref_tag, post_text = add_tag(text_tag_string, match.start(), match.end(), ref_tag) pre_text = pre_text[last_match_end:] last_match_end = match.end() tag_cursor.replace_with(ref_tag) ref_tag.insert_before(pre_text) ref_tag.insert_after(post_text) tag_cursor = post_text logs.append(f"{post_text[:50]} --- {match[0]}") # For debug return logs # For debug
def worker_unmatched_item(item, pattern): """Worker for remove_redundant via ThreadPoolExecutor to get unmatched subdomains from subdomains. """ if not re.match(pattern, item, concurrent=True): return item return None
async def show(self, ctx, url): result = regex.match(self.URL_REGEX, url) if result: url = result.group(0) else: raise commands.BadArgument("Invalid Instagram URL.") await self.show_media(ctx, url)
def extract_rules(content): pattern_supported_block = re.compile(r"^\|\|.+(\^|\^\$important)$") block_rules = [ x for x in content if re.match(pattern_supported_block, x, concurrent=True) ] pattern_supported_unblock = re.compile(r"^@@.+(\^(\$important)?|\/)$") unblock_rules = [ x for x in content if re.match(pattern_supported_unblock, x, concurrent=True) ] pattern_supported_regex = re.compile(r"^\/.*\/$") regex_rules = [ x for x in content if re.match(pattern_supported_regex, x, concurrent=True) ] return block_rules, unblock_rules, regex_rules
def extract_regex(content): """ Extracts regex rules within two '/'. """ pattern_if_regexp = re.compile(r"^\/.*\/$", re.V1) regex_rules = [ x for x in content if re.match(pattern_if_regexp, x, concurrent=True) ] return regex_rules
def check_boxes(text_box: str, pattern: str): """ Used for Validation of REGEX Text. :param text_box: Plain Text to validate. :param pattern: Regex Pattern for validation. :return: True if test_box is valid. """ return regex.match(pattern, text_box)
def is_valid(self) -> bool: """ :return Whether this variable has valid properties in order for it to work properly with the Terraform API. """ key_pattern = r"^[a-zA-Z0-9_-]+$" key_valid = regex.match(key_pattern, self.key) is not None category_valid = self.category in ["terraform", "env"] return key_valid and category_valid
def _extract_timestamps(video_id, content, word_to_extract): logger.info( f"Extract timestamps where the word {word_to_extract} is pronounced", prefix=f"{video_id} >> ") pattern = r"<(\d{2}:\d{2}:\d{2}.\d{3})>([^<]+)<(\d{2}:\d{2}:\d{2}.\d{3})>" res = [(start, word.lower().strip(), end) for start, word, end in regex.findall(pattern, content, overlapped=True) if regex.match(word_to_extract, word.lower().strip())] logger.debug(f"Extracted {len(res)} words") return res
def concat_category(out_file): """Concatenate category README.md files""" files = glob(f"{DirPath.input}/*/*.md") files = sorted(files, key=lambda x: x) files = sorted(files, key=lambda x: x.__contains__("regional")) files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True) for file in files: with open(file, encoding="utf-8") as file_input: with open(out_file, "a", encoding="utf-8") as file_output: lines = (re.sub(r"^#", r"##", x) if re.match(r"^#{0,6}+\s", x) else x for x in file_input) file_output.writelines(lines)
def bills(): data_dir = '../../lab1/data' for directory in os.listdir(data_dir): if directory.endswith('txt'): # print("directory: " + directory) bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read() text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill) # print(text[:400]) r = regex.match( r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text) if r is None: yield bill, "", "", "", "f" else: yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]
# print("directory: " + directory) yield open(os.path.join(data_dir, directory), encoding='UTF-8').read() if __name__ == '__main__': b = {} for year in range(1900, 2500): b[str(year)] = {} for bill in bills(): text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill) # print(text[:400]) r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text) # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text) # print(title.group()) position = r.group("position") year = r.group("journal_year") or r.group("year") b[year][position] = {} b[year][position]["counter"] = 0 b[year][position]["title"] = r.group("title") b[year][position]["journal_number"] = r.group("journal_number") b[year][position]["journal_year"] = r.group("journal_year") b[year][position]["year"] = r.group("year") b[year][position]["position"] = position counter = 0