def extract_abp(content): """Extracts blocked and unblocked domains from ABP style content.""" pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1) pattern_supported_block = re.compile( r"^\|\|.+\^(?>$|.+(?:" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|" r"\bdocument\b|" r"\ball\b" # r"\ball\b|" # r"\bpopup\b" r"))", re.V1, ) pattern_scrub_blocked_list = [ r"^\|\|", r"\^($|.+(?>" r"\bfirst-party\b|" r"\b1p\b|" r"\bthird-party\b|" r"\b3p\b|\bdocument\b|" r"\ball\b|" r"\bpopup\b|" r"\S+))", ] pattern_scrub_blocked = re.compile( "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1 ) block_rules = [ x for x in content if re.match(pattern_supported_block, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] blocked_domains = [ re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules ] blocked_domains = [x for x in blocked_domains if valid_domain(x)] pattern_supported_unblock = re.compile(r"@@\|\|.+\^$") unblock_rules = [ x for x in content if re.match(pattern_supported_unblock, x, concurrent=True) and not re.match(pattern_unsupported, x, concurrent=True) ] unblocked_domains = [ x.replace("@@||", "").replace("^", "").replace("$important", "") for x in unblock_rules ] regex_rules = [] return blocked_domains, unblocked_domains, unblock_rules, regex_rules
def __init__(self): ht_regex = r"[##]{1}(\w+)" regex_builder = SingleWordRegexBuilder() regex_builder.add_option(ht_regex) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
class TextHelper: FROM_CAMEL_TO_SNAKE_PATTERN: Pattern = regex.compile(r"((?<=[a-z])[A-Z]|(?!^)[A-Z](?=[a-z]))") @classmethod def from_camel_to_snake_case(cls, text: str) -> str: snaked: str = cls.FROM_CAMEL_TO_SNAKE_PATTERN.sub(r"_\1", text).lower() return snaked
def gen_checksum(file_blocklist): """Not necessary: ABP no longer requires checksum validation. adapted from https://github.com/adblockplus/adblockplus/blob/master/addChecksum.py """ checksum_pattern = re.compile( r"^\s*!\s*checksum[\s\-:]+([\w\+\/=]+).*\n", re.I | re.M ) def add_checksum(in_data): """Adds checksum.""" checksum = calculate_checksum(in_data) re.sub(checksum_pattern, "", in_data) return re.sub(r"(\r?\n)", r"\1! Checksum: %s\1" % checksum, in_data, 1) def calculate_checksum(in_data): """Calculate checksum for the filterlist file.""" md5().update(normalize(in_data).encode("utf-8")) return b64encode(md5().digest()).decode("utf-8").rstrip("=") def normalize(in_data): """Cleans the filterlist file.""" re.sub(r"\r", "", in_data) re.sub(r"\n+", "\n", in_data) return re.sub(checksum_pattern, "", in_data) with open(file_blocklist, encoding="utf-8") as file: read_data = file.read() data = add_checksum(read_data) write_file(data, file_blocklist)
def __init__(self): mention_regex = r"[@@]{1}([\w_]+)" regex_builder = SingleWordRegexBuilder() regex_builder.add_option(mention_regex) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def remove_common_sub(domains): """ Remove www. and m. subdomains """ pattern = re.compile(r"^(?>www\.|m\.)") domains = [re.sub(pattern, "", x, concurrent=True) for x in domains] return set(domains)
def find_new_drug_names(drug_names, model, top=100, min_length=6, max_length=40): """ Find new potential drug names based on current drug names and word2vec model. :param drug_names: list of current analysed drug names :param model: current word2vec model :param top: similar matching parameter :return: array of strings containing potential new drug names """ drug_names_in_vocab = [d for d in drug_names.keys() if d in model.vocab] whitelisted_chars = regex.compile( r"[a-zA-Z0-9\-\.,\_/ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]+") new_drug_names = [] for drug_candidate, _ in model.most_similar(positive=drug_names_in_vocab, topn=top): if drug_candidate not in drug_names and whitelisted_chars.fullmatch( drug_candidate) is not None: new_drug_names.append(drug_candidate) bad_vowels = set('ąęóiouy') new_drug_names = [ d for d in new_drug_names if max_length > len(d) > min_length and d[-1] not in bad_vowels ] return new_drug_names
def extract_rules(content): pattern_supported_block = re.compile(r"^\|\|.+(\^|\^\$important)$") block_rules = [ x for x in content if re.match(pattern_supported_block, x, concurrent=True) ] pattern_supported_unblock = re.compile(r"^@@.+(\^(\$important)?|\/)$") unblock_rules = [ x for x in content if re.match(pattern_supported_unblock, x, concurrent=True) ] pattern_supported_regex = re.compile(r"^\/.*\/$") regex_rules = [ x for x in content if re.match(pattern_supported_regex, x, concurrent=True) ] return block_rules, unblock_rules, regex_rules
def match_regex(domains, regex_rules): """ Match domains against regex """ regex_list = [x[1:-1] for x in regex_rules] pattern = re.compile("|".join(regex_list)) matches = [x for x in domains if re.findall(pattern, x, concurrent=True)] return matches
def __init__(self): regex_builder = SingleWordRegexBuilder( word_sep_tokens=r"([\p{P}\s])" ) #All punctuation and white space chars regex_builder.add_list_options_as_regex(self.WORDS_TO_MATCH_LOWERCASED) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def __init__(self): # based onhttps://www.regular-expressions.info/email.html email_regex = r"[A-Z\d\.\_%\+\-]+@[A-Z\d\.\-]+\.[A-Z]{2,}" regex_builder = SingleWordRegexBuilder() regex_builder.add_option(email_regex) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def extract_regex(content): """ Extracts regex rules within two '/'. """ pattern_if_regexp = re.compile(r"^\/.*\/$", re.V1) regex_rules = [ x for x in content if re.match(pattern_if_regexp, x, concurrent=True) ] return regex_rules
def __init__(self): dni_regex_1 = r"\d{8,8}[A-Z]" dni_regex_2 = r"\d\d\.\d{3,3}\.\d{3,3}-[A-Z]" regex_builder = SingleWordRegexBuilder() regex_builder.add_option(dni_regex_1) regex_builder.add_option(dni_regex_2) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def __init__(self): cif_regex_1 = r"[A-Z]\d{7,7}([A-Z]|\d)" cif_regex_2 = r"[A-Z]-\d\d\.\d{3,3}\.\d{3,3}" regex_builder = SingleWordRegexBuilder() regex_builder.add_option(cif_regex_1) regex_builder.add_option(cif_regex_2) tot_regex = regex_builder.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def _validate_pattern(pattern: Pattern, text: str, msg: str): if pattern.regex: r = _adjust_regex(pattern.regex) p = regex.compile(r) if not p.match(text): if pattern.message: msg = pattern.message raise InvalidPatternException( f"Invalid text: {text} | {msg} | Example: {pattern.example}")
def __init__(self): file = pkg_resources.open_text(files, self.COMPANY_EXTENSIONS) file_lines = file.readlines() file.close() companies = [] for line in file_lines: companies.append(line.strip().replace(".", "\.")) builder = SingleWordRegexBuilder() builder.add_list_options_as_regex(companies) comp_regex = builder.build() matcher_regex = regex.compile(comp_regex) super().__init__(matcher_regex)
def parse_posting_from_html(soup: Bs) -> dict: """ Versucht Posting Text aus HTML zu gewinnen :param soup: BS4 HTML Objekt :return: Dictionary """ try: match_html = soup.find('div', {'class': re.compile(r'\Wmaincontent\W')}) return { "description": match_html.text.replace(r'\r', "").replace(r'\n', "").replace(r'\t', "") } except AttributeError: return dict()
def extract_hosts(content, list_type): """Extracts blocked or unblocked domains from hosts/domains style content.""" pattern_scrub = [ r"(?>\#|\!|\s+\#|\s+\!).*", r"^\s", r".*\blocalhost\b.*", r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)", r"^(?>www\.|m\.)", ] pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1) domains = [re.sub(pattern, "", x, concurrent=True) for x in content] domains = [x for x in domains if valid_domain(x)] blocked_domains, unblocked_domains = [], [] if list_type == "unblock": unblocked_domains = domains if list_type == "block": blocked_domains = domains return blocked_domains, unblocked_domains
def test_build(self): """Tests code path with 0 options given""" rb = SingleWordRegexBuilder() assert rb.build() == "" rb.add_option("and") rb.add_option("BLABLABALBABLABLABA") and_regex_text = rb.build() and_regex = regex.compile(and_regex_text) matcher = RegexMatcher(and_regex) examples = ["and", "and ", " and", "this and that"] negative_examples = ["andy", " andy ", " rand "] for example in examples: res = matcher.match(example) assert len(res) == 1 for example in negative_examples: assert len(matcher.match(example)) == 0
def get_leaf_texts_to_compare(graph_filename, G, source_text, source_text_reg, law_names_data, dataset): """ get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph. Leaves are only seqitems or supseqitems. """ leaf_keys = get_leaves(G) snapshot = graph_filename[:-len(".gpickle.gz")] if dataset == "us": files = [ os.path.join(source_text, x) for x in list_dir(source_text, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ] if source_text_reg: files += [ os.path.join(source_text_reg, x) for x in list_dir(source_text_reg, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ] files.sort() else: # is DE files = get_snapshot_law_list(snapshot, law_names_data) files = [os.path.join(source_text, f) for f in files] whitespace_pattern = regex.compile(r"[\s\n]+") texts = {} for file in files: print(f"\r{files.index(file)} / {len(files)}", end="") soup = create_soup(file) tags = soup.find_all(["seqitem", "subseqitem"]) for tag in tags: if tag["key"] in leaf_keys: text = tag.get_text(" ") text = whitespace_pattern.sub(" ", text).lower().strip() texts[tag["key"]] = text.lower() return texts
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Tokenize: r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>" custom_Re = regex.compile( r"""(%s)""" % "|".join( ( r":[^:\s]+:", r"<:[^:\s]+:[0-9]+>", r"<a:[^:\s]+:[0-9]+>", r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>", ) + REGEXPS ), regex.VERBOSE | regex.I | regex.UNICODE, ) words = custom_Re.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words
def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset): if dataset == "us": if type(source_texts) is str: source_texts = [source_texts] files = sorted([ os.path.join(source_text, x) for source_text in source_texts for x in list_dir(source_text, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ]) else: # is DE assert type(source_texts) is str files = get_snapshot_law_list(snapshot, law_names_data) files = [os.path.join(source_texts, f) for f in files] whitespace_pattern = regex.compile(r"[\s\n]+") for file in files: tree = etree.parse(file) for text_tag in tree.xpath("//text"): item = text_tag.getparent() text_elems = [e for e in item.getchildren() if e.tag == "text"] pos_in_item = text_elems.index(text_tag) text_key = item.attrib["key"] + f"_{pos_in_item}" seqitem = get_seqitem(item) if seqitem is not None: citekey = seqitem.attrib.get("citekey") else: citekey = None text = etree.tostring(text_tag, method="text", encoding="utf8").decode("utf-8") text = whitespace_pattern.sub(" ", text).lower().strip() yield text_key, citekey, text
def __init__(self): self.written_numbers = self.read_numbers_file() self.months = self.read_months_file() day_nrs_regex = r"(([1-9])|(1[0-9])|(2[0-9])|(3[0-1]))" de_regex = "(de)" year_regex = r"((19[0-9][0-9])|(20[0-9][0-9]))" b = MultiWordRegexBuilder() wb1 = SingleWordRegexBuilder() wb1.add_list_options_as_regex(self.written_numbers) wb1.add_option(day_nrs_regex) b.add_regex_word(wb1.build_as_part()) b.add_regex_word(de_regex, optional=True) wb3 = SingleWordRegexBuilder() wb3.add_list_options_as_regex(self.months) b.add_regex_word(wb3.build_as_part()) b.add_regex_word(de_regex, optional=True) b.add_regex_word(year_regex) tot_regex = b.build() matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE) super().__init__(matcher_regex)
def create_regex(rules): return regex.compile(r"(?V1)(?(DEFINE){})^(?P>r0)$".format(''.join("(?<r{}>{})".format(k, ''.join( '|' if t == '|' else t[1] if t[0] == '"' else '(?P>r{})'.format(t) for t in v.split(' '))) for k, v in rules.items())))
def __init__(self, pattern, config): self.pattern = pattern self.config = config self.compiled_re = regex.compile(pattern, re.DOTALL | re.UNICODE)
r'(?&sec)' r'((?&conn)(Sec(?:tions|\.)?|§§?|\b(sub)?Parts?)?\s*(?&sec)|(?&conn)(?&numb))*' r')' r'(?!\w*(\sApp\.)?\s(U\.?S\.?C\.?|C\.?F\.?R\.?|Stat\.))' r'\s*' r'(' r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))' r'|' r'(of\stitle\s\d+)' r')?' r'(' r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations' r'|' r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States' r')?') usc_pattern = regex.compile(usc_pattern_string, flags=regex.IGNORECASE) inline_pattern_string = regex_definitions + ( r'(Sec(?:tion|\.)?|§§?|\b(sub)?parts?)\s*' r'(?&sec)' r'(' r'(?&conn)' r'(Sec(?:tions?|\.)?|§§?)?' r'\s*' r'(?&sec)' r'|' r'(?&conn)(?&numb)' r')*' r'\s*' r'(' r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))'
ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) SPLITTING_REGEX = re.compile( r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) TOKEN_UNICODEIFIER = unicodify_bytes() TOKEN_CACHE = {} if not os.path.isfile("converter/vocab.bpe"): with open("converter/vocab.bpe", "w", encoding="utf-8") as f: f.write( urlopen( "https://raw.githubusercontent.com/latitudegames/GPT-3-Encoder/master/vocab.bpe" ).read().decode("utf-8")) with open("converter/vocab.bpe", "r", encoding="utf-8") as f: bpe_data = f.read() bpe_merges = [ tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]
} class_kingdom_map = { "Animal_Fauna": "Animalia", "Archaea": "Archaea", "Bacteria": "Bacteria", "Chromista": "Chromista", "Fungi": "Fungi", "Lichen": None, "Plant_Flora": "Plantae", "Protozoa": "Protozoa", "Taxon": None, "Viruses": "Viruses" } vernacular_name_sepearator: regex.Pattern = regex.compile(", ?") strip_author_pattern: regex.Pattern = regex.compile( r"^([\p{upper}]?[\P{upper}]+) (in)?.*$", flags=regex.UNICODE) def linecount(filename): with open(filename, 'r') as fin: count = sum(1 for _ in fin) return count class DarwinCoreProcessor: def __init__(self, filtered_taxon_tsv, vernacular_name_tsv, base_path_name,
doc[tok_idx].is_sent_start = sentence_idx == 0 tok_idx += 1 if labels: if len(labels) != len(all_tokens): raise ValueError( f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})" ) # Create entities after converting IOB (actually BIO) to BILUO doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels)) return doc UPPERCASE_RE = regex.compile(r"[\p{Lu}\p{Lt}]") LOWERCASE_RE = regex.compile(r"\p{Ll}") DIGIT_RE = re.compile(r"\d") PUNC_REPEAT_RE = regex.compile(r"\p{P}+") # class FeatureExtractor(ABC): # @abstractmethod # def extract( # self, # token: str, # current_idx: int, # relative_idx: int, # tokens: Sequence[str], # features: Dict[str, float], # ) -> None:
r"S\b\.?|Satz|Sätze": "Satz", r"Ziffern?|Ziffn?\b\.?": "Ziffer", r"Buchstaben?|Buchst\b\.?": "Buchstabe", r"Halbsatz": "Halbsatz", r"Teilsatz": "Teilsatz", r"Abschnitte?|Abschn\b\.?": "Abschnitt", r"Alternativen?|Alt\b\.?": "Alternative", r"Anhang|Anhänge": "Anhang", } # fmt: off pre_numb_pattern = regex.compile( r"(" r"erste|" r"zweite|" r"dritte|" r"letzte" r")r?s?", flags=regex.IGNORECASE, ) numb_pattern = regex.compile( r"(" r"\d+(?>\.\d+)*[a-z]?|" r"[ivx]+|" r"[a-z]\)?" r")" r"(" r"ff?\.|" r"ff\b|" r"(?<=[a-z])\)|"