Beispiel #1
0
def extract_abp(content):
    """Extracts blocked and unblocked domains from ABP style content."""
    pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1)
    pattern_supported_block = re.compile(
        r"^\|\|.+\^(?>$|.+(?:"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|"
        r"\bdocument\b|"
        r"\ball\b"
        # r"\ball\b|"
        # r"\bpopup\b"
        r"))",
        re.V1,
    )
    pattern_scrub_blocked_list = [
        r"^\|\|",
        r"\^($|.+(?>"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|\bdocument\b|"
        r"\ball\b|"
        r"\bpopup\b|"
        r"\S+))",
    ]
    pattern_scrub_blocked = re.compile(
        "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1
    )
    block_rules = [
        x
        for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]

    blocked_domains = [
        re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules
    ]
    blocked_domains = [x for x in blocked_domains if valid_domain(x)]
    pattern_supported_unblock = re.compile(r"@@\|\|.+\^$")
    unblock_rules = [
        x
        for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]
    unblocked_domains = [
        x.replace("@@||", "").replace("^", "").replace("$important", "")
        for x in unblock_rules
    ]
    regex_rules = []
    return blocked_domains, unblocked_domains, unblock_rules, regex_rules
Beispiel #2
0
 def __init__(self):
     ht_regex = r"[##]{1}(\w+)"
     regex_builder = SingleWordRegexBuilder()
     regex_builder.add_option(ht_regex)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
Beispiel #3
0
class TextHelper:
    FROM_CAMEL_TO_SNAKE_PATTERN: Pattern = regex.compile(r"((?<=[a-z])[A-Z]|(?!^)[A-Z](?=[a-z]))")

    @classmethod
    def from_camel_to_snake_case(cls, text: str) -> str:
        snaked: str = cls.FROM_CAMEL_TO_SNAKE_PATTERN.sub(r"_\1", text).lower()
        return snaked
Beispiel #4
0
def gen_checksum(file_blocklist):
    """Not necessary: ABP no longer requires checksum validation.
    adapted from https://github.com/adblockplus/adblockplus/blob/master/addChecksum.py
    """
    checksum_pattern = re.compile(
        r"^\s*!\s*checksum[\s\-:]+([\w\+\/=]+).*\n", re.I | re.M
    )

    def add_checksum(in_data):
        """Adds checksum."""
        checksum = calculate_checksum(in_data)
        re.sub(checksum_pattern, "", in_data)
        return re.sub(r"(\r?\n)", r"\1! Checksum: %s\1" % checksum, in_data, 1)

    def calculate_checksum(in_data):
        """Calculate checksum for the filterlist file."""
        md5().update(normalize(in_data).encode("utf-8"))
        return b64encode(md5().digest()).decode("utf-8").rstrip("=")

    def normalize(in_data):
        """Cleans the filterlist file."""
        re.sub(r"\r", "", in_data)
        re.sub(r"\n+", "\n", in_data)
        return re.sub(checksum_pattern, "", in_data)

    with open(file_blocklist, encoding="utf-8") as file:
        read_data = file.read()
    data = add_checksum(read_data)
    write_file(data, file_blocklist)
Beispiel #5
0
 def __init__(self):
     mention_regex = r"[@@]{1}([\w_]+)"
     regex_builder = SingleWordRegexBuilder()
     regex_builder.add_option(mention_regex)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
Beispiel #6
0
def remove_common_sub(domains):
    """
    Remove www. and m. subdomains
    """
    pattern = re.compile(r"^(?>www\.|m\.)")
    domains = [re.sub(pattern, "", x, concurrent=True) for x in domains]
    return set(domains)
Beispiel #7
0
def find_new_drug_names(drug_names,
                        model,
                        top=100,
                        min_length=6,
                        max_length=40):
    """
    Find new potential drug names based on current drug names and word2vec model.
    :param drug_names: list of current analysed drug names
    :param model: current word2vec model
    :param top: similar matching parameter
    :return: array of strings containing potential new drug names
    """
    drug_names_in_vocab = [d for d in drug_names.keys() if d in model.vocab]
    whitelisted_chars = regex.compile(
        r"[a-zA-Z0-9\-\.,\_/ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]+")
    new_drug_names = []

    for drug_candidate, _ in model.most_similar(positive=drug_names_in_vocab,
                                                topn=top):
        if drug_candidate not in drug_names and whitelisted_chars.fullmatch(
                drug_candidate) is not None:
            new_drug_names.append(drug_candidate)

    bad_vowels = set('ąęóiouy')
    new_drug_names = [
        d for d in new_drug_names
        if max_length > len(d) > min_length and d[-1] not in bad_vowels
    ]

    return new_drug_names
Beispiel #8
0
def extract_rules(content):
    pattern_supported_block = re.compile(r"^\|\|.+(\^|\^\$important)$")
    block_rules = [
        x for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
    ]
    pattern_supported_unblock = re.compile(r"^@@.+(\^(\$important)?|\/)$")
    unblock_rules = [
        x for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
    ]
    pattern_supported_regex = re.compile(r"^\/.*\/$")
    regex_rules = [
        x for x in content
        if re.match(pattern_supported_regex, x, concurrent=True)
    ]
    return block_rules, unblock_rules, regex_rules
Beispiel #9
0
def match_regex(domains, regex_rules):
    """
    Match domains against regex
    """
    regex_list = [x[1:-1] for x in regex_rules]
    pattern = re.compile("|".join(regex_list))
    matches = [x for x in domains if re.findall(pattern, x, concurrent=True)]
    return matches
Beispiel #10
0
 def __init__(self):
     regex_builder = SingleWordRegexBuilder(
         word_sep_tokens=r"([\p{P}\s])"
     )  #All punctuation and white space chars
     regex_builder.add_list_options_as_regex(self.WORDS_TO_MATCH_LOWERCASED)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
Beispiel #11
0
 def __init__(self):
     # based onhttps://www.regular-expressions.info/email.html
     email_regex = r"[A-Z\d\.\_%\+\-]+@[A-Z\d\.\-]+\.[A-Z]{2,}"
     regex_builder = SingleWordRegexBuilder()
     regex_builder.add_option(email_regex)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
Beispiel #12
0
def extract_regex(content):
    """
    Extracts regex rules within two '/'.
    """
    pattern_if_regexp = re.compile(r"^\/.*\/$", re.V1)
    regex_rules = [
        x for x in content if re.match(pattern_if_regexp, x, concurrent=True)
    ]
    return regex_rules
Beispiel #13
0
 def __init__(self):
     dni_regex_1 = r"\d{8,8}[A-Z]"
     dni_regex_2 = r"\d\d\.\d{3,3}\.\d{3,3}-[A-Z]"
     regex_builder = SingleWordRegexBuilder()
     regex_builder.add_option(dni_regex_1)
     regex_builder.add_option(dni_regex_2)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
Beispiel #14
0
 def __init__(self):
     cif_regex_1 = r"[A-Z]\d{7,7}([A-Z]|\d)"
     cif_regex_2 = r"[A-Z]-\d\d\.\d{3,3}\.\d{3,3}"
     regex_builder = SingleWordRegexBuilder()
     regex_builder.add_option(cif_regex_1)
     regex_builder.add_option(cif_regex_2)
     tot_regex = regex_builder.build()
     matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
     super().__init__(matcher_regex)
def _validate_pattern(pattern: Pattern, text: str, msg: str):
    if pattern.regex:
        r = _adjust_regex(pattern.regex)
        p = regex.compile(r)

        if not p.match(text):
            if pattern.message:
                msg = pattern.message

            raise InvalidPatternException(
                f"Invalid text: {text} | {msg} | Example: {pattern.example}")
Beispiel #16
0
 def __init__(self):
     file = pkg_resources.open_text(files, self.COMPANY_EXTENSIONS)
     file_lines = file.readlines()
     file.close()
     companies = []
     for line in file_lines:
         companies.append(line.strip().replace(".", "\."))
     builder = SingleWordRegexBuilder()
     builder.add_list_options_as_regex(companies)
     comp_regex = builder.build()
     matcher_regex = regex.compile(comp_regex)
     super().__init__(matcher_regex)
Beispiel #17
0
 def parse_posting_from_html(soup: Bs) -> dict:
     """
     Versucht Posting Text aus HTML zu gewinnen
     :param soup: BS4 HTML Objekt
     :return: Dictionary
     """
     try:
         match_html = soup.find('div',
                                {'class': re.compile(r'\Wmaincontent\W')})
         return {
             "description":
             match_html.text.replace(r'\r',
                                     "").replace(r'\n',
                                                 "").replace(r'\t', "")
         }
     except AttributeError:
         return dict()
Beispiel #18
0
def extract_hosts(content, list_type):
    """Extracts blocked or unblocked domains from hosts/domains style content."""
    pattern_scrub = [
        r"(?>\#|\!|\s+\#|\s+\!).*",
        r"^\s",
        r".*\blocalhost\b.*",
        r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)",
        r"^(?>www\.|m\.)",
    ]
    pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1)
    domains = [re.sub(pattern, "", x, concurrent=True) for x in content]
    domains = [x for x in domains if valid_domain(x)]
    blocked_domains, unblocked_domains = [], []
    if list_type == "unblock":
        unblocked_domains = domains
    if list_type == "block":
        blocked_domains = domains

    return blocked_domains, unblocked_domains
Beispiel #19
0
    def test_build(self):
        """Tests code path with 0 options given"""
        rb = SingleWordRegexBuilder()
        assert rb.build() == ""
        rb.add_option("and")
        rb.add_option("BLABLABALBABLABLABA")

        and_regex_text = rb.build()
        and_regex = regex.compile(and_regex_text)
        matcher = RegexMatcher(and_regex)
        examples = ["and", "and ", " and", "this and that"]
        negative_examples = ["andy", " andy ", " rand "]

        for example in examples:
            res = matcher.match(example)
            assert len(res) == 1

        for example in negative_examples:
            assert len(matcher.match(example)) == 0
def get_leaf_texts_to_compare(graph_filename, G, source_text, source_text_reg,
                              law_names_data, dataset):
    """
    get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph.
    Leaves are only seqitems or supseqitems.
    """
    leaf_keys = get_leaves(G)

    snapshot = graph_filename[:-len(".gpickle.gz")]

    if dataset == "us":
        files = [
            os.path.join(source_text, x)
            for x in list_dir(source_text, ".xml")
            if x.split(".")[0].split("_")[-1] == snapshot
        ]
        if source_text_reg:
            files += [
                os.path.join(source_text_reg, x)
                for x in list_dir(source_text_reg, ".xml")
                if x.split(".")[0].split("_")[-1] == snapshot
            ]
        files.sort()
    else:  # is DE
        files = get_snapshot_law_list(snapshot, law_names_data)
        files = [os.path.join(source_text, f) for f in files]

    whitespace_pattern = regex.compile(r"[\s\n]+")
    texts = {}
    for file in files:
        print(f"\r{files.index(file)} / {len(files)}", end="")
        soup = create_soup(file)
        tags = soup.find_all(["seqitem", "subseqitem"])
        for tag in tags:
            if tag["key"] in leaf_keys:
                text = tag.get_text(" ")
                text = whitespace_pattern.sub(" ", text).lower().strip()
                texts[tag["key"]] = text.lower()
    return texts
Beispiel #21
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset):

    if dataset == "us":
        if type(source_texts) is str:
            source_texts = [source_texts]

        files = sorted([
            os.path.join(source_text, x) for source_text in source_texts
            for x in list_dir(source_text, ".xml")
            if x.split(".")[0].split("_")[-1] == snapshot
        ])
    else:  # is DE
        assert type(source_texts) is str
        files = get_snapshot_law_list(snapshot, law_names_data)
        files = [os.path.join(source_texts, f) for f in files]

    whitespace_pattern = regex.compile(r"[\s\n]+")

    for file in files:
        tree = etree.parse(file)
        for text_tag in tree.xpath("//text"):
            item = text_tag.getparent()

            text_elems = [e for e in item.getchildren() if e.tag == "text"]
            pos_in_item = text_elems.index(text_tag)
            text_key = item.attrib["key"] + f"_{pos_in_item}"

            seqitem = get_seqitem(item)
            if seqitem is not None:
                citekey = seqitem.attrib.get("citekey")
            else:
                citekey = None

            text = etree.tostring(text_tag, method="text",
                                  encoding="utf8").decode("utf-8")
            text = whitespace_pattern.sub(" ", text).lower().strip()

            yield text_key, citekey, text
Beispiel #23
0
    def __init__(self):
        self.written_numbers = self.read_numbers_file()
        self.months = self.read_months_file()

        day_nrs_regex = r"(([1-9])|(1[0-9])|(2[0-9])|(3[0-1]))"
        de_regex = "(de)"
        year_regex = r"((19[0-9][0-9])|(20[0-9][0-9]))"

        b = MultiWordRegexBuilder()
        wb1 = SingleWordRegexBuilder()
        wb1.add_list_options_as_regex(self.written_numbers)
        wb1.add_option(day_nrs_regex)
        b.add_regex_word(wb1.build_as_part())
        b.add_regex_word(de_regex, optional=True)
        wb3 = SingleWordRegexBuilder()
        wb3.add_list_options_as_regex(self.months)
        b.add_regex_word(wb3.build_as_part())
        b.add_regex_word(de_regex, optional=True)
        b.add_regex_word(year_regex)
        tot_regex = b.build()

        matcher_regex = regex.compile(tot_regex, flags=regex.IGNORECASE)
        super().__init__(matcher_regex)
Beispiel #24
0
def create_regex(rules):
    return regex.compile(r"(?V1)(?(DEFINE){})^(?P>r0)$".format(''.join("(?<r{}>{})".format(k, ''.join(
        '|' if t == '|' else t[1] if t[0] == '"' else '(?P>r{})'.format(t) for t in v.split(' '))) for k, v in
                                                                       rules.items())))
Beispiel #25
0
 def __init__(self, pattern, config):
     self.pattern = pattern
     self.config = config
     self.compiled_re = regex.compile(pattern, re.DOTALL | re.UNICODE)
Beispiel #26
0
    r'(?&sec)'
    r'((?&conn)(Sec(?:tions|\.)?|§§?|\b(sub)?Parts?)?\s*(?&sec)|(?&conn)(?&numb))*'
    r')'
    r'(?!\w*(\sApp\.)?\s(U\.?S\.?C\.?|C\.?F\.?R\.?|Stat\.))'
    r'\s*'
    r'('
    r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))'
    r'|'
    r'(of\stitle\s\d+)'
    r')?'
    r'('
    r'\s+of\s+the\s+Code\s+of\s+Federal\s+Regulations'
    r'|'
    r'\s+of\s+the\s+Code\s+of\s+the\s+United\s+States'
    r')?')
usc_pattern = regex.compile(usc_pattern_string, flags=regex.IGNORECASE)

inline_pattern_string = regex_definitions + (
    r'(Sec(?:tion|\.)?|§§?|\b(sub)?parts?)\s*'
    r'(?&sec)'
    r'('
    r'(?&conn)'
    r'(Sec(?:tions?|\.)?|§§?)?'
    r'\s*'
    r'(?&sec)'
    r'|'
    r'(?&conn)(?&numb)'
    r')*'
    r'\s*'
    r'('
    r'(of\sthis\s(sub\-?)?(title|chapter|part|section|division|paragraph))'
Beispiel #27
0
                                                 ord("¬") + 1)) +
          list(range(ord("®"),
                     ord("ÿ") + 1)))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


SPLITTING_REGEX = re.compile(
    r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
TOKEN_UNICODEIFIER = unicodify_bytes()
TOKEN_CACHE = {}

if not os.path.isfile("converter/vocab.bpe"):
    with open("converter/vocab.bpe", "w", encoding="utf-8") as f:
        f.write(
            urlopen(
                "https://raw.githubusercontent.com/latitudegames/GPT-3-Encoder/master/vocab.bpe"
            ).read().decode("utf-8"))

with open("converter/vocab.bpe", "r", encoding="utf-8") as f:
    bpe_data = f.read()
bpe_merges = [
    tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]
Beispiel #28
0
}

class_kingdom_map = {
    "Animal_Fauna": "Animalia",
    "Archaea": "Archaea",
    "Bacteria": "Bacteria",
    "Chromista": "Chromista",
    "Fungi": "Fungi",
    "Lichen": None,
    "Plant_Flora": "Plantae",
    "Protozoa": "Protozoa",
    "Taxon": None,
    "Viruses": "Viruses"
}

vernacular_name_sepearator: regex.Pattern = regex.compile(", ?")
strip_author_pattern: regex.Pattern = regex.compile(
    r"^([\p{upper}]?[\P{upper}]+) (in)?.*$", flags=regex.UNICODE)


def linecount(filename):
    with open(filename, 'r') as fin:
        count = sum(1 for _ in fin)
    return count


class DarwinCoreProcessor:
    def __init__(self,
                 filtered_taxon_tsv,
                 vernacular_name_tsv,
                 base_path_name,
Beispiel #29
0
            doc[tok_idx].is_sent_start = sentence_idx == 0
            tok_idx += 1

    if labels:
        if len(labels) != len(all_tokens):
            raise ValueError(
                f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})"
            )

        # Create entities after converting IOB (actually BIO) to BILUO
        doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels))

    return doc


UPPERCASE_RE = regex.compile(r"[\p{Lu}\p{Lt}]")
LOWERCASE_RE = regex.compile(r"\p{Ll}")
DIGIT_RE = re.compile(r"\d")

PUNC_REPEAT_RE = regex.compile(r"\p{P}+")

# class FeatureExtractor(ABC):
#     @abstractmethod
#     def extract(
#         self,
#         token: str,
#         current_idx: int,
#         relative_idx: int,
#         tokens: Sequence[str],
#         features: Dict[str, float],
#     ) -> None:
    r"S\b\.?|Satz|Sätze": "Satz",
    r"Ziffern?|Ziffn?\b\.?": "Ziffer",
    r"Buchstaben?|Buchst\b\.?": "Buchstabe",
    r"Halbsatz": "Halbsatz",
    r"Teilsatz": "Teilsatz",
    r"Abschnitte?|Abschn\b\.?": "Abschnitt",
    r"Alternativen?|Alt\b\.?": "Alternative",
    r"Anhang|Anhänge": "Anhang",
}

# fmt: off
pre_numb_pattern = regex.compile(
    r"("
    r"erste|"
    r"zweite|"
    r"dritte|"
    r"letzte"
    r")r?s?",
    flags=regex.IGNORECASE,
)

numb_pattern = regex.compile(
    r"("
    r"\d+(?>\.\d+)*[a-z]?|"
    r"[ivx]+|"
    r"[a-z]\)?"
    r")"
    r"("
    r"ff?\.|"
    r"ff\b|"
    r"(?<=[a-z])\)|"