def ORTH(value, config, op=None): """ Ignores case-insensitive configuration and checks words as written that means case-sensitive even if configuration is case-insensitive """ new_op = ExtendedOp(op) new_op.case_sensitive_override = True return "orth", value, new_op
def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: if op.ignore_case(config): d = {"LOWER": {"REGEX": r.lower()}} else: d = {"TEXT": {"REGEX": r}} if not op.empty(): d["OP"] = op.value yield d
def REGEX(regex_pattern, config, op=None): """ Matches words based on a Regex pattern e.g. all words that start with an 'a' would be REGEX("^a") """ new_op = ExtendedOp(op) new_op.local_regex_override = True return "regex", regex_pattern, new_op
def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: if op.ignore_case(config): normalized = sorted([item.lower() for item in lst]) base = {"LOWER": {"REGEX": r"^({0})$".format("|".join(normalized))}} else: base = {"TEXT": {"REGEX": r"^({0})$".format("|".join(sorted(lst)))}} if not op.empty(): base["OP"] = op.value yield base
def generic_parse(tag, value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: d = {} if tag == "ORTH" and op.ignore_case(config): d["LOWER"] = value.lower() else: d[tag] = value if not op.empty(): d["OP"] = op.value yield d
def NAMES(*args, config, op=None): if type(args[0]) == list: initial_list = [resolve_value(arg, config=config) for arg in flatten(args)] else: initial_list = [args[0]] names = list([" ".join(filter(remove_empty, names)) for names in generate_names(initial_list)]) logger.debug("Generated list of names: {}".format(names)) new_op = ExtendedOp(op) new_op.case_sensitive_override = True return "any_of", names, new_op
def TAG(tag, config, op=None): """ For generating POS/TAG patterns based on a Regex e.g. TAG("^NN|^JJ") for nouns or adjectives """ values = {"tag": tag} return "tag", values, ExtendedOp(op)
def apply_operator(syntax, op: ExtendedOp) -> str: if op.empty(): return syntax elif str(op) == "!": # A bit complicated one return (r"((?!{})\w+)".format(syntax .rstrip(")") .lstrip("("))) else: return syntax + str(op)
def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"): logger.debug(data) return { "label": label, "pattern": [ p for (t, d, op) in data for p in PARSERS[t](d, config, ExtendedOp(op)) ], }
def phrase_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: """ TODO: Does not support operators """ splitter = next((s for s in ["-", " "] if s in value), None) if splitter: buff = value.split(splitter) yield next( generic_parse("ORTH", buff[0], config=config, op=ExtendedOp())) for b in buff[1:]: if splitter != " ": yield next( generic_parse("ORTH", splitter, config=config, op=ExtendedOp())) yield next(generic_parse("ORTH", b, config=config, op=ExtendedOp())) else: yield next(generic_parse("ORTH", value, config=config, op=ExtendedOp()))
def tag_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: """ For generating POS/TAG patterns based on a Regex e.g. TAG("^NN|^JJ") for adjectives or nouns also deals with TAG_WORD for tag and word or tag and list """ d = {"TAG": {"REGEX": values["tag"]}} if "word" in values: if op.ignore_case(config): d["LOWER"] = values["word"].lower() else: d["TEXT"] = values["word"] elif "list" in values: lst = values["list"] if op.ignore_case(config): normalized = sorted([item.lower() for item in lst]) d["LOWER"] = {"REGEX": r"^({0})$".format("|".join(normalized))} else: d["TEXT"] = {"REGEX": r"^({0})$".format("|".join(sorted(lst)))} if not op.empty(): d["OP"] = op.value yield d
def TAG_WORD(tag, value, config, op=None): """ For generating TAG patterns with a word or a list e.g. match only "proposed" when it is in the sentence a verb (and not an adjective): TAG_WORD("^VB", "proposed") e.g. match a list of words only to verbs words = {"percived", "proposed"} {TAG_WORD("^VB", words)?}->MARK("LABEL") """ values = {"tag": tag} if type(value) == list: values["list"] = value else: values["word"] = value return "tag", values, ExtendedOp(op)
def resolve_value(obj, config): logger.debug("Resolving value: {0}".format(obj)) if isinstance(obj, str): return obj elif isinstance(obj, tuple): return obj elif isinstance(obj, list): return obj elif isinstance(obj, types.GeneratorType): return "either", list(obj), ExtendedOp(None) return obj(config=config)
def test_pattern_with_escaped_characters(config): p = RitaParser(config) p.build(debug=True) results = p.parse(''' special = { '"', "*", "-" } IN_LIST(special)->MARK("TEST") ''') assert len(results) > 0 rules = results[1]() assert { "label": "TEST", "data": [("any_of", ["\"", "*", "-"], ExtendedOp())] } == rules
def PLURALIZE(*args, config, op=None): """ For a noun or a list of nouns, it will match any singular or plural word Usage for a single word, e.g.: PLURALIZE("car") Usage for lists, e.g.: vehicles = {"car", "bicycle", "ship"} PLURALIZE(vehicles) Will work even for regex or if the lemmatizer of spaCy is making an error Has dependency to the Python inflect package https://pypi.org/project/inflect/ """ if type(args[0]) == list: initial_list = [ resolve_value(arg, config=config) for arg in flatten(args) ] else: initial_list = [args[0]] return "any_of", pluralizing(initial_list), ExtendedOp(op)
def test_parser_assign_literal_and_use_it(config): p = RitaParser(config) p.build(debug=True) results = p.parse(""" my_variable = "Test" {WORD(my_variable)} -> MARK("TEST") """) assert len(results) == 2 rules = results[1]() print(rules) assert { "label": "TEST", "data": [("value", "Test", ExtendedOp())] } == rules
def PUNCT(config, op=None): return "punct", None, ExtendedOp(op)
def IN_LIST(*args, config, op=None): return "any_of", [ resolve_value(arg, config=config) for arg in flatten(args) ], ExtendedOp(op)
def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: d = dict() d["IS_PUNCT"] = True if not op.empty(): d["OP"] = op.value yield d
def WORD(*args, config, op=None): if len(args) == 1: literal = resolve_value(args[0], config=config) return "value", literal, ExtendedOp(op) elif len(args) == 0: return "regex", r"((\w|['_-])+)", ExtendedOp(op)
def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: # TODO: build premutations d = {"LOWER": {"REGEX": "({0})[.,?;!]?".format("|".join(r))}} if not op.empty(): d["OP"] = op.value yield d
def NUM(*args, config, op=None): if len(args) == 1: literal = resolve_value(args[0], config=config) return "value", literal, ExtendedOp(op) elif len(args) == 0: return "regex", r"((\d+[\.,]\d+)|(\d+))", ExtendedOp(op)
def POS(name, config, op=None): return "pos", resolve_value(name, config=config), ExtendedOp(op)
def orth_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern: d = {} d["ORTH"] = value if not op.empty(): d["OP"] = op.value yield d
def LEMMA(name, config, op=None): return "lemma", resolve_value(name, config=config), ExtendedOp(op)
def ENTITY(name, config, op=None): return "entity", resolve_value(name, config=config), ExtendedOp(op)
def ANY(config, op=None): return "regex", r".*", ExtendedOp(op)
def gen(): for p in pattern: yield p yield "value", "-", ExtendedOp("?")
def gen(): for p in pattern: yield p yield "punct", None, ExtendedOp("?")
def PREFIX(name, config, op=None): return "prefix", resolve_value(name, config=config), ExtendedOp(op)