Ejemplo n.º 1
0
def ORTH(value, config, op=None):
    """
    Ignores case-insensitive configuration and checks words as written
    that means case-sensitive even if configuration is case-insensitive
    """
    new_op = ExtendedOp(op)
    new_op.case_sensitive_override = True
    return "orth", value, new_op
Ejemplo n.º 2
0
def regex_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    if op.ignore_case(config):
        d = {"LOWER": {"REGEX": r.lower()}}
    else:
        d = {"TEXT": {"REGEX": r}}

    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 3
0
def REGEX(regex_pattern, config, op=None):
    """
    Matches words based on a Regex pattern
    e.g. all words that start with an 'a' would be
    REGEX("^a")
    """
    new_op = ExtendedOp(op)
    new_op.local_regex_override = True
    return "regex", regex_pattern, new_op
Ejemplo n.º 4
0
def any_of_parse(lst, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    if op.ignore_case(config):
        normalized = sorted([item.lower() for item in lst])
        base = {"LOWER": {"REGEX": r"^({0})$".format("|".join(normalized))}}
    else:
        base = {"TEXT": {"REGEX": r"^({0})$".format("|".join(sorted(lst)))}}

    if not op.empty():
        base["OP"] = op.value
    yield base
Ejemplo n.º 5
0
def generic_parse(tag, value, config: "SessionConfig",
                  op: ExtendedOp) -> SpacyPattern:
    d = {}
    if tag == "ORTH" and op.ignore_case(config):
        d["LOWER"] = value.lower()
    else:
        d[tag] = value

    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 6
0
def NAMES(*args, config, op=None):
    if type(args[0]) == list:
        initial_list = [resolve_value(arg, config=config)
                        for arg in flatten(args)]
    else:
        initial_list = [args[0]]

    names = list([" ".join(filter(remove_empty, names))
                  for names in generate_names(initial_list)])
    logger.debug("Generated list of names: {}".format(names))
    new_op = ExtendedOp(op)
    new_op.case_sensitive_override = True
    return "any_of", names, new_op
Ejemplo n.º 7
0
def TAG(tag, config, op=None):
    """
    For generating POS/TAG patterns based on a Regex
    e.g. TAG("^NN|^JJ") for nouns or adjectives
    """
    values = {"tag": tag}
    return "tag", values, ExtendedOp(op)
Ejemplo n.º 8
0
def apply_operator(syntax, op: ExtendedOp) -> str:
    if op.empty():
        return syntax

    elif str(op) == "!":  # A bit complicated one
        return (r"((?!{})\w+)".format(syntax
                                      .rstrip(")")
                                      .lstrip("(")))
    else:
        return syntax + str(op)
Ejemplo n.º 9
0
def rules_to_patterns(label: str, data: Patterns, config: "SessionConfig"):
    logger.debug(data)
    return {
        "label":
        label,
        "pattern": [
            p for (t, d, op) in data
            for p in PARSERS[t](d, config, ExtendedOp(op))
        ],
    }
Ejemplo n.º 10
0
def phrase_parse(value, config: "SessionConfig",
                 op: ExtendedOp) -> SpacyPattern:
    """
    TODO: Does not support operators
    """
    splitter = next((s for s in ["-", " "] if s in value), None)
    if splitter:
        buff = value.split(splitter)
        yield next(
            generic_parse("ORTH", buff[0], config=config, op=ExtendedOp()))
        for b in buff[1:]:
            if splitter != " ":
                yield next(
                    generic_parse("ORTH",
                                  splitter,
                                  config=config,
                                  op=ExtendedOp()))
            yield next(generic_parse("ORTH", b, config=config,
                                     op=ExtendedOp()))
    else:
        yield next(generic_parse("ORTH", value, config=config,
                                 op=ExtendedOp()))
Ejemplo n.º 11
0
def tag_parse(values, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    """
    For generating POS/TAG patterns based on a Regex
    e.g. TAG("^NN|^JJ") for adjectives or nouns
    also deals with TAG_WORD for tag and word or tag and list
    """
    d = {"TAG": {"REGEX": values["tag"]}}
    if "word" in values:
        if op.ignore_case(config):
            d["LOWER"] = values["word"].lower()
        else:
            d["TEXT"] = values["word"]
    elif "list" in values:
        lst = values["list"]
        if op.ignore_case(config):
            normalized = sorted([item.lower() for item in lst])
            d["LOWER"] = {"REGEX": r"^({0})$".format("|".join(normalized))}
        else:
            d["TEXT"] = {"REGEX": r"^({0})$".format("|".join(sorted(lst)))}
    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 12
0
def TAG_WORD(tag, value, config, op=None):
    """
    For generating TAG patterns with a word or a list
    e.g. match only "proposed" when it is in the sentence a verb (and not an adjective):
    TAG_WORD("^VB", "proposed")
    e.g. match a list of words only to verbs
    words = {"percived", "proposed"}
    {TAG_WORD("^VB", words)?}->MARK("LABEL")
    """
    values = {"tag": tag}
    if type(value) == list:
        values["list"] = value
    else:
        values["word"] = value
    return "tag", values, ExtendedOp(op)
Ejemplo n.º 13
0
def resolve_value(obj, config):
    logger.debug("Resolving value: {0}".format(obj))

    if isinstance(obj, str):
        return obj

    elif isinstance(obj, tuple):
        return obj

    elif isinstance(obj, list):
        return obj

    elif isinstance(obj, types.GeneratorType):
        return "either", list(obj), ExtendedOp(None)

    return obj(config=config)
Ejemplo n.º 14
0
def test_pattern_with_escaped_characters(config):
    p = RitaParser(config)
    p.build(debug=True)

    results = p.parse('''
        special = { '"', "*", "-" }
        IN_LIST(special)->MARK("TEST")
        ''')

    assert len(results) > 0

    rules = results[1]()

    assert {
        "label": "TEST",
        "data": [("any_of", ["\"", "*", "-"], ExtendedOp())]
    } == rules
Ejemplo n.º 15
0
def PLURALIZE(*args, config, op=None):
    """
    For a noun or a list of nouns, it will match any singular or plural word
    Usage for a single word, e.g.:
    PLURALIZE("car")
    Usage for lists, e.g.:
    vehicles = {"car", "bicycle", "ship"}
    PLURALIZE(vehicles)
    Will work even for regex or if the lemmatizer of spaCy is making an error
    Has dependency to the Python inflect package https://pypi.org/project/inflect/
    """
    if type(args[0]) == list:
        initial_list = [
            resolve_value(arg, config=config) for arg in flatten(args)
        ]
    else:
        initial_list = [args[0]]
    return "any_of", pluralizing(initial_list), ExtendedOp(op)
Ejemplo n.º 16
0
def test_parser_assign_literal_and_use_it(config):
    p = RitaParser(config)
    p.build(debug=True)

    results = p.parse("""
        my_variable = "Test"

        {WORD(my_variable)} -> MARK("TEST")
        """)
    assert len(results) == 2

    rules = results[1]()

    print(rules)
    assert {
        "label": "TEST",
        "data": [("value", "Test", ExtendedOp())]
    } == rules
Ejemplo n.º 17
0
def PUNCT(config, op=None):
    return "punct", None, ExtendedOp(op)
Ejemplo n.º 18
0
def IN_LIST(*args, config, op=None):
    return "any_of", [
        resolve_value(arg, config=config) for arg in flatten(args)
    ], ExtendedOp(op)
Ejemplo n.º 19
0
def punct_parse(_, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    d = dict()
    d["IS_PUNCT"] = True
    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 20
0
def WORD(*args, config, op=None):
    if len(args) == 1:
        literal = resolve_value(args[0], config=config)
        return "value", literal, ExtendedOp(op)
    elif len(args) == 0:
        return "regex", r"((\w|['_-])+)", ExtendedOp(op)
Ejemplo n.º 21
0
def fuzzy_parse(r, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    # TODO: build premutations
    d = {"LOWER": {"REGEX": "({0})[.,?;!]?".format("|".join(r))}}
    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 22
0
def NUM(*args, config, op=None):
    if len(args) == 1:
        literal = resolve_value(args[0], config=config)
        return "value", literal, ExtendedOp(op)
    elif len(args) == 0:
        return "regex", r"((\d+[\.,]\d+)|(\d+))", ExtendedOp(op)
Ejemplo n.º 23
0
def POS(name, config, op=None):
    return "pos", resolve_value(name, config=config), ExtendedOp(op)
Ejemplo n.º 24
0
def orth_parse(value, config: "SessionConfig", op: ExtendedOp) -> SpacyPattern:
    d = {}
    d["ORTH"] = value
    if not op.empty():
        d["OP"] = op.value
    yield d
Ejemplo n.º 25
0
def LEMMA(name, config, op=None):
    return "lemma", resolve_value(name, config=config), ExtendedOp(op)
Ejemplo n.º 26
0
def ENTITY(name, config, op=None):
    return "entity", resolve_value(name, config=config), ExtendedOp(op)
Ejemplo n.º 27
0
def ANY(config, op=None):
    return "regex", r".*", ExtendedOp(op)
Ejemplo n.º 28
0
 def gen():
     for p in pattern:
         yield p
         yield "value", "-", ExtendedOp("?")
Ejemplo n.º 29
0
 def gen():
     for p in pattern:
         yield p
         yield "punct", None, ExtendedOp("?")
Ejemplo n.º 30
0
def PREFIX(name, config, op=None):
    return "prefix", resolve_value(name, config=config), ExtendedOp(op)