Beispiel #1
0
    def fuzzy_intents(self):
        intents = {}

        for intent_name in self.intents:
            samples = []
            ents = {}
            for sent in self.intents[intent_name]:
                if "{" in sent:
                    for ent in AutoRegex.get_unique_kwords(
                            self.intents[intent_name]):
                        if ent in self.entities:
                            ents[ent] = self.entities[ent]
                            for valid in self.entities.get(ent, []):
                                samples += [
                                    s.replace("{ " + ent + " }", valid)
                                    for s in expand_options(sent,
                                                            as_strings=True)
                                ]
                        samples += [
                            s.replace("{ " + ent + " }", "*")
                            for s in expand_options(sent, as_strings=True)
                        ]
                else:
                    samples += expand_options(sent, as_strings=True)

            intents[intent_name] = [{
                "samples": list(set(samples)),
                "entities": ents
            }]

        return intents
Beispiel #2
0
 def keywords2samples(keywords, resources):
     samples = []
     for kw_type, kw in keywords:
         if kw_type == "required":
             kw_samples = resources.get(kw + ".voc")
             expanded_samples = []
             if not kw_samples:
                 print(f"WARNING: missing {kw}.voc")
             else:
                 for s in list(kw_samples):
                     expanded_samples += expand_options(s, as_strings=True)
             if not len(samples):
                 samples = expanded_samples
             else:
                 if not expanded_samples:
                     expanded_samples = ["{ " + kw + " }"]
                 samples = [
                     f"{s} {s2}" for s in samples for s2 in expanded_samples
                 ]
         elif kw_type == "optional":
             kw_samples = resources.get(kw + ".voc")
             expanded_samples = []
             if not kw_samples:
                 print(f"WARNING: missing {kw}.voc")
             else:
                 for s in list(kw_samples):
                     expanded_samples += expand_options(s, as_strings=True)
             if not len(samples):
                 samples = expanded_samples + ["\n"]
             elif expanded_samples:
                 samples = [
                     f"{s} {s2}" for s in samples for s2 in expanded_samples
                 ] + samples
         elif kw_type == "one_of":
             expanded_samples = []
             for kw2 in kw:
                 kw_samples2 = resources.get(kw2 + ".voc") or []
                 if not kw_samples2:
                     print(f"WARNING: missing {kw2}.voc")
                 else:
                     for s in kw_samples2:
                         expanded_samples += expand_options(s,
                                                            as_strings=True)
             if len(samples):
                 samples = [
                     f"{s} {s2}" for s in samples for s2 in expanded_samples
                 ]
             else:
                 samples = expanded_samples
     samples = [s.strip() for s in samples]
     samples = list(set(samples))
     return samples
Beispiel #3
0
def keyword_split(samples, lang="en-us"):
    samples = flatten([expand_options(s, as_strings=True) for s in samples])
    keywords = keyword_start_split(samples)
    # start keyword detected
    if len(keywords) == 2:
        return keywords
    # fallback to main entity keyword split
    keywords = keyword_end_split(samples)
    if len(keywords) == 2:
        return keywords
    return keyword_entity_split(samples, lang)
Beispiel #4
0
    def padaos_intents(self):
        intents = {}

        for intent_name in self.intents:
            samples = []
            ents = []
            for sent in self.intents[intent_name]:
                samples += expand_options(sent, as_strings=True)
            for ent in AutoRegex.get_unique_kwords(self.intents[intent_name]):
                if self.entities.get(ent):
                    ents.append({ent: self.entities[ent]})
            intents[intent_name] = [{"samples": samples, "entities": ents}]

        return intents
Beispiel #5
0
def keyword_end_split(samples):
    samples = flatten([expand_options(s, as_strings=True) for s in samples])
    keywords = []
    samples = [k for k in samples if k]
    # detect shared utterance starts + split into own keyword
    ends = {}
    if len(samples) > 1:
        for i in range(3):
            st = list(
                set([" ".join(k.split(" ")[-(3 - i):]) for k in samples if k]))
            counts = {
                s: len([_ for _ in samples if _.endswith(s)]) / len(samples)
                for s in st
            }
            if counts and all(v >= 0.35 for v in counts.values()) and \
                    not any(c in samples for c in counts.keys()):
                ends = counts
                break

    if ends:
        keywords.append({
            "name":
            "end_kw",
            "required":
            True,
            "samples": [
                " ".join([_ for _ in s.split(" ") if len(_) > 2])
                for s in ends.keys()
            ]
        })
        # update samples to remove ends
        for idx, s in enumerate(samples):
            for k in ends:
                if s.endswith(k):
                    samples[idx] = s[:-len(k)].strip()

    # create base keyword
    if len(samples):
        keywords.insert(0, {
            "name": "required_kw",
            "required": True,
            "samples": list(set(samples))
        })

    return keywords
Beispiel #6
0
    def sample2regex(sample):

        kwords = {}

        # expand parentheses into multiple samples
        samples = expand_options(sample, as_strings=True)
        # create regex for variables - {some var}

        for s in samples:

            if "{" in s:
                helpers = [h.split("}")[-1] for h in s.split("{")]
                helpers = [h.strip() for h in helpers if h.strip()]

                s = s.replace("[", "(").replace("]", ")")
                rx = list(AutoRegex.get_expressions(s))
                kws = AutoRegex.get_unique_kwords(s)
                for kw in kws:

                    if kw not in kwords:
                        kwords[kw] = {
                            "name": kw + "_rx",
                            "samples": [],
                            "required": all(kw in s for s in samples),
                            "type": "regex"
                        }
                    kwords[kw]["samples"] += rx

                for kw in helpers:
                    if kw not in kwords:
                        kwords[kw] = {
                            "name": kw.replace(" ", "_") + "_rx_helper",
                            "samples": [],
                            "required": False
                        }
                    kwords[kw]["samples"] += [kw]

        return kwords
Beispiel #7
0
    def samples2keywords(samples, lang="en-us"):
        keywords = []

        # expand samples
        samples = flatten(
            [expand_options(s, as_strings=True) for s in samples])

        # parse required/optional
        kw_samples = []
        opt_kw = []
        for s in samples:
            parsed = expand_keywords(s)
            kw_samples += parsed["required"]
            opt_kw += parsed["optional"]

        kw_samples = list(set([r for r in kw_samples if "{" not in r]))
        opt_kw = list(set([r for r in opt_kw if "{" not in r]))

        # create base optional keyword
        if len(opt_kw):
            keywords.append({
                "name": "optional_kw",
                "required": False,
                "samples": opt_kw
            })

        # segment keywords
        keywords += keyword_split(kw_samples, lang=lang)

        # extract required samples for reference/deduplication
        rs = flatten([k["samples"] for k in keywords if k["required"]])

        # regex keywords
        print(samples)
        rx_kw = IntentAssistant.samples2regex(samples)

        # TODO autoregex has a bug where _ in names are removed
        # this causes kw extraction in IntentAssistant.samples2regex to be
        # incorrect
        rx = flatten([chunk(_, ["{"]) for _ in samples if "{" in _])
        rx = list(set([_.split("}")[0].strip() for _ in rx if "}" in _]))
        kmap = {k.replace("_", ""): k for k in rx}
        # END TODO

        for k, v in rx_kw.items():
            if v.get("type", "") == "regex":
                # if kw in all samples -> required
                required = all(
                    "{ " + kmap[k] + " }" in s or "{" + kmap[k] + "}" in s
                    for s in samples)

                # add regex keyword
                keywords.append({
                    "name": v["name"],
                    "entity": k,
                    "required": required,
                    "regex": True,
                    "samples": list(set(v["samples"]))
                })

                # use non regex chunks as helper_kws
                s = [
                    chunk(_, ["{ " + kmap[k] + " }", "{" + kmap[k] + "}"])
                    for _ in samples
                ]
                s = flatten([[x for x in _ if "{" not in x] for _ in s])
                s = list(
                    set([
                        " ".join([x for x in _.split(" ") if len(x) > 2])
                        for _ in s
                    ]))

                # filter samples already in required_kw
                reqs = [
                    _ for _ in s
                    if all(_ in s
                           for s in samples) and _.strip() and _ not in rs
                ]
                opts = [
                    _ for _ in s if _ not in reqs and _.strip() and _ not in rs
                ]

                # create helper keywords to boost regex matches
                if len(reqs):
                    keywords.append({
                        "name": k + "_rx_helper",
                        "required": True,
                        "samples": list(set(reqs))
                    })
                if len(opts):
                    keywords.append({
                        "name": k + "_optional_rx_helper",
                        "required": False,
                        "samples": list(set(opts))
                    })

        return [k for k in keywords if k["samples"]]