def fuzzy_intents(self): intents = {} for intent_name in self.intents: samples = [] ents = {} for sent in self.intents[intent_name]: if "{" in sent: for ent in AutoRegex.get_unique_kwords( self.intents[intent_name]): if ent in self.entities: ents[ent] = self.entities[ent] for valid in self.entities.get(ent, []): samples += [ s.replace("{ " + ent + " }", valid) for s in expand_options(sent, as_strings=True) ] samples += [ s.replace("{ " + ent + " }", "*") for s in expand_options(sent, as_strings=True) ] else: samples += expand_options(sent, as_strings=True) intents[intent_name] = [{ "samples": list(set(samples)), "entities": ents }] return intents
def keywords2samples(keywords, resources): samples = [] for kw_type, kw in keywords: if kw_type == "required": kw_samples = resources.get(kw + ".voc") expanded_samples = [] if not kw_samples: print(f"WARNING: missing {kw}.voc") else: for s in list(kw_samples): expanded_samples += expand_options(s, as_strings=True) if not len(samples): samples = expanded_samples else: if not expanded_samples: expanded_samples = ["{ " + kw + " }"] samples = [ f"{s} {s2}" for s in samples for s2 in expanded_samples ] elif kw_type == "optional": kw_samples = resources.get(kw + ".voc") expanded_samples = [] if not kw_samples: print(f"WARNING: missing {kw}.voc") else: for s in list(kw_samples): expanded_samples += expand_options(s, as_strings=True) if not len(samples): samples = expanded_samples + ["\n"] elif expanded_samples: samples = [ f"{s} {s2}" for s in samples for s2 in expanded_samples ] + samples elif kw_type == "one_of": expanded_samples = [] for kw2 in kw: kw_samples2 = resources.get(kw2 + ".voc") or [] if not kw_samples2: print(f"WARNING: missing {kw2}.voc") else: for s in kw_samples2: expanded_samples += expand_options(s, as_strings=True) if len(samples): samples = [ f"{s} {s2}" for s in samples for s2 in expanded_samples ] else: samples = expanded_samples samples = [s.strip() for s in samples] samples = list(set(samples)) return samples
def keyword_split(samples, lang="en-us"): samples = flatten([expand_options(s, as_strings=True) for s in samples]) keywords = keyword_start_split(samples) # start keyword detected if len(keywords) == 2: return keywords # fallback to main entity keyword split keywords = keyword_end_split(samples) if len(keywords) == 2: return keywords return keyword_entity_split(samples, lang)
def padaos_intents(self): intents = {} for intent_name in self.intents: samples = [] ents = [] for sent in self.intents[intent_name]: samples += expand_options(sent, as_strings=True) for ent in AutoRegex.get_unique_kwords(self.intents[intent_name]): if self.entities.get(ent): ents.append({ent: self.entities[ent]}) intents[intent_name] = [{"samples": samples, "entities": ents}] return intents
def keyword_end_split(samples): samples = flatten([expand_options(s, as_strings=True) for s in samples]) keywords = [] samples = [k for k in samples if k] # detect shared utterance starts + split into own keyword ends = {} if len(samples) > 1: for i in range(3): st = list( set([" ".join(k.split(" ")[-(3 - i):]) for k in samples if k])) counts = { s: len([_ for _ in samples if _.endswith(s)]) / len(samples) for s in st } if counts and all(v >= 0.35 for v in counts.values()) and \ not any(c in samples for c in counts.keys()): ends = counts break if ends: keywords.append({ "name": "end_kw", "required": True, "samples": [ " ".join([_ for _ in s.split(" ") if len(_) > 2]) for s in ends.keys() ] }) # update samples to remove ends for idx, s in enumerate(samples): for k in ends: if s.endswith(k): samples[idx] = s[:-len(k)].strip() # create base keyword if len(samples): keywords.insert(0, { "name": "required_kw", "required": True, "samples": list(set(samples)) }) return keywords
def sample2regex(sample): kwords = {} # expand parentheses into multiple samples samples = expand_options(sample, as_strings=True) # create regex for variables - {some var} for s in samples: if "{" in s: helpers = [h.split("}")[-1] for h in s.split("{")] helpers = [h.strip() for h in helpers if h.strip()] s = s.replace("[", "(").replace("]", ")") rx = list(AutoRegex.get_expressions(s)) kws = AutoRegex.get_unique_kwords(s) for kw in kws: if kw not in kwords: kwords[kw] = { "name": kw + "_rx", "samples": [], "required": all(kw in s for s in samples), "type": "regex" } kwords[kw]["samples"] += rx for kw in helpers: if kw not in kwords: kwords[kw] = { "name": kw.replace(" ", "_") + "_rx_helper", "samples": [], "required": False } kwords[kw]["samples"] += [kw] return kwords
def samples2keywords(samples, lang="en-us"): keywords = [] # expand samples samples = flatten( [expand_options(s, as_strings=True) for s in samples]) # parse required/optional kw_samples = [] opt_kw = [] for s in samples: parsed = expand_keywords(s) kw_samples += parsed["required"] opt_kw += parsed["optional"] kw_samples = list(set([r for r in kw_samples if "{" not in r])) opt_kw = list(set([r for r in opt_kw if "{" not in r])) # create base optional keyword if len(opt_kw): keywords.append({ "name": "optional_kw", "required": False, "samples": opt_kw }) # segment keywords keywords += keyword_split(kw_samples, lang=lang) # extract required samples for reference/deduplication rs = flatten([k["samples"] for k in keywords if k["required"]]) # regex keywords print(samples) rx_kw = IntentAssistant.samples2regex(samples) # TODO autoregex has a bug where _ in names are removed # this causes kw extraction in IntentAssistant.samples2regex to be # incorrect rx = flatten([chunk(_, ["{"]) for _ in samples if "{" in _]) rx = list(set([_.split("}")[0].strip() for _ in rx if "}" in _])) kmap = {k.replace("_", ""): k for k in rx} # END TODO for k, v in rx_kw.items(): if v.get("type", "") == "regex": # if kw in all samples -> required required = all( "{ " + kmap[k] + " }" in s or "{" + kmap[k] + "}" in s for s in samples) # add regex keyword keywords.append({ "name": v["name"], "entity": k, "required": required, "regex": True, "samples": list(set(v["samples"])) }) # use non regex chunks as helper_kws s = [ chunk(_, ["{ " + kmap[k] + " }", "{" + kmap[k] + "}"]) for _ in samples ] s = flatten([[x for x in _ if "{" not in x] for _ in s]) s = list( set([ " ".join([x for x in _.split(" ") if len(x) > 2]) for _ in s ])) # filter samples already in required_kw reqs = [ _ for _ in s if all(_ in s for s in samples) and _.strip() and _ not in rs ] opts = [ _ for _ in s if _ not in reqs and _.strip() and _ not in rs ] # create helper keywords to boost regex matches if len(reqs): keywords.append({ "name": k + "_rx_helper", "required": True, "samples": list(set(reqs)) }) if len(opts): keywords.append({ "name": k + "_optional_rx_helper", "required": False, "samples": list(set(opts)) }) return [k for k in keywords if k["samples"]]