def apply_rules(cls, transcript, samples, rule=MatchRule.FUZZY, thresh=0.75): for s in samples: s = s.lower().strip() if rule == MatchRule.FUZZY: score = fuzzy_match(s, transcript) if score >= thresh: return True elif rule == MatchRule.TOKEN_SORT_RATIO: score = fuzzy_match(s, transcript, strategy=MatchStrategy.TOKEN_SORT_RATIO) if score >= thresh: return True elif rule == MatchRule.TOKEN_SET_RATIO: score = fuzzy_match(s, transcript, strategy=MatchStrategy.TOKEN_SET_RATIO) if score >= thresh: return True elif rule == MatchRule.PARTIAL_TOKEN_SORT_RATIO: score = fuzzy_match( s, transcript, strategy=MatchStrategy.PARTIAL_TOKEN_SORT_RATIO) if score >= thresh: return True elif rule == MatchRule.PARTIAL_TOKEN_SET_RATIO: score = fuzzy_match( s, transcript, strategy=MatchStrategy.PARTIAL_TOKEN_SET_RATIO) if score >= thresh: return True elif rule == MatchRule.CONTAINS: if s in transcript: return True elif rule == MatchRule.EQUALS: if s == transcript: return True elif rule == MatchRule.STARTS: if transcript.startswith(s): return True elif rule == MatchRule.ENDS: if transcript.endswith(s): return True return False
def calc_score(match): score = base_score # this will give score of 100 if query is included in video title score += 100 * fuzzy_match( phrase.lower(), match["title"].lower(), strategy=MatchStrategy.TOKEN_SET_RATIO) # small penalty to not return 100 and allow better disambiguation if media_type == CPSMatchType.GENERIC: score -= 10 if score >= 100: if media_type == CPSMatchType.AUDIO: score -= 20 # likely don't want to answer most of these elif media_type != CPSMatchType.VIDEO: score -= 10 elif media_type == CPSMatchType.MUSIC and not is_music(match): score -= 5 # youtube gives pretty high scores in general, so we allow it # to run as fallback mode, which assigns lower scores and gives # preference to matches from other skills if self.settings["fallback_mode"]: if not explicit_request: score -= 25 return min(100, score)
def CPS_search(self, phrase, media_type): base_score = self.match_media_type(phrase, media_type) # penalty for generic searches, they tend to overmatch if media_type == CPSMatchType.GENERIC: base_score -= 20 # match titles and sort # then match all the metadata up to self.settings["search_depth"] videos = sorted(self.videos, key=lambda k: fuzzy_match(k["title"], phrase), reverse=True) cps_results = [] for idx, video in enumerate(videos[:self.settings["search_depth"]]): score = base_score + fuzzy_match(video["title"], phrase) * 30 if self.settings["match_tags"]: score += self.match_tags(video, phrase, media_type) if self.settings["match_title"]: score += self.match_title(video, phrase, media_type) if self.settings["match_description"]: score += self.match_description(video, phrase, media_type) if score < self.settings["min_score"]: continue cps_results.append(merge_dict(video, { "match_confidence": min(100, score), "media_type": self.media_type, "playback": self.playback_type, "skill_icon": self.skill_icon, "skill_logo": self.skill_logo, "bg_image": video.get("logo") or self.default_bg, "image": video.get("logo") or self.default_image, "author": self.name })) cps_results = sorted(cps_results, key=lambda k: k["match_confidence"], reverse=True) return cps_results
def match_title(self, videos, phrase, match): # match video name clean_phrase = self.normalize_title(phrase) leftover_text = phrase best_score = 0 best_video = random.choice(videos) for video in videos: title = video["title"] score = fuzzy_match(clean_phrase, self.normalize_title(title)) if phrase.lower() in title.lower() or \ clean_phrase in self.normalize_title(title): score += 0.3 if score >= best_score: # TODO handle ties match = CPSMatchLevel.TITLE best_video = video best_score = score leftover_text = phrase.replace(title, "") return match, best_score, best_video, leftover_text
def find_duplicate_streams(cls): """ detect streams that are duplicated by several skills """ for prev_idx, prev_ch in cls.channels.items(): prev_url = prev_ch.get("stream") for idx, ch in dict(cls.channels).items(): if idx == prev_idx: continue url = ch.get("stream") if url == prev_url and False: score = 1.0 else: score = fuzzy_match(ch["title"].lower(), prev_ch["title"].lower()) if score >= cls.duplicate_threshold: if idx not in cls._duplicates: LOG.info(f"Duplicate channel: {prev_idx}:{idx} - " f"confidence: {score}") cls._duplicates[idx] = [prev_idx] elif prev_idx not in cls._duplicates[idx]: cls._duplicates[idx].append(prev_idx)
def match_title(self, video, phrase, media_type): # match video name clean_phrase = self.normalize_title(phrase) title = video["title"] score = fuzzy_match(clean_phrase, self.normalize_title(title)) * 100 if phrase.lower() in title.lower() or \ clean_phrase in self.normalize_title(title): score += 25 if phrase.lower() in title.lower().split(" ") or \ clean_phrase in self.normalize_title(title).split(" "): score += 30 if media_type == CPSMatchType.TRAILER: if self.voc_match(title, "trailer"): score += 20 else: score -= 10 elif self.settings["filter_trailers"] and \ self.voc_match(title, "trailer") or \ "trailer" in title.lower(): # trailer in title, but not in media_type, let's skip it # TODO bundle trailer.voc in ovos_utils score = 0 if media_type == CPSMatchType.BEHIND_THE_SCENES: if self.voc_match(title, "behind_scenes"): score += 20 else: score -= 10 elif self.settings["filter_behind_scenes"] and \ self.voc_match(title, "behind_scenes") or \ "behind the scenes" in title.lower(): # trailer in title, but not in media_type, let's skip it # TODO bundle behind_scenes.voc in ovos_utils score = 0 return score