def apply_rules(cls,
                 transcript,
                 samples,
                 rule=MatchRule.FUZZY,
                 thresh=0.75):
     for s in samples:
         s = s.lower().strip()
         if rule == MatchRule.FUZZY:
             score = fuzzy_match(s, transcript)
             if score >= thresh:
                 return True
         elif rule == MatchRule.TOKEN_SORT_RATIO:
             score = fuzzy_match(s,
                                 transcript,
                                 strategy=MatchStrategy.TOKEN_SORT_RATIO)
             if score >= thresh:
                 return True
         elif rule == MatchRule.TOKEN_SET_RATIO:
             score = fuzzy_match(s,
                                 transcript,
                                 strategy=MatchStrategy.TOKEN_SET_RATIO)
             if score >= thresh:
                 return True
         elif rule == MatchRule.PARTIAL_TOKEN_SORT_RATIO:
             score = fuzzy_match(
                 s,
                 transcript,
                 strategy=MatchStrategy.PARTIAL_TOKEN_SORT_RATIO)
             if score >= thresh:
                 return True
         elif rule == MatchRule.PARTIAL_TOKEN_SET_RATIO:
             score = fuzzy_match(
                 s,
                 transcript,
                 strategy=MatchStrategy.PARTIAL_TOKEN_SET_RATIO)
             if score >= thresh:
                 return True
         elif rule == MatchRule.CONTAINS:
             if s in transcript:
                 return True
         elif rule == MatchRule.EQUALS:
             if s == transcript:
                 return True
         elif rule == MatchRule.STARTS:
             if transcript.startswith(s):
                 return True
         elif rule == MatchRule.ENDS:
             if transcript.endswith(s):
                 return True
     return False
        def calc_score(match):
            score = base_score
            # this will give score of 100 if query is included in video title
            score += 100 * fuzzy_match(
                phrase.lower(), match["title"].lower(),
                strategy=MatchStrategy.TOKEN_SET_RATIO)

            # small penalty to not return 100 and allow better disambiguation
            if media_type == CPSMatchType.GENERIC:
                score -= 10
            if score >= 100:
                if media_type == CPSMatchType.AUDIO:
                    score -= 20  # likely don't want to answer most of these
                elif media_type != CPSMatchType.VIDEO:
                    score -= 10
                elif media_type == CPSMatchType.MUSIC and not is_music(match):
                    score -= 5

            # youtube gives pretty high scores in general, so we allow it
            # to run as fallback mode, which assigns lower scores and gives
            # preference to matches from other skills
            if self.settings["fallback_mode"]:
                if not explicit_request:
                    score -= 25
            return min(100, score)
Exemple #3
0
    def CPS_search(self, phrase, media_type):
        base_score = self.match_media_type(phrase, media_type)
        # penalty for generic searches, they tend to overmatch
        if media_type == CPSMatchType.GENERIC:
            base_score -= 20
        # match titles and sort
        # then match all the metadata up to self.settings["search_depth"]
        videos = sorted(self.videos,
                        key=lambda k: fuzzy_match(k["title"], phrase),
                        reverse=True)
        cps_results = []
        for idx, video in enumerate(videos[:self.settings["search_depth"]]):
            score = base_score + fuzzy_match(video["title"], phrase) * 30
            if self.settings["match_tags"]:
                score += self.match_tags(video, phrase, media_type)
            if self.settings["match_title"]:
                score += self.match_title(video, phrase, media_type)
            if self.settings["match_description"]:
                score += self.match_description(video, phrase, media_type)
            if score < self.settings["min_score"]:
                continue
            cps_results.append(merge_dict(video, {
                "match_confidence": min(100, score),
                "media_type": self.media_type,
                "playback": self.playback_type,
                "skill_icon": self.skill_icon,
                "skill_logo": self.skill_logo,
                "bg_image": video.get("logo") or self.default_bg,
                "image": video.get("logo") or self.default_image,
                "author": self.name
            }))

        cps_results = sorted(cps_results,
                             key=lambda k: k["match_confidence"],
                             reverse=True)
        return cps_results
Exemple #4
0
 def match_title(self, videos, phrase, match):
     # match video name
     clean_phrase = self.normalize_title(phrase)
     leftover_text = phrase
     best_score = 0
     best_video = random.choice(videos)
     for video in videos:
         title = video["title"]
         score = fuzzy_match(clean_phrase, self.normalize_title(title))
         if phrase.lower() in title.lower() or \
                 clean_phrase in self.normalize_title(title):
             score += 0.3
         if score >= best_score:
             # TODO handle ties
             match = CPSMatchLevel.TITLE
             best_video = video
             best_score = score
             leftover_text = phrase.replace(title, "")
     return match, best_score, best_video, leftover_text
Exemple #5
0
    def find_duplicate_streams(cls):
        """ detect streams that are duplicated by several skills """
        for prev_idx, prev_ch in cls.channels.items():
            prev_url = prev_ch.get("stream")
            for idx, ch in dict(cls.channels).items():
                if idx == prev_idx:
                    continue
                url = ch.get("stream")

                if url == prev_url and False:
                    score = 1.0
                else:
                    score = fuzzy_match(ch["title"].lower(),
                                        prev_ch["title"].lower())

                if score >= cls.duplicate_threshold:
                    if idx not in cls._duplicates:
                        LOG.info(f"Duplicate channel: {prev_idx}:{idx} - "
                                 f"confidence: {score}")
                        cls._duplicates[idx] = [prev_idx]
                    elif prev_idx not in cls._duplicates[idx]:
                        cls._duplicates[idx].append(prev_idx)
Exemple #6
0
    def match_title(self, video, phrase, media_type):
        # match video name
        clean_phrase = self.normalize_title(phrase)
        title = video["title"]
        score = fuzzy_match(clean_phrase, self.normalize_title(title)) * 100
        if phrase.lower() in title.lower() or \
                clean_phrase in self.normalize_title(title):
            score += 25
        if phrase.lower() in title.lower().split(" ") or \
                clean_phrase in self.normalize_title(title).split(" "):
            score += 30

        if media_type == CPSMatchType.TRAILER:
            if self.voc_match(title, "trailer"):
                score += 20
            else:
                score -= 10
        elif self.settings["filter_trailers"] and \
                self.voc_match(title, "trailer") or \
                "trailer" in title.lower():
            # trailer in title, but not in media_type, let's skip it
            # TODO bundle trailer.voc in ovos_utils
            score = 0

        if media_type == CPSMatchType.BEHIND_THE_SCENES:
            if self.voc_match(title, "behind_scenes"):
                score += 20
            else:
                score -= 10
        elif self.settings["filter_behind_scenes"] and \
                self.voc_match(title, "behind_scenes") or \
                "behind the scenes" in title.lower():
            # trailer in title, but not in media_type, let's skip it
            # TODO bundle behind_scenes.voc in ovos_utils
            score = 0

        return score