Python levenshtein Exemples, MythTV.utility.levenshtein Python Exemples

Exemple #1

0

Afficher le fichier

    def search(self, phrase, subtitle=None, tolerance=None, func=None):
        """
        obj.search(phrase, subtitle=None, tolerance=None) -> result generator

            Returns a generator of results matching the given search
                phrase.  A secondary phrase can be given through the
                'subtitle' parameter, and an optional levenshtein
                tolerance value can be given for filtering results.
        """
        if not func:
            if subtitle is not None:
                func = lambda p,s,r: levenshtein(r.subtitle, s)
            else:
                func = lambda p,s,r: levenshtein('%s : %s' % \
                                            (r.title,r.subtitle), p) \
                                        if r.subtitle is not None else \
                                            levenshtein(r.title, p)

        if tolerance is None:
            tolerance = int(self.db.settings.NULL.\
                                        get('MetadataLookupTolerance', 5))
        if subtitle is not None:
            res = self.command('-N', '"%s" "%s"' % (phrase, subtitle))
        else:
            res = self.command('-M', '"%s"' % phrase)

        for r in res:
            r.levenshtein = func(phrase, subtitle, r)
            if r.levenshtein > tolerance:
                continue
            yield r

Exemple #2

0

Afficher le fichier

Fichier : system.py Projet : Olti/mythtv

    def search(self, phrase, subtitle=None, tolerance=None, func=None):
        """
        obj.search(phrase, subtitle=None, tolerance=None) -> result generator

            Returns a generator of results matching the given search
                phrase.  A secondary phrase can be given through the 
                'subtitle' parameter, and an optional levenshtein
                tolerance value can be given for filtering results.
        """
        if not func:
            if subtitle is not None:
                func = lambda p,s,r: levenshtein(r.subtitle, s)
            else:
                func = lambda p,s,r: levenshtein('%s : %s' % \
                                            (r.title,r.subtitle), p) \
                                        if r.subtitle is not None else \
                                            levenshtein(r.title, p)

        if tolerance is None:
            tolerance = int(self.db.settings.NULL.\
                                        get('MetadataLookupTolerance', 5))
        if subtitle is not None:
            res = self.command('-N', '"%s" "%s"' % (phrase, subtitle))
        else:
            res = self.command('-M', '"%s"' % phrase)

        for r in res:
            r.levenshtein = func(phrase, subtitle, r)
            if r.levenshtein > tolerance:
                continue
            yield r

Exemple #3

0

Afficher le fichier

def _name_match_quality(name, tvtitle):
    distance = levenshtein(name.lower(), tvtitle.lower())
    if len(tvtitle) > len(name):
        match_quality = float(len(tvtitle) - distance) / len(tvtitle)
    else:
        match_quality = float(len(name) - distance) / len(name)
    return match_quality

Exemple #4

0

Afficher le fichier

    def _getSeries(self, series, by_id=False):
        """This searches TheTVDB.com for the series name,
        If a custom_ui UI is configured, it uses this to select the correct
        series. If not, and interactive == True, ConsoleUI is used, if not
        BaseUI is used to select the first result.
        """
        allSeries = self.search(series)

        if self.config['custom_ui'] is not None:
            log().debug("Using custom UI %s" %
                        (repr(self.config['custom_ui'])))
            ui = self.config['custom_ui'](config=self.config)
        else:
            if not self.config['interactive']:
                log().debug('Auto-selecting first search result using BaseUI')
                ui = BaseUI(config=self.config)
            else:
                log().debug('Interactively selecting show using ConsoleUI')
                ui = ConsoleUI(config=self.config)

        if self.config['sort_series']:
            series_lowercase = series.lower()
            for s in allSeries:
                s[u'match_similarity'] = levenshtein(series_lowercase,
                                                     s[u'seriesName'].lower())
            allSeries.sort(key=lambda s: s[u'match_similarity'])

        return ui.selectSeries(allSeries)

Exemple #5

0

Afficher le fichier

Fichier : tvmaze.py Projet : Pugwash1/mythtv

def buildNumbers(args, opts):
    # either option -N <inetref> <subtitle>   e.g. -N 69 "Elizabeth Keen"
    #    or  option -N <inetref> <date time>  e.g. -N 69 "2021-01-29 19:00:00"
    #    or  option -N <title> <subtitle>     e.g. -N "The Blacklist" "Elizabeth Keen"
    #    or  option -N <title> <date time>    e.g. -N "The Blacklist" "2021-01-29 19:00:00"
    from MythTV.utility import levenshtein
    from MythTV.utility.dt import posixtzinfo
    from MythTV.tvmaze import tvmaze_api as tvmaze
    from MythTV import datetime
    from lxml import etree
    from datetime import timedelta

    if opts.debug:
        print("Function 'buildNumbers' called with arguments: " +
              (" ".join(["'%s'" % i for i in args])))

    # set the session
    if opts.session:
        tvmaze.set_session(opts.session)

    dtInLocalZone = None
    # ToDo:
    # below check shows a deficiency of the MythTV grabber API itself:
    # TV-Shows or Movies with an integer as title are not recognized correctly.
    # see https://www.mythtv.org/wiki/MythTV_Universal_Metadata_Format
    # and https://code.mythtv.org/trac/ticket/11850
    try:
        inetref = int(args[0])
        tvsubtitle = args[1]
        inetrefList = [inetref]

    except ValueError:
        tvtitle = args[0]
        tvsubtitle = args[1]
        inetrefList = []  # inetrefs for shows with title matches
        best_show_quality = 0.5  # require at least this quality on string match

        showlist = tvmaze.search_show(tvtitle)

        # It's problematic to make decisions solely upon the Levenshtein distance.
        # If the strings are really long or really short, a simple rule, such as
        # "accept any distance < 6" can provide misleading results.
        # To establish a more useful measurement, we'll use the Levenshtein
        # distance to figure out the ratio (0 - 1) of matching characters in the
        # longer string, and call this 'match_quality'.
        #    "Risk", "Call" -> distance = 4
        #           match_quality = (4 - 4) / 4 = 0
        #    "In Sickness and in Health", "Sickness and Health" -> distance = 6
        #           match_quality = (25 - 6)/25 = .76

        for show_info in showlist:
            try:
                inetref = int(show_info.id)
                distance = levenshtein(show_info.name.lower(), tvtitle.lower())
                if len(tvtitle) > len(show_info.name):
                    match_quality = float(len(tvtitle) -
                                          distance) / len(tvtitle)
                else:
                    match_quality = float(len(show_info.name) -
                                          distance) / len(show_info.name)
                if match_quality >= best_show_quality:
                    #if opts.debug:
                    #print ('show_info =', show_info, ', match_quality =', match_quality)
                    if match_quality == best_show_quality:
                        inetrefList.append(inetref)
                    else:
                        # Any items previously appended for a lesser match need to be eliminated
                        inetrefList = [inetref]
                        best_show_quality = match_quality
            except (TypeError, ValueError):
                pass

    # check whether the 'subtitle' is really a timestamp
    try:
        dtInLocalZone = datetime.strptime(
            tvsubtitle, "%Y-%m-%d %H:%M:%S")  # defaults to local timezone
    except ValueError:
        dtInLocalZone = None

    matchesFound = 0
    best_ep_quality = 0.5  # require at least this quality on string match
    tree = etree.XML(u'<metadata></metadata>')
    for inetref in inetrefList:
        dtInTgtZone = None
        if dtInLocalZone:
            try:
                show_info = tvmaze.get_show(inetref)
                # Some cases have 'network' = None, but webChannel != None. If we
                # find such a case, we'll set show_network to the webChannel.
                show_network = show_info.network
                if show_network is None:
                    show_network = show_info.streaming_service
                show_country = show_network.get('country')
                show_tz = show_country.get('timezone')
                dtInTgtZone = dtInLocalZone.astimezone(posixtzinfo(show_tz))

            except (ValueError, AttributeError) as e:
                dtInTgtZone = None

        if dtInTgtZone:
            # get episode info based on inetref and datetime in target zone
            try:
                #print('get_show_episodes_by_date(', inetref, ',', dtInTgtZone, ')')
                episodes = tvmaze.get_show_episodes_by_date(
                    inetref, dtInTgtZone)
            except SystemExit:
                episodes = []
            time_match_list = []
            early_match_list = []
            minTimeDelta = timedelta(minutes=60)
            for i, ep in enumerate(episodes):
                epInTgtZone = datetime.fromIso(ep.timestamp,
                                               tz=posixtzinfo(show_tz))
                durationDelta = timedelta(minutes=ep.duration)

                # Consider it a match if the recording starts late, but within the duration of the show.
                if epInTgtZone <= dtInTgtZone < epInTgtZone + durationDelta:
                    # Recording start time is within the range of this episode
                    if opts.debug:
                        print('Recording in range of inetref %d, season %d, episode %d (%s ... %s)' \
                                % (inetref, ep.season, ep.number, epInTgtZone, epInTgtZone+durationDelta))
                    time_match_list.append(i)
                    minTimeDelta = timedelta(minutes=0)
                # Consider it a match if the recording is a little bit early. This helps cases
                # where you set up a rule to record, at say 9:00, and the broadcaster uses a
                # slightly odd start time, like 9:05.
                elif epInTgtZone - minTimeDelta <= dtInTgtZone < epInTgtZone:
                    # Recording started earlier than this episode, so see if it's the closest match
                    if epInTgtZone - dtInTgtZone == minTimeDelta:
                        if opts.debug:
                            print('adding episode to closest list',
                                  epInTgtZone - dtInTgtZone, '\n')
                        early_match_list.append(i)
                    elif epInTgtZone - dtInTgtZone < minTimeDelta:
                        if opts.debug:
                            print('this episode is new closest',
                                  epInTgtZone - dtInTgtZone, '\n')
                        minTimeDelta = epInTgtZone - dtInTgtZone
                        early_match_list = [i]

            if not time_match_list:
                # No exact matches found, so use the list of the closest episode(s)
                time_match_list = early_match_list

            if time_match_list:
                for ep_index in time_match_list:
                    season_nr = str(episodes[ep_index].season)
                    episode_id = episodes[ep_index].id
                    item = buildSingleItem(inetref, season_nr, episode_id)
                    if item is not None:
                        tree.append(item.toXML())
                        matchesFound += 1
        else:
            # get episode based on subtitle
            episodes = tvmaze.get_show_episode_list(inetref)

            min_dist_list = []
            for i, ep in enumerate(episodes):
                if 0 and opts.debug:
                    print("tvmaze.get_show_episode_list(%s) returned :" %
                          inetref)
                    for k, v in ep.__dict__.items():
                        print(k, " : ", v)
                distance = levenshtein(ep.name, tvsubtitle)
                if len(tvsubtitle) >= len(ep.name):
                    match_quality = float(len(tvsubtitle) -
                                          distance) / len(tvsubtitle)
                else:
                    match_quality = float(len(ep.name) - distance) / len(
                        ep.name)
                #if opts.debug:
                #print('inetref', inetref, 'episode =', ep.name, ', distance =', distance, ', match_quality =', match_quality)
                if match_quality >= best_ep_quality:
                    if match_quality == best_ep_quality:
                        min_dist_list.append(i)
                        if opts.debug:
                            print(
                                '"%s" added to best list, match_quality = %g' %
                                (ep.name, match_quality))
                    else:
                        # Any items previously appended for a lesser match need to be eliminated
                        tree = etree.XML(u'<metadata></metadata>')
                        min_dist_list = [i]
                        best_ep_quality = match_quality
                        if opts.debug:
                            print('"%s" is new best match_quality = %g' %
                                  (ep.name, match_quality))

            # The list is constructed in order of oldest season to newest.
            # If episodes with equivalent match quality show up in multiple
            # seasons, we want to list the most recent first. To accomplish
            # this, we'll process items starting at the end of the list, and
            # proceed to the beginning.
            while min_dist_list:
                ep_index = min_dist_list.pop()
                season_nr = str(episodes[ep_index].season)
                episode_id = episodes[ep_index].id
                if opts.debug:
                    episode_nr = str(episodes[ep_index].number)
                    print("tvmaze.get_show_episode_list(%s) returned :" %
                          inetref)
                    print("with season : %s and episode %s" %
                          (season_nr, episode_nr))
                    print(
                        "Chosen episode index '%d' based on match quality %g" %
                        (ep_index, best_ep_quality))

                # we have now inetref, season, episode_id
                item = buildSingleItem(inetref, season_nr, episode_id)
                if item is not None:
                    tree.append(item.toXML())
                    matchesFound += 1

    if matchesFound > 0:
        print_etree(
            etree.tostring(tree,
                           encoding='UTF-8',
                           pretty_print=True,
                           xml_declaration=True))
    else:
        if dtInLocalZone:
            raise Exception(
                "Cannot find any episode with timestamp matching '%s'." %
                tvsubtitle)
        else:
            # tvmaze.py -N 4711 "Episode 42"
            raise Exception("Cannot find any episode with subtitle '%s'." %
                            tvsubtitle)