Ejemplo n.º 1
0
    def _compare_to_track(self, track):
        """
        Compare file metadata to a MusicBrainz track.

        Weigths:
          * title                = 13
          * artist name          = 4
          * release name         = 5
          * length               = 10
          * number of tracks     = 4
          * album type           = 20
          * release country      = 2
          * format               = 2

        """
        total = 0.0
        parts = []
        w = self.comparison_weights

        if 'title' in self.metadata:
            a = self.metadata['title']
            b = track.title[0].text
            parts.append((similarity2(a, b), w["title"]))
            total += w["title"]

        if 'artist' in self.metadata:
            a = self.metadata['artist']
            b = artist_credit_from_node(track.artist_credit[0], self.config)[0]
            parts.append((similarity2(a, b), w["artist"]))
            total += w["artist"]

        a = self.metadata.length
        if a > 0 and 'length' in track.children:
            b = int(track.length[0].text)
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, w["length"]))
            total += w["length"]

        releases = []
        if "release_list" in track.children and "release" in track.release_list[0].children:
            releases = track.release_list[0].release

        if not releases:
            return (total, None)

        scores = []
        for release in releases:
            t, p = self.metadata.compare_to_release(release, w, self.config)
            total_ = total + t
            parts_ = list(parts) + p
            scores.append((reduce(lambda x, y: x + y[0] * y[1] / total_, parts_, 0.0), release.id))

        return max(scores, key=lambda x: x[0])
Ejemplo n.º 2
0
    def _compare_to_release(self, release):
        """
        Compare cluster metadata to a MusicBrainz release. Produces a
        probability as a linear combination of weights that the
        cluster is a certain album.

        Weights:
          * title                = 17
          * artist name          = 6
          * number of tracks     = 5
          * release country      = 2
          * format               = 2

        """
        total = 0.0
        parts = []
        w = Cluster.comparison_weights

        a = self.metadata['albumartist']
        b = artist_credit_from_node(release.artist_credit[0], self.config)[0]
        parts.append((similarity2(a, b), w["artist"]))
        total += w["artist"]

        t, p = self.metadata.compare_to_release(release, w, self.config)
        total += t
        parts.extend(p)

        return reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
Ejemplo n.º 3
0
    def compare(self, other):
        parts = []
        total = 0

        if self.length and other.length:
            score = 1.0 - min(abs(self.length - other.length), 30000) / 30000.0
            parts.append((score, 8))
            total += 8

        for name, weight in self.__weights:
            a = self[name]
            b = other[name]
            if a and b:
                if name in ('tracknumber', 'totaltracks'):
                    try:
                        ia = int(a)
                        ib = int(b)
                    except ValueError:
                        ia = a
                        ib = b
                    score = 1.0 - abs(cmp(ia, ib))
                else:
                    score = similarity2(a, b)
                parts.append((score, weight))
                total += weight
        return reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
Ejemplo n.º 4
0
    def compare(self, other):
        parts = []

        if self.length and other.length:
            score = self.length_score(self.length, other.length)
            parts.append((score, 8))

        for name, weight in self.__weights:
            a = self[name]
            b = other[name]
            if a and b:
                if name in ('tracknumber', 'totaltracks'):
                    try:
                        ia = int(a)
                        ib = int(b)
                    except ValueError:
                        ia = a
                        ib = b
                    score = 1.0 - (int(ia != ib))
                else:
                    score = similarity2(a, b)
                parts.append((score, weight))
            elif (a and name in other.deleted_tags
                  or b and name in self.deleted_tags):
                parts.append((0, weight))
        return linear_combination_of_weights(parts)
Ejemplo n.º 5
0
    def compare(self, other):
        parts = []

        if self.length and other.length:
            score = self.length_score(self.length, other.length)
            parts.append((score, 8))

        for name, weight in self.__weights:
            a = self[name]
            b = other[name]
            if a and b:
                if name in ('tracknumber', 'totaltracks'):
                    try:
                        ia = int(a)
                        ib = int(b)
                    except ValueError:
                        ia = a
                        ib = b
                    score = 1.0 - (int(ia != ib))
                else:
                    score = similarity2(a, b)
                parts.append((score, weight))
            elif (a and name in other.deleted_tags
                  or b and name in self.deleted_tags):
                parts.append((0, weight))
        return linear_combination_of_weights(parts)
Ejemplo n.º 6
0
    def compare_to_track(self, track, weights):
        total = 0.0
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.title[0].text
            parts.append((similarity2(a, b), weights["title"]))
            total += weights["title"]

        if 'artist' in self:
            a = self['artist']
            b = artist_credit_from_node(track.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["artist"]))
            total += weights["artist"]

        a = self.length
        if a > 0 and 'length' in track.children:
            b = int(track.length[0].text)
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, weights["length"]))
            total += weights["length"]

        releases = []
        if "release_list" in track.children and "release" in track.release_list[
                0].children:
            releases = track.release_list[0].release

        if not releases:
            sim = reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
            return (sim, None, None, track)

        result = (-1, )
        for release in releases:
            t, p = self.compare_to_release(release, weights, return_parts=True)
            sim = reduce(lambda x, y: x + y[0] * y[1] / (total + t), parts + p,
                         0.0)
            if sim > result[0]:
                rg = release.release_group[
                    0] if "release_group" in release.children else None
                result = (sim, rg, release, track)

        return result
Ejemplo n.º 7
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.get('title', '')
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            artist_credits = track.get('artist-credit', [])
            b = artist_credit_from_node(artist_credits)[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track:
            b = track['length']
            score = self.length_score(a, b)
            parts.append((score, weights["length"]))

        releases = []
        if "releases" in track:
            releases = track['releases']

        search_score = get_score(track)
        if not releases:
            sim = linear_combination_of_weights(parts) * search_score
            return SimMatchTrack(similarity=sim, releasegroup=None, release=None, track=track)

        if 'isvideo' in weights:
            metadata_is_video = self['~video'] == '1'
            track_is_video = track.get('video', False)
            score = 1 if metadata_is_video == track_is_video else 0
            parts.append((score, weights['isvideo']))

        result = SimMatchTrack(similarity=-1, releasegroup=None, release=None, track=None)
        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts) * search_score
            if sim > result.similarity:
                rg = release['release-group'] if "release-group" in release else None
                result = SimMatchTrack(similarity=sim, releasegroup=rg, release=release, track=track)
        return result
Ejemplo n.º 8
0
    def compare_to_track(self, track, weights):
        config = QObject.config
        total = 0.0
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.title[0].text
            parts.append((similarity2(a, b), weights["title"]))
            total += weights["title"]

        if 'artist' in self:
            a = self['artist']
            b = artist_credit_from_node(track.artist_credit[0], config)[0]
            parts.append((similarity2(a, b), weights["artist"]))
            total += weights["artist"]

        a = self.length
        if a > 0 and 'length' in track.children:
            b = int(track.length[0].text)
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, weights["length"]))
            total += weights["length"]

        releases = []
        if "release_list" in track.children and "release" in track.release_list[0].children:
            releases = track.release_list[0].release

        if not releases:
            sim = reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
            return (sim, None, None, track)

        result = (-1,)
        for release in releases:
            t, p = self.compare_to_release(release, weights, return_parts=True)
            sim = reduce(lambda x, y: x + y[0] * y[1] / (total + t), parts + p, 0.0)
            if sim > result[0]:
                rg = release.release_group[0] if "release_group" in release.children else None
                result = (sim, rg, release, track)

        return result
Ejemplo n.º 9
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.get('title', '')
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            artist_credits = track.get('artist-credit', [])
            b = artist_credit_from_node(artist_credits)[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track:
            b = track['length']
            score = self.length_score(a, b)
            parts.append((score, weights["length"]))

        releases = []
        if "releases" in track:
            releases = track['releases']

        if not releases:
            sim = linear_combination_of_weights(parts)
            return (sim, None, None, track)

        result = (-1, )

        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts)
            if 'score' in track:
                sim *= track['score'] / 100
            if sim > result[0]:
                rg = release[
                    'release-group'] if "release-group" in release else None
                result = (sim, rg, release, track)
        return result
Ejemplo n.º 10
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.get('title', '')
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            artist_credits = track.get('artist-credit', [])
            b = artist_credit_from_node(artist_credits)[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track:
            b = track['length']
            score = self.length_score(a, b)
            parts.append((score, weights["length"]))

        releases = []
        if "releases" in track:
            releases = track['releases']

        if not releases:
            sim = linear_combination_of_weights(parts)
            return SimMatchTrack(similarity=sim, releasegroup=None, release=None, track=track)

        result = SimMatchTrack(similarity=-1, releasegroup=None, release=None, track=None)
        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts)
            if 'score' in track:
                sim *= track['score'] / 100
            if sim > result.similarity:
                rg = release['release-group'] if "release-group" in release else None
                result = SimMatchTrack(similarity=sim, releasegroup=rg, release=release, track=track)
        return result
Ejemplo n.º 11
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
            b = release['track-count']
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))
        except (ValueError, KeyError):
            pass

        weights_from_preferred_countries(
            parts, release, config.setting["preferred_release_countries"],
            weights["releasecountry"])

        weights_from_preferred_formats(
            parts, release, config.setting["preferred_release_formats"],
            weights["format"])

        if "releasetype" in weights:
            weights_from_release_type_scores(
                parts, release, config.setting["release_type_scores"],
                weights["releasetype"])

        rg = QObject.tagger.get_release_group_by_id(
            release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 12
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.title[0].text
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            b = artist_credit_from_node(track.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track.children:
            b = int(track.length[0].text)
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, weights["length"]))

        releases = []
        if "release_list" in track.children and "release" in track.release_list[
                0].children:
            releases = track.release_list[0].release

        if not releases:
            sim = linear_combination_of_weights(parts)
            return (sim, None, None, track)

        result = (-1, )
        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts)
            if sim > result[0]:
                rg = release.release_group[
                    0] if "release_group" in release.children else None
                result = (sim, rg, release, track)

        return result
Ejemplo n.º 13
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track['title']
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            b = artist_credit_from_node(track['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track:
            b = track['length']
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, weights["length"]))

        releases = []
        if "releases" in track:
            releases = track['releases']

        if not releases:
            sim = linear_combination_of_weights(parts)
            return (sim, None, None, track)

        result = (-1,)

        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts)
            if sim > result[0]:
                rg = release['release-group'] if "release-group" in release else None
                result = (sim, rg, release, track)
        return result
Ejemplo n.º 14
0
    def compare_to_track(self, track, weights):
        parts = []

        if 'title' in self:
            a = self['title']
            b = track.title[0].text
            parts.append((similarity2(a, b), weights["title"]))

        if 'artist' in self:
            a = self['artist']
            b = artist_credit_from_node(track.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["artist"]))

        a = self.length
        if a > 0 and 'length' in track.children:
            b = int(track.length[0].text)
            score = 1.0 - min(abs(a - b), 30000) / 30000.0
            parts.append((score, weights["length"]))

        releases = []
        if "release_list" in track.children and "release" in track.release_list[0].children:
            releases = track.release_list[0].release

        if not releases:
            sim = linear_combination_of_weights(parts)
            return (sim, None, None, track)

        result = (-1,)
        for release in releases:
            release_parts = self.compare_to_release_parts(release, weights)
            sim = linear_combination_of_weights(parts + release_parts)
            if sim > result[0]:
                rg = release.release_group[0] if "release_group" in release.children else None
                result = (sim, rg, release, track)

        return result
Ejemplo n.º 15
0
def get_close_matches(search_string, search_set, max_results, min_score):
    winners = []
    matches = []
    for items in search_set:
        score = similarity2(search_string, items)
        if score > min_score:
            winners.append([score, items])
    if winners == []:
        return []
    winners.sort()
    winners.reverse()
    for items in winners:
        if len(matches) < max_results:
            matches.append(items[1])
    return matches
Ejemplo n.º 16
0
    def compare(self, other):
        parts = []

        if self.length and other.length:
            score = 1.0 - min(abs(self.length - other.length), 30000) / 30000.0
            parts.append((score, 8))

        for name, weight in self.__weights:
            a = self[name]
            b = other[name]
            if a and b:
                if name in ('tracknumber', 'totaltracks'):
                    try:
                        ia = int(a)
                        ib = int(b)
                    except ValueError:
                        ia = a
                        ib = b
                    score = 1.0 - abs(cmp(ia, ib))
                else:
                    score = similarity2(a, b)
                parts.append((score, weight))

        return linear_combination_of_weights(parts)
Ejemplo n.º 17
0
 def test_2(self):
     a = "a b c"
     b = "A,B•C"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 18
0
 def test_4(self):
     a = "a b c"
     b = "c a b"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 19
0
 def test_1(self):
     a = b = "a b c"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 20
0
 def test_not_a(self):
     a = ""
     b = "def"
     self.assertEqual(similarity2(a, b), 0.0)
Ejemplo n.º 21
0
 def test_a_b_totally_different(self):
     a = "abc"
     b = "def"
     self.assertEqual(similarity2(a, b), 0.0)
Ejemplo n.º 22
0
 def test_empty_lists(self):
     a = " "
     b = "  "
     self.assertEqual(similarity2(a, b), 0.0)
Ejemplo n.º 23
0
 def test_not_a_and_not_b(self):
     a = ""
     b = ""
     self.assertEqual(similarity2(a, b), 0.0)
Ejemplo n.º 24
0
def similarity(a, b):
    return int(similarity2(to_unicode(a), to_unicode(b)) * 100)
Ejemplo n.º 25
0
 def test_5(self):
     a = "a b c"
     b = "a b d"
     self.assertAlmostEqual(similarity2(a, b), 0.6, 1)
Ejemplo n.º 26
0
def similarity(a, b):
    return int(similarity2(to_unicode(a), to_unicode(b)) * 100)
Ejemplo n.º 27
0
        if m:
            try:
                original = shs.lookup('recording', int(m.group(1)))
                if 'performer' in original:
                    shs_artists.append(original['performer']['artist'])
            except ValueError:
                pass
            except urllib2.HTTPError:
                pass
    for shs_artist in shs_artists:
        shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName']))
        mb_artist_name = mangle_name(artist['name'])
        if shs_artist_name == mb_artist_name:
            artist_uri = shs_artist['uri']
            break
        elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85:
            print " * '%s' has a similarity of %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)))
            artist_uri = shs_artist['uri']
            break

    if artist_uri:
        matched_artists.add(artist['gid'])
        colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri))
        edit_note = 'Guessing artist SecondHandSongs URL from work https://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url'])
        out(' * edit note: %s' % (edit_note,))
        
        mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note)
    else:
        colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],))

    if artist['processed'] is None and artist['gid'] not in seen_artists:
Ejemplo n.º 28
0
    def compare_to_release(self, release, weights, config):
        total = 0.0
        parts = []

        if "album" in self:
            b = release.title[0].text
            parts.append((similarity2(self["album"], b), weights["album"]))
            total += weights["album"]

        if "totaltracks" in self:
            a = int(self["totaltracks"])
            if "title" in weights:
                b = int(release.medium_list[0].medium[0].track_list[0].count)
            else:
                b = int(release.medium_list[0].track_count[0].text)
            if a > b:
                score = 0.0
            elif a < b:
                score = 0.3
            else:
                score = 1.0
            parts.append((score, weights["totaltracks"]))
            total += weights["totaltracks"]

        preferred_countries = config.setting["preferred_release_countries"].split("  ")
        preferred_formats = config.setting["preferred_release_formats"].split("  ")

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release.children:
                try:
                    i = preferred_countries.index(release.country[0].text)
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release.medium_list[0].medium:
                if "format" in medium.children:
                    try:
                        i = preferred_formats.index(medium.format[0].text)
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0: score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = load_release_type_scores(config.setting["release_type_scores"])
            if 'release_group' in release.children and 'type' in release.release_group[0].attribs:
                release_type = release.release_group[0].type
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))
            total += weights["releasetype"]

        return (total, parts)
Ejemplo n.º 29
0
 def test_7(self):
     a = "abc"
     b = "def"
     self.assertEqual(similarity2(a, b), 0.0)
Ejemplo n.º 30
0
 def test_6(self):
     a = "a b c"
     b = "a f d"
     self.assertAlmostEqual(similarity2(a, b), 0.3, 1)
Ejemplo n.º 31
0
    def compare_to_release_parts(self, release, weights):
        parts = []

        if "album" in self:
            b = release.title[0].text
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        if "totaltracks" in self:
            try:
                a = int(self["totaltracks"])
            except ValueError:
                pass
            else:
                if "title" in weights:
                    b = int(release.medium_list[0].medium[0].track_list[0].count)
                else:
                    b = int(release.medium_list[0].track_count[0].text)
                score = 0.0 if a > b else 0.3 if a < b else 1.0
                parts.append((score, weights["totaltracks"]))

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release.children:
                try:
                    i = preferred_countries.index(release.country[0].text)
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release.medium_list[0].medium:
                if "format" in medium.children:
                    try:
                        i = preferred_formats.index(medium.format[0].text)
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = dict(config.setting["release_type_scores"])
            if 'release_group' in release.children and 'type' in release.release_group[0].attribs:
                release_type = release.release_group[0].type
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))

        rg = QObject.tagger.get_release_group_by_id(release.release_group[0].id)
        if release.id in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 32
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
        except (ValueError, KeyError):
            pass
        else:
            try:
                if "title" in weights:
                    b = release['media'][0]['track-count']
                else:
                    b = release['track-count']
            except KeyError:
                b = 0
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release:
                try:
                    i = preferred_countries.index(release['country'])
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats and 'media' in release:
            score = 0.0
            subtotal = 0
            for medium in release['media']:
                if "format" in medium:
                    try:
                        i = preferred_formats.index(medium['format'])
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            # This section generates a score that determines how likely this release will be selected in a lookup.
            # The score goes from 0 to 1 with 1 being the most likely to be chosen and 0 the least likely
            # This score is based on the preferences of release-types found in this release
            # This algorithm works by taking the scores of the primary type (and secondary if found) and averages them
            # If no types are found, it is set to the score of the 'Other' type or 0.5 if 'Other' doesnt exist

            type_scores = dict(config.setting["release_type_scores"])
            score = 0.0
            other_score = type_scores.get('Other', 0.5)
            if 'release-group' in release and 'primary-type' in release['release-group']:
                types_found = [release['release-group']['primary-type']]
                if 'secondary-types' in release['release-group']:
                    types_found += release['release-group']['secondary-types']
                for release_type in types_found:
                    score += type_scores.get(release_type, other_score)
                score /= len(types_found)
            parts.append((score, weights["releasetype"]))

        rg = QObject.tagger.get_release_group_by_id(release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 33
0
 def test_full_match(self):
     a = b = "a b c"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 34
0
    def compare_to_release(self, release, weights, return_parts=False):
        """
        Compare metadata to a MusicBrainz release. Produces a probability as a
        linear combination of weights that the metadata matches a certain album.
        """
        total = 0.0
        parts = []

        if "album" in self:
            b = release.title[0].text
            parts.append((similarity2(self["album"], b), weights["album"]))
            total += weights["album"]

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))
            total += weights["albumartist"]

        if "totaltracks" in self:
            a = int(self["totaltracks"])
            if "title" in weights:
                b = int(release.medium_list[0].medium[0].track_list[0].count)
            else:
                b = int(release.medium_list[0].track_count[0].text)
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))
            total += weights["totaltracks"]

        preferred_countries = config.setting[
            "preferred_release_countries"].split("  ")
        preferred_formats = config.setting["preferred_release_formats"].split(
            "  ")

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release.children:
                try:
                    i = preferred_countries.index(release.country[0].text)
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release.medium_list[0].medium:
                if "format" in medium.children:
                    try:
                        i = preferred_formats.index(medium.format[0].text)
                        score += float(total_formats -
                                       i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = load_release_type_scores(
                config.setting["release_type_scores"])
            if 'release_group' in release.children and 'type' in release.release_group[
                    0].attribs:
                release_type = release.release_group[0].type
                score = type_scores.get(release_type,
                                        type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))
            total += weights["releasetype"]

        rg = QObject.tagger.get_release_group_by_id(
            release.release_group[0].id)
        if release.id in rg.loaded_albums:
            parts.append((1.0, 6))

        return (total, parts) if return_parts else \
               (reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0), release)
        if m:
            try:
                original = shs.lookup('recording', int(m.group(1)))
                if 'performer' in original:
                    shs_artists.append(original['performer']['artist'])
            except ValueError:
                pass
            except urllib2.HTTPError:
                pass
    for shs_artist in shs_artists:
        shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName']))
        mb_artist_name = mangle_name(artist['name'])
        if shs_artist_name == mb_artist_name:
            artist_uri = shs_artist['uri']
            break
        elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85:
            print " * '%s' has a similarity of %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)))
            artist_uri = shs_artist['uri']
            break

    if artist_uri:
        matched_artists.add(artist['gid'])
        colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri))
        edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url'])
        out(' * edit note: %s' % (edit_note,))
        
        mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note)
    else:
        colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],))

    if artist['processed'] is None and artist['gid'] not in seen_artists:
Ejemplo n.º 36
0
 def test_a_longer_than_b(self):
     a = "a b c d"
     b = "a d c"
     self.assertAlmostEqual(similarity2(a, b), 0.88, 1)
Ejemplo n.º 37
0
    def compare_to_release(self, release, weights, config):
        total = 0.0
        parts = []

        if "album" in self:
            b = release.title[0].text
            parts.append((similarity2(self["album"], b), weights["album"]))
            total += weights["album"]

        if "totaltracks" in self:
            a = int(self["totaltracks"])
            if "title" in weights:
                b = int(release.medium_list[0].medium[0].track_list[0].count)
            else:
                b = int(release.medium_list[0].track_count[0].text)
            if a > b:
                score = 0.0
            elif a < b:
                score = 0.3
            else:
                score = 1.0
            parts.append((score, weights["totaltracks"]))
            total += weights["totaltracks"]

        preferred_countries = config.setting["preferred_release_countries"].split("  ")
        preferred_formats = config.setting["preferred_release_formats"].split("  ")

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release.children:
                try:
                    i = preferred_countries.index(release.country[0].text)
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release.medium_list[0].medium:
                if "format" in medium.children:
                    try:
                        i = preferred_formats.index(medium.format[0].text)
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0: score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = load_release_type_scores(config.setting["release_type_scores"])
            if 'release_group' in release.children and 'type' in release.release_group[0].attribs:
                release_type = release.release_group[0].type
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))
            total += weights["releasetype"]

        return (total, parts)
Ejemplo n.º 38
0
    def compare_to_release(self, release, weights, return_parts=False):
        """
        Compare metadata to a MusicBrainz release. Produces a probability as a
        linear combination of weights that the metadata matches a certain album.
        """
        total = 0.0
        parts = []

        if "album" in self:
            b = release.title[0].text
            parts.append((similarity2(self["album"], b), weights["album"]))
            total += weights["album"]

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release.artist_credit[0])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))
            total += weights["albumartist"]

        if "totaltracks" in self:
            a = int(self["totaltracks"])
            if "title" in weights:
                b = int(release.medium_list[0].medium[0].track_list[0].count)
            else:
                b = int(release.medium_list[0].track_count[0].text)
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))
            total += weights["totaltracks"]

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release.children:
                try:
                    i = preferred_countries.index(release.country[0].text)
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release.medium_list[0].medium:
                if "format" in medium.children:
                    try:
                        i = preferred_formats.index(medium.format[0].text)
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = load_release_type_scores(config.setting["release_type_scores"])
            if 'release_group' in release.children and 'type' in release.release_group[0].attribs:
                release_type = release.release_group[0].type
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))
            total += weights["releasetype"]

        rg = QObject.tagger.get_release_group_by_id(release.release_group[0].id)
        if release.id in rg.loaded_albums:
            parts.append((1.0, 6))

        return (total, parts) if return_parts else \
               (reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0), release)
Ejemplo n.º 39
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
            b = release['track-count']
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))
        except (ValueError, KeyError):
            pass

        # Date Logic
        date_match_factor = 0.0
        if "date" in release and release['date'] != '':
            release_date = release['date']
            if "date" in self:
                metadata_date = self['date']
                if release_date == metadata_date:
                    # release has a date and it matches what our metadata had exactly.
                    date_match_factor = self.__date_match_factors['exact']
                else:
                    release_year = extract_year_from_date(release_date)
                    if release_year is not None:
                        metadata_year = extract_year_from_date(metadata_date)
                        if metadata_year is not None:
                            if release_year == metadata_year:
                                # release has a date and it matches what our metadata had for year exactly.
                                date_match_factor = self.__date_match_factors['year']
                            elif abs(release_year - metadata_year) <= 2:
                                # release has a date and it matches what our metadata had closely (year +/- 2).
                                date_match_factor = self.__date_match_factors['close_year']
                            else:
                                # release has a date but it does not match ours (all else equal,
                                # its better to have an unknown date than a wrong date, since
                                # the unknown could actually be correct)
                                date_match_factor = self.__date_match_factors['differed']
            else:
                # release has a date but we don't have one (all else equal, we prefer
                # tracks that have non-blank date values)
                date_match_factor = self.__date_match_factors['exists_vs_null']
        else:
            # release has a no date (all else equal, we don't prefer this
            # release since its date is missing)
            date_match_factor = self.__date_match_factors['no_release_date']

        parts.append((date_match_factor, weights['date']))

        config = get_config()
        weights_from_preferred_countries(parts, release,
                                         config.setting["preferred_release_countries"],
                                         weights["releasecountry"])

        weights_from_preferred_formats(parts, release,
                                       config.setting["preferred_release_formats"],
                                       weights["format"])

        if "releasetype" in weights:
            weights_from_release_type_scores(parts, release,
                                             config.setting["release_type_scores"],
                                             weights["releasetype"])

        rg = QObject.tagger.get_release_group_by_id(release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
        if m:
            try:
                original = shs.lookup('recording', int(m.group(1)))
                if 'performer' in original:
                    shs_artists.append(original['performer']['artist'])
            except ValueError:
                pass
            except urllib2.HTTPError:
                pass
    for shs_artist in shs_artists:
        shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName']))
        mb_artist_name = mangle_name(artist['name'])
        if shs_artist_name == mb_artist_name:
            artist_uri = shs_artist['uri']
            break
        elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85:
            print "%s => similarity = %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)))
            artist_uri = shs_artist['uri']
            break

    if artist_uri:
        matched_artists.add(artist['gid'])
        colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri))
        edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url'])
        out(' * edit note: %s' % (edit_note,))
        
        mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note)
    else:
        colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],))

    if artist['processed'] is None and artist['gid'] not in seen_artists:
Ejemplo n.º 41
0
 def test_match_various_separators_2(self):
     a = "a b c"
     b = ",A, B •C•"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 42
0
 def test_3(self):
     a = "a b c"
     b = ",A, B •C•"
     self.assertEqual(similarity2(a, b), 1.0)
Ejemplo n.º 43
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
        except (ValueError, KeyError):
            pass
        else:
            if "title" in weights:
                b = release['media'][0]['track-count']
            else:
                b = release['track-count']
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release:
                try:
                    i = preferred_countries.index(release['country'])
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release['media']:
                if "format" in medium:
                    try:
                        i = preferred_formats.index(medium['format'])
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = dict(config.setting["release_type_scores"])
            if 'release-group' in release and 'primary-type' in release['release-group']:
                release_type = release['release-group']['primary-type']
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))

        rg = QObject.tagger.get_release_group_by_id(release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 44
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
        except (ValueError, KeyError):
            pass
        else:
            if "title" in weights:
                b = release['media'][0]['track-count']
            else:
                b = release['track-count']
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release:
                try:
                    i = preferred_countries.index(release['country'])
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats:
            score = 0.0
            subtotal = 0
            for medium in release['media']:
                if "format" in medium:
                    try:
                        i = preferred_formats.index(medium['format'])
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            type_scores = dict(config.setting["release_type_scores"])
            if 'release-group' in release and 'primary-type' in release['release-group']:
                release_type = release['release-group']['primary-type']
                score = type_scores.get(release_type, type_scores.get('Other', 0.5))
            else:
                score = 0.0
            parts.append((score, weights["releasetype"]))

        rg = QObject.tagger.get_release_group_by_id(release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 45
0
    def compare_to_release_parts(self, release, weights):
        parts = []
        if "album" in self:
            b = release['title']
            parts.append((similarity2(self["album"], b), weights["album"]))

        if "albumartist" in self and "albumartist" in weights:
            a = self["albumartist"]
            b = artist_credit_from_node(release['artist-credit'])[0]
            parts.append((similarity2(a, b), weights["albumartist"]))

        try:
            a = int(self["totaltracks"])
        except (ValueError, KeyError):
            pass
        else:
            try:
                if "title" in weights:
                    b = release['media'][0]['track-count']
                else:
                    b = release['track-count']
            except KeyError:
                b = 0
            score = 0.0 if a > b else 0.3 if a < b else 1.0
            parts.append((score, weights["totaltracks"]))

        preferred_countries = config.setting["preferred_release_countries"]
        preferred_formats = config.setting["preferred_release_formats"]

        total_countries = len(preferred_countries)
        if total_countries:
            score = 0.0
            if "country" in release:
                try:
                    i = preferred_countries.index(release['country'])
                    score = float(total_countries - i) / float(total_countries)
                except ValueError:
                    pass
            parts.append((score, weights["releasecountry"]))

        total_formats = len(preferred_formats)
        if total_formats and 'media' in release:
            score = 0.0
            subtotal = 0
            for medium in release['media']:
                if "format" in medium:
                    try:
                        i = preferred_formats.index(medium['format'])
                        score += float(total_formats - i) / float(total_formats)
                    except ValueError:
                        pass
                    subtotal += 1
            if subtotal > 0:
                score /= subtotal
            parts.append((score, weights["format"]))

        if "releasetype" in weights:
            # This section generates a score that determines how likely this release will be selected in a lookup.
            # The score goes from 0 to 1 with 1 being the most likely to be chosen and 0 the least likely
            # This score is based on the preferences of release-types found in this release
            # This algorithm works by taking the scores of the primary type (and secondary if found) and averages them
            # If no types are found, it is set to the score of the 'Other' type or 0.5 if 'Other' doesnt exist

            type_scores = dict(config.setting["release_type_scores"])
            score = 0.0
            other_score = type_scores.get('Other', 0.5)
            if 'release-group' in release and 'primary-type' in release['release-group']:
                types_found = [release['release-group']['primary-type']]
                if 'secondary-types' in release['release-group']:
                    types_found += release['release-group']['secondary-types']
                for release_type in types_found:
                    score += type_scores.get(release_type, other_score)
                score /= len(types_found)
            parts.append((score, weights["releasetype"]))

        rg = QObject.tagger.get_release_group_by_id(release['release-group']['id'])
        if release['id'] in rg.loaded_albums:
            parts.append((1.0, 6))

        return parts
Ejemplo n.º 46
0
            try:
                original = shs.lookup('recording', int(m.group(1)))
                if 'performer' in original:
                    shs_artists.append(original['performer']['artist'])
            except ValueError:
                pass
            except urllib2.HTTPError:
                pass
    for shs_artist in shs_artists:
        shs_artist_name = mangle_name(
            re.sub(' \[\d+\]$', '', shs_artist['commonName']))
        mb_artist_name = mangle_name(artist['name'])
        if shs_artist_name == mb_artist_name:
            artist_uri = shs_artist['uri']
            break
        elif similarity2(to_unicode(shs_artist_name),
                         to_unicode(mb_artist_name)) > 0.85:
            print " * '%s' has a similarity of %.2f" % (
                shs_artist['commonName'],
                similarity2(to_unicode(shs_artist_name),
                            to_unicode(mb_artist_name)))
            artist_uri = shs_artist['uri']
            break

    if artist_uri:
        matched_artists.add(artist['gid'])
        colored_out(
            bcolors.HEADER, ' * using %s, found artist SHS URL: %s' %
            (artist['shs_url'], artist_uri))
        edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (
            artist['work_gid'], artist['shs_url'])
        out(' * edit note: %s' % (edit_note, ))