Ejemplo n.º 1
0
def _fixup_range(start, end, mapping):
    extra = []
    for i in xrange(start, end + 1):
        u = unichr(i)
        if u in mapping:
            extra.append(re_escape(u"".join(mapping[u])))

    start = re_escape(unichr(start))
    end = re_escape(unichr(end))
    return u"%s%s-%s" % ("".join(extra), start, end)
Ejemplo n.º 2
0
def diacritic_for_letters(regenerate=False):
    """Returns a mapping for combining diacritic mark to ascii characters
    for which they can be used to combine to a single unicode char.

    (actually not ascii, but unicode from the Lu/Ll/Lt categories,
    but mainly ascii)

    Since this is quite expensive to compute, the result is a cached version
    unless regenerate != True. regenerate = True is used for unittests
    to validate the cache.
    """

    if not regenerate:
        return _DIACRITIC_CACHE

    d = {}
    for i in xrange(sys.maxunicode):
        u = unichr(i)
        n = unicodedata.normalize("NFKD", u)
        if len(n) <= 1:
            continue
        if unicodedata.category(u) not in ("Lu", "Ll", "Lt"):
            continue
        if not all(map(unicodedata.combining, n[1:])):
            continue
        d.setdefault(n[1:], set()).add(n[0])

    for k, v in d.items():
        d[k] = u"".join(sorted(v))

    return d
Ejemplo n.º 3
0
def diacritic_for_letters(regenerate=False):
    """Returns a mapping for combining diacritic mark to ascii characters
    for which they can be used to combine to a single unicode char.

    (actually not ascii, but unicode from the Lu/Ll/Lt categories,
    but mainly ascii)

    Since this is quite expensive to compute, the result is a cached version
    unless regenerate != True. regenerate = True is used for unittests
    to validate the cache.
    """

    if not regenerate:
        return _DIACRITIC_CACHE

    d = {}
    for i in xrange(sys.maxunicode):
        u = unichr(i)
        n = unicodedata.normalize("NFKD", u)
        if len(n) <= 1:
            continue
        if unicodedata.category(u) not in ("Lu", "Ll", "Lt"):
            continue
        if not all(map(unicodedata.combining, n[1:])):
            continue
        d.setdefault(n[1:], set()).add(n[0])

    for k, v in d.items():
        d[k] = u"".join(sorted(v))

    return d
Ejemplo n.º 4
0
def _fixup_literal(literal, in_seq, mapping):
    u = unichr(literal)
    if u in mapping:
        u = u + u"".join(mapping[u])
    need_seq = len(u) > 1
    u = re_escape(u)
    if need_seq and not in_seq:
        u = u"[%s]" % u
    return u
Ejemplo n.º 5
0
def get_punctuation_mapping(regenerate=False):
    """This takes the unicode confusables set and extracts punctuation
    which looks similar to one or more ASCII punctuation.

    e.g. ' --> '

    """

    if not regenerate:
        return _PUNCT_CONFUSABLES_CACHE

    h = urlopen("http://www.unicode.org/Public/security/9.0.0/confusables.txt")
    data = h.read()
    mapping = {}
    for line in data.decode("utf-8-sig").splitlines():
        line = line.strip()
        if not line:
            continue
        if line.startswith(u"#"):
            continue

        char, repls = line.split(";", 2)[:2]
        char = char.strip()
        repls = repls.split()
        to_uni = lambda x: unichr(int(x, 16))
        char = to_uni(char)
        repls = [to_uni(r) for r in repls]

        def is_ascii(char):
            try:
                char.encode("ascii")
            except UnicodeEncodeError:
                return False
            return True

        def is_punct(char):
            return unicodedata.category(char).startswith("P")

        if all(is_ascii(c) and is_punct(c) for c in repls) and char:
            repls = u"".join(repls)
            mapping[repls] = mapping.get(repls, u"") + char

    # if any of the equal chars is also ascii + punct we can replace
    # it as well
    for ascii_, uni in mapping.items():
        also_ascii = [c for c in uni if is_ascii(c) and is_punct(c)]
        for c in also_ascii:
            mapping[c] = uni.replace(c, u"")

    return mapping
Ejemplo n.º 6
0
def get_punctuation_mapping(regenerate=False):
    """This takes the unicode confusables set and extracts punctuation
    which looks similar to one or more ASCII punctuation.

    e.g. ' --> '

    """

    if not regenerate:
        return _PUNCT_CONFUSABLES_CACHE

    h = urlopen("http://www.unicode.org/Public/security/9.0.0/confusables.txt")
    data = h.read()
    mapping = {}
    for line in data.decode("utf-8-sig").splitlines():
        line = line.strip()
        if not line:
            continue
        if line.startswith(u"#"):
            continue

        char, repls = line.split(";", 2)[:2]
        char = char.strip()
        repls = repls.split()
        to_uni = lambda x: unichr(int(x, 16))
        char = to_uni(char)
        repls = [to_uni(r) for r in repls]

        def is_ascii(char):
            try:
                char.encode("ascii")
            except UnicodeEncodeError:
                return False
            return True

        def is_punct(char):
            return unicodedata.category(char).startswith("P")

        if all(is_ascii(c) and is_punct(c) for c in repls) and char:
            repls = u"".join(repls)
            mapping[repls] = mapping.get(repls, u"") + char

    # if any of the equal chars is also ascii + punct we can replace
    # it as well
    for ascii_, uni in mapping.items():
        also_ascii = [c for c in uni if is_ascii(c) and is_punct(c)]
        for c in also_ascii:
            mapping[c] = uni.replace(c, u"")

    return mapping
Ejemplo n.º 7
0
def get_decomps_mapping(regenerate=False):
    """This takes the decomps.txt file of the Unicode UCA and gives us a cases
    where a letter can be decomposed for collation and that mapping isn't in
    NFKD.
    """

    if not regenerate:
        return _UCA_DECOMPS_CACHE

    mapping = {}

    h = urlopen("http://unicode.org/Public/UCA/8.0.0/decomps.txt")
    for line in h.read().splitlines():
        if line.startswith("#"):
            continue

        to_uni = lambda x: unichr(int(x, 16))
        is_letter = lambda x: unicodedata.category(x) in ("Lu", "Ll", "Lt")

        cp, line = line.split(";", 1)
        tag, line = line.split(";", 1)
        decomp, line = line.split("#", 1)
        decomp = map(to_uni, decomp.strip().split())
        cp = to_uni(cp)

        if not is_letter(cp):
            continue

        decomp = filter(is_letter, decomp)
        simple = "".join(decomp)
        if not simple:
            continue

        # skip anything we get from normalization
        if unicodedata.normalize("NFKD", cp)[0] == simple:
            continue

        mapping[simple] = mapping.get(simple, "") + cp

    return mapping
Ejemplo n.º 8
0
def get_decomps_mapping(regenerate=False):
    """This takes the decomps.txt file of the Unicode UCA and gives us a cases
    where a letter can be decomposed for collation and that mapping isn't in
    NFKD.
    """

    if not regenerate:
        return _UCA_DECOMPS_CACHE

    mapping = {}

    h = urlopen("http://unicode.org/Public/UCA/8.0.0/decomps.txt")
    for line in h.read().splitlines():
        if line.startswith("#"):
            continue

        to_uni = lambda x: unichr(int(x, 16))
        is_letter = lambda x: unicodedata.category(x) in ("Lu", "Ll", "Lt")

        cp, line = line.split(";", 1)
        tag, line = line.split(";", 1)
        decomp, line = line.split("#", 1)
        decomp = map(to_uni, decomp.strip().split())
        cp = to_uni(cp)

        if not is_letter(cp):
            continue

        decomp = filter(is_letter, decomp)
        simple = "".join(decomp)
        if not simple:
            continue

        # skip anything we get from normalization
        if unicodedata.normalize("NFKD", cp)[0] == simple:
            continue

        mapping[simple] = mapping.get(simple, "") + cp

    return mapping
Ejemplo n.º 9
0
def _fixup_not_literal(literal, mapping):
    u = unichr(literal)
    return u"[^%s]" % u"".join(re_escape(u + u"".join(mapping.get(u, []))))
Ejemplo n.º 10
0
def _remove_punctuation_trans():
    """Lookup all Unicode punctuation, and remove it"""

    return dict.fromkeys(i for i in xrange(sys.maxunicode)
                         if unicodedata.category(unichr(i)).startswith('P'))
Ejemplo n.º 11
0
def _remove_punctuation_trans():
    """Lookup all Unicode punctuation, and remove it"""

    return dict.fromkeys(
        i for i in xrange(sys.maxunicode)
        if unicodedata.category(unichr(i)).startswith('P'))
Ejemplo n.º 12
0
class Duplicates(SongsMenuPlugin, PluginConfigMixin):
    PLUGIN_ID = 'Duplicates'
    PLUGIN_NAME = _('Duplicates Browser')
    PLUGIN_DESC = _('Finds and displays similarly tagged versions of songs.')
    PLUGIN_ICON = Icons.EDIT_SELECT_ALL

    MIN_GROUP_SIZE = 2
    _CFG_KEY_KEY = "key_expression"
    __DEFAULT_KEY_VALUE = "~artist~title~version"

    _CFG_REMOVE_WHITESPACE = 'remove_whitespace'
    _CFG_REMOVE_DIACRITICS = 'remove_diacritics'
    _CFG_REMOVE_PUNCTUATION = 'remove_punctuation'
    _CFG_CASE_INSENSITIVE = 'case_insensitive'

    plugin_handles = any_song(is_finite)

    # Cached values
    key_expression = None
    __cfg_cache = {}

    __remove_punctuation_trans = tbl = dict.fromkeys(
        i for i in xrange(sys.maxunicode)
        if unicodedata.category(unichr(i)).startswith('P'))
    """Lookup all Unicode punctuation, and remove it"""
    @classmethod
    def get_key_expression(cls):
        if not cls.key_expression:
            cls.key_expression = (cls.config_get(cls._CFG_KEY_KEY,
                                                 cls.__DEFAULT_KEY_VALUE))
        return cls.key_expression

    @classmethod
    def PluginPreferences(cls, window):
        def key_changed(entry):
            cls.key_expression = None
            cls.config_set(cls._CFG_KEY_KEY, entry.get_text().strip())

        vb = Gtk.VBox(spacing=10)
        vb.set_border_width(0)
        hbox = Gtk.HBox(spacing=6)
        # TODO: construct a decent validator and use ValidatingEntry
        e = UndoEntry()
        e.set_text(cls.get_key_expression())
        e.connect("changed", key_changed)
        e.set_tooltip_markup(
            _("Accepts QL tag expressions like "
              "<tt>~artist~title</tt> or <tt>musicbrainz_track_id</tt>"))
        lbl = Gtk.Label(label=_("_Group duplicates by:"))
        lbl.set_mnemonic_widget(e)
        lbl.set_use_underline(True)
        hbox.pack_start(lbl, False, True, 0)
        hbox.pack_start(e, True, True, 0)
        frame = qltk.Frame(label=_("Duplicate Key"), child=hbox)
        vb.pack_start(frame, True, True, 0)

        # Matching Option
        toggles = [
            (cls._CFG_REMOVE_WHITESPACE, _("Remove _Whitespace")),
            (cls._CFG_REMOVE_DIACRITICS, _("Remove _Diacritics")),
            (cls._CFG_REMOVE_PUNCTUATION, _("Remove _Punctuation")),
            (cls._CFG_CASE_INSENSITIVE, _("Case _Insensitive")),
        ]
        vb2 = Gtk.VBox(spacing=6)
        for key, label in toggles:
            ccb = ConfigCheckButton(label, 'plugins', cls._config_key(key))
            ccb.set_active(cls.config_get_bool(key))
            vb2.pack_start(ccb, True, True, 0)

        frame = qltk.Frame(label=_("Matching options"), child=vb2)
        vb.pack_start(frame, False, True, 0)

        vb.show_all()
        return vb

    @staticmethod
    def remove_accents(s):
        return "".join(c for c in unicodedata.normalize('NFKD', text_type(s))
                       if not unicodedata.combining(c))

    @classmethod
    def get_key(cls, song):
        key = song(cls.get_key_expression())
        if cls.config_get_bool(cls._CFG_REMOVE_DIACRITICS):
            key = cls.remove_accents(key)
        if cls.config_get_bool(cls._CFG_CASE_INSENSITIVE):
            key = key.lower()
        if cls.config_get_bool(cls._CFG_REMOVE_PUNCTUATION):
            key = (key.translate(cls.__remove_punctuation_trans))
        if cls.config_get_bool(cls._CFG_REMOVE_WHITESPACE):
            key = "_".join(key.split())
        return key

    def plugin_songs(self, songs):
        model = DuplicatesTreeModel()
        self.__cfg_cache = {}

        # Index all songs by our custom key
        # TODO: make this cache-friendly
        print_d("Calculating duplicates for %d song(s)..." % len(songs))
        groups = {}
        for song in songs:
            key = self.get_key(song)
            if key and key in groups:
                print_d("Found duplicate based on '%s'" % key)
                groups[key].add(song._song)
            elif key:
                groups[key] = {song._song}

        for song in app.library:
            key = self.get_key(song)
            if key in groups:
                groups[key].add(song)

        # Now display the grouped duplicates
        for (key, children) in groups.items():
            if len(children) < self.MIN_GROUP_SIZE:
                continue
            # The parent (group) label
            model.add_group(key, children)

        dialog = DuplicateDialog(model)
        dialog.show()