Python canonicalize_string 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: chirp.library.similarity

메소드/함수: canonicalize_string

hotexamples.com에서의 예제들: 7

Python canonicalize_string - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 chirp.library.similarity.canonicalize_string에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: artists.py 프로젝트: agocs/chirpradio-machine

def _read_artist_mappings_from_file(file_obj):
    """Read the artist mappings from a file.

    Args:
      file_obj: A file object to read the mappings from

    Returns:
      A 2-tuple containing
        * A mapping dict, which carries canonicalized strings to
          canonicalized strings
        * A "raw mapping dict, which gives the mapping exactly as
          described in the file, without any canonicalization.
    """
    raw_mappings = {}
    mappings = {}
    for line in file_obj:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        before, sep, after = line.partition(_MAPPINGS_SEP)
        if not sep:
            sys.stderr.write("Skipping invalid mapping: \"%s\"\n"
                             % line.encode("utf-8"))
            continue
        before = before.strip()
        after = after.strip()
        raw_mappings[before] = after

        canon_before = similarity.canonicalize_string(before)
        canon_after = similarity.canonicalize_string(after)
        mappings[canon_before] = canon_after

    return mappings, raw_mappings

예제 #2

파일 보기

파일: artists.py 프로젝트: agocs/chirpradio-machine

def merge_whitelist_and_mappings(whitelist, raw_mappings):
    """Combine information from whitelist and mappings.

    Args:
      whitelist: A whitelist dict
      raw_mappings: A raw mappings dict
      
    Returns:
      A (whitelist, raw_mapping) pair that is equivalent to the args
      but with certain normalizations applied that take information
      from the mappings and applies it back to the whitelist, thereby
      correcting any inconsistencies.
    """
    new_whitelist = dict(whitelist)
    inv_whitelist = dict((v, k) for k, v in whitelist.iteritems())
    new_raw_mappings = {}
    for before, after in raw_mappings.iteritems():
        std_before = _standardize(before, whitelist, {})
        std_after = _standardize(after, whitelist, {})
        # Every "after" should exactly match a whitelist item.
        if after != std_after:
            if std_after is not None:
                # Delete the whitelist entry that created the non-matching
                # standardization of after.
                try:
                    del new_whitelist[inv_whitelist[std_after]]
                except KeyError:
                    pass
            # A "before" item in the mappings should never exactly match
            # an existing whitelist entry.  If it does, delete it from
            # the whitelist.
            canon_before = similarity.canonicalize_string(before)
            if canon_before in new_whitelist:
                del new_whitelist[canon_before]
            # Insert the "after" form into the new whitelist.
            canon_after = similarity.canonicalize_string(after)
            new_whitelist[canon_after] = after
        # If we can figure out a mapping based solely on the whitelist,
        # the mapping can be dropped.
        if std_before and std_before == std_after:
            continue
        new_raw_mappings[before] = after

    # If the whitelist and mappings remained stable under these operations,
    # return them.
    if new_whitelist == whitelist and new_raw_mappings == raw_mappings:
        return new_whitelist, new_raw_mappings
    # If something did change, call self recursively on the results.
    return merge_whitelist_and_mappings(new_whitelist, new_raw_mappings)

예제 #3

파일 보기

파일: artists.py 프로젝트: agocs/chirpradio-machine

def _seq_to_whitelist(seq_of_names):
    new_whitelist = {}
    for name in seq_of_names:
        canon = similarity.canonicalize_string(name)
        if canon in new_whitelist:
            sys.stderr.write("Artist whitelist collision: \"%s\" and \"%s\"\n"
                             % (new_whitelist[canon], name))
            return None
        new_whitelist[canon] = name
    return new_whitelist

예제 #4

파일 보기

파일: similarity_test.py 프로젝트: D3tail/chirpradio-machine

 def test_basic(self):
     test_cases = (
         ("", u""),
         ("   ", u""),
         (u"foo", u"foo"),
         ("foo. bar.", u"foobar"),
         ("foo &   Bar  ", u"foo&bar"),
         ("The Foo and Bar", u"foo&bar"),
         ("Foo!!!", u"foo"),
         ("!!!!", u"!!!!"),
         )
     for before, after in test_cases:
         self.assertEqual(after, similarity.canonicalize_string(before))

예제 #5

파일 보기

 def test_basic(self):
     test_cases = (
         ("", u""),
         ("   ", u""),
         (u"foo", u"foo"),
         ("foo. bar.", u"foobar"),
         ("foo &   Bar  ", u"foo&bar"),
         ("The Foo and Bar", u"foo&bar"),
         ("Foo!!!", u"foo"),
         ("!!!!", u"!!!!"),
     )
     for before, after in test_cases:
         self.assertEqual(after, similarity.canonicalize_string(before))

예제 #6

파일 보기

파일: artists.py 프로젝트: agocs/chirpradio-machine

def _standardize_simple(artist_name, whitelist, mappings):
    """Attempt to standardize an artist name using only "simple" methods.

    Args:
      artist_name: A unicode string containing an artist's name
      whitelist: A whitelist dict that maps canonicalized names
        to names
      mappings: A mappings dict whose keys and values are both
        canonicalized artist names

    Returns:
      A string containing the standardized form of the artist name,
      or None if the name is not recognized.
    """
    canon_name = similarity.canonicalize_string(artist_name)
    # We just try to look up the canonicalized form of the artist name
    # in both the whitelist and mapping dicts.
    if canon_name in whitelist:
        return whitelist[canon_name]
    elif canon_name in mappings:
        return whitelist.get(mappings[canon_name])
    else:
        return None

예제 #7

파일 보기

파일: artists.py 프로젝트: agocs/chirpradio-machine

def suggest(name):
    canon_name = similarity.canonicalize_string(name)
    _global_lock.acquire()
    try:
        canon_whitelist = list(_global_whitelist)
    finally:
        _global_lock.release()
    best_guess = None
    # We ignore any items that are more than 10 edits away from our
    # original name.
    MAX_DIST = 10
    MAX_NORM_DIST = 0.25
    best_dist = 1e+100
    for guess in canon_whitelist:
        normalizer = (len(guess)+len(canon_name)/2.0)
        max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST))
        lev_dist = similarity.get_levenshtein_distance(
            canon_name, guess, max_value=max_value)
        if lev_dist < MAX_DIST:
            normalized_lev_dist = lev_dist / normalizer
            if normalized_lev_dist < MAX_NORM_DIST:
                best_guess = guess
                best_dist = normalized_lev_dist
    return _global_whitelist.get(best_guess)