Python cleanse_titleの例

プログラミング言語: Python

名前空間/パッケージ名: common

メソッド/関数: cleanse_title

hotexamples.comのコード掲載数: 4

Python cleanse_title - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcommon.cleanse_titleの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def Search(results, media, lang, manual, movie):
    ''' AniDB Search assign an anidbid to a series or movie
  '''
    Log.Info("=== AniDB.Search() ===".ljust(157, '='))
    FILTER_SEARCH_WORDS = [  ### These are words which cause extra noise due to being uninteresting for doing searches on, Lowercase only ####################################
        'to',
        'wa',
        'ga',
        'no',
        'age',
        'da',
        'chou',
        'super',
        'yo',
        'de',
        'chan',
        'hime',
        'ni',
        'sekai',  # Jp
        'a',
        'of',
        'an',
        'the',
        'motion',
        'picture',
        'special',
        'oav',
        'ova',
        'tv',
        'special',
        'eternal',
        'final',
        'last',
        'one',
        'movie',
        'me',
        'princess',
        'theater',
        'and',  # En Continued
        'le',
        'la',
        'un',
        'les',
        'nos',
        'vos',
        'des',
        'ses',
        'world',
        'in',
        'another',
        'this',
        'story',
        'life',
        'name',  # Fr
        'i',
        'ii',
        'iii',
        'iv',
        'v',
        'vi',
        'vii',
        'viii',
        'ix',
        'x',
        'xi',
        'xii',
        'xiii',
        'xiv',
        'xv',
        'xvi'
    ]  # Roman digits
    SPLIT_CHARS = [
        ';', ':', '*', '?', ',', '.', '~', '-', '\\', '/'
    ]  #Space is implied, characters forbidden by os filename limitations
    orig_title = media.title if movie else media.show
    orig_title_cleansed = common.cleanse_title(orig_title)
    Log.Info("orig_title: '{}', orig_title_cleansed: '{}'".format(
        orig_title, orig_title_cleansed))

    ### Full title search = 1.3s
    Log.Info("--- full title ---".ljust(157, '-'))
    best_aid, best_score, best_title, n = "", 0, "", 0
    start_time = time.time()
    Log.Info('len AniDBTitlesDB: {}'.format(len(AniDBTitlesDB)))
    for element in AniDBTitlesDB.xpath(
            u"/animetitles/anime/title[text()[contains(lower-case(.), '%s')]]"
            % orig_title.lower().replace("'", " ")):
        aid = element.getparent().get('aid', '')
        title = element.text
        if aid == best_aid and best_score >= 100: continue
        if orig_title == title: title_cleansed, score = title, 100
        elif orig_title.lower() == title.lower():
            title_cleansed, score = title.lower(), 99
        else:  #contained in title
            title_cleansed = common.cleanse_title(title)
            score1 = 100 * len(
                String.LongestCommonSubstring(orig_title_cleansed,
                                              title_cleansed)
            ) / max(len(title_cleansed), len(orig_title_cleansed)) - n if max(
                len(title_cleansed), len(orig_title_cleansed)) else 0
            score2 = 100 - 100 * Util.LevenshteinDistance(
                orig_title_cleansed, title_cleansed) / max(
                    len(title_cleansed), len(orig_title_cleansed)) - n if max(
                        len(title_cleansed), len(orig_title_cleansed)) else 0
            score = max(score1, score2)
        if score >= 100 and not aid == best_aid: n += 1
        results.Append(
            MetadataSearchResult(id="%s-%s" % ("anidb", aid),
                                 name="%s [%s-%s]" % (title, "anidb", aid),
                                 year=media.year,
                                 lang=lang,
                                 score=score))
        Log.Info(
            "[+] score: {:>3}, aid: {:>5}, title: '{}', title_cleansed: {}".
            format(score, aid, title, title_cleansed))
        if score > best_score:
            best_score, best_title, best_aid = score, title, aid
    if best_score:
        Log.Info(
            "[=] best_score: {:>3}, best_aid: {:>5}, best_title: {}".format(
                best_score, best_aid, best_title))
    Log.Info("elapsed_time: {:.3f}".format(time.time() - start_time))
    if best_score >= 90: return best_score, n

    ### Keyword match using Xpath
    Log.Info("--- keyword ---".ljust(157, '-'))
    words, words_skipped = [], []
    for i in SPLIT_CHARS:
        orig_title_cleansed = orig_title_cleansed.replace(i, " ")
    orig_title_cleansed = orig_title_cleansed.replace("'", '')
    for word in orig_title_cleansed.split():
        (words_skipped if word in FILTER_SEARCH_WORDS or len(word) <= 3 else
         words).append(word)
    if not words:
        words, words_skipped = orig_title_cleansed.split(), [
        ]  #Prevent CRITICAL Exception in the search function of agent named 'HamaTV', called with keyword arguments {'show': 'No 6', 'id': '20145', 'year': None} (most recent call last):
    Log.Info("Keyword Search - Words: {}, skipped: {}".format(
        str(words), str(words_skipped)))
    type_order = ('main', 'official', 'syn', 'short', '')
    best_score, best_title, best_aid, best_type, best_lang = 0, "", "", "", ""
    last_chance, best_score_entry = [], 0
    start_time = time.time()

    for element in AniDBTitlesDB.xpath(u"/animetitles/anime[title[{}]]".format(
            " or ".join([
                "contains(lower-case(text()), '{}')".format(x.lower())
                for x in words
            ]))):
        aid = element.get('aid', '')
        best_score_entry, best_title_entry, best_type_entry, best_lang_entry = 0, "", "", ""
        for element in element.xpath(u"title[%s]" % " or ".join(
            ["contains(lower-case(text()), '%s')" % x.lower()
             for x in words])):
            title = element.text
            Type = element.get('type', '')
            Lang = element.get('{http://www.w3.org/XML/1998/namespace}lang',
                               '')
            title_cleansed = common.cleanse_title(title)
            if title_cleansed == orig_title_cleansed:
                score = 98 if ';' not in title else 100
            else:
                score = WordsScore(
                    orig_title_cleansed.split(), title_cleansed
                )  # - type_order.index(Type)  #Movies can have same title
            if score > best_score_entry or score == best_score_entry and (
                    not best_type_entry or type_order.index(Type) <
                    type_order.index(best_type_entry)):
                best_score_entry, best_title_entry, best_type_entry, best_lang_entry, best_title_entry_cleansed = score, title, Type, Lang, title_cleansed
        if best_score_entry < 25:
            last_chance.append((best_score_entry, best_title_entry,
                                best_type_entry, best_lang_entry, aid))
            continue
        Log.Info('[-] score: {:>3}, aid: {:>5}, title: "{}"'.format(
            best_score_entry, aid, best_title_entry))
        #Log.Info("levenstein: {}".format(100 - 200 * Util.LevenshteinDistance(title_cleansed, orig_title_cleansed) / (len(title_cleansed) + len(orig_title_cleansed)) ))
        results.Append(
            MetadataSearchResult(
                id="%s-%s" % ("anidb", aid),
                name="{title} [{Type}({Lang})] [anidb-{aid}]".format(
                    title=best_title_entry,
                    aid=aid,
                    Type=best_type_entry,
                    Lang=best_lang_entry),
                year=media.year,
                lang=lang,
                score=best_score_entry))
        if best_score_entry > best_score:
            best_score, best_title, best_type, best_lang, best_aid = best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid
    if best_score < 50:  # Add back lower than 25 if nothin above 50
        for best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid in last_chance:
            Log.Info('[-] score: {:>3}, aid: {:>5}, title: "{}"'.format(
                best_score_entry, best_title_entry, aid))
            results.Append(
                MetadataSearchResult(
                    id="%s-%s" % ("anidb", aid),
                    name="{title} [{Type}({Lang}): {aid}]".format(
                        title=best_title_entry,
                        aid=aid,
                        Type=best_type_entry,
                        Lang=best_lang_entry),
                    year=media.year,
                    lang=lang,
                    score=best_score_entry))
        if best_score_entry > best_score:
            best_score, best_title, best_type, best_lang, best_aid = best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid
    #Log.Info('           ---       -----         ---------------------------------------------------')
    #Log.Info('[=] score: {:>3}, aid: {:>5}, title: "{}"'.format(best_score, best_aid, best_title))
    Log.Info("elapsed_time: {:.3f}".format(time.time() - start_time))

    return best_score, n

コード例 #2

ファイルを表示

ファイル: AniDB.py プロジェクト: ZeroQI/Hama.bundle

def Search(results, media, lang, manual, movie):
  ''' AniDB Search assign an anidbid to a series or movie
  '''
  Log.Info("=== AniDB.Search() ===".ljust(157, '='))
  FILTER_SEARCH_WORDS  = [ ### These are words which cause extra noise due to being uninteresting for doing searches on, Lowercase only ####################################
  'to', 'wa', 'ga', 'no', 'age', 'da', 'chou', 'super', 'yo', 'de', 'chan', 'hime', 'ni', 'sekai',                                             # Jp
  'a',  'of', 'an', 'the', 'motion', 'picture', 'special', 'oav', 'ova', 'tv', 'special', 'eternal', 'final', 'last', 'one', 'movie', 'me',  'princess', 'theater', 'and', # En Continued
  'le', 'la', 'un', 'les', 'nos', 'vos', 'des', 'ses', 'world', 'in', 'another', 'this', 'story', 'life', 'name',                                                                                        # Fr
  'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi']                                                                    # Roman digits
  SPLIT_CHARS          = [';', ':', '*', '?', ',', '.', '~', '-', '\\', '/' ] #Space is implied, characters forbidden by os filename limitations
  orig_title           = media.title if movie else media.show
  orig_title_cleansed  = common.cleanse_title(orig_title)
  Log.Info("orig_title: '{}', orig_title_cleansed: '{}'".format(orig_title, orig_title_cleansed))
  
  ### Full title search = 1.3s
  Log.Info("--- full title ---".ljust(157, '-'))
  best_aid, best_score, best_title, n = "", 0, "", 0
  start_time = time.time()
  Log.Info('len AniDBTitlesDB: {}'.format(len(AniDBTitlesDB)))
  for element in AniDBTitlesDB.xpath(u"/animetitles/anime/title[text()[contains(lower-case(.), '%s')]]" % orig_title.lower().replace("'", " ")):
    aid            = element.getparent().get('aid',  '')
    title          = element.text
    if aid==best_aid and best_score>=100:  continue
    if orig_title            == title                      :  title_cleansed, score = title,        100 
    elif orig_title.lower()  == title.lower()              :  title_cleansed, score = title.lower(), 99
    else: #contained in title
      title_cleansed = common.cleanse_title(title)
      score1 = 100*len(String.LongestCommonSubstring(orig_title_cleansed, title_cleansed))/max(len(title_cleansed), len(orig_title_cleansed))-n if max(len(title_cleansed), len(orig_title_cleansed)) else 0
      score2 = 100 - 100 * Util.LevenshteinDistance (orig_title_cleansed, title_cleansed) /max(len(title_cleansed), len(orig_title_cleansed))-n  if max(len(title_cleansed), len(orig_title_cleansed)) else 0
      score=max(score1, score2)
    if score>=100 and not aid==best_aid:  n+=1
    results.Append(MetadataSearchResult(id="%s-%s" % ("anidb", aid), name="%s [%s-%s]" % (title, "anidb", aid), year=media.year, lang=lang, score=score))
    Log.Info("[+] score: {:>3}, aid: {:>5}, title: '{}', title_cleansed: {}".format(score, aid, title, title_cleansed))
    if score > best_score:  best_score, best_title, best_aid = score, title, aid
  if best_score:  Log.Info("[=] best_score: {:>3}, best_aid: {:>5}, best_title: {}".format(best_score, best_aid, best_title))
  Log.Info("elapsed_time: {:.3f}".format(time.time() - start_time ))
  if best_score>=90:  return best_score, n
  
  ### Keyword match using Xpath
  Log.Info("--- keyword ---".ljust(157, '-'))
  words, words_skipped  = [], []
  for i in SPLIT_CHARS:  orig_title_cleansed = orig_title_cleansed.replace(i, " ")
  orig_title_cleansed = orig_title_cleansed.replace("'", '')
  for word in orig_title_cleansed.split():  (words_skipped if word in FILTER_SEARCH_WORDS or len(word) <= 3 else words).append(word)
  if not words:  words, words_skipped = orig_title_cleansed.split(), []  #Prevent CRITICAL Exception in the search function of agent named 'HamaTV', called with keyword arguments {'show': 'No 6', 'id': '20145', 'year': None} (most recent call last):
  Log.Info("Keyword Search - Words: {}, skipped: {}".format(str(words), str(words_skipped)))
  type_order=('main', 'official', 'syn', 'short', '')
  best_score, best_title, best_aid, best_type, best_lang = 0, "", "", "", ""
  last_chance, best_score_entry=[], 0
  start_time = time.time()
  
  for element in AniDBTitlesDB.xpath(u"/animetitles/anime[title[{}]]".format(" or ".join(["contains(lower-case(text()), '{}')".format(x.lower()) for x in words]) )):
    aid = element.get('aid',  '')
    best_score_entry, best_title_entry, best_type_entry, best_lang_entry = 0, "", "", ""
    for element in element.xpath(u"title[%s]" % " or ".join(["contains(lower-case(text()), '%s')" % x.lower() for x in words]) ):
      title          = element.text
      Type           = element.get('type', '')
      Lang           = element.get('{http://www.w3.org/XML/1998/namespace}lang', '')
      title_cleansed = common.cleanse_title(title)
      if title_cleansed == orig_title_cleansed:  score =  98 if ';' not in title else 100
      else:                                      score = WordsScore(orig_title_cleansed.split(), title_cleansed)  # - type_order.index(Type)  #Movies can have same title
      if score>best_score_entry or score==best_score_entry and (not best_type_entry or type_order.index(Type)<type_order.index(best_type_entry)):
        best_score_entry, best_title_entry, best_type_entry, best_lang_entry, best_title_entry_cleansed = score, title, Type, Lang, title_cleansed
    if best_score_entry<25:  last_chance.append((best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid));  continue
    Log.Info('[-] score: {:>3}, aid: {:>5}, title: "{}"'.format(best_score_entry, aid, best_title_entry))
    #Log.Info("levenstein: {}".format(100 - 200 * Util.LevenshteinDistance(title_cleansed, orig_title_cleansed) / (len(title_cleansed) + len(orig_title_cleansed)) ))
    results.Append(MetadataSearchResult(id="%s-%s" % ("anidb", aid), name="{title} [{Type}({Lang})] [anidb-{aid}]".format(title=best_title_entry, aid=aid, Type=best_type_entry, Lang=best_lang_entry), year=media.year, lang=lang, score=best_score_entry))
    if best_score_entry > best_score:  best_score, best_title, best_type, best_lang, best_aid = best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid
  if best_score <50:  # Add back lower than 25 if nothin above 50
    for best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid in last_chance:
      Log.Info('[-] score: {:>3}, aid: {:>5}, title: "{}"'.format(best_score_entry, best_title_entry, aid))
      results.Append(MetadataSearchResult(id="%s-%s" % ("anidb", aid), name="{title} [{Type}({Lang}): {aid}]".format(title=best_title_entry, aid=aid, Type=best_type_entry, Lang=best_lang_entry), year=media.year, lang=lang, score=best_score_entry))
    if best_score_entry > best_score:  best_score, best_title, best_type, best_lang, best_aid = best_score_entry, best_title_entry, best_type_entry, best_lang_entry, aid
  #Log.Info('           ---       -----         ---------------------------------------------------')
  #Log.Info('[=] score: {:>3}, aid: {:>5}, title: "{}"'.format(best_score, best_aid, best_title))
  Log.Info("elapsed_time: {:.3f}".format(time.time() - start_time ))
  
  return best_score, n

コード例 #3

ファイルを表示

AniDBTitlesDB = None

ANIDB_API_DOMAIN = 'api.anidb.net:9001'
ANIDB_HTTP_API_URL = ANIDB_PROTOCOL + ANIDB_API_DOMAIN + '/httpapi?request=anime&client=hama&clientver=1&protover=1&aid='

ANIDB_IMAGE_DOMAIN = 'img7.anidb.net'
ANIDB_PIC_BASE_URL = ANIDB_PROTOCOL + ANIDB_IMAGE_DOMAIN + '/pics/anime/'  # AniDB picture directory
ANIDB_PIC_THUMB_URL = ANIDB_PROTOCOL + ANIDB_IMAGE_DOMAIN + '/pics/anime/thumbs/150/{name}.jpg-thumb.jpg'

AniDBBan = False

### Functions ###
# Define custom functions to be available in 'xpath' calls
ns = etree.FunctionNamespace(None)
ns['lower-case'] = lambda context, s: s[0].lower()
ns['clean-title'] = lambda context, s: common.cleanse_title(s)


def Search(results, media, lang, manual, movie):
    ''' AniDB Search assign an anidbid to a series or movie
  '''
    Log.Info("=== AniDB.Search() ===".ljust(157, '='))
    FILTER_SEARCH_WORDS = [  ### These are words which cause extra noise due to being uninteresting for doing searches on, Lowercase only ####################################
        'to',
        'wa',
        'ga',
        'no',
        'age',
        'da',
        'chou',
        'super',

コード例 #4

ファイルを表示

ファイル: AniDB.py プロジェクト: ZeroQI/Hama.bundle

### AniBD ###
#http://api.anidb.net:9001/httpapi?request=anime&client=hama&clientver=1&protover=1&aid=6481

### Imports ###
import common    # Functions: SaveFile, LoadFile, metadata_download, WriteLogs, cleanse_title, getImagesFromASS
import os        # Functions: 
import re        # Functions: re.search, re.match, re.sub, re.IGNORECASE
import string    # Functions: 
import datetime  # Functions: 
import time      # Functions: 
import AnimeLists
from common import GetXml, Dict, SaveDict, natural_sort_key, Log, DictString
from lxml   import etree
ns = etree.FunctionNamespace(None)
ns['lower-case' ] = lambda context, s: s[0].lower()
ns['clean-title'] = lambda context, s: common.cleanse_title(s)
  
### Variables ###
### Always on variables ###
AniDBTitlesDB = None

### Functions ###

def Search(results, media, lang, manual, movie):
  ''' AniDB Search assign an anidbid to a series or movie
  '''
  Log.Info("=== AniDB.Search() ===".ljust(157, '='))
  FILTER_SEARCH_WORDS  = [ ### These are words which cause extra noise due to being uninteresting for doing searches on, Lowercase only ####################################
  'to', 'wa', 'ga', 'no', 'age', 'da', 'chou', 'super', 'yo', 'de', 'chan', 'hime', 'ni', 'sekai',                                             # Jp
  'a',  'of', 'an', 'the', 'motion', 'picture', 'special', 'oav', 'ova', 'tv', 'special', 'eternal', 'final', 'last', 'one', 'movie', 'me',  'princess', 'theater', 'and', # En Continued
  'le', 'la', 'un', 'les', 'nos', 'vos', 'des', 'ses', 'world', 'in', 'another', 'this', 'story', 'life', 'name',                                                                                        # Fr