def validated_string_language(self, title, restrict=None): try: restrict_codes = {iso639.to_iso639_2(l) for l in restrict} detected = detect(title) threealpha = iso639.to_iso639_2(detected) ok = True if restrict is None else (threealpha in restrict_codes) if ok: language = self.language_object_from_code(threealpha) if language is not None: # If there was a restricted set of languages passed to this function, # we have confidence that we've matched the language of the title # because it is one of the restricted languages. # (If no restriction was supplied, then we use whatever language # was detected in the title.) return language else: # The detected language of the title was not a member of the restricted # set, so we lack confidence to proceed. return None except iso639.NonExistentLanguageError as e: warnings.warn('*** Unrecognized language code detected: %s' % (e,)) except KeyError as e: warnings.warn( '*** LANGUAGE: detected but unrecognized title language %r: %s' % (e.args[0], title) ) except Exception as e: print('*** detect_title_language error: %r' % (e,)) return None
def test_language_coverage(): """Check if WikiPron covers languages with a sufficient amount of data. If any warnings are raised, they should be suppressed by expanding the LANGUAGE_CODES dict to handle the relevant languages. """ categories = _get_language_categories() sizes = _get_language_sizes(categories) for language, size in sizes.items(): if size < _MIN_LANGUAGE_SIZE: continue if language in ("Mon", "Translingual"): # "mon" is the ISO 639 code for Mongolian, but there is also # the Mon language (ISO 639 code: "mnw"). continue try: language_code = iso639.to_iso639_2(language) except iso639.NonExistentLanguageError: # Check if WikiPron can handle `language` directly. language_code = language try: language_inferred = wikipron.Config(key=language_code).language except iso639.NonExistentLanguageError: warnings.warn(f'WikiPron cannot handle "{language}".') continue if language_inferred != language: warnings.warn( f'WikiPron resolves the key "{language_code}" to ' f'"{language_inferred}", ' f'which is not "{language}" on Wiktionary.' )
def to_3_letter_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return iso639.to_iso639_2(lang) if len(lang) == 3: if iso639.is_valid639_2(lang): return lang return False
def index_ebooks_from_library(site, _uuid="", library="", start=0, stop=0, dir=".", num=1000, force_refresh=False): offset = 0 if not start else start - 1 num = min(1000, num) server = site.rstrip('/') api = server + '/ajax/' lib = library library = '/' + library if library else library timeout = 15 print(f"\nIndexing library: {lib} from server: {server} ") url = api + 'search' + library + '?num=0' print(f"\nGetting ebooks count of library: {lib} from server:{server} ") # print(url) try: r = requests.get(url, verify=False, timeout=(timeout, 30)) r.raise_for_status() except requests.RequestException as e: print("Unable to open site:", url) return # pass except Exception as e: print("Other issue:", e) return # pass except: print("Wazza !!!!") sys.exit(1) total_num = int(r.json()["total_num"]) total_num = total_num if not stop else stop print() print(f"Total count={total_num} from {server}") # library=r.json()["base_url"].split('/')[-1] # base_url=r.json()["base_url"] db = init_site_db(site, _uuid=_uuid, dir=dir) print() range = offset + 1 while offset < total_num: remaining_num = min(num, total_num - offset) # print() # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) print('\r {:180.180}'.format( f'Downloading ids: offset={str(offset)} count={str(remaining_num)} from {server}' ), end='') # url=server+base_url+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' url = api + 'search' + library + '?num=' + str( remaining_num) + '&offset=' + str( offset) + '&sort=timestamp&sort_order=desc' # print("->", url) try: r = requests.get(url, verify=False, timeout=(timeout, 30)) r.raise_for_status() except requests.RequestException as e: print("Connection issue:", e) return # pass except Exception as e: print("Other issue:", e) return # pass except: print("Wazza !!!!") return # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) # print() # print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num)) print('\r {:180.180}'.format( f'Downloading metadata from {str(offset+1)} to {str(offset+remaining_num)}/{total_num} from {server}' ), end='') books_s = ",".join(str(i) for i in r.json()['book_ids']) url = api + 'books' + library + '?ids=' + books_s # url=server+base_url+'/books?ids='+books_s # print("->", url) # print ('\r{:190.190}'.format(f'url= {url} ...'), end='') try: r = requests.get(url, verify=False, timeout=(60, 60)) r.raise_for_status() except requests.RequestException as e: print("Connection issue:", e) return # pass except Exception as e: print("Other issue:", e) return # pass except: print("Wazza !!!!") return # print(len(r.json()), "received") print('\r {:180.180}'.format(f'{len(r.json())} received'), end='') books = [] for id, r_book in r.json().items(): uuid = r_book['uuid'] if not uuid: print("No uuid for ebook: ignored") continue if r_book['authors']: desc = f"({r_book['title']} / {r_book['authors'][0]})" else: desc = f"({r_book['title']})" # print (f'\r--> {range}/{total_num} - {desc}', end='') # print (f'\r{server}--> {range}/{total_num} - {desc}', end='') print('\r {:180.180} '.format( f'{range}/{total_num} ({server} : {uuid} --> {desc}'), end='') if not force_refresh: # print("Checking local metadata:", uuid) try: book = load_metadata(dir, uuid) except: print("Unable to get metadata from:", uuid) range += 1 continue if book: print("Metadata already present for:", uuid) range += 1 continue if not r_book['formats']: # print("No format found for {}".format(r_book['uuid'])) range += 1 continue book = {} book['uuid'] = r_book['uuid'] book['id'] = id book['library'] = lib book['title'] = r_book['title'] book['authors'] = r_book['authors'] # book['desc']="" book['desc'] = r_book['comments'] book['series'] = r_book['series'] s_i = r_book['series_index'] if (s_i): book['series_index'] = int(s_i) # book['edition']=0 book['identifiers'] = r_book['identifiers'] book['tags'] = r_book['tags'] book['publisher'] = r_book['publisher'] book['pubdate'] = r_book['pubdate'] if not r_book['languages']: # if True: text = r_book['title'] + ". " if r_book['comments']: text = r_book['comments'] s_language, prob = identifier.classify(text) if prob >= 0.85: language = iso639.to_iso639_2(s_language) book['language'] = language else: book['language'] = '' else: book['language'] = iso639.to_iso639_2(r_book['languages'][0]) if r_book['cover']: book['cover'] = True else: book['cover'] = False book['last_modified'] = r_book['last_modified'] book['timestamp'] = r_book['timestamp'] book['formats'] = [] formats = r_book['formats'] for f in formats: if 'size' in r_book['format_metadata'][f]: size = int(r_book['format_metadata'][f]['size']) else: # print() # print(f"Size not found for format '{f}' uuid={uuid}: skipped") pass #TODO query the size when the function to rebuild the full url is ready # # print("Trying to get size online: {}".format('url')) # try: # size=get_file_size(s['url']) # except: # print("Unable to access size for format '{}' : {} skipped".format(f, uuid)) # continue book[f] = (size) book['formats'].append(f) if not book['formats']: # if not c_format: # print() # print(f"No format found for {book['uuid']} id={book['id']} : skipped") range += 1 # continue books.append(book) range += 1 # print() # print("Saving metadata") print('\r {:180.180}'.format(f'Saving metadata from {server}'), end='') save_books_metadata_from_site(db, books) print('\r {:180.180}'.format( f'--> Saved {range-1}/{total_num} ebooks from {server}'), end='') print() print() # try: # save_metadata(db, books) # except: # print("Unable to save book metadata") offset = offset + num
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect): """ Detect subtitles that do not comply with ISO-639. TODO: Add more subtitle extensions (and read/parse them correctly for language detection) TODO: Seperate language detection better in different functions TODO: Add percentage of certainty and possible other languages when low certainty TODO: Handle unicode better to detect languages like German and Dutch better TODO: Use table """ subtitleExts = ['.srt', '.sub', '.ass'] total = 0 incorrect = 0 detectedlang = 0 for subdir, dirnames, filenames in os.walk(scanfolder): for filename in filenames: incorrectSubtitle = False extension = os.path.splitext(filename)[1].lower() # subdirName = os.path.basename(os.path.normpath(subdir)) if extension in subtitleExts: total = total + 1 langcodeFromFilename = getIsoLanguageCodeFromFilename(filename) detectedLanguage = "" detectedIsoMode = False if is_valid639_1(langcodeFromFilename): detectedIsoMode = "1" detectedLanguage = iso639_to_name(langcodeFromFilename) if is_valid639_2(langcodeFromFilename): detectedIsoMode = "2" detectedLanguage = iso639_to_name(langcodeFromFilename) if detectedIsoMode is not isoMode: isoShouldBe = "" if isoMode == "1" and detectedIsoMode == "2": isoShouldBe = to_iso639_1(langcodeFromFilename) if isoMode == "2" and detectedIsoMode == "1": isoShouldBe = to_iso639_2(langcodeFromFilename) filepath = subdir + os.sep + filename incorrectSubtitle = True incorrect = incorrect + 1 warning = "Incorrectly named subtitle found at " warning += bold(filepath) printNotificationWarning(warning) if detectedIsoMode is not False: info = "\t\tLang code " + bold(langcodeFromFilename) info += " (ISO 639-" + str(detectedIsoMode) + ") " info += "detected. The ISO 639-" + isoMode + " code" info += " for " + detectedLanguage + " is " info += bold(isoShouldBe) + "." printNotificationInfo(info) if incorrectSubtitle and not disablelangdetect: filepath = subdir + os.sep + filename try: with io.open(filepath, "r", encoding="utf-8") as mfile: my_unicode_string = mfile.read() possibleLanguage = "\tDetected language is likely to " possibleLanguage += "be \"" + detect(my_unicode_string) possibleLanguage += "\"\n" detectedlang = detectedlang + 1 except Exception: possibleLanguage = "\tLanguage detection failed\n" info = "Found subtitle files " + bold(str(total)) + " of which " info += bold(str(incorrect)) + " are incorrectly named!" printNotificationInfo(info)