Beispiel #1
0
	def validated_string_language(self, title, restrict=None):
		try:
			restrict_codes = {iso639.to_iso639_2(l) for l in restrict}
			detected = detect(title)
			threealpha = iso639.to_iso639_2(detected)
			ok = True if restrict is None else (threealpha in restrict_codes)
			if ok:
				language = self.language_object_from_code(threealpha)
				if language is not None:
					# If there was a restricted set of languages passed to this function,
					# we have confidence that we've matched the language of the title
					# because it is one of the restricted languages.
					# (If no restriction was supplied, then we use whatever language
					# was detected in the title.)
					return language
			else:
				# The detected language of the title was not a member of the restricted
				# set, so we lack confidence to proceed.
				return None
		except iso639.NonExistentLanguageError as e:
			warnings.warn('*** Unrecognized language code detected: %s' % (e,))
		except KeyError as e:
			warnings.warn(
				'*** LANGUAGE: detected but unrecognized title language %r: %s' % (e.args[0], title)
			)
		except Exception as e:
			print('*** detect_title_language error: %r' % (e,))
		return None
Beispiel #2
0
def test_language_coverage():
    """Check if WikiPron covers languages with a sufficient amount of data.

    If any warnings are raised, they should be suppressed by expanding
    the LANGUAGE_CODES dict to handle the relevant languages.
    """
    categories = _get_language_categories()
    sizes = _get_language_sizes(categories)
    for language, size in sizes.items():
        if size < _MIN_LANGUAGE_SIZE:
            continue
        if language in ("Mon", "Translingual"):
            # "mon" is the ISO 639 code for Mongolian, but there is also
            # the Mon language (ISO 639 code: "mnw").
            continue
        try:
            language_code = iso639.to_iso639_2(language)
        except iso639.NonExistentLanguageError:
            # Check if WikiPron can handle `language` directly.
            language_code = language
        try:
            language_inferred = wikipron.Config(key=language_code).language
        except iso639.NonExistentLanguageError:
            warnings.warn(f'WikiPron cannot handle "{language}".')
            continue
        if language_inferred != language:
            warnings.warn(
                f'WikiPron resolves the key "{language_code}" to '
                f'"{language_inferred}", '
                f'which is not "{language}" on Wiktionary.'
            )
def to_3_letter_lang(lang):
    if len(lang) == 2:
        if iso639.is_valid639_1(lang):
            return iso639.to_iso639_2(lang)

    if len(lang) == 3:
        if iso639.is_valid639_2(lang):
            return lang

    return False
Beispiel #4
0
def index_ebooks_from_library(site,
                              _uuid="",
                              library="",
                              start=0,
                              stop=0,
                              dir=".",
                              num=1000,
                              force_refresh=False):

    offset = 0 if not start else start - 1
    num = min(1000, num)
    server = site.rstrip('/')
    api = server + '/ajax/'
    lib = library
    library = '/' + library if library else library

    timeout = 15

    print(f"\nIndexing library: {lib} from server: {server} ")
    url = api + 'search' + library + '?num=0'
    print(f"\nGetting ebooks count of library: {lib} from server:{server} ")
    # print(url)

    try:
        r = requests.get(url, verify=False, timeout=(timeout, 30))
        r.raise_for_status()
    except requests.RequestException as e:
        print("Unable to open site:", url)
        return
        # pass
    except Exception as e:
        print("Other issue:", e)
        return
        # pass
    except:
        print("Wazza !!!!")
        sys.exit(1)

    total_num = int(r.json()["total_num"])
    total_num = total_num if not stop else stop
    print()
    print(f"Total count={total_num} from {server}")

    # library=r.json()["base_url"].split('/')[-1]
    # base_url=r.json()["base_url"]

    db = init_site_db(site, _uuid=_uuid, dir=dir)

    print()

    range = offset + 1
    while offset < total_num:
        remaining_num = min(num, total_num - offset)
        # print()
        # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
        print('\r {:180.180}'.format(
            f'Downloading ids: offset={str(offset)} count={str(remaining_num)} from {server}'
        ),
              end='')

        # url=server+base_url+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
        url = api + 'search' + library + '?num=' + str(
            remaining_num) + '&offset=' + str(
                offset) + '&sort=timestamp&sort_order=desc'

        # print("->", url)
        try:
            r = requests.get(url, verify=False, timeout=(timeout, 30))
            r.raise_for_status()
        except requests.RequestException as e:
            print("Connection issue:", e)
            return
            # pass
        except Exception as e:
            print("Other issue:", e)
            return
            # pass
        except:
            print("Wazza !!!!")
            return
        # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))

        # print()
        # print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num))
        print('\r {:180.180}'.format(
            f'Downloading metadata from {str(offset+1)} to {str(offset+remaining_num)}/{total_num} from {server}'
        ),
              end='')
        books_s = ",".join(str(i) for i in r.json()['book_ids'])
        url = api + 'books' + library + '?ids=' + books_s
        # url=server+base_url+'/books?ids='+books_s
        # print("->", url)
        # print ('\r{:190.190}'.format(f'url= {url} ...'), end='')

        try:
            r = requests.get(url, verify=False, timeout=(60, 60))
            r.raise_for_status()
        except requests.RequestException as e:
            print("Connection issue:", e)
            return
            # pass
        except Exception as e:
            print("Other issue:", e)
            return
            # pass
        except:
            print("Wazza !!!!")
            return
        # print(len(r.json()), "received")
        print('\r {:180.180}'.format(f'{len(r.json())} received'), end='')

        books = []
        for id, r_book in r.json().items():
            uuid = r_book['uuid']
            if not uuid:
                print("No uuid for ebook: ignored")
                continue

            if r_book['authors']:
                desc = f"({r_book['title']} / {r_book['authors'][0]})"
            else:
                desc = f"({r_book['title']})"

            # print (f'\r--> {range}/{total_num} - {desc}', end='')
            # print (f'\r{server}--> {range}/{total_num} - {desc}', end='')
            print('\r {:180.180} '.format(
                f'{range}/{total_num} ({server} : {uuid} --> {desc}'),
                  end='')

            if not force_refresh:
                # print("Checking local metadata:", uuid)
                try:
                    book = load_metadata(dir, uuid)
                except:
                    print("Unable to get metadata from:", uuid)
                    range += 1
                    continue
                if book:
                    print("Metadata already present for:", uuid)
                    range += 1
                    continue

            if not r_book['formats']:
                # print("No format found for {}".format(r_book['uuid']))
                range += 1
                continue

            book = {}
            book['uuid'] = r_book['uuid']
            book['id'] = id
            book['library'] = lib
            book['title'] = r_book['title']
            book['authors'] = r_book['authors']
            # book['desc']=""
            book['desc'] = r_book['comments']
            book['series'] = r_book['series']
            s_i = r_book['series_index']
            if (s_i):
                book['series_index'] = int(s_i)
            # book['edition']=0
            book['identifiers'] = r_book['identifiers']
            book['tags'] = r_book['tags']
            book['publisher'] = r_book['publisher']
            book['pubdate'] = r_book['pubdate']

            if not r_book['languages']:
                # if True:
                text = r_book['title'] + ". "
                if r_book['comments']:
                    text = r_book['comments']
                s_language, prob = identifier.classify(text)
                if prob >= 0.85:
                    language = iso639.to_iso639_2(s_language)
                    book['language'] = language
                else:
                    book['language'] = ''
            else:
                book['language'] = iso639.to_iso639_2(r_book['languages'][0])

            if r_book['cover']:
                book['cover'] = True
            else:
                book['cover'] = False

            book['last_modified'] = r_book['last_modified']
            book['timestamp'] = r_book['timestamp']

            book['formats'] = []
            formats = r_book['formats']
            for f in formats:
                if 'size' in r_book['format_metadata'][f]:
                    size = int(r_book['format_metadata'][f]['size'])
                else:
                    # print()
                    # print(f"Size not found for format '{f}'  uuid={uuid}: skipped")
                    pass
                    #TODO query the size when the function to rebuild the full url is ready
                    #
                    # print("Trying to get size online: {}".format('url'))
                    # try:
                    #     size=get_file_size(s['url'])
                    # except:
                    #     print("Unable to access size for format '{}' : {} skipped".format(f, uuid))
                    #     continue
                book[f] = (size)
                book['formats'].append(f)

            if not book['formats']:
                # if not c_format:
                # print()
                # print(f"No format found for {book['uuid']} id={book['id']} : skipped")
                range += 1
                # continue

            books.append(book)
            range += 1

        # print()
        # print("Saving metadata")
        print('\r {:180.180}'.format(f'Saving metadata from {server}'), end='')
        save_books_metadata_from_site(db, books)
        print('\r {:180.180}'.format(
            f'--> Saved {range-1}/{total_num} ebooks from {server}'),
              end='')
        print()
        print()

        # try:
        #     save_metadata(db, books)
        # except:
        #     print("Unable to save book metadata")

        offset = offset + num
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect):
    """
    Detect subtitles that do not comply with ISO-639.

    TODO: Add more subtitle extensions (and read/parse them correctly for
          language detection)
    TODO: Seperate language detection better in different functions
    TODO: Add percentage of certainty and possible other languages when
          low certainty
    TODO: Handle unicode better to detect languages like German and Dutch
          better
    TODO: Use table
    """
    subtitleExts = ['.srt', '.sub', '.ass']
    total = 0
    incorrect = 0
    detectedlang = 0
    for subdir, dirnames, filenames in os.walk(scanfolder):
        for filename in filenames:
            incorrectSubtitle = False
            extension = os.path.splitext(filename)[1].lower()
            # subdirName = os.path.basename(os.path.normpath(subdir))
            if extension in subtitleExts:
                total = total + 1
                langcodeFromFilename = getIsoLanguageCodeFromFilename(filename)
                detectedLanguage = ""
                detectedIsoMode = False
                if is_valid639_1(langcodeFromFilename):
                    detectedIsoMode = "1"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if is_valid639_2(langcodeFromFilename):
                    detectedIsoMode = "2"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if detectedIsoMode is not isoMode:
                    isoShouldBe = ""
                    if isoMode == "1" and detectedIsoMode == "2":
                        isoShouldBe = to_iso639_1(langcodeFromFilename)
                    if isoMode == "2" and detectedIsoMode == "1":
                        isoShouldBe = to_iso639_2(langcodeFromFilename)
                    filepath = subdir + os.sep + filename
                    incorrectSubtitle = True
                    incorrect = incorrect + 1
                    warning = "Incorrectly named subtitle found at "
                    warning += bold(filepath)
                    printNotificationWarning(warning)
                    if detectedIsoMode is not False:
                        info = "\t\tLang code " + bold(langcodeFromFilename)
                        info += " (ISO 639-" + str(detectedIsoMode) + ") "
                        info += "detected. The ISO 639-" + isoMode + " code"
                        info += " for " + detectedLanguage + " is "
                        info += bold(isoShouldBe) + "."
                        printNotificationInfo(info)
                if incorrectSubtitle and not disablelangdetect:
                    filepath = subdir + os.sep + filename
                    try:
                        with io.open(filepath, "r", encoding="utf-8") as mfile:
                            my_unicode_string = mfile.read()
                        possibleLanguage = "\tDetected language is likely to "
                        possibleLanguage += "be \"" + detect(my_unicode_string)
                        possibleLanguage += "\"\n"
                        detectedlang = detectedlang + 1
                    except Exception:
                        possibleLanguage = "\tLanguage detection failed\n"
    info = "Found subtitle files " + bold(str(total)) + " of which "
    info += bold(str(incorrect)) + " are incorrectly named!"
    printNotificationInfo(info)