Exemple #1
0
def get_title(tika_result: dict, guess=False, uri=None):
    title = ""
    meta = tika_result.get("metadata")
    mime_type = clean_mime(meta.get("Content-Type"))
    gmt = mime_to_gmt(mime_type)

    if meta:
        title = meta.get("dc:title", "")
        title = title[0].strip() if isinstance(title, list) else title.strip()
        if not title:
            title = meta.get("title", "")
            title = title[0].strip() if isinstance(title,
                                                   list) else title.strip()

    # See if string would survive normalization
    norm_title = iscc.text_normalize(title, keep_ws=True)

    if not norm_title and guess and gmt == GMT.TEXT:
        content = tika_result.get("content", "")
        if content is not None:
            first_line = content.strip().splitlines()[0]
            title = iscc.text_trim(
                iscc.text_normalize(first_line, keep_ws=True))

    if not title and uri is not None:
        result = urlparse(uri)
        base = basename(result.path)
        title = splitext(base)[0]
        title = title.replace("-", " ")
        title = title.replace("_", " ")
    return title
Exemple #2
0
def test_text_normalize():
    text = "  Iñtërnâtiôn\nàlizætiøn☃💩 –  is a tric\t ky \u00A0 thing!\r"
    normalized = iscc.text_normalize(text, keep_ws=False)
    assert normalized == "internationalizætiøn☃💩isatrickything"

    normalized = iscc.text_normalize(text, keep_ws=True)
    assert normalized == "internation alizætiøn☃💩 is a tric ky thing"

    assert iscc.text_normalize(" ") == ""
    assert iscc.text_normalize("  Hello  World ? ",
                               keep_ws=True) == "hello world"
    assert iscc.text_normalize("Hello\nWorld", keep_ws=True) == "hello world"
Exemple #3
0
def test_text_normalize():
    text = '  Iñtërnâtiôn\nàlizætiøn☃💩 –  is a tric\t ky \u00A0 thing!\r'
    normalized = iscc.text_normalize(text, keep_ws=False)
    assert normalized == 'internationalizætiøn☃💩isatrickything'

    normalized = iscc.text_normalize(text, keep_ws=True)
    assert normalized == 'internation alizætiøn☃💩 is a tric ky thing'

    assert iscc.text_normalize(' ') == ''
    assert iscc.text_normalize('  Hello  World ? ',
                               keep_ws=True) == 'hello world'
    assert iscc.text_normalize('Hello\nWorld', keep_ws=True) == 'hello world'