def get_title(tika_result: dict, guess=False, uri=None): title = "" meta = tika_result.get("metadata") mime_type = clean_mime(meta.get("Content-Type")) gmt = mime_to_gmt(mime_type) if meta: title = meta.get("dc:title", "") title = title[0].strip() if isinstance(title, list) else title.strip() if not title: title = meta.get("title", "") title = title[0].strip() if isinstance(title, list) else title.strip() # See if string would survive normalization norm_title = iscc.text_normalize(title, keep_ws=True) if not norm_title and guess and gmt == GMT.TEXT: content = tika_result.get("content", "") if content is not None: first_line = content.strip().splitlines()[0] title = iscc.text_trim( iscc.text_normalize(first_line, keep_ws=True)) if not title and uri is not None: result = urlparse(uri) base = basename(result.path) title = splitext(base)[0] title = title.replace("-", " ") title = title.replace("_", " ") return title
def test_text_normalize(): text = " Iñtërnâtiôn\nà lizætiøn☃💩 – is a tric\t ky \u00A0 thing!\r" normalized = iscc.text_normalize(text, keep_ws=False) assert normalized == "internationalizætiøn☃💩isatrickything" normalized = iscc.text_normalize(text, keep_ws=True) assert normalized == "internation alizætiøn☃💩 is a tric ky thing" assert iscc.text_normalize(" ") == "" assert iscc.text_normalize(" Hello World ? ", keep_ws=True) == "hello world" assert iscc.text_normalize("Hello\nWorld", keep_ws=True) == "hello world"
def test_text_normalize(): text = ' Iñtërnâtiôn\nà lizætiøn☃💩 – is a tric\t ky \u00A0 thing!\r' normalized = iscc.text_normalize(text, keep_ws=False) assert normalized == 'internationalizætiøn☃💩isatrickything' normalized = iscc.text_normalize(text, keep_ws=True) assert normalized == 'internation alizætiøn☃💩 is a tric ky thing' assert iscc.text_normalize(' ') == '' assert iscc.text_normalize(' Hello World ? ', keep_ws=True) == 'hello world' assert iscc.text_normalize('Hello\nWorld', keep_ws=True) == 'hello world'