def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def parse(file_name: str): file_name = fixes.fix_character_width(file_name) infos = {} masks = np.zeros(len(file_name), dtype='bool') clean_name = file_name.replace('_', ' ') for name, val_type, pattern, norm_func in patterns: matches = re.findall(pattern, clean_name) if len(matches) == 0: continue for match in matches: index = clean_name.index(match[0]) masks[index:index+len(match[0])] = True set_field(infos, name, val_type, norm_func(match[1])) def normalize(part: str): # title title = part.split('(')[0] if title.startswith('- '): title = title[2:] if '.' in title and ' ' not in title: title = title.replace('.', ' ') title = title.replace('_', ' ') return title for i, c in enumerate(file_name): if c in '()[]()【】“”"': masks[i] = True start = 0 phrases = [] while start < len(file_name): if not masks[start]: end = start + 1 while end < len(masks) and not masks[end]: end += 1 phrase = normalize(file_name[start:end]) phrases.append(phrase.strip('._-@[]【 】()()')) start = end else: start += 1 if len(phrases) > 0: set_field(infos, 'Title', str, max(phrases, key=lambda p: len(p))) return infos, [p for p in phrases if len(p) > 0]
def make_tsquery(query: str): words = tokenize(fixes.fix_character_width(query)) return ' & '.join( [word for word in words if word not in stopwords and word.isalnum()])