def handle_unicode(text): """Wrapper for encode_string.handle_unicode_characters. Handle unicode characters appearing in string. Args: text (string): string to transform Returns: string: unicode 'normalized' string """ text, _ = encode_string.handle_unicode_characters(text) return text
def sentence_based_info_annotation_dict(text, annotation_dict, process_unicode=True, replace_math=True, correct=True, corr_cite=True, is_preprocessed=False): """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. Args: text (string): plain text of the BRAT annotation (content of .txt file) annotation_dict (dict): Result of annotation_to_dict based on BRAT annotation process_unicode (bool, optional): replace unicodes. Defaults to True. replace_math (bool, optional): replace math equations. Defaults to True. correct (bool, optional): replace string errors. Defaults to True. corr_cite (bool, optional): correct citation errors. Defaults to True. Returns: list of dictionaries: Brat based information for each sentence in text """ if process_unicode: text, replacements = encode_string.handle_unicode_characters(text) _remove_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if replace_math: text, replacements = corrections.remove_math_expr(text) _replace_segments(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if correct: text, replacements = corrections.correct_with_index(text) _add_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if corr_cite: text, switched_segments = corrections.correct_citations(text) _switch_characters(annotation_dict, switched_segments) _adjust_strings(annotation_dict, text) if not is_preprocessed: text, replacements = sentenize.normalize(text) _replace_segments(annotation_dict, replacements) _adjust_strings(annotation_dict, text) text, replacements = sentenize.sentenize_with_index(text) _add_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) sentences = [] sentence_match_objects = re.finditer(r'[^\n]+', text) for sentence in sentence_match_objects: sentence_string = sentence.group(0) sentence_entities = get_sentence_entities( sentence.span(0)[0], sentence.span(0)[1], annotation_dict) sentence_relations = get_sentence_relations(annotation_dict, sentence_entities) sentences.append({ 'string': sentence_string, 'entities': sentence_entities, 'relations': sentence_relations }) return sentences
def brat_to_bio(text, annotation, process_unicode=True, replace_math=True, correct=True, corr_cite=True): """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. Args: text (string): plain text of the BRAT annotation (content of .txt file) annotation (string): BRAT annotation (content of .ann file) process_unicode (bool, optional): replace unicodes. Defaults to True. replace_math (bool, optional): replace math equations. Defaults to True. correct (bool, optional): replace string errors. Defaults to True. corr_cite (bool, optional): correct citation errors. Defaults to True. Returns: list of dictionaries: sentences information for each sentence in text """ annotation_dict = annotation_to_dict(annotation) if process_unicode: text, replacements = encode_string.handle_unicode_characters(text) _remove_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if replace_math: text, replacements = corrections.remove_math_expr(text) _replace_segments(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if correct: text, replacements = corrections.correct_with_index(text) _add_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) if corr_cite: text, switched_segments = corrections.correct_citations(text) _switch_characters(annotation_dict, switched_segments) _adjust_strings(annotation_dict, text) text, replacements = sentenize.normalize(text) _replace_segments(annotation_dict, replacements) _adjust_strings(annotation_dict, text) text, replacements = sentenize.sentenize_with_index(text) _add_characters(annotation_dict, replacements) _adjust_strings(annotation_dict, text) sentences = [] sentence_match_objects = re.finditer(r'[^\n]+', text) for sentence in sentence_match_objects: sentence_string = sentence.group(0) sentence_entities = get_sentence_entities( sentence.span(0)[0], sentence.span(0)[1], annotation_dict) tokens = articlenizer.tokenize_text(sentence_string, 'spaces', False) tokens, names, labels = bio_annotate(tokens, sentence_entities) sentence_relations = get_sentence_relations(annotation_dict, sentence_entities) sentences.append({ 'string': sentence_string, 'tokens': tokens, 'names': names, 'labels': labels, 'entities': sentence_entities, 'relations': sentence_relations }) return sentences
def test_replacement(): s = 'Sómè ünicôdè shóûld bè rèplâcéd.' s, _ = encode_string.handle_unicode_characters(s) assert s == 'Some unicode should be replaced.'
def test_quotations(): s = '«“Different quotes should be the same.»”' s, _ = encode_string.handle_unicode_characters(s) assert s == '““Different quotes should be the same.””'
def test_trademarks(): s = 'The following should all be the same: ©™®' s, _ = encode_string.handle_unicode_characters(s) assert s == 'The following should all be the same: ™™™'
def test_removal(): s = '❨Some⁆ unicode ௵should be꜐ removed⑩.' s, _ = encode_string.handle_unicode_characters(s) assert s == 'Some unicode should be removed.'