예제 #1
0
def handle_unicode(text):
    """Wrapper for encode_string.handle_unicode_characters. Handle unicode characters appearing in string.

    Args:
        text (string): string to transform

    Returns:
        string: unicode 'normalized' string
    """
    text, _ = encode_string.handle_unicode_characters(text)
    return text
예제 #2
0
def sentence_based_info_annotation_dict(text,
                                        annotation_dict,
                                        process_unicode=True,
                                        replace_math=True,
                                        correct=True,
                                        corr_cite=True,
                                        is_preprocessed=False):
    """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. 

    Args:
        text (string): plain text of the BRAT annotation (content of .txt file)
        annotation_dict (dict): Result of annotation_to_dict based on BRAT annotation
        process_unicode (bool, optional): replace unicodes. Defaults to True.
        replace_math (bool, optional): replace math equations. Defaults to True.
        correct (bool, optional): replace string errors. Defaults to True.
        corr_cite (bool, optional): correct citation errors. Defaults to True.

    Returns:
        list of dictionaries: Brat based information for each sentence in text 
    """

    if process_unicode:
        text, replacements = encode_string.handle_unicode_characters(text)
        _remove_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if replace_math:
        text, replacements = corrections.remove_math_expr(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if correct:
        text, replacements = corrections.correct_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if corr_cite:
        text, switched_segments = corrections.correct_citations(text)
        _switch_characters(annotation_dict, switched_segments)
        _adjust_strings(annotation_dict, text)

    if not is_preprocessed:
        text, replacements = sentenize.normalize(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
        text, replacements = sentenize.sentenize_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)

    sentences = []
    sentence_match_objects = re.finditer(r'[^\n]+', text)
    for sentence in sentence_match_objects:
        sentence_string = sentence.group(0)
        sentence_entities = get_sentence_entities(
            sentence.span(0)[0],
            sentence.span(0)[1], annotation_dict)
        sentence_relations = get_sentence_relations(annotation_dict,
                                                    sentence_entities)
        sentences.append({
            'string': sentence_string,
            'entities': sentence_entities,
            'relations': sentence_relations
        })

    return sentences
예제 #3
0
def brat_to_bio(text,
                annotation,
                process_unicode=True,
                replace_math=True,
                correct=True,
                corr_cite=True):
    """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. 

    Args:
        text (string): plain text of the BRAT annotation (content of .txt file)
        annotation (string): BRAT annotation (content of .ann file)
        process_unicode (bool, optional): replace unicodes. Defaults to True.
        replace_math (bool, optional): replace math equations. Defaults to True.
        correct (bool, optional): replace string errors. Defaults to True.
        corr_cite (bool, optional): correct citation errors. Defaults to True.

    Returns:
        list of dictionaries: sentences information for each sentence in text 
    """
    annotation_dict = annotation_to_dict(annotation)
    if process_unicode:
        text, replacements = encode_string.handle_unicode_characters(text)
        _remove_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if replace_math:
        text, replacements = corrections.remove_math_expr(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if correct:
        text, replacements = corrections.correct_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if corr_cite:
        text, switched_segments = corrections.correct_citations(text)
        _switch_characters(annotation_dict, switched_segments)
        _adjust_strings(annotation_dict, text)

    text, replacements = sentenize.normalize(text)
    _replace_segments(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)
    text, replacements = sentenize.sentenize_with_index(text)
    _add_characters(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)

    sentences = []
    sentence_match_objects = re.finditer(r'[^\n]+', text)
    for sentence in sentence_match_objects:
        sentence_string = sentence.group(0)
        sentence_entities = get_sentence_entities(
            sentence.span(0)[0],
            sentence.span(0)[1], annotation_dict)
        tokens = articlenizer.tokenize_text(sentence_string, 'spaces', False)
        tokens, names, labels = bio_annotate(tokens, sentence_entities)
        sentence_relations = get_sentence_relations(annotation_dict,
                                                    sentence_entities)
        sentences.append({
            'string': sentence_string,
            'tokens': tokens,
            'names': names,
            'labels': labels,
            'entities': sentence_entities,
            'relations': sentence_relations
        })

    return sentences
예제 #4
0
def test_replacement():
    s = 'Sómè ünicôdè shóûld bè rèplâcéd.'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'Some unicode should be replaced.'
예제 #5
0
def test_quotations():
    s = '«“Different quotes should be the same.»”'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == '““Different quotes should be the same.””'
예제 #6
0
def test_trademarks():
    s = 'The following should all be the same: ©™®'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'The following should all be the same: ™™™'
예제 #7
0
def test_removal():
    s = '❨Some⁆ unicode ௵should be꜐ removed⑩.'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'Some unicode should be removed.'