Python handle_unicode_characters 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: articlenizer.encode_string

메소드/함수: handle_unicode_characters

hotexamples.com에서의 예제들: 7

Python handle_unicode_characters - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 articlenizer.encode_string.handle_unicode_characters에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: articlenizer.py 프로젝트: dave-s477/articlenizer

def handle_unicode(text):
    """Wrapper for encode_string.handle_unicode_characters. Handle unicode characters appearing in string.

    Args:
        text (string): string to transform

    Returns:
        string: unicode 'normalized' string
    """
    text, _ = encode_string.handle_unicode_characters(text)
    return text

예제 #2

파일 보기

파일: formatting.py 프로젝트: dave-s477/articlenizer

def sentence_based_info_annotation_dict(text,
                                        annotation_dict,
                                        process_unicode=True,
                                        replace_math=True,
                                        correct=True,
                                        corr_cite=True,
                                        is_preprocessed=False):
    """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. 

    Args:
        text (string): plain text of the BRAT annotation (content of .txt file)
        annotation_dict (dict): Result of annotation_to_dict based on BRAT annotation
        process_unicode (bool, optional): replace unicodes. Defaults to True.
        replace_math (bool, optional): replace math equations. Defaults to True.
        correct (bool, optional): replace string errors. Defaults to True.
        corr_cite (bool, optional): correct citation errors. Defaults to True.

    Returns:
        list of dictionaries: Brat based information for each sentence in text 
    """

    if process_unicode:
        text, replacements = encode_string.handle_unicode_characters(text)
        _remove_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if replace_math:
        text, replacements = corrections.remove_math_expr(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if correct:
        text, replacements = corrections.correct_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if corr_cite:
        text, switched_segments = corrections.correct_citations(text)
        _switch_characters(annotation_dict, switched_segments)
        _adjust_strings(annotation_dict, text)

    if not is_preprocessed:
        text, replacements = sentenize.normalize(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
        text, replacements = sentenize.sentenize_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)

    sentences = []
    sentence_match_objects = re.finditer(r'[^\n]+', text)
    for sentence in sentence_match_objects:
        sentence_string = sentence.group(0)
        sentence_entities = get_sentence_entities(
            sentence.span(0)[0],
            sentence.span(0)[1], annotation_dict)
        sentence_relations = get_sentence_relations(annotation_dict,
                                                    sentence_entities)
        sentences.append({
            'string': sentence_string,
            'entities': sentence_entities,
            'relations': sentence_relations
        })

    return sentences

예제 #3

파일 보기

파일: formatting.py 프로젝트: dave-s477/articlenizer

def brat_to_bio(text,
                annotation,
                process_unicode=True,
                replace_math=True,
                correct=True,
                corr_cite=True):
    """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. 

    Args:
        text (string): plain text of the BRAT annotation (content of .txt file)
        annotation (string): BRAT annotation (content of .ann file)
        process_unicode (bool, optional): replace unicodes. Defaults to True.
        replace_math (bool, optional): replace math equations. Defaults to True.
        correct (bool, optional): replace string errors. Defaults to True.
        corr_cite (bool, optional): correct citation errors. Defaults to True.

    Returns:
        list of dictionaries: sentences information for each sentence in text 
    """
    annotation_dict = annotation_to_dict(annotation)
    if process_unicode:
        text, replacements = encode_string.handle_unicode_characters(text)
        _remove_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if replace_math:
        text, replacements = corrections.remove_math_expr(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if correct:
        text, replacements = corrections.correct_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if corr_cite:
        text, switched_segments = corrections.correct_citations(text)
        _switch_characters(annotation_dict, switched_segments)
        _adjust_strings(annotation_dict, text)

    text, replacements = sentenize.normalize(text)
    _replace_segments(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)
    text, replacements = sentenize.sentenize_with_index(text)
    _add_characters(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)

    sentences = []
    sentence_match_objects = re.finditer(r'[^\n]+', text)
    for sentence in sentence_match_objects:
        sentence_string = sentence.group(0)
        sentence_entities = get_sentence_entities(
            sentence.span(0)[0],
            sentence.span(0)[1], annotation_dict)
        tokens = articlenizer.tokenize_text(sentence_string, 'spaces', False)
        tokens, names, labels = bio_annotate(tokens, sentence_entities)
        sentence_relations = get_sentence_relations(annotation_dict,
                                                    sentence_entities)
        sentences.append({
            'string': sentence_string,
            'tokens': tokens,
            'names': names,
            'labels': labels,
            'entities': sentence_entities,
            'relations': sentence_relations
        })

    return sentences

예제 #4

파일 보기

파일: test_encoding.py 프로젝트: dave-s477/articlenizer

def test_replacement():
    s = 'Sómè ünicôdè shóûld bè rèplâcéd.'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'Some unicode should be replaced.'

예제 #5

파일 보기

파일: test_encoding.py 프로젝트: dave-s477/articlenizer

def test_quotations():
    s = '«“Different quotes should be the same.»”'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == '““Different quotes should be the same.””'

예제 #6

파일 보기

파일: test_encoding.py 프로젝트: dave-s477/articlenizer

def test_trademarks():
    s = 'The following should all be the same: ©™®'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'The following should all be the same: ™™™'

예제 #7

파일 보기

파일: test_encoding.py 프로젝트: dave-s477/articlenizer

def test_removal():
    s = '❨Some⁆ unicode ௵should be꜐ removed⑩.'
    s, _ = encode_string.handle_unicode_characters(s)
    assert s == 'Some unicode should be removed.'