def test_tokenization_without_spaces_application():
    s = "Several softwares and R packages are available for Rasch model analysis such as ConQuest (https://shop.acer.edu.au/group/CON3), RUMM (www.rummlab.com.au), ltm (cran.r-project.org/package=ltm) and eRM (cran.r-project.org/package=eRm)."
    s = articlenizer.tokenize_text(s)
    print(s)
    assert s == [
        'Several', 'softwares', 'and', 'R', 'packages', 'are', 'available',
        'for', 'Rasch', 'model', 'analysis', 'such', 'as', 'ConQuest', '(',
        'https://shop.acer.edu.au/group/CON3', ')', ',', 'RUMM', '(',
        'www.rummlab.com.au', ')', ',', 'ltm', '(',
        'cran.r-project.org/package=ltm', ')', 'and', 'eRM', '(',
        'cran.r-project.org/package=eRm', ')', '.'
    ]
def test_tokenization_without_spaces():
    s = 'Tokenization is tested with a single sentence, which requires an example such as the sentence: "Data processing and statistical analyses were conducted using IBM SPSS 22.0 (IBM Corp., Armonk, NY), MATLAB R2015a (The MathWorks, Natick, MA), R 3.3.2 (http://www.R-project.org/), and Python libraries for scientific computation (NumPy, and SciPy) [39]."'
    s = articlenizer.tokenize_text(s)
    assert s == [
        'Tokenization', 'is', 'tested', 'with', 'a', 'single', 'sentence', ',',
        'which', 'requires', 'an', 'example', 'such', 'as', 'the', 'sentence',
        ':', '"', 'Data', 'processing', 'and', 'statistical', 'analyses',
        'were', 'conducted', 'using', 'IBM', 'SPSS', '22.0', '(', 'IBM',
        'Corp', '.', ',', 'Armonk', ',', 'NY', ')', ',', 'MATLAB', 'R',
        '2015a', '(', 'The', 'MathWorks', ',', 'Natick', ',', 'MA', ')', ',',
        'R', '3.3.2', '(', 'http://www.R-project.org/', ')', ',', 'and',
        'Python', 'libraries', 'for', 'scientific', 'computation', '(',
        'NumPy', ',', 'and', 'SciPy', ')', '[39]', '.', '"'
    ]
Beispiel #3
0
def analyze_annotation(annotation_files, threshold=6):
    """Analyze list of BRAT annotation files with respect to the annotated entities and relations.

    Args:
        annotation_files (list): list of input annotation files
        threshold (int, optional): print annotations longer than threshold tokens. Defaults to 6.
    """
    entity_count = 0
    relation_count = 0
    max_entity_count = 0
    max_relation_count = 0
    overall_entity_counts = {}
    entity_counts = {}
    # entity_counts_lower = {}
    token_number = {}
    relation_counts = {}

    for f in annotation_files:
        with f.open() as in_anno:
            anno_text = in_anno.read()
            annotation_dict = formatting.annotation_to_dict(anno_text)
            entity_count += len(annotation_dict['entities'])
            relation_count += len(annotation_dict['relations'])
            if len(annotation_dict['entities']) > max_entity_count:
                max_entity_count = len(annotation_dict['entities'])
            if len(annotation_dict['relations']) > max_relation_count:
                max_relation_count = len(annotation_dict['relations'])

            for _, entity in annotation_dict['entities'].items():
                add_to_dictionary(entity_counts, entity, overall_entity_counts)
                # add_to_dictionary(entity_counts_lower, entity)

                entity_tokens = articlenizer.tokenize_text(entity['string'])
                if len(entity_tokens) not in token_number.keys():
                    token_number[len(entity_tokens)] = 0
                token_number[len(entity_tokens)] += 1
                if len(entity_tokens) > threshold:
                    print("Annotation of length {}: {}".format(
                        len(entity_tokens), entity_tokens))
                    print(f)

            for _, relation in annotation_dict['relations'].items():
                if relation['label'] not in relation_counts.keys():
                    relation_counts[relation['label']] = 0
                relation_counts[relation['label']] += 1

    average_entities = entity_count / len(annotation_files)
    average_relations = relation_count / len(annotation_files)

    out_s = """
Base Statistics:
\tTotal number of entities/relations: {}/{}
\tAverage entities/relations:         {}/{}
\tMax entities/relations:             {}/{}
\tDistribution of token length: {}

Occurrences of Entities: 

{} 
Occurrences of Mention Types:

{}
Occurrences of Relation:

{}
Most Common Software:

{}
    """.format(entity_count, relation_count, round(average_entities, 2),
               round(average_relations, 2), max_entity_count,
               max_relation_count, sorted(token_number.items()),
               show_counts(overall_entity_counts[0]),
               show_counts(overall_entity_counts[1]),
               show_counts(relation_counts),
               show_most_common_entities(entity_counts))
    print(out_s)
Beispiel #4
0
def brat_to_bio(text,
                annotation,
                process_unicode=True,
                replace_math=True,
                correct=True,
                corr_cite=True):
    """Transform a document annotated in BRAT format into a sentence based BIO format that also considers relations. 

    Args:
        text (string): plain text of the BRAT annotation (content of .txt file)
        annotation (string): BRAT annotation (content of .ann file)
        process_unicode (bool, optional): replace unicodes. Defaults to True.
        replace_math (bool, optional): replace math equations. Defaults to True.
        correct (bool, optional): replace string errors. Defaults to True.
        corr_cite (bool, optional): correct citation errors. Defaults to True.

    Returns:
        list of dictionaries: sentences information for each sentence in text 
    """
    annotation_dict = annotation_to_dict(annotation)
    if process_unicode:
        text, replacements = encode_string.handle_unicode_characters(text)
        _remove_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if replace_math:
        text, replacements = corrections.remove_math_expr(text)
        _replace_segments(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if correct:
        text, replacements = corrections.correct_with_index(text)
        _add_characters(annotation_dict, replacements)
        _adjust_strings(annotation_dict, text)
    if corr_cite:
        text, switched_segments = corrections.correct_citations(text)
        _switch_characters(annotation_dict, switched_segments)
        _adjust_strings(annotation_dict, text)

    text, replacements = sentenize.normalize(text)
    _replace_segments(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)
    text, replacements = sentenize.sentenize_with_index(text)
    _add_characters(annotation_dict, replacements)
    _adjust_strings(annotation_dict, text)

    sentences = []
    sentence_match_objects = re.finditer(r'[^\n]+', text)
    for sentence in sentence_match_objects:
        sentence_string = sentence.group(0)
        sentence_entities = get_sentence_entities(
            sentence.span(0)[0],
            sentence.span(0)[1], annotation_dict)
        tokens = articlenizer.tokenize_text(sentence_string, 'spaces', False)
        tokens, names, labels = bio_annotate(tokens, sentence_entities)
        sentence_relations = get_sentence_relations(annotation_dict,
                                                    sentence_entities)
        sentences.append({
            'string': sentence_string,
            'tokens': tokens,
            'names': names,
            'labels': labels,
            'entities': sentence_entities,
            'relations': sentence_relations
        })

    return sentences