Exemple #1
0
def tag_layer(data):
    text = Text(data['text'])
    text.meta = data['meta']
    layers = json_to_layers(text, data['layers'])

    for layer in Text.topological_sort(layers):
        text.add_layer(layer)

    text.tag_layer(layer_names=data['parameters']['layer_names'])
    return text_to_json(text)
Exemple #2
0
            if isinstance(timex.annotations[0]['part_of_interval'],
                          OrderedDict):
                seen_implicit_timexes.add(
                    timex.annotations[0]['part_of_interval']['tid'])
            if timex.annotations[0]['comment'] is not None:
                commented += 1
        implicit_timex_count += len(seen_implicit_timexes)
        # Preprocess with EstNLTK (if required)
        if preprocess:
            preprocess_for_timex_tagger(text_obj)
            text_obj.tag_layer(['morph_analysis'])
        #print(text_obj.meta)
        converted += 1
        if not dry_run:
            # Write out results
            new_fname = fname.replace('.tml', '.json')
            fpath = os.path.join(output_dir, new_fname)
            text_to_json(text_obj, fpath)
    #if converted > 5:
    #    break

# Output statistics
print()
print(' Total processing time: {}'.format(datetime.now() - start))
print(' Docs converted:        ', converted)
print('    Original tokens:    ', original_tokens)
print('    Explicit  timexes:  ', timex_count)
print('    Implicit  timexes:  ', implicit_timex_count)
print('    Commented timexes:  ', commented)
print()
                # 4) Convert morph analyses to GT format
                if add_gt_morph_analysis:
                    gt_converter.tag(text)

                # 5) Add syntax_ignore
                if add_syntax_ignore:
                    syntax_ignore_tagger.tag(text)

                # 6) Save results
                if not skip_saving:
                    if output_format == 'pickle':
                        with open(ofnm_pckl, 'wb') as fout:
                            pickle.dump(text, fout)
                    else:
                        text_to_json(text, file=ofnm_json)
                    new_files += 1

            # X) Log errors
            except RuntimeError as err:
                write_error_log(in_file_name, err)
                errors += 1
            except Exception as err:
                write_error_log(in_file_name, err)
                errors += 1
            processed += 1

            # Report processing status and time elapsed
            if processed % 500 == 0:
                print(processed)
            time_diff = datetime.now() - startTime
Exemple #4
0
            attribs['brat_id'] = rel_id
            attribs['rel_type'] = rel_type
            attribs['a_text'] = ' '.join([content[s:e] for s, e in arg1_loc])
            attribs['b_text'] = ' '.join([content[s:e] for s, e in arg2_loc])
            attribs['b_index'] = len(arg1_loc)
            arguments_layer.add_annotation(arg1_loc + arg2_loc, **attribs)
        text_obj.add_layer(arguments_layer)
        text_objects.append(text_obj)
    return text_objects


if __name__ == '__main__':
    input_folder = None
    output_folder = None
    if len(sys.argv) > 2:
        input_folder = sys.argv[1]
        assert os.path.isdir(input_folder), \
            '(!) Unexpected input folder: {!r}. Please give name of the input folder as the first argument.'.format(input_folder)
        output_folder = sys.argv[2]
        assert os.path.isdir(output_folder), \
            '(!) Unexpected output folder: {!r}. Please give name of the (existing) output folder as the second argument.'.format(output_folder)
        text_objects = import_from_brat_folder(input_folder)
        for text in text_objects:
            fpath = os.path.join(output_folder, text.meta['file'] + '.json')
            print('=>', fpath)
            text_to_json(text, file=fpath)
        print(f"{len(text_objects)} files converted.")
    else:
        print(f'(!) Missing command line arguments input_folder and output_folder.\n'+\
              f'Usage:  python  {sys.argv[0]}  [input_folder]  [output_folder] ')
Exemple #5
0
        for text in parse_ettenten_corpus_file_iterator( in_file, encoding='utf-8', \
                                                         add_tokenization=add_tokenization, \
                                                         discard_empty_paragraphs=True, \
                                                         store_paragraph_attributes=store_paragraph_attributes ):
            if 'web_domain' not in text.meta:
                for k, v in text.meta.items():
                    logger.error(k, ':', v)
                raise Exception(
                    ' (!) Web domain name not found from the metadata of text! '
                )
            # Construct name of the file (based on web domain name)
            domain_name = text.meta['web_domain']
            domain_name = domain_name.replace('.', '_')
            fnm = domain_name + '__' + str(document_id) + output_ext
            if not only_mimic_output:
                out_file_path = os.path.join(out_dir, fnm)
            else:
                out_file_path = fnm
            logger.debug(' Writing document {0}'.format(out_file_path))
            # Export in json format
            if not only_mimic_output:
                text_to_json(text, file=out_file_path)
            document_id += 1
            if convert_only_n_docs and convert_only_n_docs <= document_id:
                break
        logger.info(' {0} documents converted.'.format(document_id))
        time_diff = datetime.now() - startTime
        logger.info(' Total processing time: {}'.format(time_diff))
    else:
        arg_parser.print_help()
Exemple #6
0
def process(start_dir, out_dir, encoding='utf-8', \
            add_tokenization=False,\
            preserve_tokenization=False, \
            create_empty_docs=True, \
            sentence_separator='\n' ):
    """Traverses recursively start_dir to find XML TEI documents,
       converts found documents to EstNLTK Text objects, and saves 
       as JSON files (into the out_dir).
    
    Parameters
    ----------
    start_dir: str
        The root directory which is recursively traversed to find 
        XML files;
    out_dir: str
        The directory where results (EstNLTK Texts in JSON format)
        are to be saved;
    encoding: str
        Encoding of the XML files. (default: 'utf-8')
    add_tokenization: boolean
        If True, then tokenization layers 'tokens', 'compound_tokens',
        'words', 'sentences', 'paragraphs' will be added to all newly 
        created Text instances;
        If preserve_tokenization is set, then original tokenization in 
        the document will be preserved; otherwise, the tokenization will be
        created with EstNLTK's default tokenization tools;
        (Default: False)
    preserve_tokenization: boolean
        If True, then the created documents will have layers 'tokens', 
        'compound_tokens', 'words', 'sentences', 'paragraphs', which 
        follow the original segmentation in the XML file. 
        (In the XML, sentences are between <s> and </s>, paragraphs are 
        between <p> and </p>, and words separated by spaces);
        Note that the layer 'compound_tokens' will always remain empty 
        because koondkorpus files do no contain information about token 
        compounding.
        (default: False)
    create_empty_docs: boolean
        If True, then documents are also created if there is no textual 
        content, but only metadata content.
        Note: an empty document may be a captioned table or a figure, 
        which content has been removed from the XML file. Depending on 
        the goals of the analysis, the caption may still be useful, 
        so, by default, empty documents are preserved;
        (default: True)
    sentence_separator: str
        String to be used as a sentence separator during the reconstruction
        of the text. The parameter value should be provided, None is not 
        allowed.
        (Default: '\n')
    """
    global logger
    xml_count = 0
    json_count = 0
    startTime = datetime.now()
    no_documents_created = []
    for dirpath, dirnames, filenames in os.walk(start_dir):
        if len(dirnames) > 0 or len(filenames) == 0 or 'bin' in dirpath:
            continue
        for fnm in filenames:
            full_fnm = os.path.join(dirpath, fnm)
            out_prefix = os.path.join(out_dir, fnm)
            target = get_div_target(full_fnm)
            if os.path.exists(out_prefix + '_0.' + output_ext):
                logger.debug(
                    'Skipping file {0}, because it seems to be already processed'
                    .format(full_fnm))
                continue
            logger.debug('Processing file {0} with target {1}'.format(
                full_fnm, target))
            docs = []
            docs = parse_tei_corpus(full_fnm, target=[target], encoding=encoding, \
                                    add_tokenization=add_tokenization, \
                                    preserve_tokenization=preserve_tokenization, \
                                    record_xml_filename=True, \
                                    sentence_separator=sentence_separator )
            xml_count += 1
            empty_docs = []
            for doc_id, doc in enumerate(docs):
                out_fnm = '{0}_{1}.{2}'.format(out_prefix, doc_id, output_ext)
                logger.debug('Writing document {0}'.format(out_fnm))
                if not create_empty_docs and len(doc.text) == 0:
                    # Skip creating an empty document
                    continue
                if len(doc.text) == 0:
                    empty_docs.append(out_fnm)
                text_to_json(doc, file=out_fnm)
                json_count += 1
            if empty_docs:
                logger.warn(
                    'Warning: empty documents created for {0}: {1}'.format(
                        fnm, empty_docs))
            elif not docs:
                logger.warn(
                    'Warning: no documents created for {0}'.format(fnm))
                no_documents_created.append(fnm)
    if no_documents_created:
        logger.warn('No documents created for XML files: {0}'.format(
            no_documents_created))
    logger.info(' Total {0} XML files processed '.format(xml_count))
    logger.info(' Total {0} JSON files created '.format(json_count))
    time_diff = datetime.now() - startTime
    logger.info(' Total processing time: {}'.format(time_diff))