Beispiel #1
0
def parse_txt(
    parser: SpacyBISTParser,
    txt_path: Union[str, PathLike],
    out_dir: Union[str, PathLike] = None,
    show_tok=True,
    show_doc=True,
):
    """Parse raw documents in the form of lines in a text file.

    Args:
        parser (SpacyBISTParser)
        txt_path (str or PathLike)
        out_dir (str or PathLike): If specified, the output will also be written to this path.
        show_tok (bool, optional): Specifies whether to include token text in output.
        show_doc (bool, optional): Specifies whether to include document text in output.

    Yields:
        CoreNLPDoc: the annotated document.
    """
    with open(txt_path, encoding="utf-8") as f:
        if out_dir:
            print("Writing parsed documents to {}".format(out_dir))
        for i, doc_text in enumerate(
                tqdm(f, total=line_count(txt_path), file=sys.stdout)):
            parsed_doc = parser.parse(doc_text.rstrip("\n"), show_tok,
                                      show_doc)

            if out_dir:
                out_path = Path(out_dir) / (str(i + 1) + ".json")
                with open(out_path, "w", encoding="utf-8") as doc_file:
                    doc_file.write(parsed_doc.pretty_json())
            yield parsed_doc
 def _iterate_docs(data: PathLike) -> tuple:
     if isdir(data):
         for file, doc_text in tqdm(list(walk_directory(data))):
             yield file, doc_text
     else:
         with open(data, encoding='utf-8') as f:
             for i, doc_text in tqdm(enumerate(f), total=line_count(data)):
                 yield str(i + 1), doc_text