def main(input_folder: str, ouput_file: FileType, trim: bool):
    nlp = spacy.load('en_core_web_lg',
                     disable=['tokenizer', 'tagger', 'ner', 'textcat'])
    nlp.max_length = 2000000
    text_to_write = []
    for file in tqdm(input_folder.glob("*.txt")):
        with open(file, 'r') as f:
            raw_text = f.read()
        text = raw_text.replace('\n\n', ' ')
        doc = nlp(text)
        if trim:
            sentences = [
                sent.string.strip() for sent in doc.sents
                if len(sent.string.strip()) > 15
            ]
        else:
            sentences = [sent.string.strip() for sent in doc.sents]
        text_to_write.append('\n'.join(sentences))
    ouput_file.write('\n\n'.join(text_to_write))
Ejemplo n.º 2
0
def output_file(data_list: List, output_file: argparse.FileType,
                is_json: bool):
    if is_json:
        json_list = to_json(data_list)
        output_file.write(json_list)