def to_sentence_lines(lines): paragraph = [] document = [] num_sents = 0 for line in lines: line = line.strip() if not line: sents = text_to_sentences(" ".join(paragraph).strip().replace( "\n", '')) sents = sents.split('\n') document.extend(sents) num_sents += len(sents) paragraph = [] continue paragraph.append(line.strip()) if paragraph: sents = text_to_sentences(" ".join(paragraph).strip().replace( "\n", '')) sents = sents.split('\n') document.extend(sents) num_sents += len(sents) return [sen for sen in document if sen != ''], num_sents
def get_sentence_boundaries(self, words, spaces): offset = 0 reconstructed = '' sentence_offsetTokens = [] text = ''.join([ words[i] + (' ' if spaces[i] else '') for i in range(0, len(words)) ]) for sent in text_to_sentences(text).split('\n'): start = offset for id in range(offset, len(words)): token = words[id] reconstructed += token if spaces[id]: reconstructed += ' ' if len(reconstructed.rstrip()) == len(sent): offset += 1 end = offset sentence_offsetTokens.append((start, end)) reconstructed = '' break offset += 1 return sentence_offsetTokens
def process_text_to_sentence(text): blocks = pattern.split(text) results = [] for block in blocks: sents = text_to_sentences(block.strip("\n").replace("\n", " ")).split("\n") sents = [sent for sent in sents if sent != ""] results.extend(sents) return results
def convert_into_sentences(self, text_file): paragraphs = [] stack = [] for chunk in text_file: if not chunk.strip(): if stack: sents = text_to_sentences(" ".join(stack).strip().replace( '\n', ' ')).split('\n') paragraphs.append(sents) stack = [] continue stack.append(chunk.strip()) if stack: sents = text_to_sentences(" ".join(stack).strip().replace( '\n', ' ')).split('\n') paragraphs.append(sents) return paragraphs
def main(): wiki_dump_file_in = Path(sys.argv[1]) wiki_dump_file_out = wiki_dump_file_in.parent / \ f'{wiki_dump_file_in.stem}_preprocessed{wiki_dump_file_in.suffix}' print(f'Pre-processing {wiki_dump_file_in} to {wiki_dump_file_out}...') with open(wiki_dump_file_out, 'w', encoding='utf-8') as out_f: with open(wiki_dump_file_in, 'r', encoding='utf-8') as in_f: for line in in_f: sentences = text_to_sentences(line) out_f.write(sentences + '\n') print(f'Successfully pre-processed {wiki_dump_file_in} to {wiki_dump_file_out}...')
def text2sentences(text: str) -> str: lines = [line.strip() for line in text.splitlines()] stack = [] sentences = [] for line in lines: if line: stack.append(line) elif stack: # empty line and non-empty stack sentences += text_to_sentences(' '.join(stack).strip()).splitlines() stack = [] return '\n'.join(sentences)
def convert_into_sentences(lines): stack = [] sent_L = [] n_sent = 0 for chunk in lines: if not chunk.strip(): if stack: sents = text_to_sentences(" ".join(stack).strip().replace( '\n', ' ')).split('\n') sent_L.extend(sents) n_sent += len(sents) #sent_L.append('\n') stack = [] continue stack.append(chunk.strip()) if stack: sents = text_to_sentences(" ".join(stack).strip().replace( '\n', ' ')).split('\n') sent_L.extend(sents) n_sent += len(sents) return sent_L, n_sent
def process_to_sentence(lines): temp_sentences = [] sentences = [] for line in lines: if not line.strip(): if temp_sentences: sents = text_to_sentences( " ".join(temp_sentences).strip().replace("\n", " ")).split("\n") sentences.extend(sents) temp_sentences = [] else: temp_sentences.append(line.strip()) return sentences
def chunk_to_sentences(chunk: str) -> List[str]: """ Takes a chunk of text from the file object, tokenize via BlingFire and return the resulting sentences. TODO: why is the Bookcorpus equivalent function so convoluted? See: https://github.com/soskek/bookcorpus/blob/master/make_sentlines.py :param chunk: chunk of input file, separated by the python open iterator :return: list of sentences from chunk """ sentences = blingfire.text_to_sentences(chunk.strip().replace( "\n", " ")).split("\n") return sentences
def main(args): assert not os.path.isfile(args.output_file), ( f"Cannot overwrite {args.output_file}" ) for filename in args.input_files: assert os.path.isfile(filename), f"Input file {filename} does not exist" with open(args.output_file, "w") as out_file: for filename in args.input_files: with open(filename) as in_file: for line in tqdm(in_file, desc="Transforming file"): line = line.strip() if line == "": out_file.write(line + "\n") else: for sentence in blingfire.text_to_sentences(line).split("\n"): sentence = sentence.strip() if len(sentence) > 0: out_file.write(sentence + "\n")
def split_text_into_sentences(text_str: str) -> List[str]: """ Given a text string, it splits each sentence into his own string. Args: text_str (str): The initial text string. Returns: List[str]: A list with the sentences. """ # Strip newlines, tabulations and trailing whitespaces. text_str = " ".join(text_str.split()) # If no content, then return empty array. if len(text_str) < 1: return [] # Adding dot at the end if necessary. if text_str[-1] not in ["?", "!", "."]: text_str += "." # Using blingfire module to split the text in lines. sentences: List[str] = text_to_sentences(text_str).splitlines() # If the previous sentence doesn't end with a closing exclamation, # an interrogation mark or a dot, then join the actual sentence # with the previous one. fixed_sentences: List[str] = [sentences[0]] for sentence in sentences[1:]: if fixed_sentences[-1][-1] not in ["?", "!", "."]: fixed_sentences[-1] += fixed_sentences[-1] + " " + sentence else: fixed_sentences.append(sentence) return fixed_sentences
def blingfire_tokenize(text): return blingfire.text_to_sentences(text).split('\n')
def sentence_tokenize(text: str, tokenizer: Optional[RegexpTokenizer] = None ) -> List[str]: r"""Divide the text into sentences. The steps followed are: * Remove characters such as '\n', '\t', etc. * Splits the text into sentences, taking into account Named Entities and special cases such as: - "I was born in 02.26.1980 in New York", "As we can see in Figure 1.1. the model will not fail.": despite the periods in the date and the Figure number, these texts will not be split into different sentences. - "Mr. Elster looked worried.", "London, capital of U.K., is famous for its red telephone boxes": the pre-processor applies Named Entity Recognition and does not split the previous sentences. - "Hello.Goodbye.", "Seriously??!That can't be true.": these sentences are split into: ['Hello.', 'Goodbye.'] and ['Seriously??!', 'That can't be true.'], respectively. Args: text (:obj:`str`): Text to be split in sentences. tokenizer (:obj:`nlkt.tokenize.RegexpTokenizer`, `optional`, defaults to :obj:`None`): Regular expression to carry out a preliminar split (the text will be afterwards split once again by the :mod:`blingfire` :func:`text_to_sentences`. function). """ # punctuation that shouldn't be preceeded by a whitespace PUNCT_NO_PREV_WHITESPACE = ".,;:!?" # Check if text is empty or contains onlynon-printable # characters, e.g., whitespaces. if len(text.strip()) == 0: return [] if tokenizer is None: # if next letter after period is lowercase, consider it part of the same sentence # ex: "As we can see in Figure 1.1. the sentence will not be split." # Also, take acronyms as groups, e.g., U.K., U.S., B.C., D.C., etc. tokenizer = RegexpTokenizer(r'[^.!?]+(?:(?:[A-Z][.])+|[.!?]+)+[^A-Z]*') # if there's no final period, add it (this makes the assumption that the last # sentence is not interrogative or exclamative, i.e., ends with '?' or '!') if text[-1] not in ('.', '?', '!'): text += '.' text = ' '.join(text.split()) # remove '\n', '\t', etc. # split sentences with the regexp and ensure there's 1 whitespace at most sentences = ' '.join(tokenizer.tokenize(text)).replace(' ', ' ') # remove whitespaces before PUNCT_WITHOUT_PREV_WHITESPACE for punct in PUNCT_NO_PREV_WHITESPACE: sentences = sentences.replace(' ' + punct, punct) sentences = text_to_sentences(sentences).split('\n') final_sentences = [sentences[0]] for sent in sentences[1:]: # if the previous sentence doesn't end with a '.', '!' or '?' # we concatenate the current sentence to it if final_sentences[-1][-1] not in ('.', '!', '?'): final_sentences[-1] += (' ' + sent) # if the next sentence doesn't start with a letter or a number, # we concatenate it to the previous elif not sent[0].isalpha() and not sent[0].isdigit(): final_sentences[-1] += sent else: final_sentences.append(sent) return final_sentences
def split_text_to_sentences(text): paragraphs_result = [] for paragraph in PARAGRAPH_SEPARATOR_RE.split(text): sentences = text_to_sentences(paragraph.strip()).split('\n') paragraphs_result.append([sentence.strip() for sentence in sentences]) return paragraphs_result
def ssplit(text): sentences = text_to_sentences(text.strip()).split('\n') return sentences