Ejemplo n.º 1
0
def to_sentence_lines(lines):
    paragraph = []
    document = []
    num_sents = 0

    for line in lines:
        line = line.strip()
        if not line:
            sents = text_to_sentences(" ".join(paragraph).strip().replace(
                "\n", ''))
            sents = sents.split('\n')
            document.extend(sents)
            num_sents += len(sents)
            paragraph = []
            continue

        paragraph.append(line.strip())

    if paragraph:
        sents = text_to_sentences(" ".join(paragraph).strip().replace(
            "\n", ''))
        sents = sents.split('\n')
        document.extend(sents)
        num_sents += len(sents)

    return [sen for sen in document if sen != ''], num_sents
Ejemplo n.º 2
0
    def get_sentence_boundaries(self, words, spaces):
        offset = 0
        reconstructed = ''
        sentence_offsetTokens = []
        text = ''.join([
            words[i] + (' ' if spaces[i] else '')
            for i in range(0, len(words))
        ])

        for sent in text_to_sentences(text).split('\n'):
            start = offset

            for id in range(offset, len(words)):
                token = words[id]
                reconstructed += token
                if spaces[id]:
                    reconstructed += ' '
                if len(reconstructed.rstrip()) == len(sent):
                    offset += 1
                    end = offset
                    sentence_offsetTokens.append((start, end))
                    reconstructed = ''
                    break
                offset += 1

        return sentence_offsetTokens
Ejemplo n.º 3
0
def process_text_to_sentence(text):
    blocks = pattern.split(text)
    results = []
    for block in blocks:
        sents = text_to_sentences(block.strip("\n").replace("\n",
                                                            " ")).split("\n")
        sents = [sent for sent in sents if sent != ""]
        results.extend(sents)
    return results
Ejemplo n.º 4
0
    def convert_into_sentences(self, text_file):
        paragraphs = []
        stack = []
        for chunk in text_file:
            if not chunk.strip():
                if stack:
                    sents = text_to_sentences(" ".join(stack).strip().replace(
                        '\n', ' ')).split('\n')
                    paragraphs.append(sents)
                    stack = []
                continue
            stack.append(chunk.strip())

        if stack:
            sents = text_to_sentences(" ".join(stack).strip().replace(
                '\n', ' ')).split('\n')
            paragraphs.append(sents)

        return paragraphs
Ejemplo n.º 5
0
def main():
    wiki_dump_file_in = Path(sys.argv[1])
    wiki_dump_file_out = wiki_dump_file_in.parent / \
        f'{wiki_dump_file_in.stem}_preprocessed{wiki_dump_file_in.suffix}'

    print(f'Pre-processing {wiki_dump_file_in} to {wiki_dump_file_out}...')
    with open(wiki_dump_file_out, 'w', encoding='utf-8') as out_f:
        with open(wiki_dump_file_in, 'r', encoding='utf-8') as in_f:
            for line in in_f:
                sentences = text_to_sentences(line)
                out_f.write(sentences + '\n')
    print(f'Successfully pre-processed {wiki_dump_file_in} to {wiki_dump_file_out}...')
Ejemplo n.º 6
0
def text2sentences(text: str) -> str:
    lines = [line.strip() for line in text.splitlines()]
    stack = []
    sentences = []

    for line in lines:
        if line:
            stack.append(line)
        elif stack:  # empty line and non-empty stack
                sentences += text_to_sentences(' '.join(stack).strip()).splitlines()
                stack = []

    return '\n'.join(sentences)
Ejemplo n.º 7
0
def convert_into_sentences(lines):
    stack = []
    sent_L = []
    n_sent = 0
    for chunk in lines:
        if not chunk.strip():
            if stack:
                sents = text_to_sentences(" ".join(stack).strip().replace(
                    '\n', ' ')).split('\n')
                sent_L.extend(sents)
                n_sent += len(sents)
                #sent_L.append('\n')
                stack = []
            continue
        stack.append(chunk.strip())

    if stack:
        sents = text_to_sentences(" ".join(stack).strip().replace(
            '\n', ' ')).split('\n')
        sent_L.extend(sents)
        n_sent += len(sents)
    return sent_L, n_sent
Ejemplo n.º 8
0
def process_to_sentence(lines):
    temp_sentences = []
    sentences = []
    for line in lines:
        if not line.strip():
            if temp_sentences:
                sents = text_to_sentences(
                    " ".join(temp_sentences).strip().replace("\n",
                                                             " ")).split("\n")
                sentences.extend(sents)
                temp_sentences = []
        else:
            temp_sentences.append(line.strip())

    return sentences
Ejemplo n.º 9
0
def chunk_to_sentences(chunk: str) -> List[str]:
    """
    Takes a chunk of text from the file object, tokenize via BlingFire and
    return the resulting sentences.

    TODO: why is the Bookcorpus equivalent function so convoluted? See:
    https://github.com/soskek/bookcorpus/blob/master/make_sentlines.py

    :param chunk: chunk of input file, separated by the python open iterator
    :return: list of sentences from chunk
    """

    sentences = blingfire.text_to_sentences(chunk.strip().replace(
        "\n", " ")).split("\n")

    return sentences
Ejemplo n.º 10
0
def main(args):

    assert not os.path.isfile(args.output_file), (
        f"Cannot overwrite {args.output_file}"
    )
    for filename in args.input_files:
        assert os.path.isfile(filename), f"Input file {filename} does not exist"

    with open(args.output_file, "w") as out_file:
        for filename in args.input_files:
            with open(filename) as in_file:

                for line in tqdm(in_file, desc="Transforming file"):
                    line = line.strip()
                    if line == "":
                        out_file.write(line + "\n")
                    else:
                        for sentence in blingfire.text_to_sentences(line).split("\n"):
                            sentence = sentence.strip()
                            if len(sentence) > 0:
                                out_file.write(sentence + "\n")
Ejemplo n.º 11
0
    def split_text_into_sentences(text_str: str) -> List[str]:
        """ Given a text string, it splits each sentence into his own string.

        Args:
            text_str (str): The initial text string.

        Returns:
            List[str]: A list with the sentences.
        """

        # Strip newlines, tabulations and trailing whitespaces.
        text_str = " ".join(text_str.split())

        # If no content, then return empty array.
        if len(text_str) < 1:
            return []

        # Adding dot at the end if necessary.
        if text_str[-1] not in ["?", "!", "."]:
            text_str += "."

        # Using blingfire module to split the text in lines.
        sentences: List[str] = text_to_sentences(text_str).splitlines()

        # If the previous sentence doesn't end with a closing exclamation,
        # an interrogation mark or a dot, then join the actual sentence
        # with the previous one.
        fixed_sentences: List[str] = [sentences[0]]

        for sentence in sentences[1:]:
            if fixed_sentences[-1][-1] not in ["?", "!", "."]:
                fixed_sentences[-1] += fixed_sentences[-1] + " " + sentence
            else:
                fixed_sentences.append(sentence)

        return fixed_sentences
def blingfire_tokenize(text):
    return blingfire.text_to_sentences(text).split('\n')
Ejemplo n.º 13
0
def sentence_tokenize(text: str,
                      tokenizer: Optional[RegexpTokenizer] = None
                      ) -> List[str]:
    r"""Divide the text into sentences.

    The steps followed are:

        * Remove characters such as '\n', '\t', etc.
        * Splits the text into sentences, taking into account Named Entities and
        special cases such as:

            - "I was born in 02.26.1980 in New York", "As we can see in Figure 1.1.
            the model will not fail.": despite the periods in the date and the
            Figure number, these texts will not be split into different sentences.
            - "Mr. Elster looked worried.", "London, capital of U.K., is famous
            for its red telephone boxes": the pre-processor applies Named Entity
            Recognition and does not split the previous sentences.
            - "Hello.Goodbye.", "Seriously??!That can't be true.": these sentences
            are split into: ['Hello.', 'Goodbye.'] and ['Seriously??!', 'That can't
            be true.'], respectively.

    Args:
        text (:obj:`str`):
            Text to be split in sentences.
        tokenizer (:obj:`nlkt.tokenize.RegexpTokenizer`, `optional`, defaults to :obj:`None`):
            Regular expression to carry out a preliminar split (the text will be
            afterwards split once again by the :mod:`blingfire` :func:`text_to_sentences`.
            function).
    """

    # punctuation that shouldn't be preceeded by a whitespace
    PUNCT_NO_PREV_WHITESPACE = ".,;:!?"

    # Check if text is empty or contains onlynon-printable
    # characters, e.g., whitespaces.
    if len(text.strip()) == 0:
        return []

    if tokenizer is None:
        # if next letter after period is lowercase, consider it part of the same sentence
        # ex: "As we can see in Figure 1.1. the sentence will not be split."
        # Also, take acronyms as groups, e.g., U.K., U.S., B.C., D.C., etc.
        tokenizer = RegexpTokenizer(r'[^.!?]+(?:(?:[A-Z][.])+|[.!?]+)+[^A-Z]*')

    # if there's no final period, add it (this makes the assumption that the last
    # sentence is not interrogative or exclamative, i.e., ends with '?' or '!')
    if text[-1] not in ('.', '?', '!'):
        text += '.'

    text = ' '.join(text.split())  # remove '\n', '\t', etc.

    # split sentences with the regexp and ensure there's 1 whitespace at most
    sentences = ' '.join(tokenizer.tokenize(text)).replace('  ', ' ')

    # remove whitespaces before PUNCT_WITHOUT_PREV_WHITESPACE
    for punct in PUNCT_NO_PREV_WHITESPACE:
        sentences = sentences.replace(' ' + punct, punct)

    sentences = text_to_sentences(sentences).split('\n')

    final_sentences = [sentences[0]]

    for sent in sentences[1:]:
        # if the previous sentence doesn't end with a '.', '!' or '?'
        # we concatenate the current sentence to it
        if final_sentences[-1][-1] not in ('.', '!', '?'):
            final_sentences[-1] += (' ' + sent)
        # if the next sentence doesn't start with a letter or a number,
        # we concatenate it to the previous
        elif not sent[0].isalpha() and not sent[0].isdigit():
            final_sentences[-1] += sent
        else:
            final_sentences.append(sent)

    return final_sentences
Ejemplo n.º 14
0
def split_text_to_sentences(text):
    paragraphs_result = []
    for paragraph in PARAGRAPH_SEPARATOR_RE.split(text):
        sentences = text_to_sentences(paragraph.strip()).split('\n')
        paragraphs_result.append([sentence.strip() for sentence in sentences])
    return paragraphs_result
Ejemplo n.º 15
0
def ssplit(text):
    sentences = text_to_sentences(text.strip()).split('\n')
    return sentences