def index_text(text, correct_spaces: bool = False):
    """Segments contiguous (Icelandic) text into paragraphs and sentences
    and returns:
        dictionary of sentence indices to sentences
        dictionary of paragraph index to constituent sentence indices"""
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)

    pgs = tokenizer.paragraphs(tok_stream)
    pg_idx_to_sent_idx = dict()
    sent_idx_to_sent = dict()
    curr_sent_idx = 0
    curr_pg_idx = 0

    for pg in pgs:
        sent_idxs = []
        for _, sent in pg:
            curr_sent = list(filter(BIN_Token.is_understood, sent))
            curr_sent = tokenizer.normalized_text_from_tokens(curr_sent)
            if correct_spaces:
                curr_sent = tokenizer.correct_spaces(curr_sent)
            sent_idxs.append(curr_sent_idx)
            sent_idx_to_sent[curr_sent_idx] = curr_sent
            curr_sent_idx += 1
        pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
        curr_pg_idx += 1
    return pg_idx_to_sent_idx, sent_idx_to_sent
Exemple #2
0
def index_text(text: str) -> Tuple[Dict[int, List[int]], Dict[int, str]]:
    """ Segments contiguous (Icelandic) text into paragraphs and sentences
        and returns:
            dictionary of sentence indices to sentences
            dictionary of paragraph index to constituent sentence indices"""
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)

    pgs = tokenizer.paragraphs(tok_stream)
    pg_idx_to_sent_idx = dict()  # type: Dict[int, List[int]]
    sent_idx_to_sent = dict()  # type: Dict[int, str]
    curr_sent_idx = 0
    curr_pg_idx = 0

    for pg in pgs:
        sent_idxs = []
        for _, sent in pg:
            curr_sent = list(filter(BIN_Token.is_understood, sent))  # type: List[Tok]
            curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
            sent_idxs.append(curr_sent_idx)
            sent_idx_to_sent[curr_sent_idx] = curr_sent_text
            curr_sent_idx += 1
        pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
        curr_pg_idx += 1
    return pg_idx_to_sent_idx, sent_idx_to_sent
def split_text(text):
    """Segments contiguous (Icelandic) text into paragraphs and sentences
    and returns a list of lists
    """
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)
    pgs = tokenizer.paragraphs(tok_stream)
    data = []
    for pg in pgs:
        pg_data = []
        for _, sentence in pg:
            sentence = list(filter(BIN_Token.is_understood, sentence))
            sentence = tokenizer.normalized_text_from_tokens(sentence)
            pg_data.append(sentence)
        data.append(pg_data)
    return data
Exemple #4
0
 def paragraphs(self):
     """ Yield the paragraphs from the token stream """
     for p in paragraphs(self._toklist):
         yield IncrementalParser._IncrementalParagraph(self, p)
Exemple #5
0
 def paragraphs(self):
     """ Yield the paragraphs from the token stream """
     for p in paragraphs(self._tokens):
         yield _Paragraph(self, p)