Esempio n. 1
0
class RawSentenceStream(object):
    def __init__(self, extract_func=None, fz_docs=False, reshuffles=0):
        self.docs = FZArticleLibrary(reshuffles=reshuffles) if fz_docs else CaseReportLibrary(reshuffles=reshuffles)
        self.tokenizer = RawTokenizer()
        self.extract_func = extract_func

    def __iter__(self):
        doc_count = len(self.docs)
        count = 0
        for doc in self.docs:
            for sentence in sent_tokenize(doc.get_text().lower()):
                tokens = self.tokenizer.tokenize(sentence)
                if self.extract_func is not None:
                    labeled_tokens = LabeledSentence(words=tokens, labels=self.extract_func(doc))
                    yield labeled_tokens
                else:
                    yield tokens
            count += 1
            logging.info(msg="%s/%s documents streamed" % (count, doc_count, ))
Esempio n. 2
0
 def __init__(self, extract_func=None, fz_docs=False, reshuffles=0):
     self.docs = FZArticleLibrary(reshuffles=reshuffles) if fz_docs else CaseReportLibrary(reshuffles=reshuffles)
     self.tokenizer = RawTokenizer()
     self.extract_func = extract_func