class RawSentenceStream(object): def __init__(self, extract_func=None, fz_docs=False, reshuffles=0): self.docs = FZArticleLibrary(reshuffles=reshuffles) if fz_docs else CaseReportLibrary(reshuffles=reshuffles) self.tokenizer = RawTokenizer() self.extract_func = extract_func def __iter__(self): doc_count = len(self.docs) count = 0 for doc in self.docs: for sentence in sent_tokenize(doc.get_text().lower()): tokens = self.tokenizer.tokenize(sentence) if self.extract_func is not None: labeled_tokens = LabeledSentence(words=tokens, labels=self.extract_func(doc)) yield labeled_tokens else: yield tokens count += 1 logging.info(msg="%s/%s documents streamed" % (count, doc_count, ))
def __init__(self, extract_func=None, fz_docs=False, reshuffles=0): self.docs = FZArticleLibrary(reshuffles=reshuffles) if fz_docs else CaseReportLibrary(reshuffles=reshuffles) self.tokenizer = RawTokenizer() self.extract_func = extract_func