def build_corpus(self, size=-1): texts = self.wp.records(limit=size) text_stream, metadata_stream = split_records(texts, 'text') logging.info('building corpus...') self.corpus = Corpus(self.lang, texts=text_stream, metadatas=metadata_stream)
def test_corpus_init_texts_and_metadatas(): limit = 3 texts, metadatas = io.split_records(DATASET.records(limit=limit), 'text') texts = list(texts) metadatas = list(metadatas) corpus = Corpus('en', texts=texts, metadatas=metadatas) assert len(corpus.docs) == limit assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus) for i in range(limit): assert texts[i] == corpus[i].text assert metadatas[i] == corpus[i].metadata
def test_corpus_init_docs(): limit = 3 texts, metadatas = io.split_records(DATASET.records(limit=limit), 'text') docs = [ Doc(text, lang='en', metadata=metadata) for text, metadata in zip(texts, metadatas) ] corpus = Corpus('en', docs=docs) assert len(corpus.docs) == limit assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus) for i in range(limit): assert corpus[i].metadata == docs[i].metadata corpus = Corpus('en', docs=docs, metadatas=({ 'foo': 'bar' } for _ in range(limit))) for i in range(limit): assert corpus[i].metadata == {'foo': 'bar'}
def test_corpus_init_docs(): limit = 3 texts, metadatas = io.split_records(DATASET.records(limit=limit), "text") docs = [ Doc(text, lang="en", metadata=metadata) for text, metadata in zip(texts, metadatas) ] corpus = Corpus("en", docs=docs) assert len(corpus.docs) == limit assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus) for i in range(limit): assert corpus[i].metadata == docs[i].metadata corpus = Corpus("en", docs=docs, metadatas=({ "foo": "bar" } for _ in range(limit))) for i in range(limit): assert corpus[i].metadata == {"foo": "bar"}
def corpus(): spacy_lang = cache.load_spacy('en') records = DATASET.records(speaker_name={'Bernie Sanders'}, limit=10) text_stream, metadata_stream = io.split_records(records, 'text') corpus = Corpus(spacy_lang, texts=text_stream, metadatas=metadata_stream) return corpus
def corpus(request): texts, metadatas = io.split_records(DATASET.records(limit=3), 'text') corpus = Corpus('en', texts=texts, metadatas=metadatas) return corpus
def corpus(request): texts, metadatas = io.split_records(DATASET.records(limit=3), "text") corpus = Corpus("en", texts=texts, metadatas=metadatas) return corpus