def build_corpus(self, size=-1):
     texts = self.wp.records(limit=size)
     text_stream, metadata_stream = split_records(texts, 'text')
     logging.info('building corpus...')
     self.corpus = Corpus(self.lang,
                          texts=text_stream,
                          metadatas=metadata_stream)
Beispiel #2
0
def test_corpus_init_texts_and_metadatas():
    limit = 3
    texts, metadatas = io.split_records(DATASET.records(limit=limit), 'text')
    texts = list(texts)
    metadatas = list(metadatas)
    corpus = Corpus('en', texts=texts, metadatas=metadatas)
    assert len(corpus.docs) == limit
    assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus)
    for i in range(limit):
        assert texts[i] == corpus[i].text
        assert metadatas[i] == corpus[i].metadata
Beispiel #3
0
def test_corpus_init_docs():
    limit = 3
    texts, metadatas = io.split_records(DATASET.records(limit=limit), 'text')
    docs = [
        Doc(text, lang='en', metadata=metadata)
        for text, metadata in zip(texts, metadatas)
    ]
    corpus = Corpus('en', docs=docs)
    assert len(corpus.docs) == limit
    assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus)
    for i in range(limit):
        assert corpus[i].metadata == docs[i].metadata
    corpus = Corpus('en',
                    docs=docs,
                    metadatas=({
                        'foo': 'bar'
                    } for _ in range(limit)))
    for i in range(limit):
        assert corpus[i].metadata == {'foo': 'bar'}
Beispiel #4
0
def test_corpus_init_docs():
    limit = 3
    texts, metadatas = io.split_records(DATASET.records(limit=limit), "text")
    docs = [
        Doc(text, lang="en", metadata=metadata)
        for text, metadata in zip(texts, metadatas)
    ]
    corpus = Corpus("en", docs=docs)
    assert len(corpus.docs) == limit
    assert all(doc.spacy_vocab is corpus.spacy_vocab for doc in corpus)
    for i in range(limit):
        assert corpus[i].metadata == docs[i].metadata
    corpus = Corpus("en",
                    docs=docs,
                    metadatas=({
                        "foo": "bar"
                    } for _ in range(limit)))
    for i in range(limit):
        assert corpus[i].metadata == {"foo": "bar"}
Beispiel #5
0
def corpus():
    spacy_lang = cache.load_spacy('en')
    records = DATASET.records(speaker_name={'Bernie Sanders'}, limit=10)
    text_stream, metadata_stream = io.split_records(records, 'text')
    corpus = Corpus(spacy_lang, texts=text_stream, metadatas=metadata_stream)
    return corpus
Beispiel #6
0
def corpus(request):
    texts, metadatas = io.split_records(DATASET.records(limit=3), 'text')
    corpus = Corpus('en', texts=texts, metadatas=metadatas)
    return corpus
Beispiel #7
0
def corpus(request):
    texts, metadatas = io.split_records(DATASET.records(limit=3), "text")
    corpus = Corpus("en", texts=texts, metadatas=metadatas)
    return corpus