Ejemplo n.º 1
0
def test_fit_transform_creates_a_bag_of_word_bag_term_matrix():
    corpus = mock_corpus()
    vectorizer = CorpusVectorizer()
    v_corpus = vectorizer.fit_transform(corpus, already_tokenized=True)
    expected_vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    expected_dtm = [[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1],
                    [2, 0, 1, 1]]
    expected_word_counts = {'a': 10, 'b': 10, 'c': 11, 'd': 3}
    assert expected_vocab, v_corpus.token2id
    assert expected_word_counts, v_corpus.term_frequency
    assert (expected_dtm == v_corpus.bag_term_matrix.toarray()).all()
 def skip_test_chisquare(self):
     corpus = self.create_corpus()
     v = CorpusVectorizer()
     v_corpus = v.fit_transform(
         corpus, already_tokenized=True).group_by_year().slice_by_tf(0)
     _ = scipy.stats.chisquare(v_corpus.bag_term_matrix.T.todense(),
                               f_exp=None,
                               ddof=0,
                               axis=0)  # pylint: disable=unused-variable
     _ = linkage(v_corpus.bag_term_matrix.T, 'ward')  # pylint: disable=unused-variable
     results = None
     expected = None
     self.assertEqual(expected, results)
Ejemplo n.º 3
0
def test_co_occurrence_given_windows_and_vocabulary_succeeds():

    vocabulary = generate_token2id([x[2] for x in SAMPLE_WINDOW_STREAM])

    windows_corpus = WindowsCorpus(SAMPLE_WINDOW_STREAM, vocabulary=vocabulary)

    v_corpus = CorpusVectorizer().fit_transform(windows_corpus,
                                                already_tokenized=True,
                                                vocabulary=vocabulary)

    coo_matrix = v_corpus.co_occurrence_matrix()

    assert 10 == coo_matrix.todense()[vocabulary['b'], vocabulary['a']]
    assert 1 == coo_matrix.todense()[vocabulary['d'], vocabulary['c']]
Ejemplo n.º 4
0
def to_co_occurrence_matrix(
    corpus_or_reader: Union[ICorpusReader, TokenizedCorpus], vocabulary: Mapping[str, int] = None
) -> scipy.sparse.spmatrix:
    """Computes a term-term co-ocurrence matrix for documents in corpus/reader.

    Parameters
    ----------
    corpus_or_reader : Union[ICorpusReader,TokenizedCorpus]
        Sequence of tokenized documents

    Returns
    -------
    pd.DataFrame
        Upper diagonal of term-term frequency matrix (TTM). Note that diagonal (wi, wi) is not returned
    """

    if not isinstance(corpus_or_reader, ITokenizedCorpus):
        corpus_or_reader = TokenizedCorpus(reader=corpus_or_reader)

    vocabulary = vocabulary or corpus_or_reader.token2id
    dtm_corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        corpus_or_reader, already_tokenized=True, vocabulary=vocabulary
    )
    term_term_matrix = dtm_corpus.co_occurrence_matrix()
    return term_term_matrix
Ejemplo n.º 5
0
    def test_to_dataframe_of_term_matrix_gives_expected_result(self):

        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            # Pre-compute transform options:
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )

        term_term_matrix = CorpusVectorizer().fit_transform(corpus, already_tokenized=True).co_occurrence_matrix()

        # Act
        id2w = corpus.id2token.get
        co_occurrences = term_term_matrix_to_co_occurrences(term_term_matrix, threshold_count=1, ignore_ids=set())
        co_occurrences['w1'] = co_occurrences.w1_id.apply(id2w)
        co_occurrences['w2'] = co_occurrences.w2_id.apply(id2w)

        # Assert
        assert 2 == int(co_occurrences[((co_occurrences.w1 == 'A') & (co_occurrences.w2 == 'B'))].value)
        assert 0 == len(co_occurrences[((co_occurrences.w1 == 'C') & (co_occurrences.w2 == 'F'))])
Ejemplo n.º 6
0
def test_co_occurrence_matrix_of_corpus_returns_correct_result():

    expected_token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
    expected_matrix = np.matrix([[0, 6, 4, 3, 3], [0, 0, 2, 1, 4],
                                 [0, 0, 0, 2, 0], [0, 0, 0, 0, 0],
                                 [0, 0, 0, 0, 0]])

    corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)

    v_corpus = CorpusVectorizer().fit_transform(corpus,
                                                already_tokenized=True,
                                                vocabulary=corpus.token2id)

    term_term_matrix = v_corpus.co_occurrence_matrix()

    assert (term_term_matrix.todense() == expected_matrix).all()
    assert expected_token2id == v_corpus.token2id
Ejemplo n.º 7
0
def test_vectorizer(mary_had_a_little_lamb_corpus: textacy_api.Corpus):  # pylint: disable=redefined-outer-name

    expected_dtm = np.matrix([
        [0, 0, 1, 1, 0, 1, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 1, 1, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
    ])

    opts: textacy_api.ExtractPipeline.ExtractOpts = textacy_api.ExtractPipeline.ExtractOpts(
        include_pos=('NOUN', 'PROPN'), filter_nums=True, filter_punct=True)
    terms = (
        textacy_api.ExtractPipeline(
            mary_had_a_little_lamb_corpus, target='lemma',
            extract_opts=opts).remove_stopwords(extra_stopwords=[])
        # .ingest(filter_nums=True, filter_punct=True)
        .min_character_filter(2).transform(
            transformer=lambda x: x.lower()).process())

    document_terms = ((f'document_{i}.txt', tokens)
                      for i, tokens in enumerate(terms))
    vectorizer = CorpusVectorizer()

    v_corpus: VectorizedCorpus = vectorizer.fit_transform(
        document_terms, already_tokenized=True)

    assert v_corpus is not None

    assert {
        'mary': 5,
        'lamb': 3,
        'fleece': 2,
        'snow': 8,
        'school': 7,
        'day': 1,
        'rule': 6,
        'child': 0,
        'teacher': 9,
        'love': 4,
    } == v_corpus.token2id

    assert (expected_dtm == v_corpus.data.todense()).all()
Ejemplo n.º 8
0
    def test_fit_transform_gives_document_term_matrix(self):
        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )
        v_corpus = CorpusVectorizer().fit_transform(corpus)

        term_term_matrix = v_corpus.co_occurrence_matrix()
        token2id = v_corpus.token2id

        assert 2 == term_term_matrix.todense()[token2id['A'], token2id['B']]
        assert 0 == term_term_matrix.todense()[token2id['C'], token2id['F']]
Ejemplo n.º 9
0
def test_fit_transform_when_given_a_vocabulary_returns_same_vocabulary():

    corpus = TokenizedCorpus(
        reader=create_reader(),
        transform_opts=TokensTransformOpts(to_lower=True, min_len=10),
    )

    vocabulary = CorpusVectorizer().fit_transform(
        corpus, already_tokenized=True).token2id

    assert corpus.token2id == vocabulary

    expected_vocabulary_reversed = {
        k: abs(v - 5)
        for k, v in corpus.token2id.items()
    }

    vocabulary = (CorpusVectorizer().fit_transform(
        corpus,
        already_tokenized=True,
        vocabulary=expected_vocabulary_reversed).token2id)

    assert expected_vocabulary_reversed == vocabulary
Ejemplo n.º 10
0
def test_load_of_uncompressed_corpus(text_corpus):
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Arrange
    corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True)

    corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False)

    # Act
    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(
        tag='dump_test', folder=OUTPUT_FOLDER)

    # Assert
    assert (corpus.term_frequency == loaded_corpus.term_frequency).all()
    assert corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert corpus.token2id == loaded_corpus.token2id
Ejemplo n.º 11
0
def test_TTM_to_co_occurrence_DTM_using_LIL_matrix():

    source_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    token2id = Token2Id(source_corpus.token2id)
    document_index: DocumentIndex = source_corpus.document_index

    stream: Iterable[CoOccurrencePayload] = (CoOccurrencePayload(
        document_id,
        document_name="-",
        ttm_data_map={
            VectorizeType.Normal:
            VectorizedTTM(
                vectorize_type=VectorizeType.Normal,
                term_term_matrix=CorpusVectorizer().fit_transform(
                    [doc], already_tokenized=True,
                    vocabulary=token2id.data).co_occurrence_matrix(),
                term_window_counts={},
                document_id=document_id,
            )
        },
    ) for document_id, doc in enumerate(source_corpus))

    pair2id: Token2Id = Token2Id()

    builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder(
        vectorize_type=VectorizeType.Normal,
        document_index=document_index,
        pair2id=pair2id,
        token2id=token2id,
    )

    for payload in stream:
        builder.ingest_pairs(payload).add(payload)

    corpus: VectorizedCorpus = builder.corpus

    assert corpus is not None
Ejemplo n.º 12
0
def test_term_frequency_are_absolute_word_of_entire_corpus():

    corpus = create_corpus()
    vectorizer = CorpusVectorizer()
    v_corpus = vectorizer.fit_transform(corpus, already_tokenized=True)
    results = v_corpus.term_frequency
    expected = {
        'tre': 1,
        'svarta': 1,
        'ekar': 1,
        'ur': 2,
        'snön': 1,
        'så': 3,
        'grova': 1,
        'men': 2,
        'fingerfärdiga': 1,
        'deras': 1,
        'väldiga': 2,
        'flaskor': 1,
        'ska': 1,
        'grönskan': 1,
        'skumma': 1,
        'vår': 1,
        'på': 3,
        'väg': 1,
        'det': 3,
        'långa': 1,
        'mörkret': 2,
        'envist': 1,
        'skimrar': 1,
        'mitt': 1,
        'armbandsur': 1,
        'med': 2,
        'tidens': 1,
        'fångna': 1,
        'insekt': 1,
        'nordlig': 1,
        'storm': 1,
        'är': 5,
        'den': 3,
        'tid': 1,
        'när': 1,
        'rönnbärsklasar': 1,
        'mognar': 1,
        'vaken': 1,
        'hör': 1,
        'man': 2,
        'stjärnbilderna': 1,
        'stampa': 1,
        'sina': 1,
        'spiltor': 1,
        'högt': 1,
        'över': 1,
        'trädet': 1,
        'jag': 4,
        'ligger': 1,
        'sängen': 1,
        'armarna': 1,
        'utbredda': 1,
        'ett': 1,
        'ankare': 1,
        'som': 4,
        'grävt': 1,
        'ner': 1,
        'sig': 1,
        'ordentligt': 1,
        'och': 2,
        'håller': 1,
        'kvar': 1,
        'skuggan': 1,
        'flyter': 1,
        'där': 1,
        'ovan': 1,
        'stora': 1,
        'okända': 1,
        'en': 2,
        'del': 1,
        'av': 1,
        'säkert': 1,
        'viktigare': 1,
        'än': 1,
        'har': 2,
        'sett': 1,
        'mycket': 2,
        'verkligheten': 1,
        'tärt': 1,
        'här': 1,
        'sommaren': 1,
        'till': 1,
        'sist': 1,
    }
    assert ([expected[v_corpus.id2token[i]]
             for i in range(0, len(expected))] == results).all()
Ejemplo n.º 13
0
def test_fit_transform_creates_a_vocabulary_with_unique_tokens_with_an_id_sequence(
):
    corpus = create_corpus()
    vectorizer = CorpusVectorizer()
    v_corpus = vectorizer.fit_transform(corpus, already_tokenized=True)
    assert corpus.token2id == v_corpus.token2id
Ejemplo n.º 14
0
def vectorized_corpus(text_corpus: TokenizedCorpus) -> VectorizedCorpus:
    corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True)
    return corpus