Exemple #1
0
def test_compute_and_store_bundle():

    tag: str = f'{uuid.uuid4()}'

    target_folder: str = jj(OUTPUT_FOLDER, tag)
    target_filename: str = co_occurrence.to_filename(folder=target_folder,
                                                     tag=tag)

    os.makedirs(target_folder, exist_ok=True)

    simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS)
    context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts(
        concept={'g'}, ignore_concept=False, context_width=2)
    bundle: co_occurrence.Bundle = test_utils.create_simple_bundle_by_pipeline(
        data=simple_corpus,
        context_opts=context_opts,
        folder=target_folder,
        tag=tag,
    )

    bundle.store()

    assert os.path.isfile(target_filename)

    shutil.rmtree(target_folder, ignore_errors=True)
Exemple #2
0
def test_term_term_matrix_to_co_occurrences_with_ignore_ids():

    text_corpus = very_simple_corpus(data=[
        ('tran_2019_01_test.txt', ['*', 'b', 'c', 'c']),
        ('tran_2019_02_test.txt', ['a', '*', '*', 'd']),
        ('tran_2019_03_test.txt', ['a', 'e', 'e', 'b']),
        ('tran_2020_01_test.txt', ['*', 'c', 'd', 'a']),
        ('tran_2020_02_test.txt', ['a', 'b', '*', '*']),
    ])
    token2id: Token2Id = Token2Id(text_corpus.token2id)

    term_term_matrix = (dtm.CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True,
        vocabulary=text_corpus.token2id).co_occurrence_matrix())

    pad_id = token2id['*']

    co_occurrences = term_term_matrix_to_co_occurrences(
        term_term_matrix=term_term_matrix,
        threshold_count=1,
        ignore_ids=set([pad_id]),
    )

    assert not (co_occurrences.w1_id == pad_id).any()
    assert not (co_occurrences.w2_id == pad_id).any()
Exemple #3
0
def test_to_co_occurrence_matrix():

    text_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)

    term_term_matrix1 = very_simple_term_term_matrix(text_corpus)

    term_term_matrix2 = co_occurrence.to_co_occurrence_matrix(text_corpus)

    assert (term_term_matrix1 != term_term_matrix2).nnz == 0
Exemple #4
0
def test_co_occurrence_matrix_of_corpus_returns_correct_result():

    expected_token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
    expected_matrix = np.matrix([[0, 6, 4, 3, 3], [0, 0, 2, 1, 4],
                                 [0, 0, 0, 2, 0], [0, 0, 0, 0, 0],
                                 [0, 0, 0, 0, 0]])

    corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)

    v_corpus = CorpusVectorizer().fit_transform(corpus,
                                                already_tokenized=True,
                                                vocabulary=corpus.token2id)

    term_term_matrix = v_corpus.co_occurrence_matrix()

    assert (term_term_matrix.todense() == expected_matrix).all()
    assert expected_token2id == v_corpus.token2id
Exemple #5
0
def test_term_term_matrix_to_co_occurrences_with_no_ignore_ids():

    text_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    term_term_matrix: scipy.sparse.spmatrix = very_simple_term_term_matrix(
        text_corpus)

    co_occurrences = term_term_matrix_to_co_occurrences(
        term_term_matrix=term_term_matrix,
        threshold_count=1,
        ignore_ids=None,
    )

    fg = text_corpus.token2id.get
    assert co_occurrences.value.sum() == term_term_matrix.sum()
    assert 4 == int(co_occurrences[((co_occurrences.w1_id == fg('a')) &
                                    (co_occurrences.w2_id == fg('c')))].value)
    assert 1 == int(co_occurrences[((co_occurrences.w1_id == fg('b')) &
                                    (co_occurrences.w2_id == fg('d')))].value)
Exemple #6
0
def test_TTM_to_co_occurrence_DTM_using_LIL_matrix():

    source_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS)
    token2id = Token2Id(source_corpus.token2id)
    document_index: DocumentIndex = source_corpus.document_index

    stream: Iterable[CoOccurrencePayload] = (CoOccurrencePayload(
        document_id,
        document_name="-",
        ttm_data_map={
            VectorizeType.Normal:
            VectorizedTTM(
                vectorize_type=VectorizeType.Normal,
                term_term_matrix=CorpusVectorizer().fit_transform(
                    [doc], already_tokenized=True,
                    vocabulary=token2id.data).co_occurrence_matrix(),
                term_window_counts={},
                document_id=document_id,
            )
        },
    ) for document_id, doc in enumerate(source_corpus))

    pair2id: Token2Id = Token2Id()

    builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder(
        vectorize_type=VectorizeType.Normal,
        document_index=document_index,
        pair2id=pair2id,
        token2id=token2id,
    )

    for payload in stream:
        builder.ingest_pairs(payload).add(payload)

    corpus: VectorizedCorpus = builder.corpus

    assert corpus is not None