Ejemplo n.º 1
0
def compute_kmeans(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs):
    """Computes KMeans clusters using `sklearn.cluster.KMeans`(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)"""
    data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]

    km = sklearn.cluster.KMeans(n_clusters=n_clusters, **kwargs).fit(data.T)

    return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=km.cluster_centers_, labels=km.labels_))
Ejemplo n.º 2
0
def co_occurrence_corpus_to_co_occurrence(
    *,
    coo_corpus: VectorizedCorpus,
    token2id: Token2Id,
) -> CoOccurrenceDataFrame:
    """Creates a co-occurrence data frame from a co-occurrence DTM corpus."""
    return coo_corpus.to_co_occurrences(token2id)
Ejemplo n.º 3
0
def simple_corpus_with_pivot_keys():
    corpus = VectorizedCorpus(
        bag_term_matrix=np.array([
            [2, 1, 4, 1],
            [2, 2, 3, 0],
            [2, 3, 2, 0],
            [2, 4, 1, 1],
            [2, 0, 1, 1],
        ]),
        token2id={
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3
        },
        document_index=pd.DataFrame(
            {
                'year': [2009, 2013, 2014, 2017, 2017],
                'color_id': [0, 0, 1, 2, 3],
                'cov_id': [1, 1, 2, 2, 3],
                'document_id': [0, 1, 2, 3, 4],
                'document_name':
                [f'doc_{y}_{i}' for i, y in enumerate(range(0, 5))],
                'filename':
                [f'doc_{y}_{i}.txt' for i, y in enumerate(range(0, 5))],
            },
            dtype=np.int16,
        ),
    )
    return corpus
Ejemplo n.º 4
0
def test_dump_and_store_of_corpus_with_empty_trailing_row(
) -> VectorizedCorpus:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [0, 0, 0, 0]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({'year': [2013, 2013, 2014]})
    corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix,
                                                token2id=token2id,
                                                document_index=document_index)

    corpus.dump(tag="ZERO", folder="./tests/output")

    loaded_corpus = VectorizedCorpus.load(tag="ZERO", folder="./tests/output")

    assert corpus.data.shape == loaded_corpus.data.shape
Ejemplo n.º 5
0
def test_co_occurrence_matrix(corpus: VectorizedCorpus):
    m = corpus.co_occurrence_matrix()
    assert m is not None
    assert (m == np.matrix([
        [0, 20, 22, 6],
        [0, 0, 20, 5],
        [0, 0, 0, 6],
        [0, 0, 0, 0],
    ])).all()
Ejemplo n.º 6
0
def test_to_bag_of_terms(corpus: VectorizedCorpus):
    expected_docs = [
        ['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'],
        ['a', 'a', 'b', 'b', 'c', 'c', 'c'],
        ['a', 'a', 'b', 'b', 'b', 'c', 'c'],
        ['a', 'a', 'b', 'b', 'b', 'b', 'c', 'd'],
        ['a', 'a', 'c', 'd'],
    ]
    assert [list(x) for x in corpus.to_bag_of_terms()] == expected_docs
Ejemplo n.º 7
0
def create_vectorized_corpus():
    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0],
                                [2, 4, 1, 1], [2, 0, 1, 1]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({'year': [2013, 2013, 2014, 2014, 2014]})
    corpus = VectorizedCorpus(bag_term_matrix,
                              token2id=token2id,
                              document_index=document_index)
    return corpus
Ejemplo n.º 8
0
def compute_kmeans2(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs):
    """Computes KMeans clusters using `scipy.cluster.vq.kmeans2` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans2.html"""
    data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]
    data = data.T.todense()
    if not np.issubdtype(data.dtype, np.floating):
        data = data.astype(np.float64)
    centroids, labels = scipy.cluster.vq.kmeans2(data, n_clusters, **kwargs)

    return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=centroids, labels=labels))
Ejemplo n.º 9
0
def test_bag_term_matrix_to_bag_term_docs(corpus: VectorizedCorpus):

    doc_ids = (
        0,
        1,
    )
    expected = [['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'],
                ['a', 'a', 'b', 'b', 'c', 'c', 'c']]
    docs = corpus.to_bag_of_terms(doc_ids)
    assert expected == ([list(d) for d in docs])

    expected = [
        ['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'],
        ['a', 'a', 'b', 'b', 'c', 'c', 'c'],
        ['a', 'a', 'b', 'b', 'b', 'c', 'c'],
        ['a', 'a', 'b', 'b', 'b', 'b', 'c', 'd'],
        ['a', 'a', 'c', 'd'],
    ]
    docs = corpus.to_bag_of_terms()
    assert expected == ([list(d) for d in docs])
Ejemplo n.º 10
0
def test_find_matching_indices(corpus: VectorizedCorpus):

    corpus._token2id = {"bengt": 0, "bertil": 1, "eva": 2, "julia": 3}  # pylint: disable=protected-access

    assert set(corpus.find_matching_words_indices(["jens"], 4)) == set()
    assert set(corpus.find_matching_words_indices([], 4)) == set()
    assert set(corpus.find_matching_words_indices(["bengt"], 4)) == {0}
    assert set(corpus.find_matching_words_indices(["b*"], 4)) == {0, 1}
    assert set(corpus.find_matching_words_indices(["|.*a|"], 4)) == {2, 3}
    assert set(corpus.find_matching_words_indices(["*"], 4)) == {0, 1, 2, 3}
Ejemplo n.º 11
0
    def corpus(self) -> VectorizedCorpus:
        shape: Tuple[int, int] = (len(self.document_index), len(self.pair2id))
        self.matrix = sp.coo_matrix((self.data, (self.row, self.col)),
                                    shape=shape)
        corpus: VectorizedCorpus = VectorizedCorpus(
            bag_term_matrix=self.matrix.tocsr(),
            token2id=dict(self.pair2id.data),
            document_index=self.document_index.set_index('document_id',
                                                         drop=False),
        )

        return corpus
Ejemplo n.º 12
0
def create_smaller_vectorized_corpus():
    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0],
                                [2, 4, 1, 1], [2, 0, 1, 1]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({
        'year': [2013, 2013, 2014, 2014, 2014],
        'filename':
        ['2013.txt', '2013.txt', '2014.txt', '2014.txt', '2014.txt'],
        'document_id': [0, 1, 2, 3, 4],
    })
    v_corpus = VectorizedCorpus(bag_term_matrix,
                                token2id=token2id,
                                document_index=document_index)
    return v_corpus
Ejemplo n.º 13
0
def test_group_by_year_with_average():

    corpus = [
        "the house had a tiny little mouse",
        "the cat saw the mouse",
        "the mouse ran away from the house",
        "the cat finally ate the mouse",
        "the end of the mouse story",
    ]
    expected_bag_term_matrix = np.array([
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0],
        [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 2, 0],
        [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0],
    ])

    expected_bag_term_matrix_sums = np.array([
        expected_bag_term_matrix[[0, 1, 2], :].sum(axis=0),
        expected_bag_term_matrix[[3, 4], :].sum(axis=0)
    ])

    expected_bag_term_matrix_means = np.array([
        expected_bag_term_matrix[[0, 1, 2], :].sum(axis=0) / 3.0,
        expected_bag_term_matrix[[3, 4], :].sum(axis=0) / 2.0,
    ])

    document_index = pd.DataFrame({
        'year': [1, 1, 1, 2, 2],
        'document_id': range(0, 5)
    })

    vec = CountVectorizer()
    bag_term_matrix = vec.fit_transform(corpus)

    v_corpus: VectorizedCorpus = VectorizedCorpus(
        bag_term_matrix,
        token2id=vec.vocabulary_,
        document_index=document_index)

    assert np.allclose(expected_bag_term_matrix, bag_term_matrix.todense())

    y_sum_corpus = v_corpus.group_by_year(aggregate='sum', fill_gaps=True)
    y_mean_corpus = v_corpus.group_by_year(aggregate='mean', fill_gaps=True)

    assert np.allclose(expected_bag_term_matrix_sums,
                       y_sum_corpus.data.todense())
    assert np.allclose(expected_bag_term_matrix_means,
                       y_mean_corpus.data.todense())
Ejemplo n.º 14
0
def test_normalize_by_raw_counts():

    corpus: VectorizedCorpus = VectorizedCorpus(
        bag_term_matrix=np.array([[4, 3, 7, 1], [6, 7, 4, 2]]),
        token2id={
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3
        },
        document_index=pd.DataFrame({'year': [2013, 2014]}),
    )

    n_corpus = corpus.normalize()
    t_corpus = corpus.normalize_by_raw_counts()
    assert np.allclose(t_corpus.data.todense(), n_corpus.data.todense())
Ejemplo n.º 15
0
def compute_hca(
    corpus: VectorizedCorpus, tokens: List[str], linkage_method: str = 'ward', linkage_metric: str = 'euclidean'
) -> HCACorpusClusters:
    """Computes HCA clusters using `scipy.cluster.hierarchy.linkage` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html"""
    data = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]

    linkage_matrix = linkage(data.T.todense(), method=linkage_method, metric=linkage_metric)
    """ from documentation

        A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with token_ids Z[i, 0] and Z[i, 1] are combined to form cluster n + i.
        A cluster with an index less than n corresponds to one of the original observations.
        The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2].
        The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.

    """

    return HCACorpusClusters(corpus, tokens, linkage_matrix)
Ejemplo n.º 16
0
def test_load_of_uncompressed_corpus(text_corpus):
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Arrange
    corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        text_corpus, already_tokenized=True)

    corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False)

    # Act
    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(
        tag='dump_test', folder=OUTPUT_FOLDER)

    # Assert
    assert (corpus.term_frequency == loaded_corpus.term_frequency).all()
    assert corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert corpus.token2id == loaded_corpus.token2id
Ejemplo n.º 17
0
def store_corpus_bundle(corpus: VectorizedCorpus, args: interface.ComputeOpts):

    if VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder):
        VectorizedCorpus.remove(tag=args.corpus_tag, folder=args.target_folder)

    target_folder = args.target_folder

    if args.create_subfolder:
        if os.path.split(target_folder)[1] != args.corpus_tag:
            target_folder = os.path.join(target_folder, args.corpus_tag)
        os.makedirs(target_folder, exist_ok=True)

    corpus.dump(tag=args.corpus_tag, folder=target_folder)

    VectorizedCorpus.dump_options(
        tag=args.corpus_tag,
        folder=target_folder,
        options=args.props,
    )
Ejemplo n.º 18
0
def test_group_by_time_period_aggregates_DTM_to_PTM():

    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0],
                                [2, 4, 1, 1], [2, 0, 1, 1]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({
        'year': [2009, 2013, 2014, 2017, 2017],
        'filename':
        ['2009.txt', '2013.txt', '2014.txt', '2017.txt', '2017.txt'],
        'document_id': [0, 1, 2, 3, 4],
    })
    corpus = VectorizedCorpus(bag_term_matrix,
                              token2id=token2id,
                              document_index=document_index)

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='year')
    expected_ytm = [[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [4, 4, 2, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='lustrum')
    expected_ytm = [[2, 1, 4, 1], [4, 5, 5, 0], [4, 4, 2, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='decade')
    expected_ytm = [[2, 1, 4, 1], [8, 9, 7, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='year', fill_gaps=True)
    expected_ytm = np.matrix([
        [2, 1, 4, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [2, 2, 3, 0],
        [2, 3, 2, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [4, 4, 2, 2],
    ])
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())
    assert len(grouped_corpus.document_index) == 9
    assert is_strictly_increasing(grouped_corpus.document_index.index,
                                  sort_values=False)
Ejemplo n.º 19
0
def create_abc_corpus(dtm: List[List[int]],
                      document_years: List[int] = None,
                      token2id: dict = None) -> VectorizedCorpus:

    bag_term_matrix = np.array(dtm)
    token2id = token2id or {
        chr(ord('a') + i): i
        for i in range(0, bag_term_matrix.shape[1])
    }

    years: List[int] = (document_years if document_years is not None else
                        [2000 + i for i in range(0, bag_term_matrix.shape[0])])

    document_index = pd.DataFrame({
        'year':
        years,
        'filename': [f'{2000+i}_{i}.txt' for i in years],
        'document_id': [i for i in range(0, bag_term_matrix.shape[0])],
    })
    corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix,
                                                token2id=token2id,
                                                document_index=document_index)
    return corpus
Ejemplo n.º 20
0
def test_from_token_ids_stream():

    tokenized_corpus: MockedProcessedCorpus = mock_corpus()
    token2id: dict = tokenized_corpus.token2id
    id2token: dict = {v: k for k, v in tokenized_corpus.token2id.items()}
    """Arrange: simulate tagged ID frame payloads by turning corpus into a stream of document_id ✕ pd.Series"""
    document_index: pd.DataFrame = tokenized_corpus.document_index
    name2id = document_index.set_index('filename')['document_id'].to_dict().get
    tokens2series = lambda tokens: pd.Series([token2id[t] for t in tokens],
                                             dtype=np.int64)
    stream = [(name2id(filename), tokens2series(tokens))
              for filename, tokens in tokenized_corpus]
    assert [id2token[t] for t in stream[0][1]] == tokenized_corpus.data[0][1]
    """Act: create a vectorized corpus out of stream"""

    vectorized_corpus: VectorizedCorpus = VectorizedCorpus.from_token_id_stream(
        stream, token2id, document_index)

    assert vectorized_corpus is not None
    """Check results"""
    expected_dtm = np.matrix([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0],
                              [2, 4, 1, 1], [2, 0, 1, 1]])

    assert (vectorized_corpus.data.todense() == expected_dtm).all()
Ejemplo n.º 21
0
def test_pick_top_tf_map(corpus: VectorizedCorpus):
    assert corpus.pick_top_tf_map(2) == {'a': 10, 'c': 11}
Ejemplo n.º 22
0
def test_step_by_step_llr_compute_corpus_keyness_alternative():
    bundle: Bundle = create_keyness_test_bundle(data=SIMPLE_CORPUS_ABCDE_3DOCS)
    opts: ComputeKeynessOpts = create_keyness_opts(keyness=KeynessMetric.HAL_cwr)

    corpus: VectorizedCorpus = bundle.corpus
    concept_corpus: VectorizedCorpus = bundle.concept_corpus
    token2id: Token2Id = bundle.token2id
    pivot_key: str = opts.pivot_column_name

    with inline_code(source=keyness.compute_weighed_corpus_keyness):

        zero_out_indices: Sequence[int] = corpus.zero_out_by_tf_threshold(3)
        concept_corpus.zero_out_by_indices(zero_out_indices)

        with inline_code(source=keyness.compute_corpus_keyness):

            corpus = corpus.group_by_time_period_optimized(
                time_period_specifier=opts.period_pivot,
                target_column_name=pivot_key,
                fill_gaps=opts.fill_gaps,
                aggregate='sum',
            )  # matrix([[3, 0, 0, 0, 3, 4, 8, 3, 5, 6, 3, 0, 3]])

            rows = []
            cols = []
            data = []
            pairs2token = (
                corpus.vocabs_mapping.get
            )  # {(0, 2): 0, (0, 3): 2, (0, 4): 3, (0, 5): 10, (0, 6): 7, (2, 3): 1, (2, 4): 5, (2, 6): 11, (3, 4): 4, (3, 6): 12, (4, 5): 6, (4, 6): 9, (5, 6): 8}
            for document_id, term_term_matrix in corpus.to_term_term_matrix_stream(token2id):  # 0,
                # matrix([[0, 0, 3, 0, 0, 3, 3],
                #         [0, 0, 0, 0, 0, 0, 0],
                #         [0, 0, 0, 0, 4, 0, 0],
                #         [0, 0, 0, 0, 3, 0, 3],
                #         [0, 0, 0, 0, 0, 8, 6],
                #         [0, 0, 0, 0, 0, 0, 5],
                #         [0, 0, 0, 0, 0, 0, 0]])
                meta_data = corpus.document_index[corpus.document_index.document_id == 0].to_dict('records')[0]
                weights, (w1_ids, w2_ids) = metrics.significance(
                    TTM=term_term_matrix,
                    metric=opts.keyness,
                    normalize=opts.normalize,
                    n_contexts=meta_data['n_documents'],
                    n_words=meta_data['n_tokens'],
                )
                # (array([-279.97270999,  -23.03480975, -120.70153416,  -85.94256279,
                #         -17.99472463, -182.2522578 ,  -20.19035001,  144.74677931]),
                # (array([0, 0, 2, 3, 3, 4, 4, 5], dtype=int32),
                # array([5, 6, 4, 4, 6, 5, 6, 6], dtype=int32)))
                token_ids = (pairs2token(p) for p in zip(w1_ids, w2_ids))
                rows.extend([document_id] * len(weights))
                cols.extend(token_ids)
                data.extend(weights)

            bag_term_matrix = scipy.sparse.csr_matrix(
                (data, (rows, cols)),
                shape=(len(corpus.document_index), len(corpus.token2id)),
                dtype=np.float64,
            )

            llr_corpus = VectorizedCorpus(
                bag_term_matrix=bag_term_matrix,
                token2id=corpus.token2id,
                document_index=corpus.document_index,
            ).remember(vocabs_mapping=corpus.vocabs_mapping)

            assert llr_corpus is not None

    pp(llr_corpus.data.todense())
Ejemplo n.º 23
0
def test_LEGACY_step_by_step_llr_compute_corpus_keyness():

    bundle: Bundle = create_keyness_test_bundle(data=SIMPLE_CORPUS_ABCDE_3DOCS)
    opts: ComputeKeynessOpts = create_keyness_opts(keyness=KeynessMetric.LLR)

    corpus: VectorizedCorpus = bundle.corpus
    concept_corpus: VectorizedCorpus = bundle.concept_corpus
    token2id: Token2Id = bundle.token2id
    pivot_key: str = opts.pivot_column_name

    with inline_code(source=keyness.compute_weighed_corpus_keyness):

        zero_out_indices: Sequence[int] = corpus.zero_out_by_tf_threshold(3)
        concept_corpus.zero_out_by_indices(zero_out_indices)

        with inline_code(source=keyness.compute_corpus_keyness):

            corpus = corpus.group_by_time_period_optimized(
                time_period_specifier=opts.period_pivot,
                target_column_name=pivot_key,
                fill_gaps=opts.fill_gaps,
                aggregate='sum',
            )  # matrix([[3, 0, 0, 0, 3, 4, 8, 3, 5, 6, 3, 0, 3]])

            """Current implementation"""
            with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn.to_keyness_co_occurrence_corpus):

                with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn.to_keyness_co_occurrences):

                    co_occurrences: pd.DataFrame = corpus.to_co_occurrences(token2id)

                    with inline_code(source=metrics.partitioned_significances):
                        vocabulary_size: int = len(token2id)
                        co_occurrence_partitions = []
                        for period in co_occurrences[pivot_key].unique():
                            pivot_co_occurrences = co_occurrences[co_occurrences[pivot_key] == period]
                            term_term_matrix = scipy.sparse.csc_matrix(
                                (pivot_co_occurrences.value, (pivot_co_occurrences.w1_id, pivot_co_occurrences.w2_id)),
                                shape=(vocabulary_size, vocabulary_size),
                                dtype=np.float64,
                            )

                            n_contexts = metrics._get_documents_count(corpus.document_index, pivot_co_occurrences)
                            weights, (w1_ids, w2_ids) = metrics.significance(
                                TTM=term_term_matrix,
                                metric=opts.keyness,
                                normalize=opts.normalize,
                                n_contexts=n_contexts,
                            )
                            co_occurrence_partitions.append(
                                pd.DataFrame(
                                    data={pivot_key: period, 'w1_id': w1_ids, 'w2_id': w2_ids, 'value': weights}
                                )
                            )
                        keyness_co_occurrences = pd.concat(co_occurrence_partitions, ignore_index=True)

                    mg = corpus.get_token_ids_2_pair_id(token2id=token2id).get

                    keyness_co_occurrences['token_id'] = [
                        mg((x[0].item(), x[1].item()))
                        for x in keyness_co_occurrences[['w1_id', 'w2_id']].to_records(index=False)
                    ]

                with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn._to_co_occurrence_matrix):
                    pg: Callable = {v: k for k, v in corpus.document_index[pivot_key].to_dict().items()}.get
                    llr_matrix: scipy.sparse.spmatrix = scipy.sparse.coo_matrix(
                        (
                            keyness_co_occurrences.value,
                            (
                                keyness_co_occurrences[pivot_key].apply(pg).astype(np.int32),
                                keyness_co_occurrences.token_id.astype(np.int32),
                            ),
                        ),
                        shape=corpus.data.shape,
                    )

                llr_corpus: VectorizedCorpus = VectorizedCorpus(
                    bag_term_matrix=llr_matrix,
                    token2id=corpus.token2id,
                    document_index=corpus.document_index,
                    vocabs_mapping=corpus.vocabs_mapping,
                )

    assert llr_corpus is not None
    pp(llr_corpus.data.todense())
Ejemplo n.º 24
0
def test_to_n_top_dataframe(corpus: VectorizedCorpus):
    assert corpus.to_n_top_dataframe(1) is not None
Ejemplo n.º 25
0
def test_token_indices(corpus: VectorizedCorpus):
    assert corpus.token_indices(['a', 'c', 'z']) == [0, 2]
Ejemplo n.º 26
0
def test_tf_idf(corpus: VectorizedCorpus):
    assert corpus.tf_idf() is not None
Ejemplo n.º 27
0
def test_get_top_n_words(corpus: VectorizedCorpus):
    assert corpus.get_top_n_words(n=2) == [('c', 11), ('a', 10)]
Ejemplo n.º 28
0
import os

from penelope.common.curve_fit import pchip_spline
from penelope.common.keyness.metrics import KeynessMetric  # , rolling_average_smoother
from penelope.corpus import VectorizedCorpus
from penelope.notebook.word_trends.displayers import TopTokensDisplayer
from penelope.notebook.word_trends.interface import TrendsComputeOpts

# pylint: disable=protected-access

DEFAULT_SMOOTHERS = [pchip_spline]

folder = "/path/to/data"
tag = os.path.split(folder)[1]

corpus: VectorizedCorpus = VectorizedCorpus.load(folder=folder, tag=tag)
compute_opts: TrendsComputeOpts = TrendsComputeOpts(normalize=False,
                                                    keyness=KeynessMetric.TF,
                                                    temporal_key='year')

top_tokens = corpus.get_top_n_words(n=100000)
displayer: TopTokensDisplayer = TopTokensDisplayer()
displayer.setup()

indices = [x[1] for x in top_tokens]
smooth = False
plot_data = displayer._compile(corpus=corpus,
                               compute_opts=compute_opts,
                               indices=indices,
                               smoothers=[DEFAULT_SMOOTHERS] if smooth else [])
Ejemplo n.º 29
0
def test_stats(corpus: VectorizedCorpus):
    assert corpus.stats() is not None
Ejemplo n.º 30
0
def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus):

    tag: str = f'{str(uuid.uuid1())[:6]}'
    folder: str = jj(OUTPUT_FOLDER, tag)

    os.makedirs(folder, exist_ok=True)

    vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode)

    assert VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert VectorizedCorpus.find_tags(folder) == [tag]

    loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag,
                                                            folder=folder)
    assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency
            ).all()
    assert vectorized_corpus.document_index.to_dict(
    ) == loaded_corpus.document_index.to_dict()
    assert vectorized_corpus.token2id == loaded_corpus.token2id

    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict()

    VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1))
    loaded_options: dict = VectorizedCorpus.load_options(tag=tag,
                                                         folder=folder)
    assert loaded_options == dict(apa=1)

    VectorizedCorpus.remove(tag=tag, folder=folder)
    assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder)
    assert not VectorizedCorpus.find_tags(folder)

    shutil.rmtree(folder)