def test_tm_adjust_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_dtm = tm.document_term_matrix(
            to_array=False)  # document-term-matrix

        res_adj_dtm = tm.Term_Matrix_Adjust(sparsity_thresh=0.8,
                                            to_array=False)

        assert res_adj_dtm.shape == (9, 5) and res_adj_dtm.getformat() == 'csr'

        #-----------------------------------------------------------------------

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_tdm = tm.term_document_matrix(
            to_array=False)  # term-document-matrix

        res_adj_tdm = tm.Term_Matrix_Adjust(sparsity_thresh=0.8,
                                            to_array=False)

        assert res_adj_tdm.shape == (5, 9) and res_adj_tdm.getformat() == 'csc'
    def test_assoc_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_ass_single = tm.term_associations(
            Terms=['the'])  # single term, keep_terms = None

        assert res_ass_single.shape == (52, 2)

        res_ass_single1 = tm.term_associations(
            Terms=['the'], keep_terms=20)  # single term, keep_terms = 20

        assert res_ass_single1.shape == (20, 2)

        tmp_terms = ['the', 'of', 'or']

        res_ass_mult = tm.term_associations(
            Terms=tmp_terms)  # multiple terms, keep_terms = None

        for item in tmp_terms:

            assert res_ass_mult[item].shape == (52, 2)

        res_ass_mult1 = tm.term_associations(
            Terms=tmp_terms, keep_terms=20)  # multiple terms, keep_terms = 20

        for item1 in tmp_terms:

            assert res_ass_mult1[item1].shape == (20, 2)
    def test_corpus_dtm_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        crp = tm.corpus_terms()

        assert isinstance(crp, np.ndarray) and len(crp) == 53
    def test_corpus_Sparsity_percentage_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        spst = tm.Sparsity()

        assert isinstance(spst, basestring)
    def test_sparsity_error_handling(self):

        tm = txtPY.docs_matrix()

        with pytest.raises(Exception) as excinfo:

            tm.Sparsity()

        assert 'run first the Term_Matrix method' in str(excinfo.value)
    def test_assoc_error_handling0(self):

        tm = txtPY.docs_matrix()

        with pytest.raises(Exception) as excinfo:

            tm.term_associations(Terms=['the'])

        assert 'run first the Term_Matrix method' in str(excinfo.value)
    def test_corpus_error_handling(self):

        tm = txtPY.docs_matrix()

        with pytest.raises(Exception) as excinfo:

            tm.corpus_terms()

        assert "run first one of the 'document_term_matrix', 'term_document_matrix' and/or 'Term_Matrix_Adjust' methods and then require the corpus terms" in str(
            excinfo.value)
    def test_mft1_error_handling(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm1)

        with pytest.raises(Exception) as excinfo:

            tm.most_frequent_terms()

        assert params_term_matrix.msg_mft1 in str(excinfo.value)
    def test_term_error_handling(self):

        tm = txtPY.docs_matrix()

        for sub_dict in range(len(params_term_matrix.lst_term)):

            with pytest.raises(Exception) as excinfo:

                tm.Term_Matrix(**params_term_matrix.lst_term[sub_dict])

            assert params_term_matrix.msg_term[sub_dict] in str(excinfo.value)
    def test_tm_adjust_error_handling0(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        with pytest.raises(Exception) as excinfo:

            tm.Term_Matrix_Adjust()

        assert "run first one of the 'document_term_matrix' or 'term_document_matrix' methods" in str(
            excinfo.value)
    def test_mft_error_handling(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        for sub_dict in range(len(params_term_matrix.lst_mft)):

            with pytest.raises(Exception) as excinfo:

                tm.most_frequent_terms(**params_term_matrix.lst_mft[sub_dict])

            assert params_term_matrix.msg_mft[sub_dict] in str(excinfo.value)
    def test_dtm_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_dtm = tm.document_term_matrix(to_array=False)

        assert res_dtm.getformat() == 'csr' and res_dtm.shape == (9, 53)

        res_dtm1 = tm.document_term_matrix(to_array=True)

        assert type(res_dtm1) == np.ndarray
    def test_corpus_sparsity_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_dtm = tm.term_document_matrix(to_array=False)

        res_adj = tm.Term_Matrix_Adjust(sparsity_thresh=0.8, to_array=False)

        crp_adj = tm.corpus_terms()

        assert isinstance(crp_adj,
                          np.ndarray) and len(crp_adj) == res_adj.shape[0]
    def test_tm_adjust_error_handling(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_dtm = tm.document_term_matrix(to_array=False)

        for sub_dict in range(len(params_term_matrix.lst_adj)):

            with pytest.raises(Exception) as excinfo:

                tm.Term_Matrix_Adjust(**params_term_matrix.lst_adj[sub_dict])

            assert params_term_matrix.msg_adj[sub_dict] in str(excinfo.value)
    def test_mft_expect_true(self):

        tm = txtPY.docs_matrix()

        tm.Term_Matrix(**params_term_matrix.args_tm)

        res_mft = tm.most_frequent_terms()

        assert res_mft.shape == (53, 2)

        keep_items = 10

        res_mft_keep = tm.most_frequent_terms(keep_terms=keep_items)

        assert res_mft_keep.shape == (keep_items, 2)
    def test_mft_error_handling0(self):

        tm = txtPY.docs_matrix()

        with pytest.raises(Exception) as excinfo:

            tm.most_frequent_terms()

        assert 'run first the Term_Matrix method' in str(excinfo.value)

        tm.Term_Matrix(**params_term_matrix.args_tm_mft)

        with pytest.raises(Exception) as excinfo:

            tm.most_frequent_terms()

        assert "the most_frequent_terms method is invalid if the normalize parameter is not None or the tf_idf parameter is TRUE" in str(
            excinfo.value)
    def test_dtm_term_error_handling(self):

        tm = txtPY.docs_matrix()

        with pytest.raises(Exception) as excinfo:

            tm.document_term_matrix(to_array='False')

        assert 'run first the Term_Matrix method' in str(excinfo.value)

        tm.Term_Matrix(**params_term_matrix.args_tm)

        with pytest.raises(Exception) as excinfo:

            tm.document_term_matrix(to_array='False')

        assert 'the to_array parameter should be of type boolean' in str(
            excinfo.value)