class TfTest:
    _test_option = MatrixTestOptions(
        file_id_content_map={
            0: "ha ha ha ha la ta ha",
            2: "la la ta ta da da ha",
            3: "ta da ha"
        },
        front_end_option=MatrixFrontEndOption(
            token_option=TokenOption(n_gram_size=1, token_type="word"),
            norm_option=NormOption(use_freq=True,
                                   use_tf_idf=True,
                                   tf_idf_norm_option='l1'),
            culling_option=CullingOption(cull_least_seg=None,
                                         mfw_lowest_rank=None),
            id_temp_label_map={
                0: "test_label_1",
                2: "test_label_2",
                3: "test_label_3"
            }))

    model = MatrixModel(test_options=_test_option)

    expected_final_dtm = pd.DataFrame(
        [[0.000000, 0.675177, 0.189788, 0.135035],
         [0.326024, 0.115984, 0.326024, 0.231968],
         [0.412709, 0.293646, 0.000000, 0.293646]],
        index=[0, 2, 3],
        columns=['da', 'ha', 'la', 'ta'])
class BasicTest:
    """A class that packs all the option for a basic testing"""

    _test_option = MatrixTestOptions(
        file_id_content_map={
            0: "ha ha ha ha la ta ha",
            2: "la la ta ta da da ha",
            3: "ta da ha"
        },
        front_end_option=MatrixFrontEndOption(
            token_option=TokenOption(n_gram_size=1, token_type="word"),
            norm_option=NormOption(use_freq=True,
                                   use_tf_idf=False,
                                   tf_idf_norm_option='l1'),
            culling_option=CullingOption(cull_least_seg=None,
                                         mfw_lowest_rank=None),
            id_temp_label_map={
                0: "test_label_1",
                2: "test_label_2",
                3: "test_label_3"
            }))

    model = MatrixModel(test_options=_test_option)

    expected_raw_count_matrix = pd.DataFrame(
        [[0., 5., 1., 1.], [2., 1., 2., 2.], [1., 1., 0., 1.]],
        index=[0, 2, 3],
        columns=['da', 'ha', 'la', 'ta'])

    expected_final_dtm = pd.DataFrame(
        [[0., .714285714286, 0.142857142857, .142857142857],
         [.285714285714, .142857142857, 0.285714285714, .285714285714],
         [.333333333333, .333333333333, 0., 0.333333333333]],
        index=[0, 2, 3],
        columns=['da', 'ha', 'la', 'ta'])
class CullingTest:
    """A class that packs all the option for a culling test"""

    _test_option = MatrixTestOptions(
        file_id_content_map={
            1: "ha ha ha ha la ha",
            2: "la la ta ta da da ha ha ha",
            3: "la da ha"
        },
        front_end_option=MatrixFrontEndOption(
            token_option=TokenOption(n_gram_size=1, token_type="word"),
            norm_option=NormOption(use_freq=True,
                                   use_tf_idf=False,
                                   tf_idf_norm_option='l1'),
            culling_option=CullingOption(cull_least_seg=3, mfw_lowest_rank=1),
            id_temp_label_map={
                1: "label_1",
                2: "test_2",
                3: "la_la_la_3"
            }))

    model = MatrixModel(test_options=_test_option)

    expected_raw_count_matrix = pd.DataFrame(
        [[0., 5., 1., 0.], [2., 3., 2., 2.], [1., 1., 1., 0.]],
        index=[1, 2, 3],
        columns=['da', 'ha', 'la', 'ta'])

    expected_final_dtm = pd.DataFrame([[1.], [1.], [1.]],
                                      index=[1, 2, 3],
                                      columns=['ha'])
Ejemplo n.º 4
0
 def _doc_term_matrix(self) -> pd.DataFrame:
     """:return: the document term matrix."""
     return self._test_dtm if self._test_dtm is not None \
         else MatrixModel().get_matrix()
Ejemplo n.º 5
0
 def _id_temp_label_map(self) -> IdTempLabelMap:
     """:return: a map takes an id to temp labels."""
     return self._test_id_temp_label_map \
         if self._test_id_temp_label_map is not None \
         else MatrixModel().get_id_temp_label_map()