class TfTest: _test_option = MatrixTestOptions( file_id_content_map={ 0: "ha ha ha ha la ta ha", 2: "la la ta ta da da ha", 3: "ta da ha" }, front_end_option=MatrixFrontEndOption( token_option=TokenOption(n_gram_size=1, token_type="word"), norm_option=NormOption(use_freq=True, use_tf_idf=True, tf_idf_norm_option='l1'), culling_option=CullingOption(cull_least_seg=None, mfw_lowest_rank=None), id_temp_label_map={ 0: "test_label_1", 2: "test_label_2", 3: "test_label_3" })) model = MatrixModel(test_options=_test_option) expected_final_dtm = pd.DataFrame( [[0.000000, 0.675177, 0.189788, 0.135035], [0.326024, 0.115984, 0.326024, 0.231968], [0.412709, 0.293646, 0.000000, 0.293646]], index=[0, 2, 3], columns=['da', 'ha', 'la', 'ta'])
class BasicTest: """A class that packs all the option for a basic testing""" _test_option = MatrixTestOptions( file_id_content_map={ 0: "ha ha ha ha la ta ha", 2: "la la ta ta da da ha", 3: "ta da ha" }, front_end_option=MatrixFrontEndOption( token_option=TokenOption(n_gram_size=1, token_type="word"), norm_option=NormOption(use_freq=True, use_tf_idf=False, tf_idf_norm_option='l1'), culling_option=CullingOption(cull_least_seg=None, mfw_lowest_rank=None), id_temp_label_map={ 0: "test_label_1", 2: "test_label_2", 3: "test_label_3" })) model = MatrixModel(test_options=_test_option) expected_raw_count_matrix = pd.DataFrame( [[0., 5., 1., 1.], [2., 1., 2., 2.], [1., 1., 0., 1.]], index=[0, 2, 3], columns=['da', 'ha', 'la', 'ta']) expected_final_dtm = pd.DataFrame( [[0., .714285714286, 0.142857142857, .142857142857], [.285714285714, .142857142857, 0.285714285714, .285714285714], [.333333333333, .333333333333, 0., 0.333333333333]], index=[0, 2, 3], columns=['da', 'ha', 'la', 'ta'])
class CullingTest: """A class that packs all the option for a culling test""" _test_option = MatrixTestOptions( file_id_content_map={ 1: "ha ha ha ha la ha", 2: "la la ta ta da da ha ha ha", 3: "la da ha" }, front_end_option=MatrixFrontEndOption( token_option=TokenOption(n_gram_size=1, token_type="word"), norm_option=NormOption(use_freq=True, use_tf_idf=False, tf_idf_norm_option='l1'), culling_option=CullingOption(cull_least_seg=3, mfw_lowest_rank=1), id_temp_label_map={ 1: "label_1", 2: "test_2", 3: "la_la_la_3" })) model = MatrixModel(test_options=_test_option) expected_raw_count_matrix = pd.DataFrame( [[0., 5., 1., 0.], [2., 3., 2., 2.], [1., 1., 1., 0.]], index=[1, 2, 3], columns=['da', 'ha', 'la', 'ta']) expected_final_dtm = pd.DataFrame([[1.], [1.], [1.]], index=[1, 2, 3], columns=['ha'])
def _doc_term_matrix(self) -> pd.DataFrame: """:return: the document term matrix.""" return self._test_dtm if self._test_dtm is not None \ else MatrixModel().get_matrix()
def _id_temp_label_map(self) -> IdTempLabelMap: """:return: a map takes an id to temp labels.""" return self._test_id_temp_label_map \ if self._test_id_temp_label_map is not None \ else MatrixModel().get_id_temp_label_map()