class TFIDFModelTest(unittest.TestCase): def setUp(self): self.data = DummyDataset() self.model = TFIDFModel(verbose = False) def test_tocsr(self): # test item/tag matrix conversion TF_expected = [[1, 1, 1, 1, 1, 1], [1, 0, 4, 0, 1, 0], [1, 0, 2, 0, 2, 1], [1, 0, 0, 3, 2, 0], [1, 0, 0, 4, 0, 1]] TF = self.model._TFIDFModel__convert_tocsr(self.data.item_tags) self.assertTrue(np.array_equal(TF.todense(), TF_expected)) def test_extract(self): # test fact extraction without a threshold DF_expected = [[1, 1, 1, 1, 1, 1], [1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 1], [1, 0, 0, 1, 1, 0], [1, 0, 0, 1, 0, 1]] DF = self.model._TFIDFModel__extract_facts(self.data.item_tags) self.assertTrue(np.array_equal(DF.todense(), DF_expected)) def test_extract_threshold(self): # test preference extraction with a threshold P_expected = [[1, 0, 0, 1, 0], [1, 1, 1, 1, 1], [1, 0, 0, 0, 1]] P = self.model._TFIDFModel__extract_facts(self.data.ratings, 3.5) self.assertTrue(np.array_equal(P.todense(), P_expected)) def test_tfidf_profiles(self): I_expected = np.matrix([[ 0., 0.86991409, 0.27610534, 0.27610534, 0.12061088, 0.27610534], [ 0., 0. , 0.9940897 , 0. , 0.10856185, 0. ], [ 0., 0. , 0.83309624, 0. , 0.36392077, 0.41654812], [ 0., 0. , 0. , 0.96011533, 0.27960428, 0. ], [ 0., 0. , 0. , 0.9701425 , 0. , 0.24253563]]) self.model.build(self.data) # test TFIDF item profile extraction self.assertTrue(stringify_matrix(self.model.I().todense()) == stringify_matrix(I_expected))
help="Classification Type , junk-class or sensitive class ") argp.add_argument("-s" , "--sample_interval_mode" , choices=[sampleIntervalMode.LINE_MODE , sampleIntervalMode.CRLF_MODE] , default=sampleIntervalMode.LINE_MODE , help="The mode with describes what is the inverval symbol between samples , default is LINE_MODE") argp.add_argument("-i" , "--input" , type=str , default="stdin" , help="'sysin' for using standard input ; else file path is needed.") args = argp.parse_args() logging.info("loadding segmentor") segmentor = Segmentor() segmentor.load(CWS_MODEL_PATH) logging.info("done") # loading model if args.classname == classname.JUNK : model = TFIDFModel() model.load_model(JUNK_MODEL_PATH) else : model = BOOLModel() model.load_model(SENSITIVE_MODEL_PATH) #process the input file if args.input == "stdin" : ifo = sys.stdin else : ifo = open(args.input) # if error , just quit logging.info("reading samples from %s" %ifo.name) #print sample interval mode logging.info("set sample interval mode as '%s'" %args.sample_interval_mode)
def setUp(self): self.data = DummyDataset() self.model = TFIDFModel(verbose = False)