Exemple #1
0
class TFIDFModelTest(unittest.TestCase):

    def setUp(self):
        self.data = DummyDataset()
        self.model = TFIDFModel(verbose = False)

    def test_tocsr(self):
        # test item/tag matrix conversion
        TF_expected = [[1, 1, 1, 1, 1, 1],
                       [1, 0, 4, 0, 1, 0],
                       [1, 0, 2, 0, 2, 1],
                       [1, 0, 0, 3, 2, 0],
                       [1, 0, 0, 4, 0, 1]]
        TF = self.model._TFIDFModel__convert_tocsr(self.data.item_tags)
        self.assertTrue(np.array_equal(TF.todense(), TF_expected))

    def test_extract(self):
        # test fact extraction without a threshold
        DF_expected = [[1, 1, 1, 1, 1, 1],
                       [1, 0, 1, 0, 1, 0],
                       [1, 0, 1, 0, 1, 1],
                       [1, 0, 0, 1, 1, 0],
                       [1, 0, 0, 1, 0, 1]]
        DF = self.model._TFIDFModel__extract_facts(self.data.item_tags)
        self.assertTrue(np.array_equal(DF.todense(), DF_expected))

    def test_extract_threshold(self):
        # test preference extraction with a threshold
        P_expected = [[1, 0, 0, 1, 0],
                      [1, 1, 1, 1, 1],
                      [1, 0, 0, 0, 1]]
        P = self.model._TFIDFModel__extract_facts(self.data.ratings, 3.5)
        self.assertTrue(np.array_equal(P.todense(), P_expected))

    def test_tfidf_profiles(self):
        I_expected = np.matrix([[ 0.,          0.86991409,  0.27610534,  0.27610534,  0.12061088,  0.27610534],
                                [ 0.,          0.        ,  0.9940897 ,  0.        ,  0.10856185,  0.        ],
                                [ 0.,          0.        ,  0.83309624,  0.        ,  0.36392077,  0.41654812],
                                [ 0.,          0.        ,  0.        ,  0.96011533,  0.27960428,  0.        ],
                                [ 0.,          0.        ,  0.        ,  0.9701425 ,  0.        ,  0.24253563]])
        self.model.build(self.data)
        # test TFIDF item profile extraction
        self.assertTrue(stringify_matrix(self.model.I().todense()) == 
                        stringify_matrix(I_expected))
                      help="Classification Type , junk-class or sensitive class ")
    argp.add_argument("-s" , "--sample_interval_mode" , choices=[sampleIntervalMode.LINE_MODE , sampleIntervalMode.CRLF_MODE] , 
                      default=sampleIntervalMode.LINE_MODE ,
                      help="The mode with describes what is the inverval symbol between samples , default is LINE_MODE")
    argp.add_argument("-i" , "--input" , type=str , default="stdin" , 
                      help="'sysin' for using standard input ; else file path is needed.")
    args = argp.parse_args()

    logging.info("loadding segmentor")
    segmentor = Segmentor()
    segmentor.load(CWS_MODEL_PATH)
    logging.info("done")

    # loading model
    if args.classname == classname.JUNK :
        model = TFIDFModel()
        model.load_model(JUNK_MODEL_PATH)
    else :
        model = BOOLModel()
        model.load_model(SENSITIVE_MODEL_PATH)
    
    #process the input file
    if args.input == "stdin" :
        ifo = sys.stdin
    else :
        ifo = open(args.input) # if error , just quit
    logging.info("reading samples from %s" %ifo.name)
    
    #print sample interval mode
    logging.info("set sample interval mode as '%s'" %args.sample_interval_mode)
    
Exemple #3
0
 def setUp(self):
     self.data = DummyDataset()
     self.model = TFIDFModel(verbose = False)