def test_ProductRecognizer(): """Test ``FeatureExtractor`` class.""" from clair.textprocessing import ProductRecognizer, split_random from clair.coredata import DataStore print "start" data_dir = relative("../../example-data") data = DataStore() data.read_data(data_dir) finder = ProductRecognizer("nikon-d70") print "Test: filter_trainig_samples" samples, _, _ = finder.filter_trainig_samples(data.listings) train_samples = samples print "Number training samples:", len(samples) # print samples #Test if search for training samples worked assert len(samples) > 100 assert all(samples["training_sample"] == 1.0) pe = samples["products"].map(lambda l: "nikon-d70" in l) pa = samples["products_absent"].map(lambda l: "nikon-d70" in l) assert all(pe | pa) print "\nTest: filter_candidate_listings" samples = cand_samples = finder.filter_candidate_listings(data.listings) print "Number candidate samples:", len(samples) #Test if filter for candidate samples worked assert len(samples) > 10 assert all(samples["training_sample"] != 1.0) pe = samples["expected_products"].map(lambda l: "nikon-d70" in l) assert all(pe) print "\nTest: train_finder, compute_accuracy" train_set, test_set = split_random(train_samples, 0.8) finder.train_finder(train_set) finder.compute_accuracy(test_set) print "\nTest: contains_product" for i, (_, listing) in enumerate(cand_samples.iterrows()): if i >= 10: break contains = finder.contains_product(listing) print listing["title"] print "Contains", finder.product_id, ":", contains print print "finished"
def test_split_random(): """Test splitting frame into two fractions randomly.""" from clair.textprocessing import split_random df = pd.DataFrame({"aaa":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # print df frac1, frac2 = split_random(df, 0.3) # print frac1 # print frac2 assert len(frac1) == 3 and len(frac2) == 7 frac1, frac2 = split_random(df, 0.8) # print frac1 # print frac2 assert len(frac1) == 8 and len(frac2) == 2 #Simple test for randomness print split_random(df, 0.4)[0].index assert any(split_random(df, 0.4)[0].index != split_random(df, 0.4)[0].index) print "finished"