Example #1
0
def test_ProductRecognizer():
    """Test ``FeatureExtractor`` class."""
    from clair.textprocessing import ProductRecognizer, split_random
    from clair.coredata import DataStore
    print "start"
    
    data_dir = relative("../../example-data")
    data = DataStore()
    data.read_data(data_dir)
    
    finder = ProductRecognizer("nikon-d70")
    
    print "Test: filter_trainig_samples"
    samples, _, _ = finder.filter_trainig_samples(data.listings)
    train_samples = samples
    print "Number training samples:", len(samples)
#    print samples
    #Test if search for training samples worked
    assert len(samples) > 100
    assert all(samples["training_sample"] == 1.0)
    pe = samples["products"].map(lambda l: "nikon-d70" in l)
    pa = samples["products_absent"].map(lambda l: "nikon-d70" in l)
    assert all(pe | pa)
    
    print "\nTest: filter_candidate_listings"
    samples = cand_samples = finder.filter_candidate_listings(data.listings)
    print "Number candidate samples:", len(samples)
    #Test if filter for candidate samples worked
    assert len(samples) > 10
    assert all(samples["training_sample"] != 1.0)
    pe = samples["expected_products"].map(lambda l: "nikon-d70" in l)
    assert all(pe)
    
    print "\nTest: train_finder, compute_accuracy"
    train_set, test_set = split_random(train_samples, 0.8)
    finder.train_finder(train_set)
    finder.compute_accuracy(test_set)
    
    print "\nTest: contains_product"
    for i, (_, listing) in enumerate(cand_samples.iterrows()):
        if i >= 10:
            break
        contains = finder.contains_product(listing)
        print listing["title"]
        print "Contains", finder.product_id, ":", contains
        print 
    
    print "finished"
Example #2
0
def test_split_random():
    """Test splitting frame into two fractions randomly."""
    from clair.textprocessing import split_random
    
    df = pd.DataFrame({"aaa":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
#    print df
    
    frac1, frac2 = split_random(df, 0.3)
#    print frac1
#    print frac2
    assert len(frac1) == 3 and len(frac2) == 7
    
    frac1, frac2 = split_random(df, 0.8)
#    print frac1
#    print frac2
    assert len(frac1) == 8 and len(frac2) == 2
    
    #Simple test for randomness
    print split_random(df, 0.4)[0].index
    assert any(split_random(df, 0.4)[0].index != split_random(df, 0.4)[0].index)
    
    print "finished"