Example #1
0
def run_ours(Xtr,
             Ytr,
             Xt,
             Yt,
             lb,
             nsample,
             lambda_mode,
             q,
             sample_mode,
             k=None,
             rerun=True,
             eps=0.01,
             min_recall_per_class=0.8,
             log=None):
    #name = 'ours' if k is None else 'oursk'
    name = 'ours{}'.format(int(rerun))
    k = k if k is not None else 100

    dec = DecisionSet(eps)
    dec.train(Xtr,
              Ytr,
              max_k=k,
              nsamp=nsample,
              lamb=lambda_mode,
              q=q,
              mode=sample_mode,
              rerun=rerun,
              min_recall_per_class=min_recall_per_class)
    print('default:', dec.default)

    Xt_ = [Transaction(feat2item(t)) for t in Xt.values]
    Y_pred = dec.predict_all(Xt_)

    if log is None:
        from logger import log
    log('{}-default'.format(name), dec.default)
    log('{}-k'.format(name), len(dec.rules))
    log('{}-maxk'.format(name), k)
    [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)]
    log('{}-q'.format(name), q)
    log('{}-nsample'.format(name), nsample)
    log('{}-lamb'.format(name), lambda_mode)
    log('{}-seq'.format(name), dec.seq)
    log('{}-auc'.format(name),
        roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred)))
    log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred))
    log('{}-disp'.format(name), dispersion(dec.rules, average=True))
    log('{}-overlap'.format(name), overlap(dec.rules))
    log('{}-mode'.format(name), sample_mode)
    [
        log('{}-precisions-tr'.format(name), v, l)
        for l, v in precision(dec).items()
    ]
    [
        log('{}-recall-tr'.format(name), v, l)
        for l, v in recall(dec.rules).items()
    ]
    print(confusion_matrix(Yt, Y_pred))

    return Y_pred
Example #2
0
def evaluate_mutator(mutator, threshold, min_percent, latex=True):
    labels = (POS, NEU, NEG)
    train_loc = root+'Data/twitterData/train_alternative.tsv'
    dev_loc = root+'Data/twitterData/dev_alternative.tsv'
    test_loc = root+'Data/twitterData/test_alternative.tsv'
    train, dev, test = get_final_semeval_data(reduce(lambda x, y: x|y, labels), train_loc, dev_loc, test_loc)
    dev_x, dev_y = dev
    for label in labels:
        pred_y = []
        for tweet in dev_x:
            pred_y.append(label if mutator.apply_filter(tweet, label) else -1)
        if pred_y.count(label) < min_percent*len(pred_y):
            yield str(label), ' (%.3f,0.0)' % threshold
        else:
            yield str(label), ' (%.3f,%.4f)' % (threshold, precision(dev_y, pred_y, label))
Example #3
0
def calculateLSTMaccuracy(receipts, results):
    total_price_total = 0
    total_price_found = 0
    total_price_correct = 0

    currency_total = 0
    currency_found = 0
    currency_correct = 0

    date_total = 0
    date_found = 0
    date_correct = 0

    vendor_total = 0
    vendor_found = 0
    vendor_correct = 0

    tax_rate_total = 0
    tax_rate_found = 0
    tax_rate_correct = 0

    address_total = 0
    address_found = 0
    address_correct = 0

    products_total = 0
    products_found = 0
    products_correct = 0

    count = 0
    for i, receipt in enumerate(receipts):
        corr = True
      ## Check total price
        if 'total_price' in results[i]:
            price = results[i]['total_price'].replace(',','.')
            to_remove = []
            for p in price:
              if util.isInt(p) or p == '.':
                continue
              to_remove.append(p)
            for p in to_remove:
              price = price.replace(p, '')
            if price.count('.') == 2:
                index = price.index('.')
                price = price[0 : index : ] + price[index + 1 : :]
            elif price.count('.') == 1 and len(price.split('.')[-1]) > 2:
                price = price.replace('.', '')
        else:
            price = None
        if price and price != '':
            total_price_found+=1
        if 'total_price' in receipt.groundTruth:
            total_price_total+= 1
            if compare.totalPrice(receipt.groundTruth['total_price'], price):
                total_price_correct += 1
            else:
                corr = False
        ## Check currecy
        if 'currency' in results[i]:
            currency = results[i]['currency']
            to_remove = []
            for c in currency:
              if c.isalpha():
                continue 
              to_remove.append(c)
            for c in to_remove:
              currency = currency.replace(c, '')
        else:
            currency = None
        if currency and currency != '':
            currency_found+=1
        if 'currency' in receipt.groundTruth:
            currency_total+=1
            if compare.currency(receipt.groundTruth['currency'], currency):
                currency_correct += 1
            else:
                corr = False
        ## Check date
        if 'date' in results[i]:
            date = results[i]['date']
            split = date.split(' ')
            if len(split) == 2:
                date = split[0]
                if len(split[1]) > len(split[0]):
                    date = split[1]
        else:
            date = None
        if date and date != '':
            date_found+=1
        if 'date' in receipt.groundTruth:
            date_total+=1
            if compare.date(receipt.groundTruth['date'],date):
                date_correct += 1
            else:
                corr = False
        ## Check vendor
        if 'vendor' in results[i]:
            vendor = results[i]['vendor']
        else:
            vendor = None
        if vendor and vendor != '':
            vendor_found +=1
        if 'vendor' in receipt.groundTruth:
            vendor_total+=1
            if compare.vendor(receipt.groundTruth['vendor'], vendor):
                vendor_correct += 1
            else:
                corr = False
        ## Check tax rate
        if 'tax_rate' in results[i]:
            tax = results[i]['tax_rate']
            split = tax.split(' ')
            if len(split) == 2:
                tax = split[0]
        else:
            tax = None
        if tax and tax != '':
            tax_rate_found+=1
        if 'tax_rate' in receipt.groundTruth:
            tax_rate_total+=1
            if compare.taxRate(receipt.groundTruth['tax_rate'], tax):
                tax_rate_correct += 1
            else:
                corr = False
        ## Check address
        if 'address' in results[i]:
            address = results[i]['address']
        else:
            address = None
        if address and address != '':
            address_found += 1
        if 'address' in receipt.groundTruth:
            address_total+=1
            if compare.address(receipt.groundTruth['address'], address):
                address_correct += 1
            else:
                corr = False
        if 'products' in receipt.groundTruth:
            products_total += len(receipt.groundTruth['products'])
        if 'products' in results[i]:
            products = results[i]['products']
        found = []
        for product in products:
            product['amount'] = 1
            products_found += 1
            if not 'name' in product:
                continue
            if 'products' in receipt.groundTruth:
                real_products = receipt.groundTruth['products']
                for j,real_product in enumerate(real_products):
                    if j in found:
                        continue
                    if compare.products(product, real_product):
                        found.append(j)
                        products_correct += 1
                        break
            

    totalDataPoints = vendor_total + date_total + address_total + tax_rate_total +  total_price_total + currency_total + products_total
    totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found
    totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct

    total_precision = 0
    total_recall = 0
    
    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(receipts))
    print('-----VENDORS-----')
    print(vendor_total, vendor_found, vendor_correct)
    precision = util.precision(vendor_correct, vendor_found)
    recall = util.recall(vendor_total, vendor_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(date_total, date_found, date_correct)
    precision = util.precision(date_correct, date_found)
    recall = util.recall(date_total, date_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(address_total, address_found, address_correct)
    precision = util.precision(address_correct, address_found)
    recall = util.recall(address_total, address_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(tax_rate_total, tax_rate_found, tax_rate_correct)
    precision = util.precision(tax_rate_correct, tax_rate_found)
    recall = util.recall(tax_rate_total, tax_rate_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(total_price_total, total_price_found, total_price_correct)
    precision = util.precision(total_price_correct, total_price_found)
    recall = util.recall(total_price_total, total_price_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currency_total, currency_found, currency_correct)
    precision = util.precision(currency_correct, currency_found)
    recall = util.recall(currency_total, currency_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products_total, products_found, products_correct)
    precision = util.precision(products_correct, products_found)
    recall = util.recall(products_total, products_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
Example #4
0
def calculateRuleBasedAccuracy(receipts):
    total_price_total = 0
    total_price_found = 0
    total_price_correct = 0

    currency_total = 0
    currency_found = 0
    currency_correct = 0

    date_total = 0
    date_found = 0
    date_correct = 0

    vendor_total = 0
    vendor_found = 0
    vendor_correct = 0

    tax_rate_total = 0
    tax_rate_found = 0
    tax_rate_correct = 0

    address_total = 0
    address_found = 0
    address_correct = 0

    products_total = 0
    products_found = 0
    products_correct = 0

    count = 0
    for receipt in receipts:
        corr = True
        ## Check total price
        if 'total_price' in receipt.ruleBasedPrediction:
            price = receipt.ruleBasedPrediction['total_price']
        else:
            price = None
        if price:
            total_price_found+=1
        if 'total_price' in receipt.groundTruth:
            total_price_total+= 1
            if compare.totalPrice(receipt.groundTruth['total_price'], price):
                total_price_correct += 1
            else:
                corr = False
        ## Check currecy
        if 'currency' in receipt.ruleBasedPrediction:
            currency = receipt.ruleBasedPrediction['currency']
        else:
            currency = None
        if currency:
            currency_found+=1
        if 'currency' in receipt.groundTruth:
            currency_total+=1
            if compare.currency(receipt.groundTruth['currency'], currency):
                currency_correct += 1
            else:
                corr = False
        ## Check date
        if 'date' in receipt.ruleBasedPrediction:
            date = receipt.ruleBasedPrediction['date']
        else:
            date = None
        if date:
            date_found+=1
        if 'date' in receipt.groundTruth:
            date_total+=1
            if compare.date(receipt.groundTruth['date'],date):
                date_correct += 1
            else:
                corr = False
        ## Check vendor
        if 'vendor' in receipt.ruleBasedPrediction:
            vendor = receipt.ruleBasedPrediction['vendor']
        else:
            vendor = None
        if vendor:
            vendor_found +=1
        if 'vendor' in receipt.groundTruth:
            vendor_total+=1
            if compare.vendor(receipt.groundTruth['vendor'], vendor):
                vendor_correct += 1
            else:
                corr = False
        ## Check tax rate
        if 'tax_rate' in receipt.ruleBasedPrediction:
            tax = receipt.ruleBasedPrediction['tax_rate']
        else:
            tax = None
        if tax:
            tax_rate_found+=1
        if 'tax_rate' in receipt.groundTruth:
            tax_rate_total+=1
            if compare.taxRate(receipt.groundTruth['tax_rate'], tax):
                tax_rate_correct += 1
            else:
                corr = False
        ## Check address
        if 'address' in receipt.ruleBasedPrediction:
            address = receipt.ruleBasedPrediction['address']
        else:
            address = None
        if address:
            address_found += 1
        if 'address' in receipt.groundTruth:
            address_total+=1
            if compare.address(receipt.groundTruth['address'], address):
                address_correct += 1
            else:
                corr = False
        ## Check products
        if 'products' in receipt.ruleBasedPrediction:
            products = receipt.ruleBasedPrediction['products']
        else:
            products = []
        found = []
        if 'products' in receipt.groundTruth:
            products_total+= len(receipt.groundTruth['products'])
        for product in products:
            products_found += 1
            if 'products' in receipt.groundTruth:
                real_products = receipt.groundTruth['products']
                for j,real_product in enumerate(real_products):
                    if j in found:
                        continue
                    if compare.products(product, real_product):
                        found.append(j)
                        products_correct += 1
                        break
        if 'products' in receipt.groundTruth:
            if len(found) < len(receipt.groundTruth['products']):
                corr = False
        if corr:
            count+=1
    totalDataPoints = vendor_total + date_total + address_total + tax_rate_total +  total_price_total + currency_total + products_total
    totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found
    totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct
    
    total_precision = 0
    total_recall = 0

    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(receipts))
    print('-----VENDORS-----')
    print(vendor_total, vendor_found, vendor_correct)
    precision = util.precision(vendor_correct, vendor_found)
    recall = util.recall(vendor_total, vendor_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(date_total, date_found, date_correct)
    precision = util.precision(date_correct, date_found)
    recall = util.recall(date_total, date_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(address_total, address_found, address_correct)
    precision = util.precision(address_correct, address_found)
    recall = util.recall(address_total, address_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(tax_rate_total, tax_rate_found, tax_rate_correct)
    precision = util.precision(tax_rate_correct, tax_rate_found)
    recall = util.recall(tax_rate_total, tax_rate_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(total_price_total, total_price_found, total_price_correct)
    precision = util.precision(total_price_correct, total_price_found)
    recall = util.recall(total_price_total, total_price_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currency_total, currency_found, currency_correct)
    precision = util.precision(currency_correct, currency_found)
    recall = util.recall(currency_total, currency_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products_total, products_found, products_correct)
    precision = util.precision(products_correct, products_found)
    recall = util.recall(products_total, products_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
def kmeans(eac, removeTerms, ngram):
    terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos']

    print('Filtering tweets')
    tweets = util.read_from_file("dataset.csv")
    if removeTerms:
        tweets = filter.filter_tweets(tweets, terms_to_remove=terms)
    else:
        tweets = filter.filter_tweets(tweets)

    # Reduce tweets list length
    tweets = tweets[0:6300]

    carlitos = 0
    lazaro = 0

    data = []
    for tw in tweets:
        if tw.tw_type == 'Carlitos':
            carlitos += 1
        else:
            lazaro += 1
        data.append(tw.text)

    print(carlitos, lazaro)

    print("Transform Data...")
    # Transform data
    if ngram:
        hasher = HashingVectorizer(non_negative=True,
                                   ngram_range=(1, 3),
                                   analyzer='word',
                                   norm='l2',
                                   binary=False)
    else:
        hasher = HashingVectorizer(non_negative=True, norm='l2', binary=False)
    vectorizer = make_pipeline(hasher)
    X = vectorizer.fit_transform(data)

    count = 0
    precision_list = []

    while count < 100:

        # Start timer
        t0 = time()

        if eac:

            clustering = EAC(30, min_k=2, max_k=10)
            EAC_D = clustering.fit(X).distance_

            # Kmedoids over EAC_D
            kmed = KMedoids(2, init='random', distance_metric="precomputed")
            labels = kmed.fit(EAC_D).labels_

        else:

            km = KMeans(n_clusters=2, init='k-means++', n_init=1, max_iter=100)
            labels = km.fit(X).labels_

        # Assign labels to tweets
        for i in range(len(tweets)):
            tweets[i].label = labels[i]

        print("Precision: ")
        # Print precision
        precision = util.precision(tweets)

        print("done in %0.3fs" % (time() - t0))

        if isAdable(precision):
            precision_list.append(precision)
            count += 1

    return precision_list
def minhash(eac, shingle, removeTerms):
    terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos']

    print('Filtering tweets')
    tweets = util.read_from_file("dataset.csv")
    if removeTerms:
        tweets = filter.filter_tweets(tweets, terms_to_remove=terms)
    else:
        tweets = filter.filter_tweets(tweets)

    # Reduce tweets list length
    tweets = tweets[0:6300]

    carlitos = 0
    lazaro = 0

    data = []
    for tw in tweets:
        if tw.tw_type == 'Carlitos':
            carlitos += 1
        else:
            lazaro += 1
        data.append(tw.text)

    print(carlitos, lazaro)

    # Extract text from tweets
    X = [tw.text for tw in tweets]

    # Start timer
    t0 = time()

    print("Calculating distance matrix...")
    D = metrics.jaccard_minhash_distance_mp(X, shingle_length=shingle)

    count = 0
    precision_list = []

    while count < 100:

        if eac:

            print("EAC clustering...")
            # EAC clustering
            kmedoid = KMedoids(init='random', distance_metric='precomputed')
            clustering = EAC(30, min_k=2, max_k=10, clustering=kmedoid)
            EAC_D = clustering.fit(D).distance_

            # Kmedoids over EAC_D
            kmed = KMedoids(2, init='random', distance_metric="precomputed")
            labels = kmed.fit(EAC_D).labels_

        else:
            kmedoid = KMedoids(2, init='random', distance_metric='precomputed')

            print("Kmedoids clustering...")
            labels = kmedoid.fit(D).labels_

        # Assign labels to tweets
        for i in range(len(tweets)):
            tweets[i].label = labels[i]

        # Print precision
        print("Precision: ")
        precision = util.precision(tweets)

        if isAdable(precision):
            print(count)
            precision_list.append(precision)
            count += 1

        print("done in %0.3fs" % (time() - t0))

    return precision_list
Example #7
0
def calculateMetrics(reciepts, result, writeToFile=False, path=None):
    correctVendors = 0
    vendorsFound = 0
    vendors = 0

    correctDates = 0
    datesFound = 0
    dates = 0

    correctAddresses = 0
    addressesFound = 0
    addresses = 0

    correctTaxes = 0
    taxesFound = 0
    taxes = 0

    correctPrices = 0
    pricesFound = 0
    prices = 0

    correctCurrencies = 0
    currenciesFound = 0
    currencies = 0

    correctProducts = 0
    productsFound = 0
    products = 0

    result_dict = {}
    count = 0
    for i, reciept in enumerate(reciepts):
        corr = True
        vendor = result[i]['vendor']
        result_dict['vendor'] = vendor
        if vendor:
            vendorsFound += 1
            vendor = vendor.lower()
        if 'vendor' in reciept.groundTruth:
            vendors += 1
            if vendor and levenshtein_distance(
                    vendor, reciept.groundTruth['vendor'].lower()) <= 0:
                correctVendors += 1
            else:
                corr = False
        date = result[i]['date']
        result_dict['date'] = date
        if date:
            datesFound += 1
            date = date.lower()
        if 'date' in reciept.groundTruth:
            dates += 1
            if date == reciept.groundTruth['date'].lower(
            ) or date == reciept.groundTruth['date'].lower().replace(' ', ''):
                correctDates += 1
            else:
                corr = False
        address = result[i]['address']
        result_dict['address'] = address
        if address:
            addressesFound += 1
            address = address.lower()
        if 'address' in reciept.groundTruth:
            addresses += 1
            if address and levenshtein_distance(
                    address, reciept.groundTruth['address'].lower()) <= 0:
                correctAddresses += 1
            else:
                corr = False
        tax = result[i]['tax_rate']
        result_dict['tax_rate'] = tax
        if tax != None:
            taxesFound += 1
        if 'tax_rate' in reciept.groundTruth:
            taxes += 1
            real_tax = int(
                float(reciept.groundTruth['tax_rate'].lower().replace('%',
                                                                      '')))
            if tax == real_tax:
                correctTaxes += 1
            else:
                corr = False
        price = result[i]['total_price']
        result_dict['total_price'] = price
        if price:
            pricesFound += 1
        if 'total_price' in reciept.groundTruth:
            prices += 1
            real_price = float(reciept.groundTruth['total_price'].lower())
            if price == real_price:
                correctPrices += 1
            else:
                corr = False
        currency = result[i]['currency']
        result_dict['currency'] = currency
        if currency:
            currenciesFound += 1
            currency = currency.lower()
        if 'currency' in reciept.groundTruth:
            currencies += 1
            if currency == reciept.groundTruth['currency'].lower():
                correctCurrencies += 1
            else:
                corr = False
        productsList = result[i]['products']
        result_dict['products'] = productsList
        if 'products' in reciept.groundTruth:
            for product in reciept.groundTruth['products']:
                products += 1
        checkedIndexes = []
        for product in productsList:
            productsFound += 1
            for i, real_product in enumerate(reciept.groundTruth['products']):
                if i in checkedIndexes:
                    continue
                price = None
                if 'price' in product:
                    price = product['price'].replace(',', '.')
                    try:
                        price = float(price)
                    except:
                        price = None
                real_price = real_product['price']
                real_price = float(real_price)
                if levenshtein_distance(product['name'].lower(),
                                        real_product['name'].lower()) <= 0:
                    if util.floatCompare(price, real_price):
                        if product['amount'] == real_product['amount']:
                            correctProducts += 1
                            checkedIndexes.append(i)
                            break
        if len(checkedIndexes) < len(reciept.groundTruth['products']):
            corr |= False
        if corr:
            count += 1

        if writeToFile:
            with open(os.path.join(path, reciept.path), 'w') as fp:
                json.dump(result_dict, fp, indent=1)

    totalDataPoints = vendors + dates + addresses + taxes + prices + currencies + products
    totalDataPointsFound = vendorsFound + datesFound + addressesFound + taxesFound + pricesFound + currenciesFound + productsFound
    totalCorrect = correctVendors + correctDates + correctAddresses + correctTaxes + correctPrices + correctCurrencies + correctProducts

    total_precision = 0
    total_recall = 0

    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(reciepts))
    print('-----VENDORS-----')
    print(vendors, vendorsFound, correctVendors)
    precision = util.precision(correctVendors, vendorsFound)
    recall = util.recall(vendors, correctVendors)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(dates, datesFound, correctDates)
    precision = util.precision(correctDates, datesFound)
    recall = util.recall(dates, correctDates)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(addresses, addressesFound, correctAddresses)
    precision = util.precision(correctAddresses, addressesFound)
    recall = util.recall(addresses, correctAddresses)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(taxes, taxesFound, correctTaxes)
    precision = util.precision(correctTaxes, taxesFound)
    recall = util.recall(taxes, correctTaxes)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(prices, pricesFound, correctPrices)
    precision = util.precision(correctPrices, pricesFound)
    recall = util.recall(prices, correctPrices)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currencies, currenciesFound, correctCurrencies)
    precision = util.precision(correctCurrencies, currenciesFound)
    recall = util.recall(currencies, correctCurrencies)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products, productsFound, correctProducts)
    precision = util.precision(correctProducts, productsFound)
    recall = util.recall(products, correctProducts)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
Example #8
0
    def train(self,
              X,
              Y,
              max_k=100,
              nsamp=100,
              lamb=None,
              q='kl',
              mode=3,
              rerun=True,
              min_recall_per_class=0.5):
        print('##### START #####')
        Itemset.clear_db()
        prep_db(X, Y)

        # Allow specify lamb to a certain number by users
        if type(lamb) == str or lamb is None:
            samp = self.sample_from_each_label(set(Itemset.labels), 100, set(),
                                               mode)
            if lamb == 'max':
                lamb = np.max([Rule.quality([r], metric=q) for r in samp])
            elif lamb == 'mean':
                lamb = np.mean([Rule.quality([r], metric=q) for r in samp])
            else:
                lamb = 0
            print('lamb:', lamb)

        greed = GreedyDiv([], lamb)
        U_all = []
        labels_samp = set(Itemset.labels)
        while len(self) < max_k and len(labels_samp) > 0:
            if mode == 0:
                samps = []
                for label in labels_samp:
                    _, samp = sample_rn(nsamp, label)
                    samp = [Rule(s, label)
                            for s in list(samp)]  # Very time-consuming
                    samps.extend(samp)
                U = set(samps)
            else:
                covered = set([t for r in self.rules for t in r.trans()])
                U = self.sample_from_each_label(labels_samp, nsamp, covered,
                                                mode)
            print('nsamp (after):', len(U))
            if len(U) == 0:
                break
            U_all.extend(U)

            # Greedy
            greed.update_univ(U)
            r = greed.greedy_once()
            # Termination criteria. Also check zero sampling above.
            if self.enough(r):
                # Include at least one rule per class, except default class.
                labels_samp.remove(r.label)
                print('remove label:', r.label)
            else:
                # Print quality vs. dispersion
                q, d = obj(self.rules, lamb, sep=True)
                qr, dr = obj(self.rules + [r], lamb, sep=True)
                print('inc q vs. d: {}, {}'.format(qr - q, dr - d))

                self.add(r)
                if np.abs(recall(self.rules)[r.label] - 1.0) < 1e-8:
                    labels_samp.remove(r.label)
                print('#{} '.format(len(self.rules)), end='')
                printRules([r])

        # Consecutive greedy over all sampels
        if rerun:
            greed.clear()
            greed.update_univ(list(set(U_all)))
            rules = greed.greedy(len(self.rules))
            if obj(rules, lamb) > obj(self.rules, lamb):
                print('Full greedy wins: {} > {}'.format(
                    obj(rules, lamb), obj(self.rules, lamb)))
                self.reset(rules)

        default = self.set_default()
        print('default:', default)

        self.build()

        print('precision: ', precision(self).items())
        print('recall (coverage): ', recall(self.rules).items())
        print('ave disp: ', dispersion(self.rules, average=True))
        print('##### END #####')