def main():
    opts = util.parse_args()
    X, y = util.data_load(opts.dataset)

    fc_nn_model = fc.create_model()
    ada_model = AdaBoostClassifier(n_estimators=100, random_state=0)
    svm_model = SVC(C=1000, gamma=0.1)

    n = opts.upsamplen if opts.upsamplen is not None else 1
    start = n if opts.upsamplestart is None else 1

    if start > n:
        print("unsample range error")
        sys.exit()
    conf_fc, conf_ada, conf_svm = [], [], []
    for t in np.arange(start, n + 1):
        needed = util.needed_n(X, y, t)
        temp_X, temp_y = util.upsample(X, y, needed)
        X_train, X_test, y_train, y_test = train_test_split(temp_X,
                                                            temp_y,
                                                            test_size=0.3,
                                                            random_state=42)
        X_train, X_test = util.normalize(X_train, X_test)
        train_dset = tf.data.Dataset.from_tensor_slices(
            (X_train,
             y_train)).batch(64,
                             drop_remainder=False).shuffle(buffer_size=10000)
        test_dset = tf.data.Dataset.from_tensor_slices(
            (X_test, y_test)).batch(64)
        ada_model.fit(X_train, y_train)
        svm_model.fit(X_train, y_train)
        fc_nn_model.fit(train_dset, epochs=10)
        pred_ada = ada_model.predict(X_test)
        pred_svm = svm_model.predict(X_test)
        conf_ada.append(confusion_matrix(y_test, pred_ada))
        conf_svm.append(confusion_matrix(y_test, pred_svm))
        temp = np.zeros((2, 2), dtype=int)
        for d, labels in test_dset:
            predictions = fc_nn_model(d)
            for i in range(len(d)):
                temp[labels[i]][np.argmax(predictions[i])] += 1
        conf_fc.append(temp)
    recall_fc = list(map(lambda x: util.recall(x), conf_fc))
    recall_ada = list(map(lambda x: util.recall(x), conf_ada))
    recall_svm = list(map(lambda x: util.recall(x), conf_svm))
    up_range = np.arange(start, n + 1)
    d = {"SVM": recall_svm, "Adaboost": recall_ada, "FC_NN": recall_fc}
    legends = ["SVM", "Adaboost", "FC_NN"]
    for key in d:
        plt.plot(up_range, d[key])
    plt.title("Recall Vs Upsample Graph")
    plt.xlabel("Upsample rate")
    plt.ylabel("Recall")
    plt.legend(legends)
    plt.show()
Ejemplo n.º 2
0
def extract_and_upload(
    vid_path="vids",
    out_frame_skip=3,
    out_duration=4,
    use_roi=True,
    gif_color=False,
    gif_delay=8,
    quiet=False,
    remove_source=True,
    to_imgur=False,
    to_tumblr=False,
    to_snapchat=False,
):
    input_file = [file for file in os.listdir(vid_path) if not file.endswith("part") and not file.startswith(".")][0]
    # TEMP WORKAROUND:
    global last_snapchat
    global last_tumblr
    global last_imgur
    try:
        vid = SpelunkyVideo(os.path.join(vid_path, input_file))
        vid.templates = get_templates(vid.template_scale)
        extract_death(
            vid,
            out_frame_skip=out_frame_skip,
            out_duration=out_duration,
            use_roi=use_roi,
            gif_color=gif_color,
            gif_delay=gif_delay,
            quiet=quiet,
        )
        if to_imgur and time.time() - last_imgur > 0:
            upload_gif_imgur(vid)
            last_imgur = time.time()
        if to_tumblr and time.time() - last_tumblr > 0:
            upload_gif_tumblr(vid)
            last_tumblr = time.time()
        if to_snapchat and time.time() - last_snapchat >= 60 * 60 * 8:
            snapchat.login(SNAPCHAT_USER, SNAPCHAT_PASS)
            snapchat_followback()
            send_snapchat(vid)
            last_snapchat = time.time()
            os.remove(vid.out_mp4)
        if remove_source:
            os.remove(os.path.join(vid_path, input_file))
    except (cv2.error, OSError, IOError, TypeError, AttributeError) as e:
        print e
        print "\nSkipping", vid.input_file, "likely due to failure to extract"
        print "moving to problems/", vid.input_file_tail
        os.rename(vid.input_file, "problems/" + vid.input_file_tail)
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_tb(exc_traceback, limit=None, file=sys.stderr)
        sys.stderr.flush()
        util.recall(extract_and_upload)
Ejemplo n.º 3
0
def main():
    opts = util.parse_args()
    X, y = util.data_load(opts.dataset)
    model = create_model()
    model_layers = create_model_more_layers()
    n = opts.upsamplen if opts.upsamplen is not None else 1
    start = n if opts.upsamplestart is None else 1
    all_conf = []
    all_conf_layers = []
    if start > n:
        print("Upsample start should be larger than end")
        sys.exit()
    for t in np.arange(start, n + 1):
        print("t", t)
        needed = util.needed_n(X, y, t)
        temp_X, temp_y = util.upsample(X, y, needed)
        X_train, X_test, y_train, y_test = train_test_split(temp_X,
                                                            temp_y,
                                                            test_size=0.3,
                                                            random_state=42)
        X_train, X_test = util.normalize(X_train, X_test)
        train_dset = tf.data.Dataset.from_tensor_slices(
            (X_train,
             y_train)).batch(64,
                             drop_remainder=False).shuffle(buffer_size=10000)
        test_dset = tf.data.Dataset.from_tensor_slices(
            (X_test, y_test)).batch(64)
        model.fit(train_dset, epochs=10)
        model_layers.fit(train_dset, epochs=10)
        conf_mat = np.zeros((2, 2), dtype=int)
        conf_mat_layers = np.zeros((2, 2), dtype=int)
        for d, labels in test_dset:
            predictions = model(d)
            predictions_layers = model_layers(d)
            for i in range(len(d)):
                conf_mat[labels[i]][np.argmax(predictions[i])] += 1
                conf_mat_layers[labels[i]][np.argmax(
                    predictions_layers[i])] += 1
        all_conf.append(conf_mat)
        all_conf_layers.append(conf_mat_layers)
    re_fc, re_fc_layer = list(map(lambda x: util.recall(x), all_conf)), list(
        (map(lambda x: util.recall(x), all_conf_layers)))
    up_range = np.arange(start, n + 1)
    plt.plot(up_range, re_fc)
    plt.plot(up_range, re_fc_layer)
    plt.title("2-layer NN vs 3-layer NN")
    plt.legend(["2-layer", "3-layer"])
    plt.xlabel("Upsample rate")
    plt.ylabel("Recall")
    plt.show()
Ejemplo n.º 4
0
def run_ours(Xtr,
             Ytr,
             Xt,
             Yt,
             lb,
             nsample,
             lambda_mode,
             q,
             sample_mode,
             k=None,
             rerun=True,
             eps=0.01,
             min_recall_per_class=0.8,
             log=None):
    #name = 'ours' if k is None else 'oursk'
    name = 'ours{}'.format(int(rerun))
    k = k if k is not None else 100

    dec = DecisionSet(eps)
    dec.train(Xtr,
              Ytr,
              max_k=k,
              nsamp=nsample,
              lamb=lambda_mode,
              q=q,
              mode=sample_mode,
              rerun=rerun,
              min_recall_per_class=min_recall_per_class)
    print('default:', dec.default)

    Xt_ = [Transaction(feat2item(t)) for t in Xt.values]
    Y_pred = dec.predict_all(Xt_)

    if log is None:
        from logger import log
    log('{}-default'.format(name), dec.default)
    log('{}-k'.format(name), len(dec.rules))
    log('{}-maxk'.format(name), k)
    [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)]
    log('{}-q'.format(name), q)
    log('{}-nsample'.format(name), nsample)
    log('{}-lamb'.format(name), lambda_mode)
    log('{}-seq'.format(name), dec.seq)
    log('{}-auc'.format(name),
        roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred)))
    log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred))
    log('{}-disp'.format(name), dispersion(dec.rules, average=True))
    log('{}-overlap'.format(name), overlap(dec.rules))
    log('{}-mode'.format(name), sample_mode)
    [
        log('{}-precisions-tr'.format(name), v, l)
        for l, v in precision(dec).items()
    ]
    [
        log('{}-recall-tr'.format(name), v, l)
        for l, v in recall(dec.rules).items()
    ]
    print(confusion_matrix(Yt, Y_pred))

    return Y_pred
Ejemplo n.º 5
0
 def set_default(self, label=None):
     '''The most under-represented class'''
     if label is not None:
         self.default = label
         return label
     rc = recall(self.rules)
     idx = np.argmin([rc[label] for label in Itemset.labels])
     deft = Itemset.labels[idx]
     self.default = deft
     return self.default
Ejemplo n.º 6
0
def calculateLSTMaccuracy(receipts, results):
    total_price_total = 0
    total_price_found = 0
    total_price_correct = 0

    currency_total = 0
    currency_found = 0
    currency_correct = 0

    date_total = 0
    date_found = 0
    date_correct = 0

    vendor_total = 0
    vendor_found = 0
    vendor_correct = 0

    tax_rate_total = 0
    tax_rate_found = 0
    tax_rate_correct = 0

    address_total = 0
    address_found = 0
    address_correct = 0

    products_total = 0
    products_found = 0
    products_correct = 0

    count = 0
    for i, receipt in enumerate(receipts):
        corr = True
      ## Check total price
        if 'total_price' in results[i]:
            price = results[i]['total_price'].replace(',','.')
            to_remove = []
            for p in price:
              if util.isInt(p) or p == '.':
                continue
              to_remove.append(p)
            for p in to_remove:
              price = price.replace(p, '')
            if price.count('.') == 2:
                index = price.index('.')
                price = price[0 : index : ] + price[index + 1 : :]
            elif price.count('.') == 1 and len(price.split('.')[-1]) > 2:
                price = price.replace('.', '')
        else:
            price = None
        if price and price != '':
            total_price_found+=1
        if 'total_price' in receipt.groundTruth:
            total_price_total+= 1
            if compare.totalPrice(receipt.groundTruth['total_price'], price):
                total_price_correct += 1
            else:
                corr = False
        ## Check currecy
        if 'currency' in results[i]:
            currency = results[i]['currency']
            to_remove = []
            for c in currency:
              if c.isalpha():
                continue 
              to_remove.append(c)
            for c in to_remove:
              currency = currency.replace(c, '')
        else:
            currency = None
        if currency and currency != '':
            currency_found+=1
        if 'currency' in receipt.groundTruth:
            currency_total+=1
            if compare.currency(receipt.groundTruth['currency'], currency):
                currency_correct += 1
            else:
                corr = False
        ## Check date
        if 'date' in results[i]:
            date = results[i]['date']
            split = date.split(' ')
            if len(split) == 2:
                date = split[0]
                if len(split[1]) > len(split[0]):
                    date = split[1]
        else:
            date = None
        if date and date != '':
            date_found+=1
        if 'date' in receipt.groundTruth:
            date_total+=1
            if compare.date(receipt.groundTruth['date'],date):
                date_correct += 1
            else:
                corr = False
        ## Check vendor
        if 'vendor' in results[i]:
            vendor = results[i]['vendor']
        else:
            vendor = None
        if vendor and vendor != '':
            vendor_found +=1
        if 'vendor' in receipt.groundTruth:
            vendor_total+=1
            if compare.vendor(receipt.groundTruth['vendor'], vendor):
                vendor_correct += 1
            else:
                corr = False
        ## Check tax rate
        if 'tax_rate' in results[i]:
            tax = results[i]['tax_rate']
            split = tax.split(' ')
            if len(split) == 2:
                tax = split[0]
        else:
            tax = None
        if tax and tax != '':
            tax_rate_found+=1
        if 'tax_rate' in receipt.groundTruth:
            tax_rate_total+=1
            if compare.taxRate(receipt.groundTruth['tax_rate'], tax):
                tax_rate_correct += 1
            else:
                corr = False
        ## Check address
        if 'address' in results[i]:
            address = results[i]['address']
        else:
            address = None
        if address and address != '':
            address_found += 1
        if 'address' in receipt.groundTruth:
            address_total+=1
            if compare.address(receipt.groundTruth['address'], address):
                address_correct += 1
            else:
                corr = False
        if 'products' in receipt.groundTruth:
            products_total += len(receipt.groundTruth['products'])
        if 'products' in results[i]:
            products = results[i]['products']
        found = []
        for product in products:
            product['amount'] = 1
            products_found += 1
            if not 'name' in product:
                continue
            if 'products' in receipt.groundTruth:
                real_products = receipt.groundTruth['products']
                for j,real_product in enumerate(real_products):
                    if j in found:
                        continue
                    if compare.products(product, real_product):
                        found.append(j)
                        products_correct += 1
                        break
            

    totalDataPoints = vendor_total + date_total + address_total + tax_rate_total +  total_price_total + currency_total + products_total
    totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found
    totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct

    total_precision = 0
    total_recall = 0
    
    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(receipts))
    print('-----VENDORS-----')
    print(vendor_total, vendor_found, vendor_correct)
    precision = util.precision(vendor_correct, vendor_found)
    recall = util.recall(vendor_total, vendor_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(date_total, date_found, date_correct)
    precision = util.precision(date_correct, date_found)
    recall = util.recall(date_total, date_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(address_total, address_found, address_correct)
    precision = util.precision(address_correct, address_found)
    recall = util.recall(address_total, address_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(tax_rate_total, tax_rate_found, tax_rate_correct)
    precision = util.precision(tax_rate_correct, tax_rate_found)
    recall = util.recall(tax_rate_total, tax_rate_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(total_price_total, total_price_found, total_price_correct)
    precision = util.precision(total_price_correct, total_price_found)
    recall = util.recall(total_price_total, total_price_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currency_total, currency_found, currency_correct)
    precision = util.precision(currency_correct, currency_found)
    recall = util.recall(currency_total, currency_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products_total, products_found, products_correct)
    precision = util.precision(products_correct, products_found)
    recall = util.recall(products_total, products_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
Ejemplo n.º 7
0
def calculateRuleBasedAccuracy(receipts):
    total_price_total = 0
    total_price_found = 0
    total_price_correct = 0

    currency_total = 0
    currency_found = 0
    currency_correct = 0

    date_total = 0
    date_found = 0
    date_correct = 0

    vendor_total = 0
    vendor_found = 0
    vendor_correct = 0

    tax_rate_total = 0
    tax_rate_found = 0
    tax_rate_correct = 0

    address_total = 0
    address_found = 0
    address_correct = 0

    products_total = 0
    products_found = 0
    products_correct = 0

    count = 0
    for receipt in receipts:
        corr = True
        ## Check total price
        if 'total_price' in receipt.ruleBasedPrediction:
            price = receipt.ruleBasedPrediction['total_price']
        else:
            price = None
        if price:
            total_price_found+=1
        if 'total_price' in receipt.groundTruth:
            total_price_total+= 1
            if compare.totalPrice(receipt.groundTruth['total_price'], price):
                total_price_correct += 1
            else:
                corr = False
        ## Check currecy
        if 'currency' in receipt.ruleBasedPrediction:
            currency = receipt.ruleBasedPrediction['currency']
        else:
            currency = None
        if currency:
            currency_found+=1
        if 'currency' in receipt.groundTruth:
            currency_total+=1
            if compare.currency(receipt.groundTruth['currency'], currency):
                currency_correct += 1
            else:
                corr = False
        ## Check date
        if 'date' in receipt.ruleBasedPrediction:
            date = receipt.ruleBasedPrediction['date']
        else:
            date = None
        if date:
            date_found+=1
        if 'date' in receipt.groundTruth:
            date_total+=1
            if compare.date(receipt.groundTruth['date'],date):
                date_correct += 1
            else:
                corr = False
        ## Check vendor
        if 'vendor' in receipt.ruleBasedPrediction:
            vendor = receipt.ruleBasedPrediction['vendor']
        else:
            vendor = None
        if vendor:
            vendor_found +=1
        if 'vendor' in receipt.groundTruth:
            vendor_total+=1
            if compare.vendor(receipt.groundTruth['vendor'], vendor):
                vendor_correct += 1
            else:
                corr = False
        ## Check tax rate
        if 'tax_rate' in receipt.ruleBasedPrediction:
            tax = receipt.ruleBasedPrediction['tax_rate']
        else:
            tax = None
        if tax:
            tax_rate_found+=1
        if 'tax_rate' in receipt.groundTruth:
            tax_rate_total+=1
            if compare.taxRate(receipt.groundTruth['tax_rate'], tax):
                tax_rate_correct += 1
            else:
                corr = False
        ## Check address
        if 'address' in receipt.ruleBasedPrediction:
            address = receipt.ruleBasedPrediction['address']
        else:
            address = None
        if address:
            address_found += 1
        if 'address' in receipt.groundTruth:
            address_total+=1
            if compare.address(receipt.groundTruth['address'], address):
                address_correct += 1
            else:
                corr = False
        ## Check products
        if 'products' in receipt.ruleBasedPrediction:
            products = receipt.ruleBasedPrediction['products']
        else:
            products = []
        found = []
        if 'products' in receipt.groundTruth:
            products_total+= len(receipt.groundTruth['products'])
        for product in products:
            products_found += 1
            if 'products' in receipt.groundTruth:
                real_products = receipt.groundTruth['products']
                for j,real_product in enumerate(real_products):
                    if j in found:
                        continue
                    if compare.products(product, real_product):
                        found.append(j)
                        products_correct += 1
                        break
        if 'products' in receipt.groundTruth:
            if len(found) < len(receipt.groundTruth['products']):
                corr = False
        if corr:
            count+=1
    totalDataPoints = vendor_total + date_total + address_total + tax_rate_total +  total_price_total + currency_total + products_total
    totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found
    totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct
    
    total_precision = 0
    total_recall = 0

    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(receipts))
    print('-----VENDORS-----')
    print(vendor_total, vendor_found, vendor_correct)
    precision = util.precision(vendor_correct, vendor_found)
    recall = util.recall(vendor_total, vendor_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(date_total, date_found, date_correct)
    precision = util.precision(date_correct, date_found)
    recall = util.recall(date_total, date_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(address_total, address_found, address_correct)
    precision = util.precision(address_correct, address_found)
    recall = util.recall(address_total, address_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(tax_rate_total, tax_rate_found, tax_rate_correct)
    precision = util.precision(tax_rate_correct, tax_rate_found)
    recall = util.recall(tax_rate_total, tax_rate_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(total_price_total, total_price_found, total_price_correct)
    precision = util.precision(total_price_correct, total_price_found)
    recall = util.recall(total_price_total, total_price_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currency_total, currency_found, currency_correct)
    precision = util.precision(currency_correct, currency_found)
    recall = util.recall(currency_total, currency_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products_total, products_found, products_correct)
    precision = util.precision(products_correct, products_found)
    recall = util.recall(products_total, products_correct)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
Ejemplo n.º 8
0
def calculateMetrics(reciepts, result, writeToFile=False, path=None):
    correctVendors = 0
    vendorsFound = 0
    vendors = 0

    correctDates = 0
    datesFound = 0
    dates = 0

    correctAddresses = 0
    addressesFound = 0
    addresses = 0

    correctTaxes = 0
    taxesFound = 0
    taxes = 0

    correctPrices = 0
    pricesFound = 0
    prices = 0

    correctCurrencies = 0
    currenciesFound = 0
    currencies = 0

    correctProducts = 0
    productsFound = 0
    products = 0

    result_dict = {}
    count = 0
    for i, reciept in enumerate(reciepts):
        corr = True
        vendor = result[i]['vendor']
        result_dict['vendor'] = vendor
        if vendor:
            vendorsFound += 1
            vendor = vendor.lower()
        if 'vendor' in reciept.groundTruth:
            vendors += 1
            if vendor and levenshtein_distance(
                    vendor, reciept.groundTruth['vendor'].lower()) <= 0:
                correctVendors += 1
            else:
                corr = False
        date = result[i]['date']
        result_dict['date'] = date
        if date:
            datesFound += 1
            date = date.lower()
        if 'date' in reciept.groundTruth:
            dates += 1
            if date == reciept.groundTruth['date'].lower(
            ) or date == reciept.groundTruth['date'].lower().replace(' ', ''):
                correctDates += 1
            else:
                corr = False
        address = result[i]['address']
        result_dict['address'] = address
        if address:
            addressesFound += 1
            address = address.lower()
        if 'address' in reciept.groundTruth:
            addresses += 1
            if address and levenshtein_distance(
                    address, reciept.groundTruth['address'].lower()) <= 0:
                correctAddresses += 1
            else:
                corr = False
        tax = result[i]['tax_rate']
        result_dict['tax_rate'] = tax
        if tax != None:
            taxesFound += 1
        if 'tax_rate' in reciept.groundTruth:
            taxes += 1
            real_tax = int(
                float(reciept.groundTruth['tax_rate'].lower().replace('%',
                                                                      '')))
            if tax == real_tax:
                correctTaxes += 1
            else:
                corr = False
        price = result[i]['total_price']
        result_dict['total_price'] = price
        if price:
            pricesFound += 1
        if 'total_price' in reciept.groundTruth:
            prices += 1
            real_price = float(reciept.groundTruth['total_price'].lower())
            if price == real_price:
                correctPrices += 1
            else:
                corr = False
        currency = result[i]['currency']
        result_dict['currency'] = currency
        if currency:
            currenciesFound += 1
            currency = currency.lower()
        if 'currency' in reciept.groundTruth:
            currencies += 1
            if currency == reciept.groundTruth['currency'].lower():
                correctCurrencies += 1
            else:
                corr = False
        productsList = result[i]['products']
        result_dict['products'] = productsList
        if 'products' in reciept.groundTruth:
            for product in reciept.groundTruth['products']:
                products += 1
        checkedIndexes = []
        for product in productsList:
            productsFound += 1
            for i, real_product in enumerate(reciept.groundTruth['products']):
                if i in checkedIndexes:
                    continue
                price = None
                if 'price' in product:
                    price = product['price'].replace(',', '.')
                    try:
                        price = float(price)
                    except:
                        price = None
                real_price = real_product['price']
                real_price = float(real_price)
                if levenshtein_distance(product['name'].lower(),
                                        real_product['name'].lower()) <= 0:
                    if util.floatCompare(price, real_price):
                        if product['amount'] == real_product['amount']:
                            correctProducts += 1
                            checkedIndexes.append(i)
                            break
        if len(checkedIndexes) < len(reciept.groundTruth['products']):
            corr |= False
        if corr:
            count += 1

        if writeToFile:
            with open(os.path.join(path, reciept.path), 'w') as fp:
                json.dump(result_dict, fp, indent=1)

    totalDataPoints = vendors + dates + addresses + taxes + prices + currencies + products
    totalDataPointsFound = vendorsFound + datesFound + addressesFound + taxesFound + pricesFound + currenciesFound + productsFound
    totalCorrect = correctVendors + correctDates + correctAddresses + correctTaxes + correctPrices + correctCurrencies + correctProducts

    total_precision = 0
    total_recall = 0

    print('-----TOTAL CORRECT RECEIPTS-----')
    print(count, 'of', len(reciepts))
    print('-----VENDORS-----')
    print(vendors, vendorsFound, correctVendors)
    precision = util.precision(correctVendors, vendorsFound)
    recall = util.recall(vendors, correctVendors)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----DATES-----')
    print(dates, datesFound, correctDates)
    precision = util.precision(correctDates, datesFound)
    recall = util.recall(dates, correctDates)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----ADDRESSES-----')
    print(addresses, addressesFound, correctAddresses)
    precision = util.precision(correctAddresses, addressesFound)
    recall = util.recall(addresses, correctAddresses)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----TAX RATES-----')
    print(taxes, taxesFound, correctTaxes)
    precision = util.precision(correctTaxes, taxesFound)
    recall = util.recall(taxes, correctTaxes)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRICE-----')
    print(prices, pricesFound, correctPrices)
    precision = util.precision(correctPrices, pricesFound)
    recall = util.recall(prices, correctPrices)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----CURRENCY-----')
    print(currencies, currenciesFound, correctCurrencies)
    precision = util.precision(correctCurrencies, currenciesFound)
    recall = util.recall(currencies, correctCurrencies)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----PRODUCTS-----')
    print(products, productsFound, correctProducts)
    precision = util.precision(correctProducts, productsFound)
    recall = util.recall(products, correctProducts)
    total_precision += precision
    total_recall += recall
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MICRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = util.precision(totalCorrect, totalDataPointsFound)
    recall = util.recall(totalDataPoints, totalCorrect)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
    print('-----MACRO AVG-----')
    print(totalDataPoints, totalDataPointsFound, totalCorrect)
    precision = total_precision / 7.0
    recall = total_recall / 7.0
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', util.fScore(precision, recall))
Ejemplo n.º 9
0
    def train(self,
              X,
              Y,
              max_k=100,
              nsamp=100,
              lamb=None,
              q='kl',
              mode=3,
              rerun=True,
              min_recall_per_class=0.5):
        print('##### START #####')
        Itemset.clear_db()
        prep_db(X, Y)

        # Allow specify lamb to a certain number by users
        if type(lamb) == str or lamb is None:
            samp = self.sample_from_each_label(set(Itemset.labels), 100, set(),
                                               mode)
            if lamb == 'max':
                lamb = np.max([Rule.quality([r], metric=q) for r in samp])
            elif lamb == 'mean':
                lamb = np.mean([Rule.quality([r], metric=q) for r in samp])
            else:
                lamb = 0
            print('lamb:', lamb)

        greed = GreedyDiv([], lamb)
        U_all = []
        labels_samp = set(Itemset.labels)
        while len(self) < max_k and len(labels_samp) > 0:
            if mode == 0:
                samps = []
                for label in labels_samp:
                    _, samp = sample_rn(nsamp, label)
                    samp = [Rule(s, label)
                            for s in list(samp)]  # Very time-consuming
                    samps.extend(samp)
                U = set(samps)
            else:
                covered = set([t for r in self.rules for t in r.trans()])
                U = self.sample_from_each_label(labels_samp, nsamp, covered,
                                                mode)
            print('nsamp (after):', len(U))
            if len(U) == 0:
                break
            U_all.extend(U)

            # Greedy
            greed.update_univ(U)
            r = greed.greedy_once()
            # Termination criteria. Also check zero sampling above.
            if self.enough(r):
                # Include at least one rule per class, except default class.
                labels_samp.remove(r.label)
                print('remove label:', r.label)
            else:
                # Print quality vs. dispersion
                q, d = obj(self.rules, lamb, sep=True)
                qr, dr = obj(self.rules + [r], lamb, sep=True)
                print('inc q vs. d: {}, {}'.format(qr - q, dr - d))

                self.add(r)
                if np.abs(recall(self.rules)[r.label] - 1.0) < 1e-8:
                    labels_samp.remove(r.label)
                print('#{} '.format(len(self.rules)), end='')
                printRules([r])

        # Consecutive greedy over all sampels
        if rerun:
            greed.clear()
            greed.update_univ(list(set(U_all)))
            rules = greed.greedy(len(self.rules))
            if obj(rules, lamb) > obj(self.rules, lamb):
                print('Full greedy wins: {} > {}'.format(
                    obj(rules, lamb), obj(self.rules, lamb)))
                self.reset(rules)

        default = self.set_default()
        print('default:', default)

        self.build()

        print('precision: ', precision(self).items())
        print('recall (coverage): ', recall(self.rules).items())
        print('ave disp: ', dispersion(self.rules, average=True))
        print('##### END #####')
Ejemplo n.º 10
0
 def enough(self, r: Rule) -> bool:
     rc_cur = recall(self.rules)
     rc_aft = recall(self.rules + [r])
     if rc_aft[r.label] - rc_cur[r.label] <= self.recall_eps:
         return True
     return False