def run_ours(Xtr, Ytr, Xt, Yt, lb, nsample, lambda_mode, q, sample_mode, k=None, rerun=True, eps=0.01, min_recall_per_class=0.8, log=None): #name = 'ours' if k is None else 'oursk' name = 'ours{}'.format(int(rerun)) k = k if k is not None else 100 dec = DecisionSet(eps) dec.train(Xtr, Ytr, max_k=k, nsamp=nsample, lamb=lambda_mode, q=q, mode=sample_mode, rerun=rerun, min_recall_per_class=min_recall_per_class) print('default:', dec.default) Xt_ = [Transaction(feat2item(t)) for t in Xt.values] Y_pred = dec.predict_all(Xt_) if log is None: from logger import log log('{}-default'.format(name), dec.default) log('{}-k'.format(name), len(dec.rules)) log('{}-maxk'.format(name), k) [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)] log('{}-q'.format(name), q) log('{}-nsample'.format(name), nsample) log('{}-lamb'.format(name), lambda_mode) log('{}-seq'.format(name), dec.seq) log('{}-auc'.format(name), roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred)) log('{}-disp'.format(name), dispersion(dec.rules, average=True)) log('{}-overlap'.format(name), overlap(dec.rules)) log('{}-mode'.format(name), sample_mode) [ log('{}-precisions-tr'.format(name), v, l) for l, v in precision(dec).items() ] [ log('{}-recall-tr'.format(name), v, l) for l, v in recall(dec.rules).items() ] print(confusion_matrix(Yt, Y_pred)) return Y_pred
def evaluate_mutator(mutator, threshold, min_percent, latex=True): labels = (POS, NEU, NEG) train_loc = root+'Data/twitterData/train_alternative.tsv' dev_loc = root+'Data/twitterData/dev_alternative.tsv' test_loc = root+'Data/twitterData/test_alternative.tsv' train, dev, test = get_final_semeval_data(reduce(lambda x, y: x|y, labels), train_loc, dev_loc, test_loc) dev_x, dev_y = dev for label in labels: pred_y = [] for tweet in dev_x: pred_y.append(label if mutator.apply_filter(tweet, label) else -1) if pred_y.count(label) < min_percent*len(pred_y): yield str(label), ' (%.3f,0.0)' % threshold else: yield str(label), ' (%.3f,%.4f)' % (threshold, precision(dev_y, pred_y, label))
def calculateLSTMaccuracy(receipts, results): total_price_total = 0 total_price_found = 0 total_price_correct = 0 currency_total = 0 currency_found = 0 currency_correct = 0 date_total = 0 date_found = 0 date_correct = 0 vendor_total = 0 vendor_found = 0 vendor_correct = 0 tax_rate_total = 0 tax_rate_found = 0 tax_rate_correct = 0 address_total = 0 address_found = 0 address_correct = 0 products_total = 0 products_found = 0 products_correct = 0 count = 0 for i, receipt in enumerate(receipts): corr = True ## Check total price if 'total_price' in results[i]: price = results[i]['total_price'].replace(',','.') to_remove = [] for p in price: if util.isInt(p) or p == '.': continue to_remove.append(p) for p in to_remove: price = price.replace(p, '') if price.count('.') == 2: index = price.index('.') price = price[0 : index : ] + price[index + 1 : :] elif price.count('.') == 1 and len(price.split('.')[-1]) > 2: price = price.replace('.', '') else: price = None if price and price != '': total_price_found+=1 if 'total_price' in receipt.groundTruth: total_price_total+= 1 if compare.totalPrice(receipt.groundTruth['total_price'], price): total_price_correct += 1 else: corr = False ## Check currecy if 'currency' in results[i]: currency = results[i]['currency'] to_remove = [] for c in currency: if c.isalpha(): continue to_remove.append(c) for c in to_remove: currency = currency.replace(c, '') else: currency = None if currency and currency != '': currency_found+=1 if 'currency' in receipt.groundTruth: currency_total+=1 if compare.currency(receipt.groundTruth['currency'], currency): currency_correct += 1 else: corr = False ## Check date if 'date' in results[i]: date = results[i]['date'] split = date.split(' ') if len(split) == 2: date = split[0] if len(split[1]) > len(split[0]): date = split[1] else: date = None if date and date != '': date_found+=1 if 'date' in receipt.groundTruth: date_total+=1 if compare.date(receipt.groundTruth['date'],date): date_correct += 1 else: corr = False ## Check vendor if 'vendor' in results[i]: vendor = results[i]['vendor'] else: vendor = None if vendor and vendor != '': vendor_found +=1 if 'vendor' in receipt.groundTruth: vendor_total+=1 if compare.vendor(receipt.groundTruth['vendor'], vendor): vendor_correct += 1 else: corr = False ## Check tax rate if 'tax_rate' in results[i]: tax = results[i]['tax_rate'] split = tax.split(' ') if len(split) == 2: tax = split[0] else: tax = None if tax and tax != '': tax_rate_found+=1 if 'tax_rate' in receipt.groundTruth: tax_rate_total+=1 if compare.taxRate(receipt.groundTruth['tax_rate'], tax): tax_rate_correct += 1 else: corr = False ## Check address if 'address' in results[i]: address = results[i]['address'] else: address = None if address and address != '': address_found += 1 if 'address' in receipt.groundTruth: address_total+=1 if compare.address(receipt.groundTruth['address'], address): address_correct += 1 else: corr = False if 'products' in receipt.groundTruth: products_total += len(receipt.groundTruth['products']) if 'products' in results[i]: products = results[i]['products'] found = [] for product in products: product['amount'] = 1 products_found += 1 if not 'name' in product: continue if 'products' in receipt.groundTruth: real_products = receipt.groundTruth['products'] for j,real_product in enumerate(real_products): if j in found: continue if compare.products(product, real_product): found.append(j) products_correct += 1 break totalDataPoints = vendor_total + date_total + address_total + tax_rate_total + total_price_total + currency_total + products_total totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(receipts)) print('-----VENDORS-----') print(vendor_total, vendor_found, vendor_correct) precision = util.precision(vendor_correct, vendor_found) recall = util.recall(vendor_total, vendor_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(date_total, date_found, date_correct) precision = util.precision(date_correct, date_found) recall = util.recall(date_total, date_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(address_total, address_found, address_correct) precision = util.precision(address_correct, address_found) recall = util.recall(address_total, address_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(tax_rate_total, tax_rate_found, tax_rate_correct) precision = util.precision(tax_rate_correct, tax_rate_found) recall = util.recall(tax_rate_total, tax_rate_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(total_price_total, total_price_found, total_price_correct) precision = util.precision(total_price_correct, total_price_found) recall = util.recall(total_price_total, total_price_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currency_total, currency_found, currency_correct) precision = util.precision(currency_correct, currency_found) recall = util.recall(currency_total, currency_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products_total, products_found, products_correct) precision = util.precision(products_correct, products_found) recall = util.recall(products_total, products_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def calculateRuleBasedAccuracy(receipts): total_price_total = 0 total_price_found = 0 total_price_correct = 0 currency_total = 0 currency_found = 0 currency_correct = 0 date_total = 0 date_found = 0 date_correct = 0 vendor_total = 0 vendor_found = 0 vendor_correct = 0 tax_rate_total = 0 tax_rate_found = 0 tax_rate_correct = 0 address_total = 0 address_found = 0 address_correct = 0 products_total = 0 products_found = 0 products_correct = 0 count = 0 for receipt in receipts: corr = True ## Check total price if 'total_price' in receipt.ruleBasedPrediction: price = receipt.ruleBasedPrediction['total_price'] else: price = None if price: total_price_found+=1 if 'total_price' in receipt.groundTruth: total_price_total+= 1 if compare.totalPrice(receipt.groundTruth['total_price'], price): total_price_correct += 1 else: corr = False ## Check currecy if 'currency' in receipt.ruleBasedPrediction: currency = receipt.ruleBasedPrediction['currency'] else: currency = None if currency: currency_found+=1 if 'currency' in receipt.groundTruth: currency_total+=1 if compare.currency(receipt.groundTruth['currency'], currency): currency_correct += 1 else: corr = False ## Check date if 'date' in receipt.ruleBasedPrediction: date = receipt.ruleBasedPrediction['date'] else: date = None if date: date_found+=1 if 'date' in receipt.groundTruth: date_total+=1 if compare.date(receipt.groundTruth['date'],date): date_correct += 1 else: corr = False ## Check vendor if 'vendor' in receipt.ruleBasedPrediction: vendor = receipt.ruleBasedPrediction['vendor'] else: vendor = None if vendor: vendor_found +=1 if 'vendor' in receipt.groundTruth: vendor_total+=1 if compare.vendor(receipt.groundTruth['vendor'], vendor): vendor_correct += 1 else: corr = False ## Check tax rate if 'tax_rate' in receipt.ruleBasedPrediction: tax = receipt.ruleBasedPrediction['tax_rate'] else: tax = None if tax: tax_rate_found+=1 if 'tax_rate' in receipt.groundTruth: tax_rate_total+=1 if compare.taxRate(receipt.groundTruth['tax_rate'], tax): tax_rate_correct += 1 else: corr = False ## Check address if 'address' in receipt.ruleBasedPrediction: address = receipt.ruleBasedPrediction['address'] else: address = None if address: address_found += 1 if 'address' in receipt.groundTruth: address_total+=1 if compare.address(receipt.groundTruth['address'], address): address_correct += 1 else: corr = False ## Check products if 'products' in receipt.ruleBasedPrediction: products = receipt.ruleBasedPrediction['products'] else: products = [] found = [] if 'products' in receipt.groundTruth: products_total+= len(receipt.groundTruth['products']) for product in products: products_found += 1 if 'products' in receipt.groundTruth: real_products = receipt.groundTruth['products'] for j,real_product in enumerate(real_products): if j in found: continue if compare.products(product, real_product): found.append(j) products_correct += 1 break if 'products' in receipt.groundTruth: if len(found) < len(receipt.groundTruth['products']): corr = False if corr: count+=1 totalDataPoints = vendor_total + date_total + address_total + tax_rate_total + total_price_total + currency_total + products_total totalDataPointsFound = vendor_found + date_found + address_found + tax_rate_found + total_price_found + currency_found + products_found totalCorrect = vendor_correct + date_correct + address_correct + tax_rate_correct + total_price_correct + currency_correct + products_correct total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(receipts)) print('-----VENDORS-----') print(vendor_total, vendor_found, vendor_correct) precision = util.precision(vendor_correct, vendor_found) recall = util.recall(vendor_total, vendor_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(date_total, date_found, date_correct) precision = util.precision(date_correct, date_found) recall = util.recall(date_total, date_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(address_total, address_found, address_correct) precision = util.precision(address_correct, address_found) recall = util.recall(address_total, address_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(tax_rate_total, tax_rate_found, tax_rate_correct) precision = util.precision(tax_rate_correct, tax_rate_found) recall = util.recall(tax_rate_total, tax_rate_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(total_price_total, total_price_found, total_price_correct) precision = util.precision(total_price_correct, total_price_found) recall = util.recall(total_price_total, total_price_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currency_total, currency_found, currency_correct) precision = util.precision(currency_correct, currency_found) recall = util.recall(currency_total, currency_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products_total, products_found, products_correct) precision = util.precision(products_correct, products_found) recall = util.recall(products_total, products_correct) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def kmeans(eac, removeTerms, ngram): terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos'] print('Filtering tweets') tweets = util.read_from_file("dataset.csv") if removeTerms: tweets = filter.filter_tweets(tweets, terms_to_remove=terms) else: tweets = filter.filter_tweets(tweets) # Reduce tweets list length tweets = tweets[0:6300] carlitos = 0 lazaro = 0 data = [] for tw in tweets: if tw.tw_type == 'Carlitos': carlitos += 1 else: lazaro += 1 data.append(tw.text) print(carlitos, lazaro) print("Transform Data...") # Transform data if ngram: hasher = HashingVectorizer(non_negative=True, ngram_range=(1, 3), analyzer='word', norm='l2', binary=False) else: hasher = HashingVectorizer(non_negative=True, norm='l2', binary=False) vectorizer = make_pipeline(hasher) X = vectorizer.fit_transform(data) count = 0 precision_list = [] while count < 100: # Start timer t0 = time() if eac: clustering = EAC(30, min_k=2, max_k=10) EAC_D = clustering.fit(X).distance_ # Kmedoids over EAC_D kmed = KMedoids(2, init='random', distance_metric="precomputed") labels = kmed.fit(EAC_D).labels_ else: km = KMeans(n_clusters=2, init='k-means++', n_init=1, max_iter=100) labels = km.fit(X).labels_ # Assign labels to tweets for i in range(len(tweets)): tweets[i].label = labels[i] print("Precision: ") # Print precision precision = util.precision(tweets) print("done in %0.3fs" % (time() - t0)) if isAdable(precision): precision_list.append(precision) count += 1 return precision_list
def minhash(eac, shingle, removeTerms): terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos'] print('Filtering tweets') tweets = util.read_from_file("dataset.csv") if removeTerms: tweets = filter.filter_tweets(tweets, terms_to_remove=terms) else: tweets = filter.filter_tweets(tweets) # Reduce tweets list length tweets = tweets[0:6300] carlitos = 0 lazaro = 0 data = [] for tw in tweets: if tw.tw_type == 'Carlitos': carlitos += 1 else: lazaro += 1 data.append(tw.text) print(carlitos, lazaro) # Extract text from tweets X = [tw.text for tw in tweets] # Start timer t0 = time() print("Calculating distance matrix...") D = metrics.jaccard_minhash_distance_mp(X, shingle_length=shingle) count = 0 precision_list = [] while count < 100: if eac: print("EAC clustering...") # EAC clustering kmedoid = KMedoids(init='random', distance_metric='precomputed') clustering = EAC(30, min_k=2, max_k=10, clustering=kmedoid) EAC_D = clustering.fit(D).distance_ # Kmedoids over EAC_D kmed = KMedoids(2, init='random', distance_metric="precomputed") labels = kmed.fit(EAC_D).labels_ else: kmedoid = KMedoids(2, init='random', distance_metric='precomputed') print("Kmedoids clustering...") labels = kmedoid.fit(D).labels_ # Assign labels to tweets for i in range(len(tweets)): tweets[i].label = labels[i] # Print precision print("Precision: ") precision = util.precision(tweets) if isAdable(precision): print(count) precision_list.append(precision) count += 1 print("done in %0.3fs" % (time() - t0)) return precision_list
def calculateMetrics(reciepts, result, writeToFile=False, path=None): correctVendors = 0 vendorsFound = 0 vendors = 0 correctDates = 0 datesFound = 0 dates = 0 correctAddresses = 0 addressesFound = 0 addresses = 0 correctTaxes = 0 taxesFound = 0 taxes = 0 correctPrices = 0 pricesFound = 0 prices = 0 correctCurrencies = 0 currenciesFound = 0 currencies = 0 correctProducts = 0 productsFound = 0 products = 0 result_dict = {} count = 0 for i, reciept in enumerate(reciepts): corr = True vendor = result[i]['vendor'] result_dict['vendor'] = vendor if vendor: vendorsFound += 1 vendor = vendor.lower() if 'vendor' in reciept.groundTruth: vendors += 1 if vendor and levenshtein_distance( vendor, reciept.groundTruth['vendor'].lower()) <= 0: correctVendors += 1 else: corr = False date = result[i]['date'] result_dict['date'] = date if date: datesFound += 1 date = date.lower() if 'date' in reciept.groundTruth: dates += 1 if date == reciept.groundTruth['date'].lower( ) or date == reciept.groundTruth['date'].lower().replace(' ', ''): correctDates += 1 else: corr = False address = result[i]['address'] result_dict['address'] = address if address: addressesFound += 1 address = address.lower() if 'address' in reciept.groundTruth: addresses += 1 if address and levenshtein_distance( address, reciept.groundTruth['address'].lower()) <= 0: correctAddresses += 1 else: corr = False tax = result[i]['tax_rate'] result_dict['tax_rate'] = tax if tax != None: taxesFound += 1 if 'tax_rate' in reciept.groundTruth: taxes += 1 real_tax = int( float(reciept.groundTruth['tax_rate'].lower().replace('%', ''))) if tax == real_tax: correctTaxes += 1 else: corr = False price = result[i]['total_price'] result_dict['total_price'] = price if price: pricesFound += 1 if 'total_price' in reciept.groundTruth: prices += 1 real_price = float(reciept.groundTruth['total_price'].lower()) if price == real_price: correctPrices += 1 else: corr = False currency = result[i]['currency'] result_dict['currency'] = currency if currency: currenciesFound += 1 currency = currency.lower() if 'currency' in reciept.groundTruth: currencies += 1 if currency == reciept.groundTruth['currency'].lower(): correctCurrencies += 1 else: corr = False productsList = result[i]['products'] result_dict['products'] = productsList if 'products' in reciept.groundTruth: for product in reciept.groundTruth['products']: products += 1 checkedIndexes = [] for product in productsList: productsFound += 1 for i, real_product in enumerate(reciept.groundTruth['products']): if i in checkedIndexes: continue price = None if 'price' in product: price = product['price'].replace(',', '.') try: price = float(price) except: price = None real_price = real_product['price'] real_price = float(real_price) if levenshtein_distance(product['name'].lower(), real_product['name'].lower()) <= 0: if util.floatCompare(price, real_price): if product['amount'] == real_product['amount']: correctProducts += 1 checkedIndexes.append(i) break if len(checkedIndexes) < len(reciept.groundTruth['products']): corr |= False if corr: count += 1 if writeToFile: with open(os.path.join(path, reciept.path), 'w') as fp: json.dump(result_dict, fp, indent=1) totalDataPoints = vendors + dates + addresses + taxes + prices + currencies + products totalDataPointsFound = vendorsFound + datesFound + addressesFound + taxesFound + pricesFound + currenciesFound + productsFound totalCorrect = correctVendors + correctDates + correctAddresses + correctTaxes + correctPrices + correctCurrencies + correctProducts total_precision = 0 total_recall = 0 print('-----TOTAL CORRECT RECEIPTS-----') print(count, 'of', len(reciepts)) print('-----VENDORS-----') print(vendors, vendorsFound, correctVendors) precision = util.precision(correctVendors, vendorsFound) recall = util.recall(vendors, correctVendors) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----DATES-----') print(dates, datesFound, correctDates) precision = util.precision(correctDates, datesFound) recall = util.recall(dates, correctDates) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----ADDRESSES-----') print(addresses, addressesFound, correctAddresses) precision = util.precision(correctAddresses, addressesFound) recall = util.recall(addresses, correctAddresses) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----TAX RATES-----') print(taxes, taxesFound, correctTaxes) precision = util.precision(correctTaxes, taxesFound) recall = util.recall(taxes, correctTaxes) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRICE-----') print(prices, pricesFound, correctPrices) precision = util.precision(correctPrices, pricesFound) recall = util.recall(prices, correctPrices) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----CURRENCY-----') print(currencies, currenciesFound, correctCurrencies) precision = util.precision(correctCurrencies, currenciesFound) recall = util.recall(currencies, correctCurrencies) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----PRODUCTS-----') print(products, productsFound, correctProducts) precision = util.precision(correctProducts, productsFound) recall = util.recall(products, correctProducts) total_precision += precision total_recall += recall print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MICRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = util.precision(totalCorrect, totalDataPointsFound) recall = util.recall(totalDataPoints, totalCorrect) print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall)) print('-----MACRO AVG-----') print(totalDataPoints, totalDataPointsFound, totalCorrect) precision = total_precision / 7.0 recall = total_recall / 7.0 print('Precision:', precision) print('Recall:', recall) print('F1:', util.fScore(precision, recall))
def train(self, X, Y, max_k=100, nsamp=100, lamb=None, q='kl', mode=3, rerun=True, min_recall_per_class=0.5): print('##### START #####') Itemset.clear_db() prep_db(X, Y) # Allow specify lamb to a certain number by users if type(lamb) == str or lamb is None: samp = self.sample_from_each_label(set(Itemset.labels), 100, set(), mode) if lamb == 'max': lamb = np.max([Rule.quality([r], metric=q) for r in samp]) elif lamb == 'mean': lamb = np.mean([Rule.quality([r], metric=q) for r in samp]) else: lamb = 0 print('lamb:', lamb) greed = GreedyDiv([], lamb) U_all = [] labels_samp = set(Itemset.labels) while len(self) < max_k and len(labels_samp) > 0: if mode == 0: samps = [] for label in labels_samp: _, samp = sample_rn(nsamp, label) samp = [Rule(s, label) for s in list(samp)] # Very time-consuming samps.extend(samp) U = set(samps) else: covered = set([t for r in self.rules for t in r.trans()]) U = self.sample_from_each_label(labels_samp, nsamp, covered, mode) print('nsamp (after):', len(U)) if len(U) == 0: break U_all.extend(U) # Greedy greed.update_univ(U) r = greed.greedy_once() # Termination criteria. Also check zero sampling above. if self.enough(r): # Include at least one rule per class, except default class. labels_samp.remove(r.label) print('remove label:', r.label) else: # Print quality vs. dispersion q, d = obj(self.rules, lamb, sep=True) qr, dr = obj(self.rules + [r], lamb, sep=True) print('inc q vs. d: {}, {}'.format(qr - q, dr - d)) self.add(r) if np.abs(recall(self.rules)[r.label] - 1.0) < 1e-8: labels_samp.remove(r.label) print('#{} '.format(len(self.rules)), end='') printRules([r]) # Consecutive greedy over all sampels if rerun: greed.clear() greed.update_univ(list(set(U_all))) rules = greed.greedy(len(self.rules)) if obj(rules, lamb) > obj(self.rules, lamb): print('Full greedy wins: {} > {}'.format( obj(rules, lamb), obj(self.rules, lamb))) self.reset(rules) default = self.set_default() print('default:', default) self.build() print('precision: ', precision(self).items()) print('recall (coverage): ', recall(self.rules).items()) print('ave disp: ', dispersion(self.rules, average=True)) print('##### END #####')