def main(outfile=None, min_count=None, aggregate=None): """ Main function for training and evaluating AAE methods on MDP data """ print("Loading data from", DATA_PATH) playlists = playlists_from_slices(DATA_PATH, n_jobs=4) print("Unpacking json data...") if aggregate is not None: aggregate =['artist_name', 'track_name', 'album_name'] print("Using aggegated metadata {}".format(aggregate)) else: print("Aggrgate={}".fomat(aggregate)) print("Using title only") bags_of_tracks, pids, side_info = unpack_playlists(playlists, aggregate) del playlists bags = Bags(bags_of_tracks, pids, side_info) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, n_items=N_ITEMS, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS, batch_size=1000) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def test_batching(): """ Test whether batched evaluation yields same results as non-batched """ n_samples = 120 n_classes = 10 X, Y = make_multilabel_classification(n_samples, 20, n_classes) predictions = np.random.rand(n_samples, n_classes) metrics = [MRR(), MAP(), P(1), P(5)] results = evaluate(Y, predictions, metrics, batch_size=None) results_batched = evaluate(Y, predictions, metrics, batch_size=25) results_mean, results_std = zip(*results) results_batched_mean, results_batched_std = zip(*results_batched) assert ((np.array(results_batched_mean) - np.array(results_mean)) < EPS).all() assert ((np.array(results_batched_std) - np.array(results_std)) < EPS).all()
def main(outfile=None, min_count=None): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def main(outfile=None, min_count=None, aggregate=None): """ Main function for training and evaluating AAE methods on MDP data """ print("Loading data from", DATA_PATH) playlists = playlists_from_slices(DATA_PATH, n_jobs=4) print("Unpacking json data...") bags_of_tracks, pids, side_info = unpack_playlists_for_models_concatenated( playlists) del playlists bags = Bags(data=bags_of_tracks, owners=pids, owner_attributes=side_info) if args.compute_mi: from sklearn.metrics import mutual_info_score print("Computing MI") X = bags.build_vocab(min_count=args.min_count, max_features=None).tocsr() C = X.T @ X print("(Pairwise) mutual information:", mutual_info_score(None, None, contingency=C)) # Exit in this case print("Bye.") exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, n_items=N_ITEMS, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) log(model.model_params, logfile=outfile) # Training model.train(train_set) print("training finished") # Prediction y_pred = model.predict(dev_set) print("prediction finished") print(" prediction sparse?:", sp.issparse(y_pred)) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) print("remove non-missing:") # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) print("evaluate:") # Evaluate metrics results = evaluate(y_test, y_pred, METRICS, batch_size=500) print("metrics: ") log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def main(outfile=None, min_count=None, drop=1): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: Reuters") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('Reuters', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count, drop=drop) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)