def main(year, min_count=None, outfile=None, drop=1): """ Main function for training and evaluating AAE methods on IREON data """ if (CLEAN == True): print("Loading data from", DATA_PATH) papers = load(DATA_PATH) print("Cleaning data...") clean(CLEAN_DATA_PATH, papers) print("Clean data in {}".format(CLEAN_DATA_PATH)) return print("Loading data from", CLEAN_DATA_PATH) papers = load(CLEAN_DATA_PATH) print("Unpacking IREON data...") # bags_of_papers, ids, side_info = unpack_papers(papers) bags_of_papers, ids, side_info = unpack_papers_conditions(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: IREON (fiv)") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('IREON', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) # Use only partial citations/labels list (no additional metadata) with open(outfile, 'a') as fh: print("~ Partial List", "~" * 42, file=fh) evaluation(BASELINES + RECOMMENDERS) # Use additional metadata (as defined in CONDITIONS for all models but SVD, which uses only titles) with open(outfile, 'a') as fh: print("~ Conditioned Models", "~" * 42, file=fh) evaluation(CONDITIONED_MODELS)
def main(year, dataset, min_count=None, outfile=None, drop=1): """ Main function for training and evaluating AAE methods on DBLP data """ path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt") print("Loading data from", path) papers = papers_from_files(path, dataset, n_jobs=4) print("Unpacking {} data...".format(dataset)) bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset:", dataset) print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print(dataset, min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) # To evaluate the baselines and the recommenders without metadata (or just the recommenders without metadata) # with open(outfile, 'a') as fh: # print("~ Partial List", "~" * 42, file=fh) # evaluation(BASELINES + RECOMMENDERS) # evaluation(RECOMMENDERS, batch_size=1000) with open(outfile, 'a') as fh: print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh) # To evaluate SVD with titles # evaluation(TITLE_ENHANCED) evaluation(CONDITIONED_MODELS, batch_size=1000)
import numpy as np from sklearn.metrics import mutual_info_score from aaerec.datasets import Bags from aaerec.condition import ConditionList, CountCondition from aaerec.utils import compute_mutual_info PARSER = argparse.ArgumentParser() PARSER.add_argument('dataset', type=str, help='path to dataset') PARSER.add_argument('-m', '--min-count', type=int, help='Pruning parameter', default=None) PARSER.add_argument('-M', '--max-features', type=int, help='Max features', default=None) ARGS = PARSER.parse_args() # MI_CONDITIONS = ConditionList([('title', CountCondition(max_features=100000))]) MI_CONDITIONS = None print("Computing Mutual Info with args") print(ARGS) # With no metadata or just titles BAGS = Bags.load_tabcomma_format(ARGS.dataset, unique=True)\ .build_vocab(min_count=ARGS.min_count, max_features=ARGS.max_features) mi = compute_mutual_info(BAGS, MI_CONDITIONS, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('CITREC', ARGS.min_count, mi, sep=',', file=mifile)
def main(outfile=None, min_count=None, drop=1): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: Reuters") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('Reuters', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count, drop=drop) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def main(year, dataset, min_count=None, outfile=None, drop=1, baselines=False, autoencoders=False, conditioned_autoencoders=False, all_metadata=True): """ Main function for training and evaluating AAE methods on DBLP data """ assert baselines or autoencoders or conditioned_autoencoders, "Please specify what to run" if all_metadata: # V2 - all metadata CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(VECTORS)), ('venue', PretrainedWordEmbeddingCondition(VECTORS)), ( 'author', CategoricalCondition( embedding_dim=32, reduce="sum", # vocab_size=0.01, sparse=False, embedding_on_gpu=True)) ]) else: # V1 - only title metadata CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(VECTORS)) ]) #### CONDITOINS defined ALL_MODELS = [] if baselines: # Models without metadata BASELINES = [ # RandomBaseline(), # MostPopular(), Countbased(), SVDRecommender(1000, use_title=False) ] ALL_MODELS += BASELINES if not all_metadata: # SVD can use only titles not generic conditions ALL_MODELS += [SVDRecommender(1000, use_title=True)] if autoencoders: AUTOENCODERS = [ AAERecommender(adversarial=False, conditions=None, lr=0.001, **AE_PARAMS), AAERecommender(adversarial=True, conditions=None, gen_lr=0.001, reg_lr=0.001, **AE_PARAMS), VAERecommender(conditions=None, **AE_PARAMS), DAERecommender(conditions=None, **AE_PARAMS) ] ALL_MODELS += AUTOENCODERS if conditioned_autoencoders: # Model with metadata (metadata used as set in CONDITIONS above) CONDITIONED_AUTOENCODERS = [ AAERecommender(adversarial=False, conditions=CONDITIONS, lr=0.001, **AE_PARAMS), AAERecommender(adversarial=True, conditions=CONDITIONS, gen_lr=0.001, reg_lr=0.001, **AE_PARAMS), DecodingRecommender(CONDITIONS, n_epochs=100, batch_size=1000, optimizer='adam', n_hidden=100, lr=0.001, verbose=True), VAERecommender(conditions=CONDITIONS, **AE_PARAMS), DAERecommender(conditions=CONDITIONS, **AE_PARAMS) ] ALL_MODELS += CONDITIONED_AUTOENCODERS print("Finished preparing models:", *ALL_MODELS, sep='\n\t') path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt") print("Loading data from", path) papers = papers_from_files(path, dataset, n_jobs=4) print("Unpacking {} data...".format(dataset)) bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset:", dataset) print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print(dataset, min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) with open(outfile, 'a') as fh: print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh) evaluation(ALL_MODELS, batch_size=1000)