def main(): CONFIG = { 'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50), 'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1) } print("Loading pre-trained embedding", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(vectors, dim=0)) ]) PARSER = argparse.ArgumentParser() PARSER.add_argument('data', type=str, choices=['pub', 'eco']) args = PARSER.parse_args() DATA = CONFIG[args.data] logfile = '/data22/ivagliano/test-irgan/' + args.data + '-decoder.log' bags = Bags.load_tabcomma_format(DATA[0]) c_year = DATA[1] evaluate = Evaluation(bags, year=c_year, logfile=logfile).setup(min_count=DATA[2], min_elements=2) user_num = evaluate.train_set.size()[0] + evaluate.test_set.size()[0] item_num = evaluate.train_set.size()[1] models = [IRGANRecommender(user_num, item_num, g_epochs=1, d_epochs=1, n_epochs=1, conditions=CONDITIONS)] evaluate(models)
def main(): """ Evaluates the VAE Recommender """ CONFIG = { 'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50), 'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1) } PARSER = argparse.ArgumentParser() PARSER.add_argument('data', type=str, choices=['pub', 'eco']) args = PARSER.parse_args() DATA = CONFIG[args.data] logfile = '/data22/ivagliano/test-vae/' + args.data + '-hyperparams-opt.log' bags = Bags.load_tabcomma_format(DATA[0]) c_year = DATA[1] evaluate = Evaluation(bags, year=c_year, logfile=logfile).setup(min_count=DATA[2], min_elements=2) print("Loading pre-trained embedding", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) params = { #'n_epochs': 10, 'batch_size': 100, 'optimizer': 'adam', # 'normalize_inputs': True, } CONDITIONS = ConditionList([('title', PretrainedWordEmbeddingCondition(vectors))]) # 100 hidden units, 200 epochs, bernoulli prior, normalized inputs -> 0.174 # activations = ['ReLU','SELU'] # lrs = [(0.001, 0.0005), (0.001, 0.001)] hcs = [(100, 50), (300, 100)] epochs = [50, 100, 200, 500] # dropouts = [(.2,.2), (.1,.1), (.1, .2), (.25, .25), (.3,.3)] # .2,.2 is best # priors = ['categorical'] # gauss is best # normal = [True, False] # bernoulli was good, letz see if categorical is better... No import itertools models = [ VAERecommender(conditions=CONDITIONS, **params, n_hidden=hc[0], n_code=hc[1], n_epochs=e) for hc, e in itertools.product(hcs, epochs) ] # models = [VAERecommender(conditions=CONDITIONS, **params)] evaluate(models)
def main(outfile=None, min_count=None): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
import numpy as np from aaerec.datasets import Bags # path = '../Data/Economics/econbiz62k.tsv' path = '../Data/PMC/citations_pmc.tsv' bags = Bags.load_tabcomma_format(path, unique=True) bags = bags.build_vocab(apply=True) csr = bags.tocsr() print("N ratings:", csr.sum()) column_sums = csr.sum(0).flatten() row_sums = csr.sum(1).flatten() print(column_sums.shape) print(row_sums.shape) FMT = "N={}, Min={}, Max={} Median={}, Mean={}, Std={}" def compute_stats(A): return A.shape[1], A.min(), A.max(), np.median(A, axis=1)[0,0], A.mean(), A.std() print("Items per document") print(FMT.format(*compute_stats(row_sums))) print("Documents per item") print(FMT.format(*compute_stats(column_sums)))
W2V_PATH = "/data21/lgalke/vectors/GoogleNews-vectors-negative300.bin.gz" W2V_IS_BINARY = True PARSER = argparse.ArgumentParser() PARSER.add_argument('dataset', type=str, help='path to dataset') PARSER.add_argument('year', type=int, help='First year of the testing set.') PARSER.add_argument('-m', '--min-count', type=int, help='Pruning parameter', default=50) PARSER.add_argument('-o', '--outfile', type=str, default=None) ARGS = PARSER.parse_args() DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True) EVAL = Evaluation(DATASET, ARGS.year, logfile=ARGS.outfile) EVAL.setup(min_count=ARGS.min_count, min_elements=2) print("Loading pre-trained embedding", W2V_PATH) VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) BASELINES = [ # RandomBaseline(), # MostPopular(), Countbased(), SVDRecommender(1000, use_title=False), ] ae_params = { 'n_code': 50,
"owner_id": "document", "fields": ["descriptor"], "target_names": ["mesh"], "path": os.path.join( "/data22/ggerstenkorn/citation_data_preprocessing/final_data/", "mesh.csv") } # With no metadata or just titles # bags = Bags.load_tabcomma_format(path, unique=True) # With more metadata for PubMed (generic conditions) bags = Bags.load_tabcomma_format(path, unique=True, owner_str="pmId", set_str="cited", meta_data_dic=mtdt_dic) else: bags = Bags.load_tabcomma_format(path, unique=True) # only papers with min min_x_cit and max max_x_cit citations citations = from_to_key(citations, min_x_cit, max_x_cit) citations = collections.OrderedDict(sorted(citations.items())) print("Total documents/labels (citation/occurrence distribution): {}".format( np.array(list(citations.values())).sum())) if dataset == "pubmed" or dataset == "acm" or dataset == "dblp": x_dim = "Citations" else: x_dim = "Occurrences"
import numpy as np from sklearn.metrics import mutual_info_score from aaerec.datasets import Bags from aaerec.condition import ConditionList, CountCondition from aaerec.utils import compute_mutual_info PARSER = argparse.ArgumentParser() PARSER.add_argument('dataset', type=str, help='path to dataset') PARSER.add_argument('-m', '--min-count', type=int, help='Pruning parameter', default=None) PARSER.add_argument('-M', '--max-features', type=int, help='Max features', default=None) ARGS = PARSER.parse_args() # MI_CONDITIONS = ConditionList([('title', CountCondition(max_features=100000))]) MI_CONDITIONS = None print("Computing Mutual Info with args") print(ARGS) # With no metadata or just titles BAGS = Bags.load_tabcomma_format(ARGS.dataset, unique=True)\ .build_vocab(min_count=ARGS.min_count, max_features=ARGS.max_features) mi = compute_mutual_info(BAGS, MI_CONDITIONS, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('CITREC', ARGS.min_count, mi, sep=',', file=mifile)
def main(outfile=None, min_count=None, drop=1): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: Reuters") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('Reuters', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count, drop=drop) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)