Example #1
0
def main():

    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    CONDITIONS = ConditionList([
        ('title', PretrainedWordEmbeddingCondition(vectors, dim=0))
    ])

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-irgan/' + args.data + '-decoder.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags,
                          year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    user_num = evaluate.train_set.size()[0] + evaluate.test_set.size()[0]
    item_num = evaluate.train_set.size()[1]
    models = [IRGANRecommender(user_num, item_num, g_epochs=1, d_epochs=1, n_epochs=1, conditions=CONDITIONS)]
    evaluate(models)
Example #2
0
def main():
    """ Evaluates the VAE Recommender """
    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-vae/' + args.data + '-hyperparams-opt.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags, year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    params = {
        #'n_epochs': 10,
        'batch_size': 100,
        'optimizer': 'adam',
        # 'normalize_inputs': True,
    }

    CONDITIONS = ConditionList([('title',
                                 PretrainedWordEmbeddingCondition(vectors))])

    # 100 hidden units, 200 epochs, bernoulli prior, normalized inputs -> 0.174
    # activations = ['ReLU','SELU']
    # lrs = [(0.001, 0.0005), (0.001, 0.001)]
    hcs = [(100, 50), (300, 100)]
    epochs = [50, 100, 200, 500]

    # dropouts = [(.2,.2), (.1,.1), (.1, .2), (.25, .25), (.3,.3)] # .2,.2 is best
    # priors = ['categorical'] # gauss is best
    # normal = [True, False]
    # bernoulli was good, letz see if categorical is better... No
    import itertools
    models = [
        VAERecommender(conditions=CONDITIONS,
                       **params,
                       n_hidden=hc[0],
                       n_code=hc[1],
                       n_epochs=e) for hc, e in itertools.product(hcs, epochs)
    ]
    # models = [VAERecommender(conditions=CONDITIONS, **params)]
    evaluate(models)
Example #3
0
def main(outfile=None, min_count=None):
    """ Main function for training and evaluating AAE methods on Reuters data """
    print("Loading data from", DATA_PATH)
    bags = Bags.load_tabcomma_format(DATA_PATH, unique=True)
    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    min_count=min_count)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)

        # Training
        model.train(train_set)

        # Prediction
        y_pred = model.predict(dev_set)

        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS)

        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)
Example #4
0
import numpy as np
from aaerec.datasets import Bags
# path = '../Data/Economics/econbiz62k.tsv'
path = '../Data/PMC/citations_pmc.tsv'
bags = Bags.load_tabcomma_format(path, unique=True)
bags = bags.build_vocab(apply=True)

csr = bags.tocsr()
print("N ratings:", csr.sum())

column_sums = csr.sum(0).flatten()
row_sums = csr.sum(1).flatten()

print(column_sums.shape)
print(row_sums.shape)


FMT = "N={}, Min={}, Max={} Median={}, Mean={}, Std={}"

def compute_stats(A):
    return A.shape[1], A.min(), A.max(), np.median(A, axis=1)[0,0], A.mean(), A.std()


print("Items per document")
print(FMT.format(*compute_stats(row_sums)))
print("Documents per item")
print(FMT.format(*compute_stats(column_sums)))

Example #5
0
W2V_PATH = "/data21/lgalke/vectors/GoogleNews-vectors-negative300.bin.gz"
W2V_IS_BINARY = True

PARSER = argparse.ArgumentParser()
PARSER.add_argument('dataset', type=str, help='path to dataset')
PARSER.add_argument('year', type=int, help='First year of the testing set.')
PARSER.add_argument('-m',
                    '--min-count',
                    type=int,
                    help='Pruning parameter',
                    default=50)
PARSER.add_argument('-o', '--outfile', type=str, default=None)

ARGS = PARSER.parse_args()

DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True)

EVAL = Evaluation(DATASET, ARGS.year, logfile=ARGS.outfile)
EVAL.setup(min_count=ARGS.min_count, min_elements=2)
print("Loading pre-trained embedding", W2V_PATH)
VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

BASELINES = [
    # RandomBaseline(),
    # MostPopular(),
    Countbased(),
    SVDRecommender(1000, use_title=False),
]

ae_params = {
    'n_code': 50,
Example #6
0
            "owner_id":
            "document",
            "fields": ["descriptor"],
            "target_names": ["mesh"],
            "path":
            os.path.join(
                "/data22/ggerstenkorn/citation_data_preprocessing/final_data/",
                "mesh.csv")
        }

        # With no metadata or just titles
        # bags = Bags.load_tabcomma_format(path, unique=True)
        # With more metadata for PubMed (generic conditions)
        bags = Bags.load_tabcomma_format(path,
                                         unique=True,
                                         owner_str="pmId",
                                         set_str="cited",
                                         meta_data_dic=mtdt_dic)
    else:
        bags = Bags.load_tabcomma_format(path, unique=True)

# only papers with min min_x_cit and max max_x_cit citations
citations = from_to_key(citations, min_x_cit, max_x_cit)
citations = collections.OrderedDict(sorted(citations.items()))
print("Total documents/labels (citation/occurrence distribution): {}".format(
    np.array(list(citations.values())).sum()))

if dataset == "pubmed" or dataset == "acm" or dataset == "dblp":
    x_dim = "Citations"
else:
    x_dim = "Occurrences"
Example #7
0
import numpy as np
from sklearn.metrics import mutual_info_score

from aaerec.datasets import Bags
from aaerec.condition import ConditionList, CountCondition
from aaerec.utils import compute_mutual_info

PARSER = argparse.ArgumentParser()
PARSER.add_argument('dataset', type=str,
                    help='path to dataset')
PARSER.add_argument('-m', '--min-count', type=int,
                    help='Pruning parameter', default=None)
PARSER.add_argument('-M', '--max-features', type=int,
                    help='Max features', default=None)
ARGS = PARSER.parse_args()


# MI_CONDITIONS = ConditionList([('title', CountCondition(max_features=100000))])
MI_CONDITIONS = None

print("Computing Mutual Info with args")
print(ARGS)

# With no metadata or just titles
BAGS = Bags.load_tabcomma_format(ARGS.dataset, unique=True)\
    .build_vocab(min_count=ARGS.min_count, max_features=ARGS.max_features)

mi = compute_mutual_info(BAGS, MI_CONDITIONS, include_labels=True, normalize=True)
with open('mi.csv', 'a') as mifile:
    print('CITREC', ARGS.min_count, mi, sep=',', file=mifile)
Example #8
0
def main(outfile=None, min_count=None, drop=1):
    """ Main function for training and evaluating AAE methods on Reuters data """
    print("Loading data from", DATA_PATH)
    bags = Bags.load_tabcomma_format(DATA_PATH, unique=True)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset: Reuters")
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp,
                                 conditions=None,
                                 include_labels=True,
                                 normalize=True)
        with open('mi.csv', 'a') as mifile:
            print('Reuters', min_count, mi, sep=',', file=mifile)
        print("=" * 78)
        exit(0)
    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    min_count=min_count,
                                                    drop=drop)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)

        # Training
        model.train(train_set)

        # Prediction
        y_pred = model.predict(dev_set)

        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS)

        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)