Exemple #1
0
def main(argv):
    experiment_id = None
    names_pkl = None

    conf = util.configuration()

    usage_str = ('Usage: %s [-h] [--experiment=<id>] [--names=<file.pkl>]' % (sys.argv[0]))
    # Parse arguments
    try:
        opts, args = getopt.getopt(argv, 'h', ['experiment=', 'names='])
    except getopt.GetoptError:
        logging.warn(usage_str)
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            logging.info(usage_str)
            logging.info('\t--experiment=<id> (default: %s)' % (experiment_id))
            logging.info('\t--names=<file.pkl> (default: %s)' % (names_pkl))
            return

        if opt == '--experiment':
            experiment_id = arg
        if opt == '--names':
            names_pkl = arg

    name = conf.get('System', 'name')
    version = conf.get('System', 'version')

    # Allocating the persistence layer ..
    layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path'))
    experiment = layer.get(experiment_id)

    names = None

    if names_pkl is not None:
        names = pickle.load(open(names_pkl, 'rb'))
        names = {name[1:-1]:code for (name, code) in names.items()}

    shell = ExplorationShell(name, version, experiment, names)
    shell.cmdloop()
Exemple #2
0
def main(argv):
    experiment_id, classes_pkl, use_rbf = None, None, False

    conf = util.configuration()

    # Parse arguments
    try:
        opts, args = getopt.getopt(argv, 'h',
                                   ['experiment=', 'classes=', 'rbf'])
    except getopt.GetoptError:
        logging.warn(
            'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--rbf]'
            % (sys.argv[0]))
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            logging.info(
                'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--rbf]'
                % (sys.argv[0]))
            logging.info('\t--experiment=<id> (default: %s)' % (experiment_id))
            logging.info('\t--classes=<classes.pkl> (default: %s)' %
                         (classes_pkl))
            logging.info(
                '\t--rbf (Consider using the RBF kernel during hyperparameters selection)'
            )
            return

        if opt == '--experiment':
            experiment_id = arg
        if opt == '--classes':
            classes_pkl = arg
        if opt == '--rbf':
            use_rbf = True

    layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path'))

    # Get the dict containing the details of the experiment (including the learned parameters)
    experiment = layer.get(experiment_id)

    # Get a { class: set([ elements ]) } dict
    classes_dict = pickle.load(open(classes_pkl, 'rb'))

    # If a class has less than 10 elements, remove it
    for _class in classes_dict.keys():
        if len(classes_dict[_class]) < 10:
            logging.info('Removing class %s' % _class)
            del classes_dict[_class]

    # Get the best parameters learned so far
    best = experiment['best_on_validation']

    entities, predicates = best['entities'], best['predicates']
    variables = ['Eemb']

    # Turn the { class: set([ elements ]) } dict into two arrays: [ classes ] and [ elements ]
    classes, elements = [], []
    for (_class, _elements) in classes_dict.items():
        for _element in _elements:
            classes += [_class]
            elements += [_element]

    # Turn [ classes ] (containing a class name for each element in elements) in a list of integers
    # e.g. [ 'Class1', 'Class2', 'Class1' ] becomes [ 0, 1, 0 ]
    indexes = [entities.index(element) for element in elements]
    class_idx = {
        _class: _idx
        for (_idx, _class) in enumerate(classes_dict.keys())
    }
    classes_numeric = [class_idx[_class] for _class in classes]

    parameter = best['parameters']['Eemb']
    Xt = np.asarray(parameter['value'])
    _X = np.transpose(Xt)
    X = _X[np.asarray(indexes), :]

    if normalize:
        preprocessing.normalize(X)

    y = np.asarray(classes_numeric)

    # Split the dataset in two equal parts
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0)

    for it in range(iterations):
        kf = cross_validation.KFold(len(elements), shuffle=True, n_folds=10)
        accuracies = []

        for train_index, test_index in kf:

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
            gammas = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]

            # Tune hyper-parameters by cross-validation
            tuned_parameters = [{'kernel': ['linear'], 'C': Cs}]

            if use_rbf:
                tuned_parameters += [{
                    'kernel': ['rbf'],
                    'gamma': gammas,
                    'C': Cs
                }]

            scoring_function = 'accuracy'  # ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'precision', 'r2', 'recall', 'roc_auc']

            logging.debug("# Tuning hyper-parameters for %s" %
                          scoring_function)

            model = svm.SVC()
            clf = grid_search.GridSearchCV(model,
                                           tuned_parameters,
                                           cv=10,
                                           scoring=scoring_function)
            clf.fit(X_train, y_train)

            logging.debug("Best parameters set found on development set:")
            logging.debug(clf.best_estimator_)

            logging.debug("Grid scores on development set:")
            for params, mean_score, scores in clf.grid_scores_:
                logging.debug('%0.3f (+/-%0.03f) for %r' %
                              (mean_score, scores.std() / 2, params))

            logging.debug("Detailed classification report:")

            y_true, y_pred = y_test, clf.predict(X_test)

            accuracy = metrics.accuracy_score(y_true, y_pred)
            accuracies += [accuracy]

            logging.info('Accuracy: %s' %
                         metrics.accuracy_score(y_true, y_pred))
            #logging.debug(metrics.classification_report(y_true, y_pred))

        logging.info('Accuracy: %f +/- %f' %
                     (np.mean(accuracies), np.var(accuracies)))
def learn(state):
    np.random.seed(state.seed)

    c = util.configuration()
    layer, exp, exp_id = None, {}, None

    dataset = util.ExpDataSet(train_path=state.train_path,
                              valid_path=state.valid_path,
                              test_path=state.test_path)

    # Training set
    trainl, trainr, traino = dataset.train()

    # Validation set
    if dataset.has_valid is True:
        validl, validr, valido = dataset.valid()

    # Test set
    if dataset.has_test is True:
        testl, testr, testo = dataset.test()

    logging.info('Shape for training set: %s' % (str(trainl.shape)))

    if state.use_db:
        is_fast = trainl.shape[
            1] > 10000000  # if the dataset is not small-sized (> 10m triples), switch to fast mode
        layer = persistence.PickleLayer(dir=c.get('Persistence', 'path'),
                                        is_fast=is_fast)
        exp = {'start_time': datetime.datetime.utcnow()}
        exp_id = layer.create(state.name, exp)

    NE, NP = len(dataset.entities), len(dataset.predicates)
    state.Nrel = NP
    state.Nent = NE + NP

    if state.Nsyn is None:
        state.Nsyn = NE

    if dataset.specs is not None:
        state.Nleft, state.Nright, state.Nshared = dataset.specs[
            'Nleft'], dataset.specs['Nright'], dataset.specs['Nshared']

    exp['best'] = {
    }  # use the validation set (if available) to pick the best model
    exp['state'] = {
        k: (state[k].__name__
            if isinstance(state[k], types.FunctionType) else str(state[k]))
        for k in state.keys()
    }
    exp['producer'] = util.producer(c)

    # Show experiment parameters
    logging.info('State: %s', exp['state'])

    if state.op in [
            'SE'
    ] + base_vers + semixi_vers + xi_vers + xiscaltrans_vers + lc_vers + scaltrans_vers + aff_vers + xiaff_vers:
        traino = traino[-state.Nrel:, :]  # last elements of traino
        if dataset.has_valid is True:
            valido = valido[-state.Nrel:, :]
        if dataset.has_test is True:
            testo = testo[-state.Nrel:, :]

    logging.debug('Converting sparse matrices to indexes ..')

    # Convert sparse matrices to indexes
    trainlidx, trainridx, trainoidx = util.convert2idx(
        trainl), util.convert2idx(trainr), util.convert2idx(traino)

    if dataset.has_valid is True:
        validlidx, validridx, validoidx = util.convert2idx(
            validl), util.convert2idx(validr), util.convert2idx(valido)

    if dataset.has_test is True:
        testlidx, testridx, testoidx = util.convert2idx(
            testl), util.convert2idx(testr), util.convert2idx(testo)

    true_triples = None
    if (dataset.has_valid is True) and (dataset.has_test is
                                        True) and state.filtered:
        true_triples = np.concatenate([
            testlidx, validlidx, trainlidx, testoidx, validoidx, trainoidx,
            testridx, validridx, trainridx
        ]).reshape(3, testlidx.shape[0] + validlidx.shape[0] +
                   trainlidx.shape[0]).T

    # Operators
    leftop, rightop = model.op(state.op, state.ndim, state.nhid)

    logging.debug('Initializing the embeddings ..')

    # Embeddings
    embeddings = learning.Embeddings(np.random,
                                     state.Nent,
                                     state.ndim,
                                     tag='emb')

    relationVec = None

    if (state.op in ['SE']) and type(embeddings) is not list:
        relationl = learning.Embeddings(np.random,
                                        state.Nrel,
                                        state.ndim * state.nhid,
                                        tag='rell')
        relationr = learning.Embeddings(np.random,
                                        state.Nrel,
                                        state.ndim * state.nhid,
                                        tag='relr')
        embeddings = [embeddings, relationl, relationr]

    elif (state.op in base_vers + lc_vers) and type(embeddings) is not list:
        relationVec = learning.Embeddings(np.random,
                                          state.Nrel,
                                          state.ndim,
                                          tag='relvec')
        embeddings = [embeddings, relationVec, relationVec]

    elif (state.op in xi_vers) and type(embeddings) is not list:
        relationVec = learning.Embeddings(np.random,
                                          state.Nrel,
                                          state.ndim * 2,
                                          tag='relvec')
        embeddings = [embeddings, relationVec, relationVec]

    elif (state.op in scaltrans_vers) and type(embeddings) is not list:
        scaleTranslateVec = learning.Embeddings(np.random,
                                                state.Nrel,
                                                state.ndim * 2,
                                                tag='scaleTranslateVec')
        embeddings = [embeddings, scaleTranslateVec,
                      scaleTranslateVec]  # x, w, d

    elif (state.op in xiscaltrans_vers) and type(embeddings) is not list:
        scaleTranslateVec = learning.Embeddings(np.random,
                                                state.Nrel,
                                                state.ndim * 4,
                                                tag='scaleTranslateVec')
        embeddings = [embeddings, scaleTranslateVec,
                      scaleTranslateVec]  # x, w, d

    elif (state.op in semixi_vers) and type(embeddings) is not list:
        scaleTranslateVec = learning.Embeddings(np.random,
                                                state.Nrel,
                                                state.ndim * 3,
                                                tag='scaleTranslateVec')
        embeddings = [embeddings, scaleTranslateVec,
                      scaleTranslateVec]  # x, w, d

    elif (state.op in aff_vers) and type(embeddings) is not list:
        affineVec = learning.Embeddings(np.random,
                                        state.Nrel, (state.ndim * state.nhid),
                                        tag='affineVec')
        embeddings = [embeddings, affineVec, affineVec]

    elif (state.op in xiaff_vers) and type(embeddings) is not list:
        affineVec = learning.Embeddings(np.random,
                                        state.Nrel,
                                        (state.ndim * state.nhid) * 2,
                                        tag='affineVec')
        embeddings = [embeddings, affineVec, affineVec]

    simfn = state.simfn

    logging.debug('Initializing the training function ..')

    # Functions compilation
    trainfunc = learning.TrainFn1Member(
        simfn,
        embeddings,
        leftop,
        rightop,
        rel=False,
        method=state.method,
        op=state.op,
        loss=loss.hinge,
        loss_margin=state.loss_margin,
        decay=state.decay,
        epsilon=state.epsilon,
        max_learning_rate=state.max_lr,
        weight_L1_param_regularizer=state.l1_param_weight,
        weight_L2_param_regularizer=state.l2_param_weight)

    # FB has some specific parameters for RankRightFnIdx:
    l_subtensorspec = state.Nsyn
    r_subtensorspec = state.Nsyn
    if dataset.specs is not None:
        r_subtensorspec = dataset.specs['Nright'] + dataset.specs['Nshared']

    ranklfunc = evaluation.RankLeftFnIdx(simfn,
                                         embeddings,
                                         leftop,
                                         rightop,
                                         subtensorspec=l_subtensorspec)
    rankrfunc = evaluation.RankRightFnIdx(simfn,
                                          embeddings,
                                          leftop,
                                          rightop,
                                          subtensorspec=r_subtensorspec)

    ranklfunc_filtered = evaluation.RankLeftFnIdx_filtered(
        simfn, embeddings, leftop, rightop, subtensorspec=l_subtensorspec)
    rankrfunc_filtered = evaluation.RankRightFnIdx_filtered(
        simfn, embeddings, leftop, rightop, subtensorspec=r_subtensorspec)

    out, outb = [], []

    train_mrs, train_hits = [], [
    ]  # Mean Rank and Hits@10 for every state.test_all Epoch
    valid_mrs, valid_hits = [], [
    ]  # Mean Rank and Hits@10 for every state.test_all Epoch
    test_mrs, test_hits = [], [
    ]  # Mean Rank and Hits@10 for every state.test_all Epoch

    state.bestvalid, state.besttest = None, None
    state.bestepoch = None

    batchsize = trainl.shape[1] / state.nbatches

    logging.info("Starting the Experiment ..")
    timeref = time.time()

    average_costs_per_epoch = []  # X
    ratios_violating_examples_per_epoch = []  # X

    for epoch_count in range(1, state.totepochs + 1):

        logging.debug('Running epoch %d of %d ..' %
                      (epoch_count, state.totepochs))

        # Shuffling
        order = np.random.permutation(trainl.shape[1])

        # Note: this is painfully slow when (trainl, trainr, traino) are lil_matrix
        trainl, trainr, traino = trainl[:, order], trainr[:,
                                                          order], traino[:,
                                                                         order]

        logging.debug('Creating negative examples ..')

        trainln_arange = np.arange(state.Nsyn)
        trainrn_arange = np.arange(state.Nsyn)
        # the FB dataset has some specific settings
        if dataset.specs is not None:
            trainln_arange = np.arange(dataset.specs['Nright'] +
                                       dataset.specs['Nshared'])
            trainrn_arange = np.arange(
                dataset.specs['Nright'], dataset.specs['Nright'] +
                dataset.specs['Nshared'] + dataset.specs['Nleft'])

        trainln = util.create_random_mat(trainl.shape, trainln_arange)
        trainrn = util.create_random_mat(trainr.shape, trainrn_arange)

        epoch_average_costs = []  # X
        epoch_ratios_violating_examples = []  # X

        for i in range(state.nbatches):  # Iterate over Batches

            logging.debug('Running on batch %d of %d ..' % (i, state.nbatches))

            tmpl = trainl[:, i * batchsize:(i + 1) * batchsize]
            tmpr = trainr[:, i * batchsize:(i + 1) * batchsize]
            tmpo = traino[:, i * batchsize:(i + 1) * batchsize]

            tmpln = trainln[:, i * batchsize:(i + 1) * batchsize]
            tmprn = trainrn[:, i * batchsize:(i + 1) * batchsize]

            logging.debug('Executing the training function ..')

            _lrparam = state.lrparam / float(batchsize)
            if state.no_rescaling is True:
                _lrparam = state.lrparam

            # training iteration
            outtmp = trainfunc(state.lremb, _lrparam, tmpl, tmpr, tmpo, tmpln,
                               tmprn)

            out += [outtmp[0] / float(batchsize)]
            outb += [outtmp[1]]

            average_cost = outtmp[0]  # X
            ratio_violating_examples = outtmp[1]  # X

            epoch_average_costs += [average_cost]  # X
            epoch_ratios_violating_examples += [ratio_violating_examples]  # X

            logging.debug('Normalizing the embeddings ..')

            # embeddings normalization
            if type(embeddings) is list:
                embeddings[0].normalize()  # normalize e
            else:
                embeddings.normalize()

        # End of Epoch

        logging.info("-- EPOCH %s (%s seconds):" %
                     (epoch_count, round(time.time() - timeref, 3)))

        average_costs_per_epoch += [epoch_average_costs]  # X
        ratios_violating_examples_per_epoch += [
            epoch_ratios_violating_examples
        ]  # X

        exp['average_costs_per_epoch'] = average_costs_per_epoch  # X
        exp['ratios_violating_examples_per_epoch'] = ratios_violating_examples_per_epoch  # X

        # Model Evaluation

        logging.info("COST >> %s +/- %s, %% updates: %s%%" %
                     (round(np.mean(out), 4), round(
                         np.std(out), 4), round(np.mean(outb) * 100, 3)))

        # Check if NaN
        if np.isnan(np.mean(out)):
            logging.error('NaN propagation detected!')
            return

        out, outb = [], []

        # Evaluate the Ranking Score each test_all epochs
        if (state.test_all is not None) and ((epoch_count % state.test_all)
                                             == 0):

            valid_summary = None
            state.valid = None

            # Evaluation on the Validation Set
            if dataset.has_valid is True:

                if state.ranking_score_right:
                    resvalid = evaluation.RankingScoreRightIdx(
                        rankrfunc, validlidx, validridx, validoidx)
                    valid_summary = evaluation.ranking_summary_right(
                        resvalid, idxo=validoidx, tag='raw valid')
                    state.valid = np.mean(resvalid)
                else:
                    resvalid = evaluation.RankingScoreIdx(
                        ranklfunc, rankrfunc, validlidx, validridx, validoidx)
                    valid_summary = evaluation.ranking_summary(resvalid,
                                                               idxo=validoidx,
                                                               tag='raw valid')
                    state.valid = np.mean(resvalid[0] + resvalid[1])

                    if (state.filtered):
                        resvalid_filtered = evaluation.FilteredRankingScoreIdx(
                            ranklfunc, rankrfunc, validlidx, validridx,
                            validoidx, true_triples)
                        valid_summary_filtered = evaluation.ranking_summary(
                            resvalid_filtered,
                            idxo=validoidx,
                            tag='filtered valid')

            test_summary = None
            state.test = None

            # Evaluation on the Test Set
            if dataset.has_test is True:

                if state.ranking_score_right:
                    restest = evaluation.RankingScoreRightIdx(
                        rankrfunc, testlidx, testridx, testoidx)
                    test_summary = evaluation.ranking_summary_right(
                        restest, idxo=testoidx, tag='raw test')
                    state.test = np.mean(restest)
                else:
                    restest = evaluation.RankingScoreIdx(
                        ranklfunc, rankrfunc, testlidx, testridx, testoidx)
                    test_summary = evaluation.ranking_summary(restest,
                                                              idxo=testoidx,
                                                              tag='raw test')
                    state.test = np.mean(restest[0] + restest[1])

                    if (state.filtered):
                        restest_filtered = evaluation.FilteredRankingScoreIdx(
                            ranklfunc, rankrfunc, testlidx, testridx, testoidx,
                            true_triples)
                        valid_summary_filtered = evaluation.ranking_summary(
                            restest_filtered,
                            idxo=testoidx,
                            tag='filtered test')

            save_model = True
            if dataset.has_valid is True:
                save_model = False
                if state.bestvalid == None or state.valid < state.bestvalid:
                    save_model = True

            if save_model is True:
                if dataset.has_valid is True:
                    state.bestvalid = state.valid
                    exp['best_valid'] = state.bestvalid

                if dataset.has_test is True:
                    state.besttest = state.test
                    exp['best_test'] = state.besttest

                state.bestepoch = epoch_count
                exp['best_epoch'] = state.bestepoch

                # Save the Best Model (on the Validation Set) using the Persistence Layer
                embs = [e.E for e in embeddings
                        ] if (type(embeddings) == list) else [embeddings.E]
                model_params = embs + leftop.params + rightop.params + (
                    simfn.params if hasattr(simfn, 'params') else [])

                model_param_values = {}
                for model_param in set(model_params):
                    value = {
                        'value': model_param.get_value().tolist(),
                        'shape': model_param.get_value().shape
                    }
                    model_param_values[str(model_param)] = value

                best_model = {
                    'parameters': model_param_values,
                    'epoch': epoch_count,
                    'entities': dataset.entities,
                    'predicates': dataset.predicates,
                    'valid_summary': valid_summary,
                    'test_summary': test_summary
                }

                if dataset.resources is not None:
                    best_model['resources'] = dataset.resources
                    best_model['bnodes'] = dataset.bnodes
                    best_model['literals'] = dataset.literals

                exp['best'] = best_model

        if state.use_db:
            layer.update(exp_id, exp)

        timeref = time.time()
    return
def main(argv):
    experiment_id = None
    classes_pkl = 'data/aifb/aifb_d2s_group_affiliates.pkl'
    is_heat = False

    k = None

    save_file = None

    manifold_name = 'TSNE'  # 'TSNE'
    cluster_name = None  # 'KMeans'

    conf = util.configuration()

    usage_str = (
        'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--manifold=<manifold>] [--cluster=<cluster>] [--top=<k>] [--heat] [--save=<out.png>]'
        % (sys.argv[0]))
    # Parse arguments
    try:
        opts, args = getopt.getopt(argv, 'h', [
            'experiment=', 'classes=', 'manifold=', 'cluster=', 'top=', 'heat',
            'save='
        ])
    except getopt.GetoptError:
        logging.warn(usage_str)
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            logging.info(usage_str)

            logging.info('\t--experiment=<id> (default: %s)' % (experiment_id))
            logging.info('\t--classes=<classes.pkl> (default: %s)' %
                         (classes_pkl))

            logging.info('\t--manifold=<manifold> (default: %s)' %
                         (manifold_name))
            logging.info('\t--cluster=<cluster> (default: %s)' %
                         (cluster_name))
            logging.info('\t--top=<k> (default: %s)' % (k))

            logging.info('\t--heat (show a heat map)')

            logging.info('\t--save=<out.png> (default: %s)' % (save_file))
            return

        if opt == '--experiment':
            experiment_id = arg
        if opt == '--classes':
            classes_pkl = arg

        if opt == '--manifold':
            manifold_name = arg
        if opt == '--cluster':
            cluster_name = arg
        if opt == '--top':
            k = int(arg)
        if opt == '--heat':
            is_heat = True

        if opt == '--save':
            save_file = arg

    # Allocating the persistence layer ..
    layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path'))

    manifold_method, cluster_method = None, None

    if experiment_id is not None:
        manifold_class = getattr(manifold, manifold_name)
        manifold_method = manifold_class()

        if (cluster_name is not None):
            cluster_class = getattr(cluster, cluster_name)
            cluster_method = cluster_class()

        experiment = layer.get(experiment_id)
        classes_dict = pickle.load(open(classes_pkl, 'rb'))

        keys = list(classes_dict.keys())

        logging.info('Removing elements in the intersection ..')

        for i, _class in enumerate(keys):
            _albums = classes_dict[_class]
            _other_genres = keys[i + 1:]
            for _other_genre in _other_genres:
                classes_dict[
                    _class] = classes_dict[_class] - classes_dict[_other_genre]

        logging.info('.. done.')

        _classes = list(classes_dict.keys())
        _cardinalities = [len(classes_dict[_class]) for _class in _classes]

        if k is not None:
            _included_class_indices = heapq.nlargest(
                k, range(len(_cardinalities)), _cardinalities.__getitem__)
            _included_classes = [
                _classes[idx] for idx in _included_class_indices
            ]
            classes_dict = {
                _class: classes_dict[_class]
                for _class in _included_classes
            }

        logging.info('Classes: %s' % classes_dict.keys())

        if is_heat:
            show_heat(experiment, classes_dict, save=save_file)
        else:
            show_points(experiment,
                        classes_dict,
                        manifold_method,
                        cluster_method,
                        save=save_file)