def main(argv): experiment_id = None names_pkl = None conf = util.configuration() usage_str = ('Usage: %s [-h] [--experiment=<id>] [--names=<file.pkl>]' % (sys.argv[0])) # Parse arguments try: opts, args = getopt.getopt(argv, 'h', ['experiment=', 'names=']) except getopt.GetoptError: logging.warn(usage_str) sys.exit(2) for opt, arg in opts: if opt == '-h': logging.info(usage_str) logging.info('\t--experiment=<id> (default: %s)' % (experiment_id)) logging.info('\t--names=<file.pkl> (default: %s)' % (names_pkl)) return if opt == '--experiment': experiment_id = arg if opt == '--names': names_pkl = arg name = conf.get('System', 'name') version = conf.get('System', 'version') # Allocating the persistence layer .. layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path')) experiment = layer.get(experiment_id) names = None if names_pkl is not None: names = pickle.load(open(names_pkl, 'rb')) names = {name[1:-1]:code for (name, code) in names.items()} shell = ExplorationShell(name, version, experiment, names) shell.cmdloop()
def main(argv): experiment_id, classes_pkl, use_rbf = None, None, False conf = util.configuration() # Parse arguments try: opts, args = getopt.getopt(argv, 'h', ['experiment=', 'classes=', 'rbf']) except getopt.GetoptError: logging.warn( 'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--rbf]' % (sys.argv[0])) sys.exit(2) for opt, arg in opts: if opt == '-h': logging.info( 'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--rbf]' % (sys.argv[0])) logging.info('\t--experiment=<id> (default: %s)' % (experiment_id)) logging.info('\t--classes=<classes.pkl> (default: %s)' % (classes_pkl)) logging.info( '\t--rbf (Consider using the RBF kernel during hyperparameters selection)' ) return if opt == '--experiment': experiment_id = arg if opt == '--classes': classes_pkl = arg if opt == '--rbf': use_rbf = True layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path')) # Get the dict containing the details of the experiment (including the learned parameters) experiment = layer.get(experiment_id) # Get a { class: set([ elements ]) } dict classes_dict = pickle.load(open(classes_pkl, 'rb')) # If a class has less than 10 elements, remove it for _class in classes_dict.keys(): if len(classes_dict[_class]) < 10: logging.info('Removing class %s' % _class) del classes_dict[_class] # Get the best parameters learned so far best = experiment['best_on_validation'] entities, predicates = best['entities'], best['predicates'] variables = ['Eemb'] # Turn the { class: set([ elements ]) } dict into two arrays: [ classes ] and [ elements ] classes, elements = [], [] for (_class, _elements) in classes_dict.items(): for _element in _elements: classes += [_class] elements += [_element] # Turn [ classes ] (containing a class name for each element in elements) in a list of integers # e.g. [ 'Class1', 'Class2', 'Class1' ] becomes [ 0, 1, 0 ] indexes = [entities.index(element) for element in elements] class_idx = { _class: _idx for (_idx, _class) in enumerate(classes_dict.keys()) } classes_numeric = [class_idx[_class] for _class in classes] parameter = best['parameters']['Eemb'] Xt = np.asarray(parameter['value']) _X = np.transpose(Xt) X = _X[np.asarray(indexes), :] if normalize: preprocessing.normalize(X) y = np.asarray(classes_numeric) # Split the dataset in two equal parts #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0) for it in range(iterations): kf = cross_validation.KFold(len(elements), shuffle=True, n_folds=10) accuracies = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] gammas = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4] # Tune hyper-parameters by cross-validation tuned_parameters = [{'kernel': ['linear'], 'C': Cs}] if use_rbf: tuned_parameters += [{ 'kernel': ['rbf'], 'gamma': gammas, 'C': Cs }] scoring_function = 'accuracy' # ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'precision', 'r2', 'recall', 'roc_auc'] logging.debug("# Tuning hyper-parameters for %s" % scoring_function) model = svm.SVC() clf = grid_search.GridSearchCV(model, tuned_parameters, cv=10, scoring=scoring_function) clf.fit(X_train, y_train) logging.debug("Best parameters set found on development set:") logging.debug(clf.best_estimator_) logging.debug("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: logging.debug('%0.3f (+/-%0.03f) for %r' % (mean_score, scores.std() / 2, params)) logging.debug("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) accuracy = metrics.accuracy_score(y_true, y_pred) accuracies += [accuracy] logging.info('Accuracy: %s' % metrics.accuracy_score(y_true, y_pred)) #logging.debug(metrics.classification_report(y_true, y_pred)) logging.info('Accuracy: %f +/- %f' % (np.mean(accuracies), np.var(accuracies)))
def learn(state): np.random.seed(state.seed) c = util.configuration() layer, exp, exp_id = None, {}, None dataset = util.ExpDataSet(train_path=state.train_path, valid_path=state.valid_path, test_path=state.test_path) # Training set trainl, trainr, traino = dataset.train() # Validation set if dataset.has_valid is True: validl, validr, valido = dataset.valid() # Test set if dataset.has_test is True: testl, testr, testo = dataset.test() logging.info('Shape for training set: %s' % (str(trainl.shape))) if state.use_db: is_fast = trainl.shape[ 1] > 10000000 # if the dataset is not small-sized (> 10m triples), switch to fast mode layer = persistence.PickleLayer(dir=c.get('Persistence', 'path'), is_fast=is_fast) exp = {'start_time': datetime.datetime.utcnow()} exp_id = layer.create(state.name, exp) NE, NP = len(dataset.entities), len(dataset.predicates) state.Nrel = NP state.Nent = NE + NP if state.Nsyn is None: state.Nsyn = NE if dataset.specs is not None: state.Nleft, state.Nright, state.Nshared = dataset.specs[ 'Nleft'], dataset.specs['Nright'], dataset.specs['Nshared'] exp['best'] = { } # use the validation set (if available) to pick the best model exp['state'] = { k: (state[k].__name__ if isinstance(state[k], types.FunctionType) else str(state[k])) for k in state.keys() } exp['producer'] = util.producer(c) # Show experiment parameters logging.info('State: %s', exp['state']) if state.op in [ 'SE' ] + base_vers + semixi_vers + xi_vers + xiscaltrans_vers + lc_vers + scaltrans_vers + aff_vers + xiaff_vers: traino = traino[-state.Nrel:, :] # last elements of traino if dataset.has_valid is True: valido = valido[-state.Nrel:, :] if dataset.has_test is True: testo = testo[-state.Nrel:, :] logging.debug('Converting sparse matrices to indexes ..') # Convert sparse matrices to indexes trainlidx, trainridx, trainoidx = util.convert2idx( trainl), util.convert2idx(trainr), util.convert2idx(traino) if dataset.has_valid is True: validlidx, validridx, validoidx = util.convert2idx( validl), util.convert2idx(validr), util.convert2idx(valido) if dataset.has_test is True: testlidx, testridx, testoidx = util.convert2idx( testl), util.convert2idx(testr), util.convert2idx(testo) true_triples = None if (dataset.has_valid is True) and (dataset.has_test is True) and state.filtered: true_triples = np.concatenate([ testlidx, validlidx, trainlidx, testoidx, validoidx, trainoidx, testridx, validridx, trainridx ]).reshape(3, testlidx.shape[0] + validlidx.shape[0] + trainlidx.shape[0]).T # Operators leftop, rightop = model.op(state.op, state.ndim, state.nhid) logging.debug('Initializing the embeddings ..') # Embeddings embeddings = learning.Embeddings(np.random, state.Nent, state.ndim, tag='emb') relationVec = None if (state.op in ['SE']) and type(embeddings) is not list: relationl = learning.Embeddings(np.random, state.Nrel, state.ndim * state.nhid, tag='rell') relationr = learning.Embeddings(np.random, state.Nrel, state.ndim * state.nhid, tag='relr') embeddings = [embeddings, relationl, relationr] elif (state.op in base_vers + lc_vers) and type(embeddings) is not list: relationVec = learning.Embeddings(np.random, state.Nrel, state.ndim, tag='relvec') embeddings = [embeddings, relationVec, relationVec] elif (state.op in xi_vers) and type(embeddings) is not list: relationVec = learning.Embeddings(np.random, state.Nrel, state.ndim * 2, tag='relvec') embeddings = [embeddings, relationVec, relationVec] elif (state.op in scaltrans_vers) and type(embeddings) is not list: scaleTranslateVec = learning.Embeddings(np.random, state.Nrel, state.ndim * 2, tag='scaleTranslateVec') embeddings = [embeddings, scaleTranslateVec, scaleTranslateVec] # x, w, d elif (state.op in xiscaltrans_vers) and type(embeddings) is not list: scaleTranslateVec = learning.Embeddings(np.random, state.Nrel, state.ndim * 4, tag='scaleTranslateVec') embeddings = [embeddings, scaleTranslateVec, scaleTranslateVec] # x, w, d elif (state.op in semixi_vers) and type(embeddings) is not list: scaleTranslateVec = learning.Embeddings(np.random, state.Nrel, state.ndim * 3, tag='scaleTranslateVec') embeddings = [embeddings, scaleTranslateVec, scaleTranslateVec] # x, w, d elif (state.op in aff_vers) and type(embeddings) is not list: affineVec = learning.Embeddings(np.random, state.Nrel, (state.ndim * state.nhid), tag='affineVec') embeddings = [embeddings, affineVec, affineVec] elif (state.op in xiaff_vers) and type(embeddings) is not list: affineVec = learning.Embeddings(np.random, state.Nrel, (state.ndim * state.nhid) * 2, tag='affineVec') embeddings = [embeddings, affineVec, affineVec] simfn = state.simfn logging.debug('Initializing the training function ..') # Functions compilation trainfunc = learning.TrainFn1Member( simfn, embeddings, leftop, rightop, rel=False, method=state.method, op=state.op, loss=loss.hinge, loss_margin=state.loss_margin, decay=state.decay, epsilon=state.epsilon, max_learning_rate=state.max_lr, weight_L1_param_regularizer=state.l1_param_weight, weight_L2_param_regularizer=state.l2_param_weight) # FB has some specific parameters for RankRightFnIdx: l_subtensorspec = state.Nsyn r_subtensorspec = state.Nsyn if dataset.specs is not None: r_subtensorspec = dataset.specs['Nright'] + dataset.specs['Nshared'] ranklfunc = evaluation.RankLeftFnIdx(simfn, embeddings, leftop, rightop, subtensorspec=l_subtensorspec) rankrfunc = evaluation.RankRightFnIdx(simfn, embeddings, leftop, rightop, subtensorspec=r_subtensorspec) ranklfunc_filtered = evaluation.RankLeftFnIdx_filtered( simfn, embeddings, leftop, rightop, subtensorspec=l_subtensorspec) rankrfunc_filtered = evaluation.RankRightFnIdx_filtered( simfn, embeddings, leftop, rightop, subtensorspec=r_subtensorspec) out, outb = [], [] train_mrs, train_hits = [], [ ] # Mean Rank and Hits@10 for every state.test_all Epoch valid_mrs, valid_hits = [], [ ] # Mean Rank and Hits@10 for every state.test_all Epoch test_mrs, test_hits = [], [ ] # Mean Rank and Hits@10 for every state.test_all Epoch state.bestvalid, state.besttest = None, None state.bestepoch = None batchsize = trainl.shape[1] / state.nbatches logging.info("Starting the Experiment ..") timeref = time.time() average_costs_per_epoch = [] # X ratios_violating_examples_per_epoch = [] # X for epoch_count in range(1, state.totepochs + 1): logging.debug('Running epoch %d of %d ..' % (epoch_count, state.totepochs)) # Shuffling order = np.random.permutation(trainl.shape[1]) # Note: this is painfully slow when (trainl, trainr, traino) are lil_matrix trainl, trainr, traino = trainl[:, order], trainr[:, order], traino[:, order] logging.debug('Creating negative examples ..') trainln_arange = np.arange(state.Nsyn) trainrn_arange = np.arange(state.Nsyn) # the FB dataset has some specific settings if dataset.specs is not None: trainln_arange = np.arange(dataset.specs['Nright'] + dataset.specs['Nshared']) trainrn_arange = np.arange( dataset.specs['Nright'], dataset.specs['Nright'] + dataset.specs['Nshared'] + dataset.specs['Nleft']) trainln = util.create_random_mat(trainl.shape, trainln_arange) trainrn = util.create_random_mat(trainr.shape, trainrn_arange) epoch_average_costs = [] # X epoch_ratios_violating_examples = [] # X for i in range(state.nbatches): # Iterate over Batches logging.debug('Running on batch %d of %d ..' % (i, state.nbatches)) tmpl = trainl[:, i * batchsize:(i + 1) * batchsize] tmpr = trainr[:, i * batchsize:(i + 1) * batchsize] tmpo = traino[:, i * batchsize:(i + 1) * batchsize] tmpln = trainln[:, i * batchsize:(i + 1) * batchsize] tmprn = trainrn[:, i * batchsize:(i + 1) * batchsize] logging.debug('Executing the training function ..') _lrparam = state.lrparam / float(batchsize) if state.no_rescaling is True: _lrparam = state.lrparam # training iteration outtmp = trainfunc(state.lremb, _lrparam, tmpl, tmpr, tmpo, tmpln, tmprn) out += [outtmp[0] / float(batchsize)] outb += [outtmp[1]] average_cost = outtmp[0] # X ratio_violating_examples = outtmp[1] # X epoch_average_costs += [average_cost] # X epoch_ratios_violating_examples += [ratio_violating_examples] # X logging.debug('Normalizing the embeddings ..') # embeddings normalization if type(embeddings) is list: embeddings[0].normalize() # normalize e else: embeddings.normalize() # End of Epoch logging.info("-- EPOCH %s (%s seconds):" % (epoch_count, round(time.time() - timeref, 3))) average_costs_per_epoch += [epoch_average_costs] # X ratios_violating_examples_per_epoch += [ epoch_ratios_violating_examples ] # X exp['average_costs_per_epoch'] = average_costs_per_epoch # X exp['ratios_violating_examples_per_epoch'] = ratios_violating_examples_per_epoch # X # Model Evaluation logging.info("COST >> %s +/- %s, %% updates: %s%%" % (round(np.mean(out), 4), round( np.std(out), 4), round(np.mean(outb) * 100, 3))) # Check if NaN if np.isnan(np.mean(out)): logging.error('NaN propagation detected!') return out, outb = [], [] # Evaluate the Ranking Score each test_all epochs if (state.test_all is not None) and ((epoch_count % state.test_all) == 0): valid_summary = None state.valid = None # Evaluation on the Validation Set if dataset.has_valid is True: if state.ranking_score_right: resvalid = evaluation.RankingScoreRightIdx( rankrfunc, validlidx, validridx, validoidx) valid_summary = evaluation.ranking_summary_right( resvalid, idxo=validoidx, tag='raw valid') state.valid = np.mean(resvalid) else: resvalid = evaluation.RankingScoreIdx( ranklfunc, rankrfunc, validlidx, validridx, validoidx) valid_summary = evaluation.ranking_summary(resvalid, idxo=validoidx, tag='raw valid') state.valid = np.mean(resvalid[0] + resvalid[1]) if (state.filtered): resvalid_filtered = evaluation.FilteredRankingScoreIdx( ranklfunc, rankrfunc, validlidx, validridx, validoidx, true_triples) valid_summary_filtered = evaluation.ranking_summary( resvalid_filtered, idxo=validoidx, tag='filtered valid') test_summary = None state.test = None # Evaluation on the Test Set if dataset.has_test is True: if state.ranking_score_right: restest = evaluation.RankingScoreRightIdx( rankrfunc, testlidx, testridx, testoidx) test_summary = evaluation.ranking_summary_right( restest, idxo=testoidx, tag='raw test') state.test = np.mean(restest) else: restest = evaluation.RankingScoreIdx( ranklfunc, rankrfunc, testlidx, testridx, testoidx) test_summary = evaluation.ranking_summary(restest, idxo=testoidx, tag='raw test') state.test = np.mean(restest[0] + restest[1]) if (state.filtered): restest_filtered = evaluation.FilteredRankingScoreIdx( ranklfunc, rankrfunc, testlidx, testridx, testoidx, true_triples) valid_summary_filtered = evaluation.ranking_summary( restest_filtered, idxo=testoidx, tag='filtered test') save_model = True if dataset.has_valid is True: save_model = False if state.bestvalid == None or state.valid < state.bestvalid: save_model = True if save_model is True: if dataset.has_valid is True: state.bestvalid = state.valid exp['best_valid'] = state.bestvalid if dataset.has_test is True: state.besttest = state.test exp['best_test'] = state.besttest state.bestepoch = epoch_count exp['best_epoch'] = state.bestepoch # Save the Best Model (on the Validation Set) using the Persistence Layer embs = [e.E for e in embeddings ] if (type(embeddings) == list) else [embeddings.E] model_params = embs + leftop.params + rightop.params + ( simfn.params if hasattr(simfn, 'params') else []) model_param_values = {} for model_param in set(model_params): value = { 'value': model_param.get_value().tolist(), 'shape': model_param.get_value().shape } model_param_values[str(model_param)] = value best_model = { 'parameters': model_param_values, 'epoch': epoch_count, 'entities': dataset.entities, 'predicates': dataset.predicates, 'valid_summary': valid_summary, 'test_summary': test_summary } if dataset.resources is not None: best_model['resources'] = dataset.resources best_model['bnodes'] = dataset.bnodes best_model['literals'] = dataset.literals exp['best'] = best_model if state.use_db: layer.update(exp_id, exp) timeref = time.time() return
def main(argv): experiment_id = None classes_pkl = 'data/aifb/aifb_d2s_group_affiliates.pkl' is_heat = False k = None save_file = None manifold_name = 'TSNE' # 'TSNE' cluster_name = None # 'KMeans' conf = util.configuration() usage_str = ( 'Usage: %s [-h] [--experiment=<id>] [--classes=<classes.pkl>] [--manifold=<manifold>] [--cluster=<cluster>] [--top=<k>] [--heat] [--save=<out.png>]' % (sys.argv[0])) # Parse arguments try: opts, args = getopt.getopt(argv, 'h', [ 'experiment=', 'classes=', 'manifold=', 'cluster=', 'top=', 'heat', 'save=' ]) except getopt.GetoptError: logging.warn(usage_str) sys.exit(2) for opt, arg in opts: if opt == '-h': logging.info(usage_str) logging.info('\t--experiment=<id> (default: %s)' % (experiment_id)) logging.info('\t--classes=<classes.pkl> (default: %s)' % (classes_pkl)) logging.info('\t--manifold=<manifold> (default: %s)' % (manifold_name)) logging.info('\t--cluster=<cluster> (default: %s)' % (cluster_name)) logging.info('\t--top=<k> (default: %s)' % (k)) logging.info('\t--heat (show a heat map)') logging.info('\t--save=<out.png> (default: %s)' % (save_file)) return if opt == '--experiment': experiment_id = arg if opt == '--classes': classes_pkl = arg if opt == '--manifold': manifold_name = arg if opt == '--cluster': cluster_name = arg if opt == '--top': k = int(arg) if opt == '--heat': is_heat = True if opt == '--save': save_file = arg # Allocating the persistence layer .. layer = persistence.PickleLayer(dir=conf.get('Persistence', 'path')) manifold_method, cluster_method = None, None if experiment_id is not None: manifold_class = getattr(manifold, manifold_name) manifold_method = manifold_class() if (cluster_name is not None): cluster_class = getattr(cluster, cluster_name) cluster_method = cluster_class() experiment = layer.get(experiment_id) classes_dict = pickle.load(open(classes_pkl, 'rb')) keys = list(classes_dict.keys()) logging.info('Removing elements in the intersection ..') for i, _class in enumerate(keys): _albums = classes_dict[_class] _other_genres = keys[i + 1:] for _other_genre in _other_genres: classes_dict[ _class] = classes_dict[_class] - classes_dict[_other_genre] logging.info('.. done.') _classes = list(classes_dict.keys()) _cardinalities = [len(classes_dict[_class]) for _class in _classes] if k is not None: _included_class_indices = heapq.nlargest( k, range(len(_cardinalities)), _cardinalities.__getitem__) _included_classes = [ _classes[idx] for idx in _included_class_indices ] classes_dict = { _class: classes_dict[_class] for _class in _included_classes } logging.info('Classes: %s' % classes_dict.keys()) if is_heat: show_heat(experiment, classes_dict, save=save_file) else: show_points(experiment, classes_dict, manifold_method, cluster_method, save=save_file)