Esempio n. 1
0
def write_vectors(vectors, output_filename):
    logger.info("Writing vectors to %s" % output_filename)
    with codecs.open(output_filename, "w") as output_file:
        for t in vectors:
            output_file.write('\t'.join(map(str,t)))
            output_file.write('\n')
    logger.info("Done writing vectors to %s" % output_filename)
Esempio n. 2
0
 def strings2ids_from_file(self,
                           input_filename,
                           output_filename=None,
                           column_offset=0):
     logger.info("encode triples from %s \n\t save to %s" %
                 (input_filename, output_filename))
     self.strings2ids(FileTriplesSource(input_filename, column_offset),
                      output_filename)
Esempio n. 3
0
 def adapt(self, new_triples):
     super().adapt(new_triples)
     logger.info("Retrain model and save to %s" %
                 self.get_current_model_folder())
     self.curr_model = self.train(
         JointTriplesSource(self.kg_triples, new_triples))
     logger.info("Done retraining model and save to %s !" %
                 self.get_current_model_folder())
Esempio n. 4
0
def dump_dict(dict_to_dump,dict_filename, overwrite):
    if overwrite or not os.path.exists(dict_filename):
        logger.info("Dump to %s" % dict_filename)
        with codecs.open(dict_filename, "w") as output_file:
            output_file.write(str(len(dict_to_dump.keys())) + '\n')
            for k, v in dict_to_dump.items():
                output_file.write(str(k) + '\t' + str(v) + '\n')
        return True
    return False
Esempio n. 5
0
def write_triples_as_predicates(triples_iteratable, output_filename, surround=False):
    logger.info("Writing triples to %s" % output_filename)
    with codecs.open(output_filename, "w") as output_file:
        for t in triples_iteratable:
            if surround:
                t=map(add_arrows,t)
            output_file.write('%s(%s,%s)'%(t[1],t[0],t[2]))
            output_file.write('\n')
    logger.info("Done writing triples to %s" % output_filename)
Esempio n. 6
0
 def train(self, triples, is_update=False):
     logger.info("Train a model using OpenKE TF!")
     self._prepare_training_data(triples)
     new_model = train_model(self._get_data_folder(),
                             self.get_current_model_folder(),
                             self.model_name)
     logger.info("Done Training a model using OpenKE TF! (%s)" %
                 self.model_name)
     return new_model
Esempio n. 7
0
 def end_process(self):
     """
     Perform basic operations at the end
     :return:
     """
     logger.info("Plotting Results!")
     evals=[i.eval for i in self.iterations]
     if all(evals):
         eval_utils.plot_iterations(evals, self.itr_stats_plot_file)
     logger.info("Done Plotting Results!")
Esempio n. 8
0
    def adapt(self, new_feedback_triples: TriplesSource):
        """
        Adapt the emebdding model using the new set of Auxilary triples

        :param new_feedback_triples: the auxilary triples
        :return:
        """
        self.iteration_num += 1
        self.feedback_triples_history.append(new_feedback_triples)
        logger.info('Triples used for adaptation: %s' %
                    new_feedback_triples.get_name())
Esempio n. 9
0
def load_saved_model(input_data_folder, model_tf_filepath, model='TransE'):
    input_data_folder += '/'
    logger.info("loading trained model from %s" % model_tf_filepath)
    con = config.Config()
    con.set_in_path(input_data_folder)
    con.set_import_files(model_tf_filepath)
    con.init()
    # con.set_model(models.TransE)
    eval('con.set_model(models.%s)' % model)
    embeddings = con.get_parameters("numpy")
    logger.info("Done loading trained model!")
    return embeddings
Esempio n. 10
0
    def explain(self):
        logger.info("Explaining clusters !")
        # self.clusters_explainer.prepare_data(self.current_itr.entity_clusters_triples)
        # clusters = self.current_itr.entity_clusters_triples.get_uniq_labels()
        # explanations_dict = self.clusters_explainer.explain(clusters)
        clusters2explanations_dict = self.clusters_explainer.explain(self.current_itr.entity_clusters_triples)

        if self.save_steps:
            output_file = os.path.join(self.get_current_itr_directory(), 'explanations.txt')
            dump_explanations_to_file(clusters2explanations_dict, output_file)

        logger.info("Done Explaining clusters!")
        return clusters2explanations_dict
Esempio n. 11
0
    def initialize(self):
        """
        Load the base model or train a new one if does not exist.

        The basemodel is loaded to current_model and base_model variables

        :return: none
        """

        # flag_file = os.path.join(self._get_current_model_folder(), 'model.vec.tf.index')
        embedding_model_file = self.get_current_model_filepath()
        if not self.is_trained():
            logger.warning("File %s does not exist!" % embedding_model_file)
            logger.warning("Training the model will start from scratch!")
            self.base_model = self.train(self.kg_triples)
            logger.warning("Done training the model will start from scratch!")

        else:
            logger.info('Loading Model from %s' %
                        self.get_current_model_filepath())
            # loaded_model =
            # if not self.base_model:
            self.base_model = self.load_from_file()
            logger.info('Done Loading Model!')

            logger.info('copy model to new folder: %s' % self.embedding_folder)
            copytree(self.get_current_model_folder(),
                     os.path.join(self.embedding_folder, 'base'))
            logger.info('Done copy model to new folder: %s' %
                        self.embedding_folder)

        self.curr_model = self.base_model
Esempio n. 12
0
    def get_entities_embedding(self, entities):
        entities_ids = self.kg_encoder.conv_entities2ids(entities)
        logger.info("Get entities ids!")
        # with open('/GW/D5data-11/gadelrab/ExDEC/results/yago_old/entities_encoded.tsv', 'w') as tmp_file:
        #     for e,i in zip(entities,entities_ids):
        #         tmp_file.write('%s\t%s\n'%(e,i))

        entities_vectors = self.curr_model['ent_embeddings']
        target_embedding_vec = np.array(
            [entities_vectors[i] for i in entities_ids])
        # logger.info("Data type ... "+str(target_embedding_vec.dtype)+" "+str(target_embedding_vec.shape))
        # np.savetxt('/GW/D5data-11/gadelrab/ExDEC/results/yago_old/data_embedding.tsv', target_embedding_vec,
        # delimiter="\t")
        print("embedding shape from adapter " +
              str(target_embedding_vec.shape))
        return target_embedding_vec
Esempio n. 13
0
    def construct_feedback(self):
        logger.info("Construct Feedback Triples!")

        output_file=None
        if self.save_steps:
            output_file = os.path.join(self.get_current_itr_directory(), 'feedback_triples.tsv')

        triples = self.augmentation_strategy.get_augmentation_triples(
            descriptions = self.current_itr.clusters_explanations_dict,
            target_entities= self.current_itr.entity_clusters_triples,
            output_file=output_file,
            iter_num=self.current_itr.id)
        
        if self.save_steps:
            write_triples(triples, output_file)

        logger.info("Done Constructing Feedback Triples!")
        return triples
Esempio n. 14
0
def plot_iterations(results_list, output_file=None):
    """
    Plot the clustering and explanations quality results over the oterations.

    :param results_list:
    :param output_file:
    :return:
    """
    markers = ['.', '*', 'x', 's', 'p', '+', 'h', 'o']
    plt.rcParams.update({'font.size': 16})

    if len(results_list) == 0:
        print("NO Results skip plotting!")
        return
    df = pd.DataFrame(results_list)
    plt.clf()
    ax = plt.gca()

    names = {'x_coverage': 'Exc', 'wr_acc': 'WRA'}

    for met, mar in zip(metrics_to_plot, markers):
        lb = met.replace('@1', '')
        lb = names[lb] if lb in names else lb

        if met in df.columns:
            df.plot(kind='line',
                    x='itr_num',
                    y=met,
                    ax=ax,
                    marker=mar,
                    label=lb)
    plt.xlabel(None)
    plt.grid(b=True, axis='y')
    # plt.legend(loc=9)
    plt.yticks([a / 10 for a in range(0, 11)])
    plt.xticks(range(0, 10))
    plt.ylim(0, 1.05)
    plt.xlim(0, 9.5)
    if output_file:
        logger.info("Saving plot to %s" % output_file)
        plt.tight_layout()
        plt.savefig(output_file)
    else:
        plt.show()
Esempio n. 15
0
    def index_triples(self,
                      triples_source: TriplesSource,
                      prefix='',
                      safe_urls=False,
                      drop_old=False):
        if drop_old:
            logger.info("Drop %s " % self.identifier)
            self.drop()

        if self.store != 'SPARQLUpdateStore' and not self.graph:
            self.graph = Graph(store=self.store, identifier=self.identifier)
            # print(self.graph.store)

        # if self.store == 'SPARQLUpdateStore':
        #     self.graph.open(self.endpoint)

        # self._index(triples_source, prefix, safe_urls)
        self._index_np(triples_source)  # , prefix, safe_urls)
        return self.graph
Esempio n. 16
0
    def cluster(self):
        logger.info("Start clustering")
        entity_vectors = self.current_itr.target_entities_embeddings
        logger.debug(entity_vectors)
        logger.info("size of the data " + str(entity_vectors.shape))

        y_pred = self.clustering_method.cluster(entity_vectors, clustering_params=self.clustering_params,
                                                output_folder=self.get_current_itr_directory())
        triples = EntityLabelsToTriples(np.column_stack((self.target_entities.get_entities(), y_pred.reshape(-1, 1))),
                                        iter_id=self.current_itr.id)

        if self.save_steps:
            output_file = os.path.join(self.get_current_itr_directory(), 'clustering.tsv')
            output_vecs_file = os.path.join(self.get_current_itr_directory(), 'embeddings_vecs.tsv')
            write_triples(triples, output_file)
            write_vectors(entity_vectors, output_vecs_file)
            output_labels_file = os.path.join(self.get_current_itr_directory(), 'clustering_labels_only.tsv')
            write_vectors(y_pred.reshape(-1,1), output_labels_file)

        return triples
Esempio n. 17
0
    def _prepare_training_data(self, triples, overwrite=False):
        logger.info("Prepare embedding training data!")

        triples_filepath = os.path.join(self._get_data_folder(),
                                        'triple2id.txt')
        if overwrite or not os.path.exists(triples_filepath):
            self.kg_encoder.strings2ids(triples, triples_filepath)

        training_data_filepath = os.path.join(self._get_data_folder(),
                                              'train2id.txt')
        if overwrite or not os.path.exists(training_data_filepath):
            convert_ere2eer(triples_filepath, training_data_filepath)

        self.kg_encoder.dump_dicts(self._get_data_folder())

        # files required by openKE
        create_dummy_valid_test_files(self._get_data_folder())
        generate_cosntraints_openKE(self._get_data_folder())

        logger.info("Done preparing training data!")
Esempio n. 18
0
def write_triples(triples_iteratable, output_filename, surround=False, unquote_urls=True):
    """
    Exports triples from a source to file as tsv.

    :param unquote_urls:
    :param triples_iteratable:
    :param output_filename:
    :param surround:
    :return:
    """
    logger.info("Writing triples to %s" % output_filename)
    with codecs.open(output_filename, "w") as output_file:
        for t in triples_iteratable:

            if unquote_urls:
                t=map(unquote,t)
            if surround:
                t=map(add_arrows,t)
            output_file.write('\t'.join(t))
            output_file.write('\n')
    logger.info("Done writing triples to %s" % output_filename)
Esempio n. 19
0
    def __init__(self,
                 filepath,
                 column_offset=0,
                 prefix='',
                 safe_urls=False,
                 delimiter='\s+'):
        super(FileTriplesSource, self).__init__(name=filepath)
        self.prefix = prefix
        self.safe_urls = safe_urls
        self.column_offset = column_offset
        self.filepath = filepath

        s = time.process_time()
        self.data = pd.read_csv(self.filepath,
                                header=None,
                                delimiter=delimiter,
                                dtype=str).values
        self.data = self.data[:, :3]
        en = time.process_time()
        logger.info("Done loading data! size: %s  time: %f s" %
                    (str(self.data.shape), (en - s)))
        valid_id = np.vectorize(data_formating.valid_id)
        # invalid_rows=np.where( np.bitwise_not(valid_id(self.data[:])) )[0]
        # print(invalid_rows)
        self.data = np.delete(
            self.data,
            np.unique(np.where(np.bitwise_not(valid_id(self.data[:])))[0]),
            axis=0)
        if safe_urls or prefix:
            self.data = parallel_apply_along_axis(format_triple,
                                                  1,
                                                  self.data,
                                                  prefix=self.prefix,
                                                  quote_it=self.safe_urls)
        logger.info(
            "Done fixing data formatting and filtering! size: %s  time: %f s" %
            (str(self.data.shape), (time.process_time() - en)))
Esempio n. 20
0
    def explain(self, clusters=None, in_file=None, output_file=None):
        logger.info("Explaining using Dedalo for " + in_file)

        for cl in clusters:
            for explanation in ddl.explain(self.kg_hdt_file,
                                           in_file,
                                           minimum_score=0.2,
                                           groupid=cl):
                logger.info(explanation)
        logger.info("Done Explaining!")
Esempio n. 21
0
 def adapt(self, new_feedback_triples):
     super().adapt(new_feedback_triples)
     logger.info("Retrain model and save to %s" %
                 self.get_current_model_folder())
     update_data = self.prepare_adaptation_data()
     logger.info("Update Data: %s  size: %i" %
                 (update_data.get_name(), update_data.size()))
     is_update = not self.update_mode == UpdateMode.RETRAIN
     self.curr_model = self.train(update_data, is_update=is_update)
     logger.info("Done retraining model and save to %s !" %
                 self.get_current_model_folder())
Esempio n. 22
0
    def _index_np(self, triples_source, prefix='', safe_urls=False):
        logger.info("Start indexing " + triples_source.get_name())

        data = triples_source.as_numpy_array()
        data_size = triples_source.size()

        number_splits = math.ceil(data_size / self.batch_size)
        logger.info("data size %i" % data_size)
        logger.info("chunks %i" % number_splits)

        # ch=0
        chunks = np.array_split(data, number_splits)
        for chunk in tqdm(chunks):
            if self.store == 'SPARQLUpdateStore':
                self.insert_sparql(chunk)
            else:
                self.insert_memory(chunk)

        logger.info("Done indexing " + triples_source.get_name())
Esempio n. 23
0
    def train(self, triples, is_update=False):
        logger.warning("Training may take long time!")
        training_array = self._prepare_training_data(triples)
        logger.info("Start Training!")

        if not is_update:
            logger.info("Fitting from scratch!")
            trained_model = self._get_model(is_update=False)
        else:
            logger.info("Continuous training!")
            trained_model = self._get_model(is_update=True)

            if self.update_mode == UpdateMode.ADAPT_RESTART:
                trained_model.copy_old_model_params(self.base_model)
            elif self.update_mode == UpdateMode.ADAPT_PROGRESSIVE:
                trained_model.copy_old_model_params(self.curr_model)

        trained_model.fit(training_array, continue_training=is_update)
        save_model(trained_model,
                   model_name_path=self.get_current_model_filepath())
        logger.info("Done Training model!")
        return trained_model
Esempio n. 24
0
 def strings2ids(self, triples_source, output_filename=None):
     logger.info("Encode triples and save to %s" % (output_filename))
     with codecs.open(output_filename, "w") as output_file:
         # triples_as_list = list(triples_source.triples())
         # output_file.write(str(len(triples_as_list)) + '\n')
         # triples_as_list = list(triples_source.triples())
         output_file.write(str(triples_source.size()) + '\n')
         logger.info("Encoding %i triples!" % triples_source.size())
         for t in triples_source:
             encoded_t = (self._entity2id(t[0]), self._relation2id(t[1]),
                          self._entity2id(t[2]))
             output_file.write('\t'.join([str(c)
                                          for c in encoded_t]) + '\n')
     logger.info("Done encoding triples!")
     return output_filename
Esempio n. 25
0
def maximum_assignments(y_true_str, y_pred, show_confusion_matrix):
    w = consruct_confusion_matrix(y_true_str, y_pred)
    row_ind, col_ind = linear_sum_assignment(w.max() - w)
    max_cells = list(zip(row_ind, col_ind))
    max_cells.sort(key=lambda tup: tup[1])

    maximum_values = np.array([w[c[0], c[1]] for c in max_cells])

    gt_groups_sizes = np.sum(w, axis=0)
    pred_groups_sizes = np.sum(w, axis=1)
    if show_confusion_matrix:
        logger.info('Confusion Matrix \n %r' % w)
        logger.info('Maximum Cells %r' % max_cells)
        logger.info('Maximum Values %r' % maximum_values)

        # print("Max values %r" % maximum_values)
        # print("group sizes %r" % gt_groups_sizes)
        # print("Predicted cls sizes %r" % pred_groups_sizes)
    return maximum_values, gt_groups_sizes, pred_groups_sizes
Esempio n. 26
0
                out_file.write('\n')

    with open(out_file_parsable + ('.%s' % quality if len(quality) > 0 else ''), 'w') as out_file:
        out_file.write('\n'.join(
            map(str, chain.from_iterable(map(lambda l: l[:topk] if topk > 0 else l, per_var_predictions.values())))))

    return out_filepath_with_type


if __name__ == '__main__':
    vos_executer = EndPointKGQueryInterface('http://halimede:8890/sparql',
                                            ['http://yago-expr.org', 'http://yago-expr.org.types'])
    descriptions = load_from_file(
        '/scratch/GW/pool0/gadelrab/ExDEC/results/baseline_data/imdb/TransE/ADAPT_PROGRESSIVE/ASSIGNMENTS_SUBGRAPH/x_coverage_Kmeans_direct_h3_correlation/run_21022020_170650/steps/itr_0/explanations.txt.parsable')

    logger.info("Descriptions #: %i", (len(descriptions)))

    ded = SparqlBasedDeductionEngine(vos_executer)
    per_var_predictions = ded.infer(descriptions)

    # print(per_var_predictions)

    # print(per_var_predictions.items()[:10])

    logger.info("Total variables with predictions subjects: %i", len(per_var_predictions))

    # labelsGraph = Graph(store='SPARQLStore', identifier='http://yago-labels-encoded.org')
    # labelsGraph.open('http://badr:8890/sparql')
    #
    # gt_facts = map(
    #     lambda t: Prediction((str(t[0]), str(t[1]), str(t[2])),
Esempio n. 27
0
def restore_model(model_name_path=None,
                  module_name="ampligraph.latent_features"):
    """Restore a saved model from disk.

        See also :meth:`save_model`.

        Parameters
        ----------
        model_name_path: string
            The name of saved model to be restored. If not specified,
            the library will try to find the default model in the working directory.

        Returns
        -------
        model: EmbeddingModel
            the neural knowledge graph embedding model restored from disk.

    """
    if model_name_path is None:
        logger.warning("There is no model name specified. \
                        We will try to lookup \
                        the latest default saved model...")
        default_models = glob.glob("*.model.pkl")
        if len(default_models) == 0:
            raise Exception("No default model found. Please specify \
                             model_name_path...")
        else:
            model_name_path = default_models[len(default_models) - 1]
            logger.info("Will will load the model: {0} in your \
                         current dir...".format(model_name_path))

    model = None
    logger.info('Will load model {}.'.format(model_name_path))

    try:
        with open(model_name_path, 'rb') as fr:
            restored_obj = pickle.load(fr)

        logger.debug('Restoring model ...')
        module = importlib.import_module(module_name)
        class_ = getattr(module,
                         restored_obj['class_name'].replace('Continue', ''))
        model = class_(**restored_obj['hyperparams'])
        model.is_fitted = restored_obj['is_fitted']
        model.ent_to_idx = restored_obj['ent_to_idx']
        model.rel_to_idx = restored_obj['rel_to_idx']

        try:
            model.is_calibrated = restored_obj['is_calibrated']
        except KeyError:
            model.is_calibrated = False

        model.restore_model_params(restored_obj)
    except pickle.UnpicklingError as e:
        msg = 'Error unpickling model {} : {}.'.format(model_name_path, e)
        logger.debug(msg)
        raise Exception(msg)
    except (IOError, FileNotFoundError):
        msg = 'No model found: {}.'.format(model_name_path)
        logger.debug(msg)
        raise FileNotFoundError(msg)

    return model
Esempio n. 28
0
    def fit(self,
            X,
            early_stopping=False,
            early_stopping_params={},
            continue_training=False):
        """Train an EmbeddingModel (with optional early stopping).

                The model is trained on a training set X using the training protocol
                described in :cite:`trouillon2016complex`.

                Parameters
                ----------
                X : ndarray (shape [n, 3]) or object of AmpligraphDatasetAdapter
                    Numpy array of training triples OR handle of Dataset adapter which would help retrieve data.
                early_stopping: bool
                    Flag to enable early stopping (default:``False``)
                early_stopping_params: dictionary
                    Dictionary of hyperparameters for the early stopping heuristics.

                    The following string keys are supported:

                        - **'x_valid'**: ndarray (shape [n, 3]) or object of AmpligraphDatasetAdapter :
                                         Numpy array of validation triples OR handle of Dataset adapter which
                                         would help retrieve data.
                        - **'criteria'**: string : criteria for early stopping 'hits10', 'hits3', 'hits1' or 'mrr'(default).
                        - **'x_filter'**: ndarray, shape [n, 3] : Positive triples to use as filter if a 'filtered' early
                                          stopping criteria is desired (i.e. filtered-MRR if 'criteria':'mrr').
                                          Note this will affect training time (no filter by default).
                                          If the filter has already been set in the adapter, pass True
                        - **'burn_in'**: int : Number of epochs to pass before kicking in early stopping (default: 100).
                        - **check_interval'**: int : Early stopping interval after burn-in (default:10).
                        - **'stop_interval'**: int : Stop if criteria is performing worse over n consecutive checks (default: 3)
                        - **'corruption_entities'**: List of entities to be used for corruptions. If 'all',
                          it uses all entities (default: 'all')
                        - **'corrupt_side'**: Specifies which side to corrupt. 's', 'o', 's+o' (default)

                        Example: ``early_stopping_params={x_valid=X['valid'], 'criteria': 'mrr'}``

                """
        self.train_dataset_handle = None
        # try-except block is mainly to handle clean up in case of exception or manual stop in jupyter notebook

        # TODO change 0: Update the mapping if there are new entities.
        if continue_training:
            self.update_mapping(X)

        try:
            if isinstance(X, np.ndarray):
                # Adapt the numpy data in the internal format - to generalize
                self.train_dataset_handle = NumpyDatasetAdapter()
                self.train_dataset_handle.set_data(X, "train")
            elif isinstance(X, AmpligraphDatasetAdapter):
                self.train_dataset_handle = X
            else:
                msg = 'Invalid type for input X. Expected ndarray/AmpligraphDataset object, got {}'.format(
                    type(X))
                logger.error(msg)
                raise ValueError(msg)

            # create internal IDs mappings
            # TODO Change 1: fist change to reuse the existing mappings rel_to_idx and ent_to_idx
            if not continue_training:
                self.rel_to_idx, self.ent_to_idx = self.train_dataset_handle.generate_mappings(
                )
            else:
                self.train_dataset_handle.use_mappings(self.rel_to_idx,
                                                       self.ent_to_idx)

            prefetch_batches = 1

            if len(self.ent_to_idx) > ENTITY_THRESHOLD:
                self.dealing_with_large_graphs = True

                logger.warning(
                    'Your graph has a large number of distinct entities. '
                    'Found {} distinct entities'.format(len(self.ent_to_idx)))

                logger.warning(
                    'Changing the variable initialization strategy.')
                logger.warning(
                    'Changing the strategy to use lazy loading of variables...'
                )

                if early_stopping:
                    raise Exception(
                        'Early stopping not supported for large graphs')

                if not isinstance(self.optimizer, SGDOptimizer):
                    raise Exception(
                        "This mode works well only with SGD optimizer with decay (read docs for details).\
         Kindly change the optimizer and restart the experiment")

            if self.dealing_with_large_graphs:
                prefetch_batches = 0
                # CPU matrix of embeddings
                # TODO Change 2.1: do not intialize if continue training
                if not continue_training:
                    self.ent_emb_cpu = self.initializer.get_np_initializer(
                        len(self.ent_to_idx), self.internal_k)

            self.train_dataset_handle.map_data()

            # This is useful when we re-fit the same model (e.g. retraining in model selection)
            if self.is_fitted:
                tf.reset_default_graph()
                self.rnd = check_random_state(self.seed)
                tf.random.set_random_seed(self.seed)

            self.sess_train = tf.Session(config=self.tf_config)

            #  change 2.2 : Do not change batch size with new training data, just use the old (for large KGs)
            # if not continue_training:
            batch_size = int(
                np.ceil(
                    self.train_dataset_handle.get_size("train") /
                    self.batches_count))
            # else:
            #     batch_size = self.batch_size

            logger.info("Batch Size: %i" % batch_size)
            # dataset = tf.data.Dataset.from_tensor_slices(X).repeat().batch(batch_size).prefetch(2)

            if len(self.ent_to_idx) > ENTITY_THRESHOLD:
                logger.warning(
                    'Only {} embeddings would be loaded in memory per batch...'
                    .format(batch_size * 2))

            self.batch_size = batch_size

            # TODO change 3: load model from trained params if continue instead of re_initialize the ent_emb and rel_emb
            if not continue_training:
                self._initialize_parameters()
            else:
                self._load_model_from_trained_params()

            dataset = tf.data.Dataset.from_generator(
                self._training_data_generator,
                output_types=(tf.int32, tf.int32, tf.float32),
                output_shapes=((None, 3), (None, 1), (None, self.internal_k)))

            dataset = dataset.repeat().prefetch(prefetch_batches)

            dataset_iterator = tf.data.make_one_shot_iterator(dataset)
            # init tf graph/dataflow for training
            # init variables (model parameters to be learned - i.e. the embeddings)

            if self.loss.get_state('require_same_size_pos_neg'):
                batch_size = batch_size * self.eta

            loss = self._get_model_loss(dataset_iterator)

            train = self.optimizer.minimize(loss)

            # Entity embeddings normalization
            normalize_ent_emb_op = self.ent_emb.assign(
                tf.clip_by_norm(self.ent_emb, clip_norm=1, axes=1))

            self.early_stopping_params = early_stopping_params

            # early stopping
            if early_stopping:
                self._initialize_early_stopping()

            self.sess_train.run(tf.tables_initializer())
            self.sess_train.run(tf.global_variables_initializer())
            try:
                self.sess_train.run(self.set_training_true)
            except AttributeError:
                pass

            normalize_rel_emb_op = self.rel_emb.assign(
                tf.clip_by_norm(self.rel_emb, clip_norm=1, axes=1))

            if self.embedding_model_params.get(
                    'normalize_ent_emb',
                    constants.DEFAULT_NORMALIZE_EMBEDDINGS):
                self.sess_train.run(normalize_rel_emb_op)
                self.sess_train.run(normalize_ent_emb_op)

            epoch_iterator_with_progress = tqdm(range(1, self.epochs + 1),
                                                disable=(not self.verbose),
                                                unit='epoch')

            # print("before epochs!")
            # print(self.sess_train.run(self.ent_emb))
            # print(self.sess_train.run(self.rel_emb))

            for epoch in epoch_iterator_with_progress:
                losses = []
                for batch in range(1, self.batches_count + 1):
                    feed_dict = {}
                    self.optimizer.update_feed_dict(feed_dict, batch, epoch)
                    if self.dealing_with_large_graphs:
                        loss_batch, unique_entities, _ = self.sess_train.run(
                            [loss, self.unique_entities, train],
                            feed_dict=feed_dict)
                        self.ent_emb_cpu[np.squeeze(unique_entities), :] = \
                            self.sess_train.run(self.ent_emb)[:unique_entities.shape[0], :]
                    else:
                        loss_batch, _ = self.sess_train.run(
                            [loss, train], feed_dict=feed_dict)

                    if np.isnan(loss_batch) or np.isinf(loss_batch):
                        msg = 'Loss is {}. Please change the hyperparameters.'.format(
                            loss_batch)
                        logger.error(msg)
                        raise ValueError(msg)

                    losses.append(loss_batch)
                    if self.embedding_model_params.get(
                            'normalize_ent_emb',
                            constants.DEFAULT_NORMALIZE_EMBEDDINGS):
                        self.sess_train.run(normalize_ent_emb_op)

                if self.verbose:
                    msg = 'Average Loss: {:10f}'.format(
                        sum(losses) / (batch_size * self.batches_count))
                    if early_stopping and self.early_stopping_best_value is not None:
                        msg += ' — Best validation ({}): {:5f}'.format(
                            self.early_stopping_criteria,
                            self.early_stopping_best_value)

                    logger.debug(msg)
                    epoch_iterator_with_progress.set_description(msg)

                if early_stopping:

                    try:
                        self.sess_train.run(self.set_training_false)
                    except AttributeError:
                        pass

                    if self._perform_early_stopping_test(epoch):
                        self._end_training()
                        return

                    try:
                        self.sess_train.run(self.set_training_true)
                    except AttributeError:
                        pass

            self._save_trained_params()
            self._end_training()
        except BaseException as e:
            self._end_training()
            raise e
Esempio n. 29
0
 def update_embedding(self):
     logger.info("Start Updating embedding stage")
     self.embedding.adapt(self.current_itr.augmentation_triples)
     logger.info("Done Updating embedding stage")
Esempio n. 30
0
 def end_process(self):
     super().end_process()
     logger.info("Clean Explanations Temp Graphs")
     self.clusters_explainer.clear_data()
     logger.info("Done Clean Explanations Temp Graphs!")