Exemple #1
0
    def print_eval_results(self, eval_results=None, specs={}, to_csv=False):
        '''
        Given the result of the evaluate method this method prints the result
        '''
        if eval_results is None: eval_results = self._last_eval_results
        L().log.info(
            "\n-----------------------------------\n  Results of evaluation \n-----------------------------------"
        )

        # 1. CSV Writer
        if to_csv:

            mode = 'w'
            if self._append_csv: mode = 'a'

            with open(self._output_path, mode) as csvfile:
                fieldnames = ["model_name"] + list(
                    self._settings_variables.keys()) + list(self._metrics)
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                if mode == "w": writer.writeheader()
                result = self._settings_variables

                for model_name in eval_results:
                    result["model_name"] = model_name
                    for metric in eval_results[model_name]:
                        result[metric] = eval_results[model_name][metric]
                    writer.writerow(result)

        for model_name in eval_results:
            L().log.info("\n-----> Model name: \t\t%s" % str(model_name))
            for metric in eval_results[model_name]:
                metric_result = eval_results[model_name][metric]
                L().log.info(
                    "%s: \t\t%s" %
                    (self._readable_metric(metric), str(metric_result)))
    def _log_cpds_emph_given(self, leaves):
        L().log.info(
            "---------------------------------------------------------------------------------------------------------------------------------------------------"
        )
        L().log.info("     New CPDs")
        L().log.info(
            "---------------------------------------------------------------------------------------------------------------------------------------------------"
        )
        for n in self.tbn.Vdata:
            if str.startswith(n, "dL_"): continue
            #print(str(leaves))
            if not n in leaves:
                #print("Ignoring non leaves! As paths are only valid if they end at a leave")
                continue
            if isinstance(self.tbn.Vdata[n]["cprob"], dict):
                L().log.info("\n\n")

                for k in self.tbn.Vdata[n]["cprob"]:
                    L().log.info(
                        "\n\n\t------- Case: %s = %s \n\t\t\tVals: %s------- \n\t\t\tConditions:"
                        % (n, str(self.tbn.Vdata[n]["cprob"][k]),
                           self.tbn.Vdata[n]["vals"]))
                    con = eval(k)
                    remember = [
                        (n,
                         str(
                             list(
                                 np.array(self.tbn.Vdata[n]["vals"])[
                                     self.tbn.Vdata[n]["cprob"][k] != 0])))
                    ]
                    tmp = dict()
                    for i in range(len(self.tbn.Vdata[n]["parents"])):
                        if con[i] == "Never": continue
                        tmp[self.tbn.Vdata[n]["parents"][i]] = con[i]
                        if not str.endswith(self.tbn.Vdata[n]["parents"][i],
                                            "_0"):
                            remember += [(self.tbn.Vdata[n]["parents"][i],
                                          con[i])]
                            continue
                        L().log.info("\t\t%s = %s" %
                                     (self.tbn.Vdata[n]["parents"][i], con[i]))
                    remember.sort()
                    L().log.info("\n\t\t\tWhat happened:")
                    for r in remember:
                        prev_tv = "_".join(r[0].split("_")[:-1] +
                                           [str(int(r[0].split("_")[-1]) - 1)])
                        if prev_tv[0] == "_": prev_tv = prev_tv[1:]
                        comes_from = tmp[prev_tv]

                        L().log.info("\t\t%s = %s (prev: %s)" %
                                     (r[0], r[1], comes_from))
            else:
                L().log.info("\n\n%s = %s" %
                             (n, str(self.tbn.Vdata[n]["cprob"])))
        L().log.info("\n\n")
 def _set_uniform_prior(self):
     if self._first_iteration:
         L().log.debug("Set priors: ")
         for n in self.tbn.Vdata:
             if str.startswith(n, "dL_"): continue
             if isinstance(self.tbn.Vdata[n]["cprob"], dict):
                 for k in self.tbn.Vdata[n]["cprob"]:
                     self.tbn.Vdata[n]["cprob"][k]  = np.array([1.0 / float(len(self.tbn.Vdata[n]["cprob"][k]))]*len(self.tbn.Vdata[n]["cprob"][k]))
                     L().log.debug("%s | %s = %s" % (n, k, str(self.tbn.Vdata[n]["cprob"][k])))
             else:
                 self.tbn.Vdata[n]["cprob"] = np.array([1.0 / float(len(self.tbn.Vdata[n]["cprob"]))]*len(self.tbn.Vdata[n]["cprob"]))
                 L().log.debug("%s = %s" % (n, str(self.tbn.Vdata[n]["cprob"])))
         self._first_iteration = False
Exemple #4
0
    def discover_structure_from_statistics(self, data, nodes):
        """
        Implements the PC algorithm.
        :param nodes: all signal_occurrence values that are in the data set
        :param data: ADtree or pandas dataframe that contains the dataset counts
        :return: list of edges
        """
        skeleton, sep_set = self.estimate_skeleton(data, nodes)
        pdag = self.estimate_cpdag(skeleton, sep_set)

        # orient remaining undirected edges according to occurrence number
        for scc in nx.strongly_connected_components(pdag):
            if len(scc) == 1:
                continue
            scc_nodes = sorted(scc, key=lambda node: int(node.rsplit('_')[-1]))
            for (parent, child) in combinations(scc_nodes, 2):
                if int(parent.rsplit('_')[-1]) <= int(
                        child.rsplit('_')[-1]) and (child,
                                                    parent) in pdag.edges:
                    pdag.remove_edge(child, parent)
                pass
            pass
        pass

        edges = [list(edge) for edge in pdag.edges]
        L().log.debug('Edges: ' + str(edges))
        return edges
Exemple #5
0
def logging_setup(log_path, number_parallel):
    if number_parallel > 10:
        print(
            "You chose to run more than 10 processes in parallel. Be aware that your machine requires according computational power for this. Else choose less parallel processes.\n"
        )

    print("Starting Experiments...")
    #sys.stderr = open(os.devnull, 'w') # disable broken pipe error
    time_str = strftime("%Y_%m_%d-%H_%M_%S", localtime())
    log_path = os.path.join(log_path, "logging_bay_" + time_str + ".log")

    FORMAT = '%(asctime)-15s %(message)s'
    logging.basicConfig(format=FORMAT, datefmt="%H:%M:%S  ")
    open(log_path, 'w').close()
    file_handler = logging.FileHandler(log_path)
    file_handler.setFormatter(logging.Formatter(FORMAT, "%H:%M:%S  "))

    L().log = logging.getLogger("tscbn_eval")
    L().log.addHandler(file_handler)
    L().log.setLevel(logging.INFO)
    L().log.parent.handlers = []
    L().log.info("Logger initialized...")
 def _log_cpds(self):
     L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------")
     L().log.info("     New CPDs")
     L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------")
     for n in self.tbn.Vdata:
         if str.startswith(n, "dL_"): continue
         if isinstance(self.tbn.Vdata[n]["cprob"], dict):
             for k in self.tbn.Vdata[n]["cprob"]:
                 L().log.info("%s | %s = %s" % (n, k, str(self.tbn.Vdata[n]["cprob"][k])))
         else:
             L().log.info("%s = %s" % (n, str(self.tbn.Vdata[n]["cprob"])))
     L().log.info("\n\n")
    def new_iteration(self, first, _debug_time):
        ''' New iteration of the EM Algorithm '''
        self._em_iteration += 1

        L().log.info("\n\nHistogram Updates \n\t\t\t\tTV %s \n\t\t\t\tsequence: %s" % (self._tv_name, str(self._full_sequence)))
        L().log.info("Anzahl Knoten: "+ str(self._len_nodes))

        # Reset histograms
        if len(self._symbol_histograms)==0:  L().log.info("No histograms - as no ambiguity\n")
        for k in range(len(self._symbol_histograms)):
            if self._em_iteration >1:
                self._symbol_histograms += self.histogram_smoothing
                self._symbol_histograms /= np.sum(self._symbol_histograms)
            try:
                if isinstance(self._full_sequence[k][0], list): ll = self._full_sequence[k][0][0]
                else: ll = self._full_sequence[k][0]
                L().log.info("Symbol %s - distribution: %s" % (str(ll), self._symbol_histograms[k]))
            except:
                L().log.error(traceback.format_exc())

        if _debug_time:L().log.info("Check Time out: \n%s" % str(self.delta_t_for_debug))
def run_vary_sample_number(number_TVs, parallel_processes, result_path, print_sequences, plot_model, print_true_distribution, estimators):

        if not number_TVs in [3,5,10]:
            print("No model is stored for this number_TV value. Please set number_TV to 3, 5 or 10")
            return

        # Settings
        state_change_prob = 0.8
        pe_debug_mode = False
        cpd_smoothing = 0.1
        parallel_processes = parallel_processes
        object_nr = number_TVs
        nodes_per_tv = 5
        states_per_tv = 4
        edges_per_tv = 2
        percentage_inter = 0.5
        per_object_gap = 0.5 # gap between two intra-nodes
        intra_gap_range = 0.1 # gap between two intra-nodes is drawn - kind of variance: lies within - per_object_gap and per_object_gap+intra_gap_range e.g. [0.5 to 0.5 + 0.1]
        t_variance_tscbn = 0.02 # Variance of resulting TSCBN (after Parameter estimation)
        dbn_tolerance = 0.02 # tolerance
        train_test_split = 0.9  # percentage of training data
        id = "_"+ "_".join([str(v) for v in [object_nr, nodes_per_tv, states_per_tv, edges_per_tv, percentage_inter, per_object_gap, intra_gap_range, t_variance_tscbn, dbn_tolerance, state_change_prob, train_test_split]])
        append_csv = False
        id_time = datetime.datetime.now().strftime("%I_%M%p_%d_%B_%Y_%H_%M_%S")
        out_path = os.path.join(result_path, r"evaluation_%s.csv" % id_time)

        # Iteration options
        grid_sample_sequences_from_tscbn =  [100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 15000] # unter 1000 macht hier gar keinen Sinn bei so vielen Daten e.g. hier 228 samples - bei 100 sequenzen sehe nichtmal bruchteil
        grid_em_sampling_frequency = [1000]
        grid_em_iterations = [5]

        # init
        sg = StructureGenerator(test_type = 1)
        sg.add_base_structure_models([TSCBNStructureModel, DBNStructureModel, CTBNStructureModel])  #  TNBNStructureModel, DBNStructureModel])
        sg.reference_model = TSCBNStructureModel # this model is used to generate sample data

        # Load sequences
        sequences_in = json.load(open('store/sequences%s.txt' % id))
        in_seq_in = json.load(open('store/in_seq%s.txt' % id))

        first = True

        if print_sequences:
            k = 0
            for sequence in sequences_in:
                k += 1; print(sequence)
                if k % 100 == 0:
                    r = input("To load more sequences type 'y' ")
                    if not r == "y": break

        # print True distribution of model
        if print_true_distribution:
            print("Actual distribution: ")
            with open('store/models%s.txt' % id, 'rb') as infile:
                real_models = dill.load(infile)
                infile.close()
            act_tscbn = real_models[sg.reference_model.__name__]

            for n in act_tscbn.Vdata:
                try:
                    if isinstance(act_tscbn.Vdata[n]["cprob"], dict):
                        for k in act_tscbn.Vdata[n]["cprob"]:
                            print("%s | %s = %s" % (n, k, str(act_tscbn.Vdata[n]["cprob"][k])))
                    else:
                        print("%s = %s" % (n, str(act_tscbn.Vdata[n]["cprob"])))
                except:
                    for k in act_tscbn.Vdata[n]["hybcprob"]:
                        print("%s | %s = mean: %s var: %s" % (n, k, str(act_tscbn.Vdata[n]["hybcprob"][k]["mean_base"]), str(act_tscbn.Vdata[n]["hybcprob"][k]["variance"])))
            print("\n\n")

        for estimator_id in estimators:
            for sample_sequences_from_tscbn in grid_sample_sequences_from_tscbn:
                for em_sampling_frequency in grid_em_sampling_frequency:
                    for em_iterations in grid_em_iterations:
                        print("\n-------------------------------\nDo: "+str(object_nr) +" "+ str(nodes_per_tv) +" "+ str(states_per_tv) +" "+ str(edges_per_tv) +" "+ str(percentage_inter) +" "+ str(per_object_gap) +" "+ str(t_variance_tscbn) +" "+ str(dbn_tolerance) +" "+ str(state_change_prob) +" "+ str(sample_sequences_from_tscbn) +" "+ str(em_sampling_frequency) +" "+ str(em_iterations))

                        # Load reference model
                        with open('store/models%s.txt' % id, 'rb') as infile:
                            real_models = dill.load(infile)
                            infile.close()
                        with open('store/specifications%s.txt' % id, 'rb') as infile:
                            specifications = dill.load(infile)
                            infile.close()
                        models = copy.deepcopy(real_models)
                        models["CTBNStructureModel"] = CTBNStructureModel()

                        # Parameter Estimation
                        pe = create_estimator(estimator_id)
                        ctbn_estimator = CTBNEstimator()
                        pe.original_tbn = copy.deepcopy(models[sg.reference_model.__name__])
                        original_tbn = copy.deepcopy(models[sg.reference_model.__name__])
                        if plot_model and first:
                            pe.original_tbn.draw("ext")
                            first = False

                        # Initialize Estimator and Evaluator
                        ev = ParameterEvaluator(append_csv);append_csv = True
                        ev.add_setting("estimator", str(estimator_id))
                        ev.add_setting("object_nr", object_nr)
                        ev.add_setting("nodes_per_tv", nodes_per_tv)
                        ev.add_setting("states_per_tv", states_per_tv)
                        ev.add_setting("edges_per_tv", edges_per_tv)
                        ev.add_setting("percentage_inter", percentage_inter)
                        ev.add_setting("per_tv_gap", per_object_gap)
                        ev.add_setting("tscbn_variance", t_variance_tscbn)
                        ev.add_setting("dbn_tolerance", dbn_tolerance)
                        ev.add_setting("sc_probability", state_change_prob)
                        ev.add_setting("sample_sequences_from_tscbn", sample_sequences_from_tscbn)
                        ev.add_setting("em_sampling_frequency", em_sampling_frequency)
                        ev.add_setting("em_iterations", em_iterations)
                        ev.set_output_path(out_path)
                        ev.rmse_tscb_variance = 0.1  # variance assumed per node - does not require parameter estimation
                        ev.rmse_mean_range = 0.2  # drift of mean will be within this range e.g. 0.1 means it will be drawn from correct +- drift*correct
                        ev.add_metric("runtime")
                        ev.add_metric("log-likelihood")
                        ev.add_metric("relative-entropy")
                        ev.add_metric("temp-log-likelihood")
                        pe.cpd_smoothing = cpd_smoothing
                        pe.sampling_frequency = em_sampling_frequency # sampling frequency for the MC MC Simulation
                        pe.iteration_frequency = em_iterations # EM Iterations
                        pe.set_parallel_processes(parallel_processes)
                        evidence = {}  # evidence when sampling
                        sg.set_model_visualization(plot = False, console_out = False)
                        Printos.print_settings(sg, pe, ev, 1, train_test_split, sample_sequences_from_tscbn, evidence, [])

                        # --------------------------------------------------------------------------------------------
                        #       Run tests
                        # --------------------------------------------------------------------------------------------
                        L().log.info("------------------ Running Test ------------------" )
                        if not ev._append_csv: eval_result = ev.write_header(True)
                        sequences = sequences_in[:sample_sequences_from_tscbn + 1]
                        in_seq = in_seq_in[:sample_sequences_from_tscbn + 1]

                        # choose random train and test data
                        from sklearn.model_selection import train_test_split
                        train_sequences, test_sequences, train_tscbn_sequences, test_tscbn_sequences = train_test_split(sequences, in_seq, test_size=0.1, random_state=0)

                        # ----------------------------------------------------------------------------------------
                        #       ESTIMATE PARAMETERS
                        # ----------------------------------------------------------------------------------------
                        for m in list(set(models)):
                            print("\nEstimating: %s ---" % str(m))
                            L().log.info("Parameter Estimation %s..." %(str(m)))

                            # TESTING
                            #print("_-------___TEST")
                            #if m != 'TSCBNStructureModel':continue


                            if m == 'CTBNStructureModel':
                                models[m] = ctbn_estimator.estimateStructureAndParameter(train_sequences,original_tbn)
                                continue

                            # Clear Models
                            pe.tbn = copy.deepcopy(models[m])
                            pe.original_tbn = copy.deepcopy(models[m])
                            if m == sg.reference_model.__name__:
                                pe.tbn.clear_parameters() # copy model structure only

                            # Estimate Parameters
                            ping = time.clock()
                            pe.estimateParameter(train_sequences, m, pe_debug_mode, ev, pe.original_tbn) # computes kl divergence per run
                            models[m] = pe.tbn
                            models[m].parameter_execution_time = time.clock() - ping # exeution time
                            print("Finished: %s ---" % str(m))

                        # ----------------------------------------------------------------------------------------
                        #       EVALUATION
                        # ----------------------------------------------------------------------------------------
                        try:
                            eval_result = ev.evaluate(models, reference = pe._reference, test_sequences = test_sequences, test_tscbn_sequences = test_tscbn_sequences)
                            ev.print_eval_results(eval_results = eval_result, specs = specifications, to_csv = True)
                        except:
                            print("bad ")
class Constant(object):
    LOCK = threading.Lock()
    LOCK2 = threading.Lock()
    JOE = L()
    Never = 0
    def get_next_symbol(self, parent_starts, parent_outcome, condition):
        '''
            each symbol is followed by a distribution which is optimized
            e.g.
                ABC would have dist[0] = [0,1,2] - depending on number of outcomes
                dist[0] = "Number of A occurrences"
                dist[1] = "Number of B occurrences"
                ...

            Then for each next symbol
                - draw number of next symbols and next symbol
                - until have next symbols return next symbols
                - optimize distributions: i.e. forbid permitted outcomes e.g. auf 4 Stellen - habe shcon AAB dann kann C nimmer 2 sein sondern nur 1
                - can count at the same time what I did draw - e.g. if AABBCCC know next_distribution dist[0] = [0,1,0]
        '''
        self._overall_index += 1


        # ------------------------------------------------------------------------------------
        #   BORDER CASES
        # ------------------------------------------------------------------------------------
        is_border = self._return_border_case()
        if is_border:
            self._cur_seen += [(condition, self._last_symbol[0])]
            return self._last_symbol

        # ------------------------------------------------------------------------------------
        #   Normal Run
        # ------------------------------------------------------------------------------------
        if self._initial:
            # draw whole distribution
            good = False
            while not good:
                try:
                    self._whole_sequence = self._draw_whole_distribution()
                    good = True
                except:
                    pass

            #self._TEST_ORIGINAL = self._whole_sequence
            #print("WHOLE: " + str(self._whole_sequence))
            self._initial = False
            # first element was already passed as initial - thus can remove it here
            self._whole_sequence = self._whole_sequence[1:]


        self._last_symbol = self._whole_sequence[0]
        self._whole_sequence = self._whole_sequence[1:]

        # Check sample invalid
        #try:
        #    self._last_symbol[1][0]
        #    print("\nACHTUNG __________________ " + str(self._last_symbol))
        #    print("ORIGIONAL: "+str(self._TEST_ORIGINAL))
        #    print("FULL: " + str(self._full_sequence))
        #    print("Histos: " + str(self._symbol_histograms))
        #except:
        #    pass
        if not self._satisfies_parent_conditions(parent_starts, parent_outcome):
            Constant.JOE.log.debug("INVALID SAMPLE %s" % str(self._full_sequence))
            return None

        self._cur_seen += [(condition, self._last_symbol[0])]
        return self._last_symbol




        # so hole ich die nächste Symbol

        # ------------------------------------------------------------------------------------
        #   Draw next until None left
        # ------------------------------------------------------------------------------------
        if not self._next_symbols == 0: # Still did not reach last
            self._next_symbols -= 1 # return until done
            self.number_nevers -= 1


            #L().log.debug("%s - I return: %s %s" % (str(self._id), self._next_symbols + 1, str(self._last_symbol)))
            # HERE check if my returned sample is legid!
            if not self._satisfies_parent_conditions(parent_starts, parent_outcome):
                L().log.debug("_______________ RETRY HARD _______________ ")
                # try again - then return not possible
                if len(self.sequence) == 0:  # reached last element - but there is no next element - so not possible
                    return None
                # if this will not work - then return invalid sample
                self._last_symbol = self.sequence[0]
                self.sequence = self.sequence[1:]
                if self.number_nevers > 0:
                    self._draw_next_elements()
                if not self._satisfies_parent_conditions(parent_starts, parent_outcome):
                    return None

            self._cur_seen += [(condition, self._last_symbol[0])]
            return self._last_symbol

        else: # get next
            self._last_symbol = self.sequence[0]
            self.sequence = self.sequence[1:]

        # ------------------------------------------------------------------------------------
        #   Draw next elements
        # ------------------------------------------------------------------------------------
        if self.number_nevers > 0:
            self._draw_next_elements()
        #L().log.debug("%s - I return: %s %s" % (str(self._id), self._next_symbols + 1, str(self._last_symbol)))

        if not self._satisfies_parent_conditions(parent_starts, parent_outcome):
            L().log.debug("_______________ RETRY HARD _______________ ")
            # try again - then return not possible
            if len(self.sequence) == 0:  # reached last element - but there is no next element - so not possible
                return None
            # if this will not work - then return invalid sample
            self._last_symbol = self.sequence[0]
            self.sequence = self.sequence[1:]
            if self.number_nevers > 0:
                self._draw_next_elements()
            if not self._satisfies_parent_conditions(parent_starts, parent_outcome):
                return None


        return self._last_symbol
Exemple #11
0
def run_structure_experiment(target_path, parameter_temp_nodes_experiment=False, parameter_signals_experiment=False,
                          comparison_experiment_temp_nodes=False, comparison_experiment_signals=False,
                          comparison_experiment_scp=False):
    # number of iterations per experiment
    iterations = 25
    # number of sequences per experiment
    sample_size = 5000

    # ----------------------------------------------------------------------------------------
    #      Structure Generator Setup
    # ----------------------------------------------------------------------------------------
    sg = StructureGenerator(test_type=TestStructureEnum.SPECIFICATION)
    sg.add_base_structure_models([TSCBNStructureModel])
    sg.reference_model = TSCBNStructureModel

    # TIME SETTINGS (fixed for all experiments)
    sg.set_temporal_range(min_per_object_gap=0.5, max_per_object_gap=1.0)
    sg.set_temporal_variance(0.001)
    sg.set_dbn_tolerance(0.1)

    # PROBABILITY SETTINGS (fixed for all experiments)
    sg.set_state_change_probability(min_probability=0.95, max_probability=0.95)

    # ----------------------------------------------------------------------------------------
    #      Experiment with different parameters of the SBTreeDiscoverer
    # ----------------------------------------------------------------------------------------
    if parameter_temp_nodes_experiment or parameter_signals_experiment:
        sd = SBTreeDiscoverer(min_out_degree=0.1, k_infrequent=0.1, approach='parent_graph', parallel=False)
        # filtering parameters fixed at 0.1
        # parent graph approach means exact score optimization (but not exhaustive)
        # structure optimization not iteration in parallel

        for edges_per_object in [1, 3]:
            print('edges_per_object: ' + str(edges_per_object) + '...')
            L().log.info('edges_per_object: ' + str(edges_per_object) + '...')

            # EDGE SETTINGS
            sg.set_connection_ranges(min_edges_per_object=edges_per_object, max_edges_per_object=edges_per_object,
                                     min_percent_inter=1.0, max_percent_inter=1.0)

            if parameter_temp_nodes_experiment:
                # 1st experiment: Increase number of temporal variables per signal

                # EVALUATOR SETUP
                ev = StructureEvaluator(True)
                ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime())))
                metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel",
                           "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld",
                           "execution-time", "psi-execution-time", "so-execution-time"]
                for metric in metrics:ev.add_metric(metric)
                eval_results = dict()
                discovery_algorithms = set()

                for number_of_signals in [2, 3, 4]:
                    print('number_of_signals: ' + str(number_of_signals) + '...')
                    L().log.info('number_of_signals: ' + str(number_of_signals) + '...')

                    if edges_per_object >= number_of_signals:
                        continue

                    numbers_of_temp_nodes = [1, 2, 3, 4, 5, 6, 7]
                    for number_of_temp_nodes in numbers_of_temp_nodes:
                        print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')
                        L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')

                        # NODE SETTINGS
                        sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals,
                                          min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes,
                                          min_states=3, max_states=3)

                        eval_results.update({number_of_temp_nodes: dict()})

                        for iteration in range(0, iterations):
                            print('iteration: ' + str(iteration) + '...')
                            L().log.info('iteration: ' + str(iteration) + '...')

                            # SAMPLE DATA
                            models, specifications = sg.run_next_testcase()
                            in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {})
                            sequences = \
                            sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0]

                            # additional information for evaluation
                            additional_infos = dict()
                            additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None}

                            for score in ['BIC', 'AIC', 'Bdeu', 'K2']:
                                print('score: ' + str(score) + '...')
                                L().log.info('score: ' + str(score) + '...')

                                for temporal_threshold in np.arange(0.0, 2.5, 0.5):
                                    print('temporal_threshold: ' + str(temporal_threshold) + '...')
                                    L().log.info('temporal_threshold: ' + str(temporal_threshold) + '...')

                                    # STRUCTURE DISCOVERER SETUP
                                    sd.score = score
                                    sd.max_time_difference = temporal_threshold

                                    sd_name = 'SBTreeDiscoverer_' + score + '_TH_' + str(temporal_threshold)
                                    if sd_name not in eval_results.get(number_of_temp_nodes):  # initialise metrics_dict
                                        metrics_dict = dict((metric, []) for metric in metrics)
                                        eval_results.get(number_of_temp_nodes).update({sd_name: metrics_dict})
                                        discovery_algorithms.add(sd_name)
                                    model_name = sd_name + ' (' + str(iteration) + ')'

                                    # RUN ALGORITHM
                                    L().log.info('----------------------------------------------------------')
                                    print('Run approach ' + model_name + '.')
                                    L().log.info('Run approach ' + model_name + '.')
                                    ping = clock()
                                    nodes, edges = sd.discover_structure(sequences)
                                    L().log.info('Nodes: ' + str(nodes))
                                    L().log.info('Edges: ' + str(edges))
                                    execution_time = clock() - ping
                                    additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data,
                                                                    'psi_execution_time': sd.parent_set_identification_time,
                                                                    'so_execution_time': sd.structure_optimization_time}
                                    L().log.info('Execution time: ' + str(execution_time))
                                    L().log.info('----------------------------------------------------------')

                                    # CREATE TSCBN
                                    skel = GraphSkeleton()
                                    skel.V = nodes
                                    skel.E = edges
                                    skel.toporder()
                                    model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True,
                                                  forbid_never=True, discrete_only=True)

                                    # EVALUATION
                                    eval_result = ev.evaluate(model_dict={model_name: model},
                                                              reference=models[sg.reference_model.__name__],
                                                              additional_infos=additional_infos)
                                    ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True)
                                    for metric, value in eval_result[model_name].items():
                                        eval_results[number_of_temp_nodes][sd_name][metric].append(value)
                                    pass
                                pass
                            pass
                        pass
                    experiment_name = 'ParameterTmpNodesExperiment_EPO_' + str(edges_per_object) + '_Sig_' + \
                                      str(number_of_signals)
                    relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel",
                                        "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time",
                                        "so-execution-time"]
                    write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms,
                                        numbers_of_temp_nodes, 'number_of_temp_nodes', target_path)
                pass
            pass

            if parameter_signals_experiment:
                # 2nd experiment: Increase number of signals

                if edges_per_object == 3:
                    continue  # TODO: remove this, when choosing a maximal number of signals larger than 5

                # EVALUATOR SETUP
                ev = StructureEvaluator(True)
                ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime())))
                metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel",
                           "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld",
                           "execution-time", "psi-execution-time", "so-execution-time"]
                for metric in metrics:
                    ev.add_metric(metric)
                eval_results = dict()
                discovery_algorithms = set()

                for number_of_temp_nodes in [3, 5]:
                    print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')
                    L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')

                    numbers_of_signals = [2, 3, 4, 5]
                    evaluated_numbers_of_signals = copy.deepcopy(numbers_of_signals)
                    for number_of_signals in numbers_of_signals:
                        print('number_of_signals: ' + str(number_of_signals) + '...')
                        L().log.info('number_of_signals: ' + str(number_of_signals) + '...')

                        if edges_per_object >= number_of_signals:
                            evaluated_numbers_of_signals.remove(number_of_signals)
                            continue

                        # NODE SETTINGS
                        sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals,
                                          min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes,
                                          min_states=3, max_states=3)

                        eval_results.update({number_of_signals: dict()})

                        for iteration in range(iterations):
                            print('iteration: ' + str(iteration) + '...')
                            L().log.info('iteration: ' + str(iteration) + '...')

                            # SAMPLE DATA
                            models, specifications = sg.run_next_testcase()
                            in_seq = models[sg.reference_model.__name__].randomsample(1000, {})
                            sequences = \
                            sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0]

                            # additional information for evaluation
                            additional_infos = dict()
                            additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None}

                            for score in ['BIC', 'AIC', 'Bdeu', 'K2']:
                                print('score: ' + str(score) + '...')
                                L().log.info('score: ' + str(score) + '...')

                                for temporal_threshold in np.arange(0.0, 2.5, 0.5):
                                    print('temporal_threshold: ' + str(temporal_threshold) + '...')
                                    L().log.info('temporal_threshold: ' + str(temporal_threshold) + '...')

                                    # STRUCTURE DISCOVERER SETUP
                                    sd.score = score
                                    sd.max_time_difference = temporal_threshold

                                    sd_name = 'SBTreeDiscoverer_' + score + '_TH_' + str(temporal_threshold)
                                    if sd_name not in eval_results.get(number_of_signals):  # initialise metrics_dict
                                        metrics_dict = dict((metric, []) for metric in metrics)
                                        eval_results.get(number_of_signals).update({sd_name: metrics_dict})
                                        discovery_algorithms.add(sd_name)
                                    model_name = sd_name + ' (' + str(iteration) + ')'

                                    # RUN ALGORITHM
                                    L().log.info('----------------------------------------------------------')
                                    print('Run approach ' + model_name + '.')
                                    L().log.info('Run approach ' + model_name + '.')
                                    ping = clock()
                                    nodes, edges = sd.discover_structure(sequences)
                                    L().log.info('Nodes: ' + str(nodes))
                                    L().log.info('Edges: ' + str(edges))
                                    execution_time = clock() - ping
                                    additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data,
                                                                    'psi_execution_time': sd.parent_set_identification_time,
                                                                    'so_execution_time': sd.structure_optimization_time}
                                    L().log.info('Execution time: ' + str(execution_time))
                                    L().log.info('----------------------------------------------------------')

                                    # CREATE TSCBN
                                    skel = GraphSkeleton()
                                    skel.V = nodes
                                    skel.E = edges
                                    skel.toporder()
                                    model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True,
                                                  forbid_never=True, discrete_only=True)

                                    # EVALUATION
                                    eval_result = ev.evaluate(model_dict={model_name: model},
                                                              reference=models[sg.reference_model.__name__],
                                                              additional_infos=additional_infos)
                                    ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True)
                                    for metric, value in eval_result[model_name].items():
                                        eval_results[number_of_signals][sd_name][metric].append(value)
                                    pass
                                pass
                            pass
                        pass
                    experiment_name = 'ParameterSignalsExperiment_EPO_' + str(edges_per_object) + '_TmpNodes_' + \
                                      str(number_of_temp_nodes)
                    relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel",
                                        "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time",
                                        "so-execution-time"]
                    write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms,
                                        evaluated_numbers_of_signals, 'num_signals', target_path)
                pass
            pass
        pass
    pass

    # ----------------------------------------------------------------------------------------
    #      Experiments with all algorithms
    # ----------------------------------------------------------------------------------------
    # 1st experiment: increase number of temporal nodes
    if comparison_experiment_temp_nodes:
        # EDGE SETTINGS
        sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2,
                                 min_percent_inter=1.0, max_percent_inter=1.0)

        # EVALUATOR SETUP
        ev = StructureEvaluator(True)
        ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime())))
        metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel",
                   "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time"]
        for metric in metrics:
            ev.add_metric(metric)
        eval_results = dict()

        for number_of_signals in [3, 4]:
            print('number_of_signals: ' + str(number_of_signals) + '...')
            L().log.info('number_of_signals: ' + str(number_of_signals) + '...')

            discovery_algorithms = set()

            numbers_of_temp_nodes = [2, 3, 4, 5, 6, 7, 8]
            for number_of_temp_nodes in numbers_of_temp_nodes:
                print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')
                L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')

                # NODE SETTINGS
                sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals,
                                  min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes,
                                  min_states=3, max_states=3)

                eval_results.update({number_of_temp_nodes: dict()})
                metrics_dict = dict((metric, []) for metric in metrics)

                # ---------------------------------------------------
                #   RUN Structure Discovery several times
                # ---------------------------------------------------
                for iteration in range(iterations):
                    print('iteration: ' + str(iteration) + '...')
                    L().log.info('iteration: ' + str(iteration) + '...')

                    # SAMPLE DATA
                    models, specifications = sg.run_next_testcase()
                    in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {})
                    sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0]

                    additional_infos = dict()
                    additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None}

                    # ---------------------------------------------------
                    #   Discovery Algorithm
                    # ---------------------------------------------------
                    for sd_name, sd in get_structure_discovery_algorithms():

                        # LIMITATIONS DUE TO RUNTIME PROBLEMS
                        # TODO: run all algorithms for all networks on a better hardware
                        if str.startswith(sd_name, 'Astar') and number_of_signals * number_of_temp_nodes > 16:
                            print('Network to large for A* algorithm.')
                            continue
                        if str.startswith(sd_name, 'PC') and number_of_signals * number_of_temp_nodes > 24:
                            print('Network to large for PC algorithm.')
                            continue

                        discovery_algorithms.add(sd_name)
                        if sd_name not in eval_results.get(number_of_temp_nodes):
                            eval_results.get(number_of_temp_nodes).update({sd_name: copy.deepcopy(metrics_dict)})

                        model_name = sd_name + ' (' + str(iteration) + ')'
                        L().log.info('----------------------------------------------------------')
                        print('Run approach ' + model_name + '.')
                        L().log.info('Run approach ' + model_name + '.')

                        ping = clock()
                        nodes, edges = sd.discover_structure(sequences)
                        L().log.info('Nodes: ' + str(nodes))
                        L().log.info('Edges: ' + str(edges))
                        execution_time = clock() - ping
                        additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data}
                        L().log.info('Execution time: ' + str(execution_time))
                        L().log.info('----------------------------------------------------------')

                        # create TSCBN
                        skel = GraphSkeleton()
                        skel.V = nodes
                        skel.E = edges
                        skel.toporder()
                        model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True,
                                      forbid_never=True, discrete_only=True)

                        # ----------------------------------------------------------------------------------------
                        #       EVALUATION
                        # ----------------------------------------------------------------------------------------
                        eval_result = ev.evaluate(model_dict={model_name: model},
                                                  reference=models[sg.reference_model.__name__],
                                                  additional_infos=additional_infos)
                        ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True)
                        for metric, value in eval_result[model_name].items():
                            eval_results[number_of_temp_nodes][sd_name][metric].append(value)
                        pass
                    pass
                pass
            experiment_name = 'TempNodesExperiment_Sig_' + str(number_of_signals)
            relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel",
                                "shd-skel", "kld", "execution-time"]
            write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms,
                                numbers_of_temp_nodes, 'number_of_temp_nodes', target_path)

    # 2nd experiment: increase number of signals
    if comparison_experiment_signals:
        # EDGE SETTINGS
        sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2,
                                 min_percent_inter=1.0, max_percent_inter=1.0)

        # EVALUATOR SETUP
        ev = StructureEvaluator(True)
        ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime())))
        metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel",
                   "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time",
                   "psi-execution-time", "so-execution-time"]
        for metric in metrics:
            ev.add_metric(metric)
        eval_results = dict()

        for number_of_temp_nodes in [3]:  # TODO: run with larger numbers on better hardware
            print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')
            L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')

            discovery_algorithms = set()

            numbers_of_signals = [3, 4, 5, 6, 7, 8]
            for number_of_signals in numbers_of_signals:
                print('number_of_signals: ' + str(number_of_signals) + '...')
                L().log.info('number_of_signals: ' + str(number_of_signals) + '...')

                # NODE SETTINGS
                sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals,
                                  min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes,
                                  min_states=3, max_states=3)

                eval_results.update({number_of_signals: dict()})
                metrics_dict = dict((metric, []) for metric in metrics)

                # ---------------------------------------------------
                #   RUN Structure Discovery several times
                # ---------------------------------------------------
                for iteration in range(iterations):
                    print('iteration: ' + str(iteration) + '...')
                    L().log.info('iteration: ' + str(iteration) + '...')

                    # SAMPLE DATA
                    models, specifications = sg.run_next_testcase()
                    in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {})
                    sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0]

                    additional_infos = dict()
                    additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None,
                                                                     'psi-execution-time': 0.0,
                                                                     'so-execution-time': 0.0}

                    # ---------------------------------------------------
                    #   Discovery Algorithm
                    # ---------------------------------------------------
                    for sd_name, sd in get_structure_discovery_algorithms():

                        # LIMITATIONS DUE TO RUNTIME PROBLEMS
                        # TODO: run all algorithms for all networks on a better hardware
                        if str.startswith(sd_name, 'Astar') and number_of_signals * number_of_temp_nodes > 16:
                            print('Network to large for A* algorithm.')
                            continue
                        if str.startswith(sd_name, 'PC') and number_of_signals * number_of_temp_nodes > 24:
                            print('Network to large for PC algorithm.')
                            continue
                        if str.startswith(sd_name, 'sbPTM') and number_of_signals * number_of_temp_nodes > 30:
                            print('Network to large for PTM algorithm.')
                            continue
                        if str.startswith(sd_name, 'cbPTM') and number_of_signals * number_of_temp_nodes > 30:
                            print('Network to large for PTM algorithm.')
                            continue

                        discovery_algorithms.add(sd_name)
                        if sd_name not in eval_results.get(number_of_signals):
                            eval_results.get(number_of_signals).update({sd_name: copy.deepcopy(metrics_dict)})

                        model_name = sd_name + ' (' + str(iteration) + ')'
                        L().log.info('----------------------------------------------------------')
                        print('Run approach ' + model_name + '.')
                        L().log.info('Run approach ' + model_name + '.')

                        ping = clock()
                        nodes, edges = sd.discover_structure(sequences)
                        L().log.info('Nodes: ' + str(nodes))
                        L().log.info('Edges: ' + str(edges))
                        execution_time = clock() - ping
                        additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data,
                                                        'psi_execution_time': 0.0, 'so_execution_time': 0.0}
                        if sd.parent_set_identification_time and sd.structure_optimization_time:
                            additional_infos[model_name].update(
                                {'psi_execution_time': sd.parent_set_identification_time,
                                 'so_execution_time': sd.structure_optimization_time})
                        L().log.info('Execution time: ' + str(execution_time))
                        L().log.info('----------------------------------------------------------')

                        # create TSCBN
                        skel = GraphSkeleton()
                        skel.V = nodes
                        skel.E = edges
                        skel.toporder()
                        model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True,
                                      forbid_never=True, discrete_only=True)

                        # ----------------------------------------------------------------------------------------
                        #       EVALUATION
                        # ----------------------------------------------------------------------------------------
                        eval_result = ev.evaluate(model_dict={model_name: model},
                                                  reference=models[sg.reference_model.__name__],
                                                  additional_infos=additional_infos)
                        ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True)
                        for metric, value in eval_result[model_name].items():
                            eval_results[number_of_signals][sd_name][metric].append(value)
                        pass
                    pass
                pass
            experiment_name = 'SignalExperiment_TmpNodes_' + str(number_of_temp_nodes)
            relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel",
                                "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"]
            write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms,
                                numbers_of_signals, 'number_of_signals', target_path)

    # 3rd experiment: different values for the state change probability
    if comparison_experiment_scp:
        # EDGE SETTINGS
        sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2,
                                 min_percent_inter=1.0, max_percent_inter=1.0)

        # EVALUATOR SETUP
        ev = StructureEvaluator(True)
        ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime())))
        metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel",
                   "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld",
                   "execution-time"]
        for metric in metrics:
            ev.add_metric(metric)
        eval_results = dict()

        for number_of_temp_nodes in [3, 4]:
            print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')
            L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...')

            # NODE SETTINGS
            sg.set_node_range(min_objects=3, max_objects=3,
                              min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes,
                              min_states=2, max_states=4)
            sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=3, min_percent_inter=0.5,
                                     max_percent_inter=1.0)

            discovery_algorithms = set()

            state_change_probabilities = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
            for state_change_probability in state_change_probabilities:
                print('state_change_probability: ' + str(state_change_probability) + '...')
                L().log.info('state_change_probability: ' + str(state_change_probability) + '...')

                sg.set_state_change_probability(min_probability=state_change_probability,
                                                max_probability=state_change_probability)

                eval_results.update({state_change_probability: dict()})
                metrics_dict = dict((metric, []) for metric in metrics)

                # ---------------------------------------------------
                #   RUN Structure Discovery several times
                # ---------------------------------------------------
                for iteration in range(iterations):
                    print('iteration: ' + str(iteration) + '...')
                    L().log.info('iteration: ' + str(iteration) + '...')

                    # SAMPLE DATA
                    models, specifications = sg.run_next_testcase()
                    in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {})
                    sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0]

                    additional_infos = dict()
                    additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None}

                    # ---------------------------------------------------
                    #   Discovery Algorithm
                    # ---------------------------------------------------
                    for sd_name, sd in get_structure_discovery_algorithms():

                        # LIMITATIONS DUE TO RUNTIME PROBLEMS
                        # TODO: run all algorithms for all networks on a better hardware
                        if str.startswith(sd_name, 'Astar') and 3 * number_of_temp_nodes > 16:
                            print('Network to large for A* algorithm.')
                            continue

                        discovery_algorithms.add(sd_name)
                        if sd_name not in eval_results.get(state_change_probability):
                            eval_results.get(state_change_probability).update({sd_name: copy.deepcopy(metrics_dict)})

                        model_name = sd_name + ' (' + str(iteration) + ')'
                        L().log.info('----------------------------------------------------------')
                        print('Run approach ' + model_name + '.')
                        L().log.info('Run approach ' + model_name + '.')

                        ping = clock()
                        nodes, edges = sd.discover_structure(sequences)
                        L().log.info('Nodes: ' + str(nodes))
                        L().log.info('Edges: ' + str(edges))
                        execution_time = clock() - ping
                        additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data}
                        L().log.info('Execution time: ' + str(execution_time))
                        L().log.info('----------------------------------------------------------')

                        # create TSCBN
                        skel = GraphSkeleton()
                        skel.V = nodes
                        skel.E = edges
                        skel.toporder()
                        model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True,
                                      forbid_never=True, discrete_only=True)

                        # ----------------------------------------------------------------------------------------
                        #       EVALUATION
                        # ----------------------------------------------------------------------------------------
                        eval_result = ev.evaluate(model_dict={model_name: model},
                                                  reference=models[sg.reference_model.__name__],
                                                  additional_infos=additional_infos)
                        ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True)
                        for metric, value in eval_result[model_name].items():
                            eval_results[state_change_probability][sd_name][metric].append(value)
                        pass
                    pass
                pass
            experiment_name = 'SCP_Experiment_Sig_3_TmpNodes_' + str(number_of_temp_nodes)
            relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel",
                                "shd-skel", "kld", "execution-time"]
            write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms,
                                state_change_probabilities, 'state_change_probability', target_path)
    def _likelihood(self, tscbn, seen, tv):
        '''
        Computes the likelihood of temporal variables that are within a
        range of index_range using tbn
        idx == indices for which this likelihood holds
        LIKELIHOOD OF WHOLE SEQUENCE!!!!
        '''
        L().log.debug("\n\n----------------------------------------------------------------------")
        idx = [] # list of lists: l[0] == symbol_idx // l[1] == symbol number
        p_tot = 1.0
        i = -1
        symbol_idx = 0
        symbol_nr = -2 #0 means 1, 1 means 2...
        prev_symbol = seen[0][1]
        first = True
        L().log.debug("\n\nSeen: %s" % (str(seen)))
        for s in seen:
            symbol_nr += 1
            i += 1
            cond = str(s[0])
            symbol = s[1]
            n = tv + "_" + str(i)
            L().log.debug("Symbol nr: %s, Symbol index: %s" % (str(symbol_nr), str(symbol_idx)))
            L().log.debug("\n\nprev_symbol: %s, i: %s, cond: %s, symbol %s n: %s" % (str(prev_symbol), str(i), str(cond), str(symbol), str(n)))
            if not s[0] is None:
                p_tot *= tscbn[n]["cprob"][str(cond)][tscbn[n]["vals"].index(symbol)]
                L().log.debug("\n\np_cond: %s" %(str(tscbn[n]["cprob"][str(cond)][tscbn[n]["vals"].index(symbol)])))
            else:
                p_tot *= tscbn[n]["cprob"][tscbn[n]["vals"].index(symbol)]
                L().log.debug("\n\np_: %s" % (str(tscbn[n]["cprob"][tscbn[n]["vals"].index(symbol)])))

            if not first and prev_symbol != symbol:
                idx.append([symbol_idx, symbol_nr])
                symbol_idx += 1
                symbol_nr = -1
                L().log.debug("idx: %s" % str(idx))

            prev_symbol = symbol
            first = False
        symbol_nr += 1
        idx.append([symbol_idx, symbol_nr])
        L().log.debug("idx: %s" % str(idx))
        '''
        for i in range(0, len(pars.keys())):

            n = tv + "_" + str(i)
            if n not in pars: break
            symbol = set_values[n]

            if pars[n]["parents"] is None:
                cond = []
            else:
                cond = [set_values[k] for k in pars[n]["parents"]]

            # get prob given cond
            if not cond:
                p = pars[n]["cprob"][pars[n]["tbn_vals"].index(symbol)]
            else:
                p = pars[n]["cprob"][str(cond)][pars[n]["tbn_vals"].index(symbol)]
            p_tot *= p

        '''
        L().log.debug("\n\n----------------------------------------------------------------------")

        return p_tot, idx
Exemple #13
0
def run_vary_structure(target_path):
    # ----------------------------------------------
    # GRID
    # ----------------------------------------------
    object_nr = [5, 10, 20, 30, 40]  # (from, to, steps)
    nodes_per_tv = [2, 50, 2]  # (from, to, steps)
    states_per_tv = [2, 6, 2]  # (from, to, steps)

    edges_per_tv = [3, 3, 2]
    percentage_inter = [0.8, 0.8, 0.2]
    per_object_gap = [0.5, 0.5, 0.01
                      ]  # range is still within selected and selected + 0.5

    t_variance_tscbn = [0.1, 0.1, 0.02]
    dbn_tolerance = [0.02, 0.02, 0.02]

    state_change_prob = [1.0, 1.0, 0.01]
    append_csv = False
    eval_models = [CTBNStructureModel, DBNStructureModel, TSCBNStructureModel]

    id_time = datetime.datetime.now().strftime("%I_%M%p_%d_%B_%Y")
    out_path = os.path.join(target_path, r"model_evaluation_%s.csv" % id_time)
    print("store to %s" % out_path)

    run = 1
    expected_runs = 1
    expected_runs *= len(object_nr)
    expected_runs *= len(
        list(range(nodes_per_tv[0], nodes_per_tv[1] + 1, nodes_per_tv[2])))
    expected_runs *= len(
        list(range(states_per_tv[0], states_per_tv[1] + 1, states_per_tv[2])))
    expected_runs *= len(
        list(range(edges_per_tv[0], edges_per_tv[1] + 1, edges_per_tv[2])))
    expected_runs *= len(
        list(
            np.arange(percentage_inter[0], percentage_inter[1] + 0.000001,
                      percentage_inter[2])))
    expected_runs *= len(
        list(
            np.arange(per_object_gap[0], per_object_gap[1] + 0.00000001,
                      per_object_gap[2])))
    expected_runs *= len(
        list(
            np.arange(t_variance_tscbn[0], t_variance_tscbn[1] + 0.00000001,
                      t_variance_tscbn[2])))
    expected_runs *= len(
        list(
            np.arange(dbn_tolerance[0], dbn_tolerance[1] + 0.00000001,
                      dbn_tolerance[2])))
    expected_runs *= len(
        list(
            np.arange(state_change_prob[0], state_change_prob[1] + 0.00000001,
                      state_change_prob[2])))

    for n_p_t in range(nodes_per_tv[0], nodes_per_tv[1] + 1, nodes_per_tv[2]):
        for s_p_t in range(states_per_tv[0], states_per_tv[1] + 1,
                           states_per_tv[2]):
            for e_p_t in range(edges_per_tv[0], edges_per_tv[1] + 1,
                               edges_per_tv[2]):
                if n_p_t < e_p_t: continue
                for per_iter in np.arange(percentage_inter[0],
                                          percentage_inter[1] + 0.000001,
                                          percentage_inter[2]):
                    for p_o_gap in np.arange(per_object_gap[0],
                                             per_object_gap[1] + 0.00000001,
                                             per_object_gap[2]):
                        for tscbn_var in np.arange(
                                t_variance_tscbn[0],
                                t_variance_tscbn[1] + 0.00000001,
                                t_variance_tscbn[2]):
                            for dbn_tol in np.arange(
                                    dbn_tolerance[0],
                                    dbn_tolerance[1] + 0.00000001,
                                    dbn_tolerance[2]):
                                for sc_prob in np.arange(
                                        state_change_prob[0],
                                        state_change_prob[1] + 0.00000001,
                                        state_change_prob[2]):
                                    for o_nr in object_nr:
                                        print(
                                            "\n----------------------------------\nobj_nr: %s\nnodes_p_t: %s\nstates_pt: %s\nedges_pt: %s\nper_iter: %s\np_o_gap: %s\ntscbn_var: %s\ndbn_tol: %s\nsc_prob: %s"
                                            % (o_nr, n_p_t, s_p_t, e_p_t,
                                               per_iter, p_o_gap, tscbn_var,
                                               dbn_tol, sc_prob))
                                        print("Remaining:  %s" %
                                              (str(expected_runs - run)))
                                        run += 1

                                        sg = StructureGenerator(
                                            test_type=TestStructureEnum.
                                            SPECIFICATION)
                                        ev = StructureEvaluator(append_csv)
                                        append_csv = True

                                        # Evaluation Parameters
                                        ev.add_setting("object_nr", o_nr)
                                        ev.add_setting("nodes_per_tv", n_p_t)
                                        ev.add_setting("states_per_tv", s_p_t)
                                        ev.add_setting("edges_per_tv", e_p_t)
                                        ev.add_setting("percentage_inter",
                                                       per_iter)
                                        ev.add_setting("per_tv_gap", p_o_gap)
                                        ev.add_setting("tscbn_variance",
                                                       tscbn_var)
                                        ev.add_setting("dbn_tolerance",
                                                       dbn_tol)
                                        ev.add_setting("sc_probability",
                                                       sc_prob)

                                        ev.set_output_path(out_path)
                                        ev.add_metric("num-edges")
                                        ev.add_metric("num-nodes")
                                        ev.add_metric("num-states")
                                        ev.add_metric("num-cpds")

                                        # ----------------------------------------------
                                        # Settings
                                        # ----------------------------------------------
                                        # Models
                                        sg.add_base_structure_models(
                                            eval_models
                                        )  # DBNStructureModel  TNBNStructureModel, TSCBNStructureModel
                                        if DBNStructureModel in sg.get_generator_models(
                                        ):
                                            [
                                                f for f in sg._generator_models
                                                if isinstance(
                                                    f, DBNStructureModel)
                                            ][0].EXPLICIT_DISABLING = True  # set setting for DBN

                                        # Structure Generation Settings
                                        # NODE SETTINGS
                                        sg.set_node_range(
                                            min_objects=o_nr,
                                            max_objects=
                                            o_nr,  # number of temporal variables
                                            min_temp_nodes=n_p_t,
                                            max_temp_nodes=
                                            n_p_t,  # number of nodes per temporal variable
                                            min_states=s_p_t,
                                            max_states=s_p_t
                                        )  # number of states per node
                                        # EDGE SETTINGS
                                        sg.set_connection_ranges(
                                            min_edges_per_object=e_p_t,
                                            max_edges_per_object=e_p_t,
                                            # Anzahl der Temporal Variables die miteinander verbunden - haben jeweils x edges zwischen Objekten
                                            min_percent_inter=per_iter,
                                            max_percent_inter=per_iter
                                        )  # Range für Random - prozentualer Anteil an Querverbindungen pro TV im Bezug auf Knotenanzahl
                                        # TIME SETTINGS
                                        sg.set_temporal_range(
                                            min_per_object_gap=p_o_gap,
                                            max_per_object_gap=p_o_gap + 0.5)
                                        sg.set_temporal_variance(tscbn_var)
                                        sg.set_dbn_tolerance(dbn_tol)

                                        # PROBABILITY SETTINGS
                                        sg.set_state_change_probability(
                                            min_probability=sc_prob,
                                            max_probability=sc_prob
                                        )  # probability of state change - at 1.0  parameter estimation should be exact

                                        # Generator Execution settings
                                        test_size = 1

                                        # Visualization parameters
                                        sg.set_model_visualization(
                                            plot=True, console_out=False)

                                        # ----------------------------------------------
                                        # Run tests
                                        # ----------------------------------------------
                                        for i in range(test_size):
                                            #print("\n\n------------------ Running Test %s ------------------" % (str(i + 1)))
                                            # Return test case
                                            try:
                                                models, specifications = sg.run_next_testcase(
                                                )
                                            except:
                                                print("Invalid sample " +
                                                      str(""))
                                                if not ev._append_csv:
                                                    eval_result = ev.write_header(
                                                        True)
                                                continue
                                            # evaluate result
                                            eval_result = ev.evaluate(
                                                models,
                                                specifications=specifications)

                                            # output
                                            ev.print_eval_results(
                                                eval_results=eval_result,
                                                specs=specifications,
                                                to_csv=True)
    L().log.info("-------------------- DONE -------------------------")
Exemple #14
0
    def estimate_skeleton(self, data, nodes):
        def create_max_skeleton(nodes):
            skeleton = nx.Graph()
            skeleton.add_nodes_from(nodes)  # create nodes
            edges = set()
            for node in nodes:
                for neigh in nodes:
                    if node != neigh:
                        edges.add((node, neigh))
                    pass
                pass
            skeleton.add_edges_from(edges)  # add edges
            return skeleton

        max_skeleton = create_max_skeleton(nodes)

        if isinstance(data, ADTree):
            cb_estimator = GSquareEstimator(adtree=data)
        else:
            cb_estimator = BaseEstimator(data=data,
                                         complete_samples_only=False)
        # procedure similar to PC algorithm
        skeleton = max_skeleton.copy()
        condition_set_size = 0
        sep_set = {}
        L().log.debug('---------------------------------------------------')
        L().log.debug('---- Conditional Independence Tests ---------------')
        L().log.debug('---------------------------------------------------')
        while True:
            cont = False
            remove_edges = []
            for (source, target) in permutations(nodes, 2):
                neighbors = list(skeleton.neighbors(source))
                if target not in neighbors:
                    continue
                else:
                    neighbors.remove(target)
                if len(neighbors) >= condition_set_size:
                    L().log.debug('testing ' + source + ' --> ' + target)
                    L().log.debug('neighbors of ' + source + ' are ' +
                                  str(neighbors))
                    for condition_set in combinations(neighbors,
                                                      condition_set_size):
                        L().log.debug('independence test of ' + source +
                                      ' and ' + target + ' with subset ' +
                                      str(condition_set))
                        _, p_val, _ = cb_estimator.test_conditional_independence(
                            source, target, list(condition_set))
                        if isnan(p_val
                                 ):  # pgmpy CI test returns NaN instead of 1
                            p_val = 1
                        L().log.debug('p_val = ' + str(p_val))
                        if p_val > self.alpha:
                            if skeleton.has_edge(source, target):
                                L().log.debug('remove edge ' +
                                              str((source, target)))
                                remove_edges.append((source, target))
                            key = tuple(sorted((source, target)))
                            if key in sep_set:
                                sep_set[key] |= set(condition_set)
                            else:
                                sep_set[key] = set(condition_set)
                            break
                        pass
                    cont = True
                pass
            condition_set_size += 1
            skeleton.remove_edges_from(remove_edges)
            if cont is False:
                break
            if condition_set_size > self.max_reach:
                break
            pass
        return skeleton, sep_set
    def _estimate_tscbn(self,
                        sequences,
                        debug,
                        leaves,
                        target='Aktiv_Funktion_Fahrerassistenzsystem_LDM'):
        cnt_s = 0
        tot_s = len(sequences)

        for sequence in sequences:
            if cnt_s % 50 == 0:
                L().log.info("Processing %s / %s" % (str(cnt_s), str(tot_s)))
            cnt_s += 1
            cur_seq = {}
            # simply count that
            largest = None
            max_val = 0
            for tv in sequence:
                i = 0
                for lst in sequence[tv]:
                    [state, start, end] = lst
                    node_name = tv + "_" + str(i)
                    if start > max_val and tv != target:
                        max_val = start
                        largest = node_name
                    i += 1
                    cur_seq[node_name] = state
            # das älteste element in Sequence muss unterstrich kriegen _...
            if largest == None: largest = target + "_1"
            cur_seq["_" + largest] = cur_seq[largest]
            del cur_seq[largest]

            # count all up in tree
            for node in cur_seq:
                if not self.tbn.Vdata[node]["parents"] is None:
                    o = list(set(list(self.tbn.Vdata[node]["parents"])))
                    o.sort()
                    self.tbn.Vdata[node]["parents"] = o

                state = cur_seq[node]

                if self.tbn.Vdata[node]["parents"] is None:
                    idx = self.tbn.Vdata[node]["vals"].index(state)
                    if not "cprob" in self.tbn.Vdata[node]:
                        self.tbn.Vdata[node]["vals"] += ["Never"]
                        self.tbn.Vdata[node]["cprob"] = np.zeros(
                            len(self.tbn.Vdata[node]["vals"]))
                    self.tbn.Vdata[node]["cprob"][idx] += 1.0

                else:
                    # get condition
                    cond = []
                    for p in self.tbn.Vdata[node]["parents"]:
                        if p not in cur_seq:
                            cond += ["Never"]  # it did not occur
                        else:
                            cond += [cur_seq[p]]
                    idx = self.tbn.Vdata[node]["vals"].index(state)

                    if not "cprob" in self.tbn.Vdata[node]:
                        self.tbn.Vdata[node]["vals"] += ["Never"]
                        self.tbn.Vdata[node]["cprob"] = dict()
                    if not str(cond) in self.tbn.Vdata[node]["cprob"]:
                        self.tbn.Vdata[node]["cprob"][str(cond)] = np.zeros(
                            len(self.tbn.Vdata[node]["vals"]))

                    self.tbn.Vdata[node]["cprob"][str(cond)][idx] += 1

        # drop not existing cpds:
        for node in self.tbn.Vdata:

            if not self.tbn.Vdata[node][
                    "parents"] is None and not str.startswith(node, "dL_"):
                keep = dict()
                for cond in self.tbn.Vdata[node]["cprob"]:
                    if not np.all(self.tbn.Vdata[node]["cprob"][cond] == 0):
                        keep[cond] = self.tbn.Vdata[node]["cprob"][cond]
                self.tbn.Vdata[node]["cprob"] = keep

        # Plot all distributions
        if self.tbn.show_plot_generated:
            self._visual.plot_histograms_from_bn(self.tbn, self.tbn)
        self._log_cpds_emph_given(leaves)
Exemple #16
0
    def estimate_cpdag(self, skel_graph, sep_set):
        dag = skel_graph.to_directed()
        nodes = skel_graph.nodes()
        for (source, target) in combinations(nodes, 2):
            source_neighbors = set(dag.successors(source))
            if target in source_neighbors:
                continue
            target_neghbors = set(dag.successors(target))
            if source in target_neghbors:
                continue
            common_neighbors = source_neighbors & target_neghbors
            key = tuple(sorted((source, target)))
            for k in common_neighbors:
                if k not in sep_set[key]:
                    if dag.has_edge(k, source):
                        dag.remove_edge(k, source)
                        L().log.debug('S: remove edge (' + k + ', ' + source +
                                      ')')
                        pass
                    if dag.has_edge(k, target):
                        dag.remove_edge(k, target)
                        L().log.debug('S: remove edge (' + k + ', ' + target +
                                      ')')
                        pass
                    pass
                pass
            pass

        def _has_both_edges(dag, i, j):
            return dag.has_edge(i, j) and dag.has_edge(j, i)

        def _has_any_edge(dag, i, j):
            return dag.has_edge(i, j) or dag.has_edge(j, i)

        # For all the combination of nodes source and target, apply the following
        # rules.
        for (source, target) in combinations(nodes, 2):
            # Rule 1: Orient source-target into source->target whenever there is an arrow k->source
            # such that k and target are nonadjacent.
            #
            # Check if source-target.
            if _has_both_edges(dag, source, target):
                # Look all the predecessors of source.
                for k in dag.predecessors(source):
                    # Skip if there is an arrow source->k.
                    if dag.has_edge(source, k):
                        continue
                    # Skip if k and target are adjacent.
                    if _has_any_edge(dag, k, target):
                        continue
                    # Make source-target into source->target
                    dag.remove_edge(target, source)
                    L().log.debug('R1: remove edge (' + target + ', ' +
                                  source + ')')
                    break
                pass

            # Rule 2: Orient source-target into source->target whenever there is a chain
            # source->k->target.
            #
            # Check if source-target.
            if _has_both_edges(dag, source, target):
                # Find nodes k where k is source->k.
                succs_i = set()
                for k in dag.successors(source):
                    if not dag.has_edge(k, source):
                        succs_i.add(k)
                        pass
                    pass
                # Find nodes target where target is k->target.
                preds_j = set()
                for k in dag.predecessors(target):
                    if not dag.has_edge(target, k):
                        preds_j.add(k)
                        pass
                    pass
                # Check if there is any node k where source->k->target.
                if len(succs_i & preds_j) > 0:
                    # Make source-target into source->target
                    dag.remove_edge(target, source)
                    L().log.debug('R2: remove edge (' + target + ', ' +
                                  source + ')')
                    break
                pass

            # Rule 3: Orient source-target into source->target whenever there are two chains
            # source-k->target and source-l->target such that k and l are nonadjacent.
            #
            # Check if source-target.
            if _has_both_edges(dag, source, target):
                # Find nodes k where source-k.
                source_neighbors = set()
                for k in dag.successors(source):
                    if dag.has_edge(k, source):
                        source_neighbors.add(k)
                        pass
                    pass
                # For all the pairs of nodes in source_neighbors,
                for (k, l) in combinations(source_neighbors, 2):
                    # Skip if k and l are adjacent.
                    if _has_any_edge(dag, k, l):
                        continue
                    # Skip if not k->target.
                    if dag.has_edge(target,
                                    k) or (not dag.has_edge(k, target)):
                        continue
                    # Skip if not l->target.
                    if dag.has_edge(target,
                                    l) or (not dag.has_edge(l, target)):
                        continue
                    # Make source-target into source->target.
                    dag.remove_edge(target, source)
                    L().log.debug('R3: remove edge (' + target + ', ' +
                                  source + ')')
                    break
                pass

        return dag
    def _single_run(self, initial_states, trees, seq_count, len_sequences, debug, disable_out = True):
        '''
            This function is used to process multiple sequences together
        '''

        # get last state
        #L().log.debug("-----------------> SEQUENCE %s of %s" % (str(seq_count + 1), str(len_sequences)))
        results = []
        delta_t_distribution = {} # save key: node - value: dict: key - condition  (inkl. myself) value: list of given delta t

        # --------- SAMPLING -----------
        pars = {}
        Constant.LOCK.acquire()
        initial_set = [n for n in self.tbn.nodes.keys() if self.tbn.Vdata[n]["parents"] == None]
        Constant.LOCK.release()
        for tz in range(self.sampling_frequency):

            # Initialize
            #if debug: L().log.debug("Sequence %s - Run %s/%s" % (str(seq_count), str(tz + 1), str(self.sampling_frequency)))
            for t in trees: trees[t].reset(initial_states)
            #Constant.LOCK.acquire()
            node_set = copy.deepcopy(initial_set)#[n for n in self.tbn.nodes.keys() if self.tbn.Vdata[n]["parents"] == None]
            #Constant.LOCK.release()
            parents_set, set_values, i, current_sample_initial = [], {}, 0, []
            current_sample, sample_legid, t_abs, t_abs_end = [], True, {}, {}

            # Iterate tree - starting from parent

            done = []
            while node_set:

                # 1. next node
                i, n = self._next_node(node_set, i)

                # 2. copy parent information - to omit parallel access
                if n not in pars:
                    Constant.LOCK.acquire()
                    par = {}
                    par["parents"] = copy.deepcopy(self.tbn.Vdata[n]["parents"])
                    par["dL_parents"] = copy.deepcopy(self.tbn.Vdata["dL_" + n]["parents"])
                    par["tbn_vals"] = copy.deepcopy(self.tbn.Vdata[n]["vals"])
                    par["children"] = copy.deepcopy(self.tbn.Vdata[n]["children"])
                    par["cprob"] = copy.deepcopy(self.tbn.Vdata[n]["cprob"])
                    pars[n] = par
                    Constant.LOCK.release()

                # 3. if initial states - draw it from there
                if n.split("_")[-1] == "0":

                    # DRAW LEAF NODE INITIAL SAMPLE
                    val = initial_states["_".join(n.split("_")[:-1])][0] #L().log.debug("%s - I return: %s " % (str(n), str(val)))
                    current_sample_initial.append([n, pars[n]["tbn_vals"].index(val)])  # info, info
                    delta_t_distribution["dL_" + n] = {}
                    if self._debug_time: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n] = {}
                    if self._debug_time: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n][str([val])] = 0

                    t_abs[n], t_abs_end[n], delta_t_distribution["dL_" + n][str([val])], set_values[n] = 0.0, initial_states["_".join(n.split("_")[:-1])][2], [0.0], val

                else:

                    # 4. if not initial states - draw conditioned on parents
                    # check if all parents given - else continue
                    if not set(pars[n]["parents"]).issubset(parents_set):
                        i += 1
                        continue

                    # get conditions
                    cond = [set_values[k] for k in pars[n]["parents"]]

                    # DRAW AND STORE NEXT SYMBOL
                    parent_starts = [ [self._is_never(k, set_values), t_abs[k]] for k in pars[n]["parents"]]
                    #parent_ends = [ [self._is_never(k, set_values), t_abs_end[k]] for k in pars[n]["parents"]]
                    val = trees["_".join(n.split("_")[:-1])].get_next_symbol(parent_starts, self._parent_outcome(n, set_values), cond)


                    if val is None:
                        if debug: L().log.debug("Sample NOT LEGID - None - BREAK")
                        print("Sample not legit")
                        break

                    set_values[n] = val[0]
                    t_abs[n] = val[1]
                    t_abs_end[n] = val[2]

                    # IF DRAWN SAMPLE LEGIT RECORD IT
                    current_sample.append([n, str(cond), pars[n]["tbn_vals"].index(val[0])])
                    if debug: L().log.debug("NEXT: %s = %s" % (str(n), val[0]))
                    if debug: L().log.debug("nodes: %s" % str(node_set))

                    # RECORD DELTA T DISTRIBUTION
                    cond_dL = [set_values[k] for k in pars[n]["dL_parents"]] # [set_values[k] for k in self.tbn.Vdata["dL_" + n]["parents"]]


                    # DEBUG - ONLY HERE!!!!!!!!!
                    if self._debug_time:
                        if "dL_" + n  not in trees["_".join(n.split("_")[:-1])].delta_t_for_debug:
                            trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n] = {}
                        if not str(cond_dL) in trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n]:
                            trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n][str(cond_dL)] = []  # Summe, Anzahl
                    #trees[n.split("_")[0]].delta_t_for_debug["dL_" + n][str(cond_dL)] += [t_abs[n] - max([t_abs[k] for k in pars[n]["parents"]])]

                    # END DEBUG

                    if "dL_" + n  not in delta_t_distribution:
                        delta_t_distribution["dL_" + n] = {}
                    if not str(cond_dL) in delta_t_distribution["dL_" + n]:
                        delta_t_distribution["dL_" + n][str(cond_dL)] = []  # Summe, Anzahl
                    delta_t_distribution["dL_" + n][str(cond_dL)] += [t_abs[n] - max([t_abs[k] for k in pars[n]["parents"]])]

                # GET NEXT NODES
                parents_set.append(n)
                node_set.remove(n)
                done += [n]
                node_set += [o for o in pars[n]["children"] if not o in done and not str.startswith(o, "dL_")]
                node_set = list(set(node_set))


            results.append([current_sample_initial, current_sample])

        # do norm fit on last run then simply aggregate all gaussians - HALT - das muss conditioned passieren
        for k in delta_t_distribution:
            for j in delta_t_distribution[k]:
                mean, std = norm.fit(delta_t_distribution[k][j])
                var = std * std
                if var == 0: var = 0.02 # else it makes no sense - as everything else then exact value is zero
                mean_scale = [1] * len(self.tbn.Vdata[k]["parents"])
                delta_t_distribution[k][j] = {'variance': var, 'mean_base': mean, 'mean_scal': mean_scale}

                if self._debug_time: trees["_".join(k.replace("dL_", "").split("_")[:-1])].delta_t_for_debug[k][j] = {'variance': var, 'mean_base': mean}


        return results, delta_t_distribution, trees, seq_count
Exemple #18
0
    def print_settings(sg, pe, ev, test_size, train_test_split,
                       sample_sequences_from_tscbn, evidence, testmode_models):
        L().log.info(
            "---------------------------------------------------------------------------------"
        )
        L().log.info(
            "                            SETTINGS                                             "
        )
        L().log.info(
            "---------------------------------------------------------------------------------\n"
        )
        ("\n\t\t\t\t\t\t ---> Execution Settings<---")
        L().log.info("Test size: \t\t\t\t\t\t%s" % str(test_size))
        #L().log.info("Traintest split percentage: \t%s per cent" % str(train_test_split * 100))
        L().log.info("Number of reference Samples: \t%s" %
                     str(sample_sequences_from_tscbn))
        L().log.info("Evidence: \t\t\t\t\t\t%s" % str(evidence))
        L().log.info("Testmode Models : \t\t\t\t%s" % str(testmode_models))

        L().log.info("\n\t\t\t\t\t\t ---> Parameter Estimation <---")
        L().log.info("E-Step Sampling Frequency: \t\t%s" %
                     str(pe.sampling_frequency)
                     )  # sampling frequency for the MC MC Simulation
        L().log.info("EM Iterations: \t\t\t\t\t%s" %
                     str(pe.iteration_frequency))  # EM Iterations
        L().log.info("Parallel Processes: \t\t\t%s" %
                     str(pe._parallel_processes))

        L().log.info("\n\t\t\t\t\t\t ---> TSCBN Infos <---")
        '''L().log.info("Object Range: \t\t\t\t\t%s" % str(sg._object_range))
        L().log.info("Models: \t\t\t\t\t\t%s" % str([m.__class__.__name__ for m in sg._generator_models]))
        L().log.info("Number TVs: \t\t\t\t\t%s" % str(sg._temp_node_range))
        L().log.info("Number States: \t\t\t\t\t%s" % str(sg._state_range))
        L().log.info("Number Inter-TV: \t\t\t\t%s" % str(sg._edges_inter_object_range))
        L().log.info("Percentage Inter-TV: \t\t\t%s" % str(sg._percentage_inter_edges))
        L().log.info("Intra Object Range: \t\t\t%s" % str(sg._intra_object_temp_range))
        L().log.info("TSCBN Temporal Variance: \t\t%s" % str(sg._temporal_variance))
        L().log.info("State Change Probability: \t\t%s" % str(sg._sc_probability))

        L().log.info("\n\t\t\t\t\t\t ---> Evaluation Settings <---")
        L().log.info("DBN Tolerance: \t\t\t\t\t%s" % str(sg._dbn_tolerance))'''
        L().log.info(
            "RMSE TSCBN Variance: \t\t\t%s" % str(ev.rmse_tscb_variance)
        )  # variance assumed per node - does not require parameter estimation
        L().log.info("RMSE TSCBN MEAN DRIFT: \t\t\t%s" %
                     str(ev.rmse_mean_range))
        L().log.info("Evaluation Metrics")
        for m in ev._metrics:
            L().log.info("\t\t%s" % str(m))
        L().log.info(
            "---------------------------------------------------------------------------------"
        )
        L().log.info(
            "                            END SETTINGS                                             "
        )
        L().log.info(
            "---------------------------------------------------------------------------------\n\n\n"
        )
        L().log.info(
            "---------------------------------------------------------------------------------"
        )
        L().log.info("                                RUN ")
        L().log.info(
            "---------------------------------------------------------------------------------\n\n"
        )
    def _estimate_tscbn(self, sequences, debug):

        # set uniform priors ------- BUT ONLY ON FIRST ITERATION
        self._set_uniform_prior()

        # FOR TEST - if given
        try:
            kl_div = self._evaluator._compute_kl_divergence(self.tbn, self._reference, print_it=False)
            if kl_div != "N.A.": EMAlgorithmParameterEstimator.LAST_KL_DIVERGENCE = kl_div
        except:
            pass#print("No Evaluation for kl per em iteration set")

        # get sample trees
        per_seq_trees, per_seq_initial_states = self._extract_sample_trees(sequences)

        # set parallel processes
        if len(sequences) <= self._parallel_processes: self._parallel_processes = len(sequences)

        '''cnt = 0
        tot_cnt = 0
        for i in per_seq_trees:
            for j in per_seq_trees[i]:
                if per_seq_trees[i][j].number_nevers == 0:
                    cnt += 1
                tot_cnt +=1

        print(str(cnt))
        print(str(tot_cnt))
        print(str(float(cnt)/float(tot_cnt)))
        import sys
        sys.exit(0)'''

        # EM: Iterations
        L().log.info("\n")
        L().log.info("Start EM Iterations")
        for opo in range(self.iteration_frequency):
            print("\n%sIteration:%s %s" % (PNT.BOLD, PNT.END, str(opo+1)))
            L().log.info("------------------------------------------------------------> EMIteration: %s ------------------------------------------------------------" % str(opo+1))

            # Update to new histograms
            L().log.debug("---------------------------------------------------------------------------------------------------------------------------------------------------")
            L().log.debug("     Histogram Update")
            L().log.debug("---------------------------------------------------------------------------------------------------------------------------------------------------")
            self._log_cpds()
            for k in per_seq_trees:
                trees = per_seq_trees[k]
                L().log.debug( "------------------------------------------------------------> Sequence " + str(k) + " <------------------------------------------------------------")
                [trees[t].new_iteration(opo == 0, self._debug_time) for t in trees]


            # per sequence create sample
            list_input = self._em_input(sequences, per_seq_trees, per_seq_initial_states, debug)
            print("Training size: %s" % str(len(sequences)))

            # split this input_list - to avoid memory overload
            split_size = 2001
            list_inputs = [list_input[i:i + split_size] for i in range(0, len(list_input), split_size)]
            final = False
            for i in range(len(list_inputs)):
                if i == (len(list_inputs)-1): final=True
                l_input = list_inputs[i]

                # parallel execution of simulation
                output_list = self._parallel_em(debug, l_input)

                # Update CPD and trees + normalize all + set all distribution parameters
                self._update_CPD(output_list, per_seq_trees, final)
                per_seq_trees = self._update_trees(output_list, per_seq_trees)

                del output_list

            # print evaluation
            try:
                kl_div = self._evaluator._compute_kl_divergence(self.tbn, self._reference, print_it = False)
                if kl_div != "N.A.": EMAlgorithmParameterEstimator.LAST_KL_DIVERGENCE = kl_div
            except:
                print("No Evaluation for kl per em iteration set")

        # Plot all distributions
        if self.tbn.show_plot_generated:
            self._visual.plot_histograms_from_bn(self.tbn, self.original_tbn)

        L().log.info(
            "------------------------------------------------------------> EM Finished ------------------------------------------------------------")
        self._log_cpds()
        return self.tbn
Exemple #20
0
    def test_conditional_independence(self, source, target, condition_set):
        adtree = self.adtree
        number_samples = adtree.count()
        source_table = adtree.table(source)
        source_values = [source_entry[0] for source_entry in source_table]
        target_table = adtree.table(target)
        target_values = [target_entry[0] for target_entry in target_table]
        dof = (
            (len(source_table) - 1) * (len(target_table) - 1) *
            np.prod(list(map(lambda x: len(adtree.table(x)), condition_set)))
        )  # degrees of freedom
        if dof == 0:  # this is the case when source or target is constant
            L().log.warning(
                'Zero degrees of freedom: Either source or target is constant!'
            )
            pass
            return 0, 1, True  # p-value is 1
        row_size_required = 10 * dof  # test results are not really reliable if there are less than 10*dof samples
        sufficient_data = True
        if number_samples < row_size_required:
            L().log.warning('Not enough samples. ' + str(number_samples) +
                            ' is too small. Need ' + str(row_size_required) +
                            '. G^2-Test may not be reliable.')
            sufficient_data = False
        pass
        g2 = 0

        # first case: empty condition set
        if len(condition_set) == 0:
            nij = pd.DataFrame(0,
                               index=[entry[0] for entry in source_table],
                               columns=[entry[0] for entry in target_table])
            kwargs = {}  # collect arguments for ADtree lookup
            for source_value in source_values:
                for target_value in target_values:
                    kwargs.update({source: source_value, target: target_value})
                    nij.loc[source_value,
                            target_value] = adtree.count(**kwargs)
                pass
            n_j = np.array([nij.sum(axis=1)
                            ]).T  # fix first variable and compute frequencies
            ni_ = np.array([nij.sum(axis=0)
                            ])  # fix second variable and compute frequencies
            expected_nij = n_j.dot(ni_) / number_samples  # expectation of nij
            ln_argument = nij.divide(expected_nij)  # compute argument for ln()
            ln_results = np.log(ln_argument)  # compute ln()
            g2 = np.nansum(nij.multiply(2 * ln_results))  # compute sum of lns
        pass

        # second case: non-empty condition set
        if len(condition_set) > 0:
            # calculate number of possible combinations of the values in the condition set
            prod_levels = np.prod(
                list(map(lambda x: len(adtree.table(x)), condition_set)))
            condition_set_values = [
                list([entry[0] for entry in adtree.table(node)])
                for node in condition_set
            ]
            cs_value_combinations = list(product(*condition_set_values))
            nij_ = [
                pd.DataFrame(0,
                             index=[entry[0] for entry in source_table],
                             columns=[entry[0] for entry in target_table])
                for _ in cs_value_combinations
            ]
            nijk = pd.concat(nij_,
                             keys=cs_value_combinations)  # type: pd.DataFrame

            # fill in frequencies
            kwargs = {}  # collect arguments for ADtree lookup
            for source_value in source_values:
                for target_value in target_values:
                    for cs_value_combination in cs_value_combinations:
                        kwargs.update({
                            source: source_value,
                            target: target_value
                        })
                        kwargs.update(zip(condition_set, cs_value_combination))
                        nijk.xs(cs_value_combination).loc[
                            source_value,
                            target_value] = adtree.count(**kwargs)
                    pass
                pass
            pass

            ni__ = np.ndarray((len(source_table), prod_levels))
            n_j_ = np.ndarray((len(target_table), prod_levels))
            for value_combination in cs_value_combinations:
                index = cs_value_combinations.index(value_combination)
                ni__[:, index] = nijk.xs(value_combination).sum(axis=1)
                n_j_[:, index] = nijk.xs(value_combination).sum(axis=0)
            pass
            n__k = n_j_.sum(axis=0)
            for value_combination in cs_value_combinations:
                index = cs_value_combinations.index(value_combination)
                ni_k = np.array([
                    ni__[:, index]
                ]).T  # fix condition set and compute source frequencies
                n_jk = np.array([
                    n_j_[:, index]
                ])  # fix condition set and compute target frequencies
                expected_nijk = ni_k.dot(n_jk) / n__k[
                    index]  # expected frequencies for nijk
                ln_argument = nijk.xs(
                    value_combination) / expected_nijk  # argument for ln()
                ln_results = np.log(ln_argument)  # compute ln()
                g2 += np.nansum(
                    nijk.xs(value_combination).multiply(2 * ln_results))
            pass
        pass

        p_val = chi2.sf(g2,
                        dof)  # compute p-value by using the chi^2 distribution
        return g2, p_val, sufficient_data
Exemple #21
0
    def discover_structure_from_pops(self, pops, data):
        """
        This method takes the potential parents of all nodes and the ADtree with all the data. An approach similar to
        PC algorithm is performed to determine the parent set for each node.
        :param pops: map from nodes to their potential parents
        :param data: ADtree or pandas dataframe
        :return nodes: list of nodes
        :return edges: list of inter edges
        """

        def create_maximal_pgm(pops):
            pgm = nx.DiGraph()
            pgm.add_nodes_from(pops)  # create nodes
            for node in pops:
                edges = [(parent, node) for parent in pops.get(node) if
                         node.rsplit('_', 1)[0] != parent.rsplit('_', 1)[0]]
                pgm.add_edges_from(edges)  # add edges
            return pgm

        def markov_blanket(graph, parent_node, node):
            mb = set(pa for pa in graph.predecessors(node))  # add parent nodes
            mb |= set(ch for ch in graph.successors(node))  # add child nodes
            for child in graph.successors(node):  # add parents of children
                mb |= set(pa for pa in graph.predecessors(child))
            if node in mb:  # remove node
                mb.remove(node)
            if parent_node in mb:  # remove parent_node
                mb.remove(parent_node)
            return mb

        max_pgm = create_maximal_pgm(pops)

        if self.draw:
            plt.title('Maximal PGM (only intra-edges)')
            signal_pos_map = {}
            pos = {}
            for node in max_pgm.nodes:
                if node.rsplit('_', 1)[0] not in signal_pos_map:
                    signal_pos_map.update({node.rsplit('_', 1)[0]: len(signal_pos_map)})
                x_coordinate = int(node[-1:])
                y_coordinate = signal_pos_map.get(node.rsplit('_', 1)[0])
                pos.update({node: [x_coordinate, y_coordinate]})
            nx.draw(max_pgm, pos=pos, with_labels=True)
            plt.show()
        pass

        if isinstance(data, ADTree):
            cb_estimator = GSquareEstimator(adtree=data)
        else:
            cb_estimator = BaseEstimator(data=data, complete_samples_only=False)
        # procedure similar to PC algorithm
        pgm = max_pgm.copy()
        condition_set_size = 0
        L().log.debug('---------------------------------------------------')
        L().log.debug('---- Conditional Independence Tests ---------------')
        L().log.debug('---------------------------------------------------')

        #
        if self.optimization_chi_square:
            import scipy.stats as scs
            def chi_square_of_df_cols(df, col1, col2):
                df_col1, df_col2 = df[col1], df[col2]
                categories_2 = list(df_col2.unique())
                categories_1 = list(df_col1.unique())
                result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
                           for cat2 in categories_2]
                          for cat1 in categories_1]

                chi = scs.chi2_contingency(result)


                return chi

            remove_edges = []
            for (source, target) in pgm.edges():
                # check how correlated those two edges are / independent of MB and all the other stuff
                dat = chi_square_of_df_cols(self.data, source, target) # 1 = more corr. 0 = less corr.
                chi2, p, sufficient_data = dat[0], dat[1], dat[2]
                #print("%s  Chi = %s, p=%s" % (str([source, target]), str(chi2), str(p)))

                if chi2 < self.chi_square_thresh and pgm.has_edge(source, target):
                    L().log.debug('remove edge ' + str((source, target)))
                    remove_edges.append((source, target))
            pgm.remove_edges_from(remove_edges)
            #import sys
            #sys.exit(0)


            # additionally remove edges which are conditionally independent
            # e.g. given a-> b  c->b   and given a, c is independent of b, then I can remove c!!!
            remove_edges = []
            for (source, target) in pgm.edges():
                condition_set = [a for a in pgm.predecessors(target) if a != source]
                if not condition_set:continue
                _, p_val, _ = cb_estimator.test_conditional_independence(source, target, list(condition_set))
                if p_val > self.alpha:
                    if pgm.has_edge(source, target):
                        L().log.debug('remove edge ' + str((source, target)))
                        remove_edges.append((source, target))
            pgm.remove_edges_from(remove_edges)

        else:
            while True:
                cont = False
                remove_edges = []
                for (source, target) in pgm.edges():
                    mb = markov_blanket(pgm, target, source)
                    if len(mb) >= condition_set_size:
                        L().log.debug('testing ' + source + ' --> ' + target)
                        L().log.debug('markov blanket of ' + source + ' is ' + str(mb))
                        for condition_set in combinations(mb, condition_set_size):
                            L().log.debug(
                                'independence test of ' + source + ' and ' + target + ' with subset ' + str(condition_set))
                            _, p_val, _ = cb_estimator.test_conditional_independence(source, target, list(condition_set))
                            #if isnan(p_val):  # pgmpy CI test returns NaN instead of 1
                            #    p_val = 1
                            L().log.debug('p_val = ' + str(p_val))
                            if p_val > self.alpha:
                                if pgm.has_edge(source, target):
                                    L().log.debug('remove edge ' + str((source, target)))
                                    remove_edges.append((source, target))
                                break
                            pass
                        cont = True
                    pass
                condition_set_size += 1
                pgm.remove_edges_from(remove_edges)
                if cont is False:
                    break
                if condition_set_size > self.max_reach:
                    break

        if self.draw:
            plt.title('PGM after CI tests (only inter-edges)')
            signal_pos_map = {}
            pos = {}
            for node in pgm.nodes:
                if node.rsplit('_', 1)[0] not in signal_pos_map:
                    signal_pos_map.update({node.rsplit('_', 1)[0]: len(signal_pos_map)})
                x_coordinate = int(node[-1:])
                y_coordinate = signal_pos_map.get(node.rsplit('_', 1)[0])
                pos.update({node: [x_coordinate, y_coordinate]})
            nx.draw(pgm, pos=pos, with_labels=True)
            plt.show()
        pass

        nodes = list(pops.keys())
        edges = [list(edge) for edge in pgm.edges]
        return nodes, edges