Python DataExtractor.get_data_vectors Beispiele

Programmiersprache: Python

Namespace / Paketname: data_extractor

Klasse / Typ: DataExtractor

Methode / Funktion: get_data_vectors

Beispiele auf hotexamples.com: 4

Python DataExtractor.get_data_vectors - 4 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die data_extractor.DataExtractor.get_data_vectors, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

DataExtractor(30)

location_mapping(10)

get_movie_actor_data(7)

get_mlmovies_data(6)

choose_versions(5)

get_data_vectors(4)

extract(4)

get_variable_values_sets(4)

get_mltags_data(4)

get_genome_tags_data(4)

get_imdb_actor_info_data(3)

get_bugged_files_path(3)

save(2)

read(2)

location_title_to_id_mapping(2)

checkout_version(2)

get_selected_versions(2)

extract_data(2)

get_data(1)

prepare_dataset_for_task1(1)

load_train_labels(1)

load_valid_data(1)

load_valid_labels(1)

extractTextFromTagList(1)

extractText(1)

prepare_dataset_for_task3(1)

load_test_labels(1)

prepare_dataset_for_task6(1)

dropna(1)

create_dataset(1)

save_csv(1)

set_selected_config(1)

to_json(1)

load_train_data(1)

load_test_data(1)

get_details_from_results(1)

init_jira_commits(1)

import_matches(1)

get_versions_by_type(1)

get_task4_feedback_data(1)

get_task2_feedback_data(1)

collect_data(1)

get_race(1)

append_givenloc_to_list(1)

findAllTags(1)

get_mlratings_data(1)

getData(1)

get_all_files_prefixed_with(1)

get_bugged_methods_path(1)

get_files_bugged(1)

Beispiel #1

Datei anzeigen

Datei: estimate_params.py Projekt: hbarthwal/course_projects

class ParameterEstimation:
    def __init__(self, network):
        self._network = network
        self._data = DataExtractor(network.name)

    def _get_probabilities(self, X, S, S_combinations):
        data_vectors = self._data.get_data_vectors()
        N = len(data_vectors[data_vectors.keys()[0]])
        X_values = data_vectors[X]
        observed_prob_dict = {}
        #  Now we look for the  value x of the variable X
        for x in self._values_dict[X]:
            #  finding matches for x
            x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x])
            observed_prob_dict['P(' + X + '=' + x + ')'] = (len(x_indices) / N) + 0.001

            for S_combination in S_combinations:
                z_indices = self._get_z_indices(S, S_combination)
                z = z_indices
                x_z = x_indices.intersection(z)
                observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = (len(x_z) / float(len(z))) + 0.001
        return observed_prob_dict

    def get_estimated_cpds(self):
        values_dict = self._data.get_variable_values_sets()
        cpds = []
        for node in self._network:
            parents = self._network.predecessors(node)
            value_combinations = PGMUtils.get_combinations(parents, values_dict)
            probability_dict = self._get_probabilities(node, parents, value_combinations)
            cpds.append(probability_dict)
        return cpds

Beispiel #2

Datei anzeigen

Datei: greedy_hill_climbing.py Projekt: hbarthwal/course_projects

class GreedyHillClimber:
    _max_change_count = 20

    def __init__(self, hyperparameter, initial_bayesian_network, tabu_list_size, max_change_count):
        self._bayesian_network = initial_bayesian_network
        self._best_score = -float('inf')
        self._best_solution = initial_bayesian_network
        self._actions_list = ['add', 'remove', 'reverse']
        self._tabu_list = OrderedDict()
        self._tabulist_size = tabu_list_size
        self._max_change_count = max_change_count
        self._data = DataExtractor(initial_bayesian_network.name)
        self._node_names = self._data.get_variable_values_sets().keys()
        values_sets = self._data.get_variable_values_sets()
        data_vectors = self._data.get_data_vectors()
        self._score_util = BDeuScoreUtil(hyperparameter, self._bayesian_network, data_vectors, values_sets)

    def _get_score(self, action = None, edge = None):
        #  We calculate the score using the BDeu score
        #  calculator
        return self._score_util.get_score(action, edge)

    def _equals(self, bayesian_network_A, bayesian_network_B):
        #  Return true if two bayesian network with identical nodes
        #  also have identical edges.
        signature_A = self._get_bn_signature(bayesian_network_A)
        signature_B = self._get_bn_signature(bayesian_network_B)
        return signature_A == signature_B

    def _tabu_list_contains(self, bayesian_network):
        #  Returns true if the tabu list contains the given
        #  bayesian network
        solution_signature = self._get_bn_signature(bayesian_network)
        has_solution = solution_signature in self._tabu_list
        if has_solution:
            pass    #  print 'solution is  contained in  tabulist(length = ', len(self._tabu_list), ')'
        else:
            pass    #  print  'solution is  not contained in  tabulist'
        return has_solution

    def _get_bn_signature(self, bayesian_network):
        #  Generate a string from the edge set of the given bayesian
        #  network which is unique for a given edge set
        edge_string_list = []
        for edge in bayesian_network.edges():
            edge_string = str(edge[0]) + '-' + str(edge[1])
            edge_string_list.append(edge_string)
        signature = ' '.join(edge_string_list)
        return signature

    def _add_solution_to_tabu_list(self, bayesian_network):
        #  Adds the given bayesian network to the tabu list
        if len(self._tabu_list) == self._tabulist_size:
            first_key = self._tabu_list.keys()[0]
            self._tabu_list.pop(first_key)

        solution_signature = self._get_bn_signature(bayesian_network)
        self._tabu_list[solution_signature] = 'dummy'

    def _get_feasible_local_solutions(self, bayesian_network, undirected_graph, edge):
        local_solutions_action_pairs = []
        #  Calculate all possible local solutions by applying
        #  all the possible actions.
        temp_bn = deepcopy(bayesian_network)
        temp_graph = deepcopy(undirected_graph)
        for action in self._actions_list:
            #  print action + 'ing', edge, ' in ', bayesian_network.edges()
            is_feasible = GraphUtils.apply_action(temp_bn, temp_graph, (edge), action, 2)

            if not is_feasible:
                #  If the action was not feasible  then try again
                #  print 'Infeasible action.. trying with different action'
                continue

            if self._tabu_list_contains(temp_bn):
                #  If generated solution is already in the tabu list then try again
                #  print 'Solution already in tabu list trying again'
                continue
            #  print 'Got ', temp_bn.edges()
            local_solutions_action_pairs.append((temp_bn, action))
            temp_bn = deepcopy(bayesian_network)
            temp_graph = deepcopy(undirected_graph)
        return local_solutions_action_pairs

    def _get_best_local_solution(self, bayesian_network, undirected_graph, edge):
        local_solutions_action_pairs = self._get_feasible_local_solutions(bayesian_network, undirected_graph, edge)
        if len(local_solutions_action_pairs) == 0:
            return self._get_score(bayesian_network), bayesian_network
        scores = [self._get_score(solution_action_pair[1], edge) for solution_action_pair in local_solutions_action_pairs]
        #  The solution with maximum score is the most optimal one
        sorted_scores = sorted(scores, reverse = True)
        #  print 'Scores: ', scores
        best_local_solution_score = sorted_scores[0]
        best_solution_index = scores.index(best_local_solution_score)
        #  print local_solutions_action_pairs[best_solution_index][1], ' action is the best action'
        best_local_solution = local_solutions_action_pairs[best_solution_index][0]
        return best_local_solution_score, best_local_solution

    def perform_GHC(self):
        current_solution = self._bayesian_network
        self._best_score = current_score = self._get_score(current_solution)
        #  draw(self._bayesian_network)
        #  plt.show()
        print 'Initial score :', self._best_score
        undirected_graph = current_solution.to_undirected()
        change_count = 0
        max_count = self._max_change_count
        print max_count
        while True:
            #  Pick a random edge and decide the best action to be
            #  applied on the edge
            random_edge = GraphUtils.get_random_edge(self._node_names)
            #  print random_edge, ' is the edge selected'
            current_score, current_solution = \
            self._get_best_local_solution(current_solution,
                                              undirected_graph, random_edge)
            undirected_graph = current_solution.to_undirected()

            if current_score > self._best_score:
                change_count = 0
                #  Update the new best solution
                self._best_solution = deepcopy(current_solution)
                self._best_score = current_score
                print '-----------', self._best_score , '------------------'

            else:
                change_count += 1

            self._add_solution_to_tabu_list(current_solution)

            if change_count == max_count:
                break

    def get_solution(self):
        return self._best_solution, self._best_score

Beispiel #3

Datei anzeigen

Datei: pgmlib_learner.py Projekt: hbarthwal/course_projects

@author: himanshu
'''
import json
from networkx import DiGraph, draw
from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner
import matplotlib.pyplot as plt
from data_extractor import DataExtractor


#  generate some data to use
data_ext = DataExtractor('genome', format = 'json')
data = data_ext.get_data_vectors()
print 'Got data with ', len(data), ' vectors'
#  instantiate my learner
learner = PGMLearner()

print 'learning the structure'
#  estimate structure
result = learner.discrete_constraint_estimatestruct(data, pvalparam = 0.02)

#  output
print json.dumps(result.E, indent = 2)
graph = DiGraph()
graph.add_edges_from(result.E)
draw(graph)
plt.show()

Beispiel #4

Datei anzeigen

Datei: bn_skeleton.py Projekt: hbarthwal/course_projects

class PC:

    _mutual_info_thresholds = [0.0005, 0.005, 0.025, 0.025]

    def __init__(self, network_name):
        self._network_name = network_name
        self._data = DataExtractor(network_name)
        self._values_dict = self._data.get_variable_values_sets()
        self._node_names = self._values_dict.keys()
        self._graph = None
        self._nmis = {}

    def _get_probabilities(self, X, Y, S, S_combinations):
        data_vectors = self._data.get_data_vectors()
        N = len(data_vectors[data_vectors.keys()[0]])
        X_values = data_vectors[X]
        Y_values = data_vectors[Y]
        observed_prob_dict = {}
        #  Now we look for the  value x of the variable X, and value y of the variable Y
        for x in self._values_dict[X]:    #  finding matches for x
            x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x])
            observed_prob_dict['P(' + X + '=' + x + ')'] = len(x_indices) / float(N)
            for y in self._values_dict[Y]:
                #  finding matches for y
                y_indices = set([element_index for (element_index, element) in enumerate(Y_values) if element == y])
                observed_prob_dict['P(' + Y + '=' + y + ')'] = len(y_indices) / float(N)
                xy = x_indices.intersection(y_indices)
                observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] = len(xy) / float(N)

                for S_combination in S_combinations:
                    z_indices = PGMUtils.get_z_indices(S, S_combination, data_vectors)
                    z = z_indices
                    y_z = y_indices.intersection(z)
                    x_z = x_indices.intersection(z)
                    xyz = xy.intersection(z)
                    observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(x_z) / float(len(z))
                    observed_prob_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(z) / float(N)
                    observed_prob_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(y_z) / float(len(z))
                    observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(xyz) / float(N)
        return observed_prob_dict

    def _are_dseparated(self, X, Y, S, n):
        H_X = H_Y = H_XY = 0
        S_combinations = PGMUtils.get_combinations(S, self._values_dict)
        probability_dict = self._get_probabilities(X, Y, S, S_combinations)
        for x in self._values_dict[X]:
            p_x = probability_dict['P(' + X + '=' + x + ')']
            for y in self._values_dict[Y]:
                p_y = probability_dict['P(' + Y + '=' + y + ')']
                #  in case we are looking for zero order conditional dependency
                if len(S_combinations) == 0:
                    H_Y += -log(p_y + 0.001)
                    H_X += -log(p_x + 0.001)
                    p_xy = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')']
                    H_XY += -log(p_xy + 0.001)
                else:
                    for S_combination in S_combinations:
                        p_y_z = probability_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_x_z = probability_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_xyz = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_z = probability_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        H_X += -log(p_x_z + 0.001)
                        H_Y += -log(p_y_z + 0.001)
                        H_XY += -log(p_xyz * p_z + 0.001)

        #  If mutual information is greater than certain threshhold
        #  then X and Y are dependent otherwise not
        n_X = 2 * len(self._values_dict[X])
        n_Y = 2 * len(self._values_dict[Y])
        n_XY = 4 * len(S_combinations)
        if n_XY == 0:
            n_XY = 4
        MI = abs((H_X / n_X) + (H_Y / n_Y) - (H_XY / n_XY))
        #  print 'MI(', X + ',' + Y + '|' + ','.join(S), ') = ', MI
        self._nmis[X + ',' + Y + '|' + ','.join(S)] = MI
        if MI < self._mutual_info_thresholds[n]:
            return True
        return False

    def _eliminate_edges(self, Sep):
        num_nodes = len(self._values_dict.keys())
        graph = complete_graph(num_nodes, Graph())
        self._graph = GraphUtils.rename_nodes(graph, self._node_names)
        n = 0
        max_allowed_degree = settings.networks_settings['genome']['max_allowed_degree']
        while n <= 3:
            print '--------------------------------------------------------'
            #  We repeat the iterations unless each node X has
            #  less than or equal to n neighbors
            for X in self._graph:
                for Y in self._graph:
                    if X != Y and GraphUtils.is_degree_greater(self._graph, max_allowed_degree) \
                    and is_connected(self._graph):
                        #  all the neighbors of X excluding Y
                        neighbors = self._graph.neighbors(X)
                        if Y in neighbors:
                            neighbors.remove(Y)
                            #  We only consider X,Y if #neighbors of X excluding Y are more than
                            #  or equal to n
                            if len(neighbors) >= n:
                                #  Combinations of all the adjacent nodes of X excluding Y
                                #  each subset in the observed_sets has cardinality 'n'
                                observed_subsets = combinations(neighbors, n)
                                for S in observed_subsets:
                                    #  We only consider the subsets which have exactly
                                    S = [s for s in sorted(S)]
                                    are_deseparated = self._are_dseparated(X, Y, S, n)
                                    if are_deseparated:
                                        if self._graph.has_edge(X, Y):
                                            self._graph.remove_edge(X, Y)
                                            print 'Removed', X, '-', Y
                                            Sep[X + ',' + Y] = S
                                            Sep[Y + ',' + X] = S
            n += 1

    def _has_directed_path(self, A, B):
        has_directed_path = False
        paths = all_simple_paths(self._graph, A, B)
        for path in paths:
            has_directed_path = has_directed_path or has_directed_path
            if has_directed_path:
                break
            i = 0
            while i < len(path) - 1:
                src_node = path[i]
                next_node = path[i + 1]
                edge = self._graph.edge[src_node] [next_node]
                if 'direction' in edge:
                    if edge['direction'] == src_node + '->' + next_node:
                        has_directed_path = True
                else:
                    has_directed_path = False
                    break
                i += 1
        return has_directed_path

    def _all_edges_oriented(self):
        '''
        for edge in self._graph.edges():
            if 'direction'in self._graph[edge[0]][edge[1]]:
                print self._graph[edge[0]][edge[1]]['direction']
        '''
        for edge in self._graph.edges():
            if 'direction'not in self._graph[edge[0]][edge[1]]:
                return False
        return True

    def _orient_edges(self, Sep):
        triplets = []
        for source in self._graph.nodes():
            for target in self._graph.nodes():
                if source != target:
                    if not self._graph.has_edge(source, target):
                        #  Each element in triplets lists will be a list of three nodes
                        #  [X, Y , Z] such that X and Z are not adjacent in the graph
                        #  while X,Y and Y,Z are adjacent
                        triplets.append(list(all_simple_paths(self._graph, source, target, 2)))

        for triplet in triplets:
            if triplet != []:
                X, Y , Z = triplet[0][0], triplet[0][1], triplet[0][2]
                if Y not in Sep[X + ',' + Z]:
                    #  We dont have partially connected graphs in networkx library
                    #  so we attach a direction attribute to all the edges which we
                    #  want to be directed
                    edgeXY = self._graph.edge[X][Y]
                    edgeXY['direction'] = X + '->' + Y
                    edgeZY = self._graph.edge[Z][Y]
                    edgeZY['direction'] = Z + '->' + Y

        while not self._all_edges_oriented():
            for edge in self._graph.edges():
                A = edge[0]
                B = edge[1]
                edgeAB = self._graph.edge[A][B]
                if 'direction' in edgeAB:
                    if edgeAB['direction'] == A + '->' + 'B':
                        for C in self._graph.neighbors(B):
                            #  A & C are not adjacent
                            if not self._graph.has_edge(A, C):
                                edgeBC = self._graph.edge[B][C]
                                if 'direction' not in edgeBC:
                                    edgeBC['direction'] = B + '->' + C

                elif self._has_directed_path(A, B):
                    edgeAB['direction'] = A + '->' + B

    def perform_PC(self):
        #  Implementation of the PC algorithm given here:
        #  http://www.lowcaliber.org/influence/spirtes-causation-prediction-search.pdf
        Sep = {}
        self._eliminate_edges(Sep)
        pprint(sorted(self._nmis.iteritems(), key = itemgetter(1), reverse = True))
        print self._graph.edges()
        draw(self._graph)
        plt.show()
        if is_connected(self._graph):
            print 'The graph is connected'
        else:
            print 'The graph is not connected'

        self._orient_edges(Sep)
        pprint (self._graph.edges())

    def get_skeleton(self):
        self._graph = GraphUtils.convert_to_directed(self._graph)
        self._graph.name = self._network_name
        return self._graph