def _transform_vertex_pair_base(self, G, v, u, distance, feature_list): # for all radii for radius in range(self.min_r, self.r + 2, 2): for label_index in range(G.graph['label_size']): if radius < len( G.node[v]['neighborhood_graph_hash'] [label_index]) and radius < len( G.node[u]['neighborhood_graph_hash'][label_index]): # feature as a pair of neighbourhoods at a radius,distance # canonicazation of pair of neighborhoods v_hash = G.node[v]['neighborhood_graph_hash'][label_index][ radius] u_hash = G.node[u]['neighborhood_graph_hash'][label_index][ radius] if v_hash < u_hash: first_hash = v_hash second_hash = u_hash else: first_hash = u_hash second_hash = v_hash t = [first_hash, second_hash, radius, distance] feature = fast_hash(t, self.bitmask) key = fast_hash([radius, distance], self.bitmask) # if self.weighted == False : if G.graph.get('weighted', False) is False: feature_list[key][feature] += 1 else: feature_list[key][feature] += G.node[v][ 'neighborhood_graph_weight'][radius] + G.node[u][ 'neighborhood_graph_weight'][radius]
def graph_hash(graph, hash_bitmask, node_name_label=lambda id, node: node['hlabel']): """ so we calculate a hash of a graph """ node_names = {n: calc_node_name(graph, n, hash_bitmask, node_name_label) for n in graph.nodes()} tmp_fast_hash = lambda a, b: fast_hash([(a ^ b) + (a + b), min(a, b), max(a, b)]) l = [tmp_fast_hash(node_names[a], node_names[b]) for (a, b) in graph.edges()] l.sort() # isolates are isolated nodes isolates = [n for (n, d) in graph.degree_iter() if d == 0] z = [node_name_label(node_id, graph.node[node_id]) for node_id in isolates] z.sort() return fast_hash(l + z, hash_bitmask)
def _compute_neighborhood_graph_hash(self, root, graph): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = graph.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: # compute the vertex hashed label by hashing the hlabel # field # with the degree of the vertex (obtained as the size of # the adjacency dictionary for the vertex v) # or, in case positional is set, using the relative # position of the vertex v w.r.t. the root vertex if self.positional: vhlabel = fast_hash_2( graph.node[v]['hlabel'], root - v) else: vhlabel = fast_hash_2( graph.node[v]['hlabel'], len(graph[v])) hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood = fast_hash( hash_label_list) hash_list.append(hashed_nodes_at_distance_d_in_neighborhood) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list) graph.node[root]['neigh_graph_hash'] = hash_neighborhood
def _compute_neighborhood_graph_hash(self, root, graph): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = graph.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: # compute the vertex hashed label by hashing the hlabel # field # with the degree of the vertex (obtained as the size of # the adjacency dictionary for the vertex v) # or, in case positional is set, using the relative # position of the vertex v w.r.t. the root vertex if self.positional: vhlabel = fast_hash_2( graph.node[v]['hlabel'], root - v) else: vhlabel = \ fast_hash_2(graph.node[v]['hlabel'], len(graph[v])) hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood = fast_hash( hash_label_list) hash_list.append(hashed_nodes_at_distance_d_in_neighborhood) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list) graph.node[root]['neigh_graph_hash'] = hash_neighborhood
def graph_hash(graph, hash_bitmask, node_name_label=None): """ so we calculate a hash of a graph """ l = [] node_name_cache = {} all_nodes = set(graph.nodes()) visited = set() # all the edges for (a, b) in graph.edges(): visited.add(a) visited.add(b) ha = node_name_cache.get(a, -1) if ha == -1: ha = calc_node_name(graph, a, hash_bitmask, node_name_label) node_name_cache[a] = ha hb = node_name_cache.get(b, -1) if hb == -1: hb = calc_node_name(graph, b, hash_bitmask, node_name_label) node_name_cache[b] = hb l.append((ha ^ hb) + (ha + hb)) # z=(ha ^ hb) + (ha + hb) # l.append( fast_hash([ha,hb],hash_bitmask) +z ) l.sort() # nodes that dont have edges if node_name_label is None: z = [graph.node[node_id]['hlabel'][0] for node_id in all_nodes - visited] else: z = [graph.node[node_id][node_name_label] for node_id in all_nodes - visited] z.sort() ihash = fast_hash(l + z, hash_bitmask) return ihash
def _compute_neighborhood_graph_hash(self, root, G): hash_neighborhood_list = [] # for all labels for label_index in range(G.graph['label_size']): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = G.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: vhlabel = G.node[v]['hlabel'][label_index] hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood_set = fast_hash( hash_label_list, self.bitmask) hash_list.append( hashed_nodes_at_distance_d_in_neighborhood_set) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list, self.bitmask) hash_neighborhood_list.append(hash_neighborhood) G.node[root]['neighborhood_graph_hash'] = hash_neighborhood_list
def calc_node_name(interfacegraph, node, hash_bitmask, node_name_label=lambda id, node: node['hlabel']): ''' part of generating the hash for a graph is calculating the hash of a node in the graph # the case that n has no neighbors is currently untested... ''' d = nx.single_source_shortest_path_length(interfacegraph, node, 20) l = [node_name_label(nid, interfacegraph.node[nid]) + dis for nid, dis in d.items()] l.sort() return fast_hash(l, hash_bitmask)
def _transform(self, instance_id, seq): if seq is None or len(seq) == 0: raise Exception('ERROR: something went wrong, empty instance at position %d.' % instance_id) # extract kmer hash codes for all kmers up to r in all positions in seq seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for pos in range(seq_len): for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): feature = [neighborhood_hash_cache[pos][radius], radius] for distance in range(self.min_d, self.d + 1): if pos + distance + radius < seq_len: dfeature = feature + [distance, neighborhood_hash_cache[pos + distance][radius]] feature_code = fast_hash(dfeature, self.bitmask) key = fast_hash([radius, distance], self.bitmask) feature_list[key][feature_code] += 1 return self._normalization(feature_list, instance_id)
def _label_preprocessing(self, G): try: G.graph['label_size'] = self.label_size for n, d in G.nodes_iter(data=True): # for dense or sparse vectors if isinstance(d['label'], list) or isinstance( d['label'], dict): node_entity, data = self._extract_entity_and_label(d) if isinstance(d['label'], dict): data = self._convert_dict_to_sparse_vector(data) # create a list of integer codes of size: label_size # each integer code is determined as follows: # for each entity, use the correspondent discretization_model_dict[node_entity] to extract the id of the # nearest cluster centroid, return the centroid id as the # integer code hlabel = [] for i in range(self.label_size): if len(self.discretization_model_dict[node_entity] ) < i: raise Exception( 'Error: discretization_model_dict for node entity: %s has length: %d but component %d was required' % (node_entity, len(self. discretization_model_dict[node_entity]), i)) predictions = self.discretization_model_dict[ node_entity][i].predict(data) if len(predictions) != 1: raise Exception( 'Error: discretizer has not returned an individual prediction but %d predictions' % len(predictions)) discretization_code = predictions[0] + 1 code = fast_hash( [hash(node_entity), discretization_code], self.bitmask) hlabel.append(code) G.node[n]['hlabel'] = hlabel elif isinstance(d['label'], basestring): # copy a hashed version of the string for a number of times equal to self.label_size # in this way qualitative ( i.e. string ) labels can be # compared to the discretized labels hlabel = int(hash(d['label']) & self.bitmask) + 1 G.node[n]['hlabel'] = [hlabel] * self.label_size else: raise Exception( 'ERROR: something went wrong, type of node label is unknown: %s' % d['label']) except Exception as e: import datetime curr_time = datetime.datetime.now().strftime( "%A, %d. %B %Y %I:%M%p") print("Program run failed on %s" % curr_time) print(e.__doc__) print(e.message)
def _compute_vertex_based_features(self, seq): if seq is None or len(seq) == 0: raise Exception('ERROR: something went wrong, empty instance.') # extract kmer hash codes for all kmers up to r in all positions in seq feature_dict = {} seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] for pos in range(seq_len): # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): feature = [neighborhood_hash_cache[pos][radius], radius] for distance in range(self.min_d, self.d + 1): if pos + distance + radius < seq_len: dfeature = feature + [distance, neighborhood_hash_cache[pos + distance][radius]] feature_code = fast_hash(dfeature, self.bitmask) key = fast_hash([radius, distance], self.bitmask) feature_list[key][feature_code] += 1 feature_dict.update(self._normalization(feature_list, pos)) X = self._convert_dict_to_sparse_matrix(feature_dict) return X
def _transform_vertex_pair_base(self, G, v, u, distance, feature_list): # for all radii for radius in range(self.min_r, self.r + 2, 2): for label_index in range(G.graph['label_size']): if radius < len(G.node[v]['neighborhood_graph_hash'][label_index]) and radius < len(G.node[u]['neighborhood_graph_hash'][label_index]): # feature as a pair of neighbourhoods at a radius,distance # canonicazation of pair of neighborhoods v_hash = G.node[v]['neighborhood_graph_hash'][label_index][radius] u_hash = G.node[u]['neighborhood_graph_hash'][label_index][radius] if v_hash < u_hash: first_hash = v_hash second_hash = u_hash else: first_hash = u_hash second_hash = v_hash t = [first_hash, second_hash, radius, distance] feature = fast_hash(t, self.bitmask) key = fast_hash([radius, distance], self.bitmask) # if self.weighted == False : if G.graph.get('weighted', False) is False: feature_list[key][feature] += 1 else: feature_list[key][feature] += G.node[v]['neighborhood_graph_weight'][radius] + G.node[u]['neighborhood_graph_weight'][radius]
def calc_node_name(interfacegraph, node, hash_bitmask, node_name_label): ''' part of generating the hash for a graph is calculating the hash of a node in the graph ''' d = nx.single_source_shortest_path_length(interfacegraph, node, 20) # d is now node:dist # l is a list of hash(label,distance) # l=[ func([interfacegraph.node[nid]['intlabel'],dis]) for nid,dis in d.items()] if node_name_label is None: l = [interfacegraph.node[nid]['hlabel'][0] + dis for nid, dis in d.items()] else: l = [interfacegraph.node[nid][node_name_label] + dis for nid, dis in d.items()] l.sort() l = fast_hash(l, hash_bitmask) return l
def calc_node_name2(interfacegraph, node, hash_bitmask, node_name_label): ''' part of generating the hash for a graph is calculating the hash of a node in the graph ''' d = nx.single_source_shortest_path_length(interfacegraph, node, 20) # d is now node:dist # l is a list of hash(label,distance) # l=[ func([interfacegraph.node[nid]['intlabel'],dis]) for nid,dis in d.items()] if node_name_label is None: l = [] # # print "hlabel used: ", [interfacegraph.node[nid]['hlabel'] for nid, dis in d.items()] # # print "values of dis: ", [(nid, dis) for nid, dis in d.items()] for nid, dis in d.items(): l.extend(interfacegraph.node[nid]['hlabel']) # ##### l = [interfacegraph.node[nid]['hlabel'][-1] + dis for nid, dis in d.items()] else: l = [interfacegraph.node[nid][node_name_label] + dis for nid, dis in d.items()] l.sort() # print "sorted l: ", l l = fast_hash(l, hash_bitmask) return l
def _label_preprocessing(self, G): try: G.graph['label_size'] = self.label_size for n, d in G.nodes_iter(data=True): # for dense or sparse vectors if isinstance(d['label'], list) or isinstance(d['label'], dict): node_entity, data = self._extract_entity_and_label(d) if isinstance(d['label'], dict): data = self._convert_dict_to_sparse_vector(data) # create a list of integer codes of size: label_size # each integer code is determined as follows: # for each entity, use the correspondent discretization_model_dict[node_entity] to extract the id of the # nearest cluster centroid, return the centroid id as the # integer code hlabel = [] for i in range(self.label_size): if len(self.discretization_model_dict[node_entity]) < i: raise Exception('Error: discretization_model_dict for node entity: %s has length: %d but component %d was required' % ( node_entity, len(self.discretization_model_dict[node_entity]), i)) predictions = self.discretization_model_dict[node_entity][i].predict(data) if len(predictions) != 1: raise Exception('Error: discretizer has not returned an individual prediction but %d predictions' % len(predictions)) discretization_code = predictions[0] + 1 code = fast_hash([hash(node_entity), discretization_code], self.bitmask) hlabel.append(code) G.node[n]['hlabel'] = hlabel elif isinstance(d['label'], basestring): # copy a hashed version of the string for a number of times equal to self.label_size # in this way qualitative ( i.e. string ) labels can be # compared to the discretized labels hlabel = int(hash(d['label']) & self.bitmask) + 1 G.node[n]['hlabel'] = [hlabel] * self.label_size else: raise Exception('ERROR: something went wrong, type of node label is unknown: %s' % d['label']) except Exception as e: import datetime curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p") print("Program run failed on %s" % curr_time) print(e.__doc__) print(e.message)
def _compute_neighborhood_graph_hash(self, root, G): hash_neighborhood_list = [] # for all labels for label_index in range(G.graph['label_size']): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = G.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: vhlabel = G.node[v]['hlabel'][label_index] hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood_set = fast_hash(hash_label_list, self.bitmask) hash_list.append(hashed_nodes_at_distance_d_in_neighborhood_set) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list, self.bitmask) hash_neighborhood_list.append(hash_neighborhood) G.node[root]['neighborhood_graph_hash'] = hash_neighborhood_list
def _fhash(stuff): return eden.fast_hash(stuff, 2 ** 20 - 1)