def _compute_neighborhood_graph_hash(self, root, graph): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = graph.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: # compute the vertex hashed label by hashing the hlabel # field # with the degree of the vertex (obtained as the size of # the adjacency dictionary for the vertex v) # or, in case positional is set, using the relative # position of the vertex v w.r.t. the root vertex if self.positional: vhlabel = fast_hash_2( graph.node[v]['hlabel'], root - v) else: vhlabel = fast_hash_2( graph.node[v]['hlabel'], len(graph[v])) hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood = fast_hash( hash_label_list) hash_list.append(hashed_nodes_at_distance_d_in_neighborhood) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list) graph.node[root]['neigh_graph_hash'] = hash_neighborhood
def _compute_neighborhood_graph_hash(self, root, graph): # list all hashed labels at increasing distances hash_list = [] # for all distances root_dist_dict = graph.node[root]['remote_neighbours'] for node_set in root_dist_dict.itervalues(): # create a list of hashed labels hash_label_list = [] for v in node_set: # compute the vertex hashed label by hashing the hlabel # field # with the degree of the vertex (obtained as the size of # the adjacency dictionary for the vertex v) # or, in case positional is set, using the relative # position of the vertex v w.r.t. the root vertex if self.positional: vhlabel = fast_hash_2( graph.node[v]['hlabel'], root - v) else: vhlabel = \ fast_hash_2(graph.node[v]['hlabel'], len(graph[v])) hash_label_list.append(vhlabel) # sort it hash_label_list.sort() # hash it hashed_nodes_at_distance_d_in_neighborhood = fast_hash( hash_label_list) hash_list.append(hashed_nodes_at_distance_d_in_neighborhood) # hash the sequence of hashes of the node set at increasing # distances into a list of features hash_neighborhood = fast_hash_vec(hash_list) graph.node[root]['neigh_graph_hash'] = hash_neighborhood
def _transform(self, instance_id, seq): if seq is None or len(seq) == 0: raise Exception('ERROR: something went wrong, empty instance # %d.' % instance_id) if len(seq) == 2 and len(seq[1]) > 0: # assume the instance is a pair (header,seq) and extract only seq seq = seq[1] # extract kmer hash codes for all kmers up to r in all positions in seq seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for pos in range(seq_len): for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): second_endpoint = pos + distance if second_endpoint + radius < seq_len: feature_code = fast_hash_4(neighborhood_hash_cache[pos][radius], radius, distance, neighborhood_hash_cache[second_endpoint][radius], self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) feature_list[key][feature_code] += 1 return self._normalization(feature_list, instance_id)
def _transform_distance(self, feature_list=None, pos=None, radius=None, seq_len=None, neigh_hash_cache=None, neighborhood_weight_cache=None): distances = list(range(self.min_d, self.d + 1)) distances += list(range(-self.d, -self.min_d)) for distance in distances: end = pos + distance # Note: after having computed pos, we now treat # distance as the positive value only distance = abs(distance) cond1 = self.weights_dict is None if cond1 or self.weights_dict.get((radius, distance), 0) != 0: if end >= 0 and end + radius < seq_len: if self.use_only_context: pfeat = 42 else: pfeat = neigh_hash_cache[pos][radius] efeat = neigh_hash_cache[end][radius] feature_code = fast_hash_4(pfeat, efeat, radius, distance, self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) if neighborhood_weight_cache: pw = neighborhood_weight_cache[pos][radius] feature_list[key][feature_code] += pw ew = neighborhood_weight_cache[end][radius] feature_list[key][feature_code] += ew else: feature_list[key][feature_code] += 1
def _transform(self, instance_id, seq): if seq is None or len(seq) == 0: raise Exception("ERROR: something went wrong, empty instance # %d." % instance_id) if len(seq) == 2 and len(seq[1]) > 0: # assume the instance is a pair (header,seq) and extract only seq seq = seq[1] # extract kmer hash codes for all kmers up to r in all positions in seq seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for pos in range(seq_len): for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): second_endpoint = pos + distance if second_endpoint + radius < seq_len: feature_code = fast_hash_4( neighborhood_hash_cache[pos][radius], radius, distance, neighborhood_hash_cache[second_endpoint][radius], self.bitmask, ) key = fast_hash_2(radius, distance, self.bitmask) feature_list[key][feature_code] += 1 return self._normalization(feature_list, instance_id)
def _compute_vertex_based_features(self, seq): if seq is None or len(seq) == 0: raise Exception("ERROR: something went wrong, empty instance.") # extract kmer hash codes for all kmers up to r in all positions in seq feature_dict = {} seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] for pos in range(seq_len): # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): if pos + distance + radius < seq_len: feature_code = fast_hash_4( neighborhood_hash_cache[pos][radius], radius, distance, neighborhood_hash_cache[pos + distance][radius], self.bitmask, ) key = fast_hash_2(radius, distance, self.bitmask) feature_list[key][feature_code] += 1 feature_dict.update(self._normalization(feature_list, pos)) data_matrix = self._convert_dict_to_sparse_matrix(feature_dict) return data_matrix
def _transform_distance(self, feature_list=None, pos=None, radius=None, seq_len=None, neigh_hash_cache=None, neighborhood_weight_cache=None): distances = list(range(self.min_d, self.d + 1)) distances += list(range(-self.d, -self.min_d)) for distance in distances: end = pos + distance # Note: after having computed pos, we now treat # distance as the positive value only distance = abs(distance) cond1 = self.weights_dict is None if cond1 or self.weights_dict.get((radius, distance), 0) != 0: if end >= 0 and end + radius < seq_len: pfeat = neigh_hash_cache[pos][radius] efeat = neigh_hash_cache[end][radius] feature_code = fast_hash_4(pfeat, efeat, radius, distance, self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) if neighborhood_weight_cache: pw = neighborhood_weight_cache[pos][radius] feature_list[key][feature_code] += pw ew = neighborhood_weight_cache[end][radius] feature_list[key][feature_code] += ew else: feature_list[key][feature_code] += 1
def _transform_vertex_pair_base(self, graph, vertex_v, vertex_u, distance, feature_list, connection_weight=1): # for all radii for radius in range(self.min_r, self.r + 2, 2): for label_index in range(graph.graph['label_size']): if radius < len(graph.node[vertex_v]['neighborhood_graph_hash'][label_index]) and \ radius < len(graph.node[vertex_u]['neighborhood_graph_hash'][label_index]): # feature as a pair of neighbourhoods at a radius,distance # canonicazation of pair of neighborhoods vertex_v_hash = graph.node[vertex_v][ 'neighborhood_graph_hash'][label_index][radius] vertex_u_hash = graph.node[vertex_u][ 'neighborhood_graph_hash'][label_index][radius] if vertex_v_hash < vertex_u_hash: first_hash, second_hash = (vertex_v_hash, vertex_u_hash) else: first_hash, second_hash = (vertex_u_hash, vertex_v_hash) feature = fast_hash_4(first_hash, second_hash, radius, distance, self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) # if self.weighted == False : if graph.graph.get('weighted', False) is False: feature_list[key][feature] += 1 else: feature_list[key][feature] += connection_weight * \ (graph.node[vertex_v]['neighborhood_graph_weight'][radius] + graph.node[vertex_u]['neighborhood_graph_weight'][radius])
def _label_preprocessing(graph, label_size=1, key_label='label', key_entity='entity', discretizers={'entity': []}, bitmask=2**20 - 1): try: graph.graph['label_size'] = label_size for n, d in graph.nodes_iter(data=True): # for dense or sparse vectors is_list = isinstance(d[key_label], list) is_dict = isinstance(d[key_label], dict) if is_list or is_dict: node_entity, data = _extract_entity_and_label( d, key_entity, key_label) if isinstance(d[key_label], list): data = np.array(data, dtype=np.float64).reshape(1, -1) if isinstance(d[key_label], dict): data = _convert_dict_to_sparse_vector(data) # create a list of integer codes of size: label_size # each integer code is determined as follows: # for each entity, use the correspondent # discretizers[node_entity] to extract # the id of the nearest cluster centroid, return the # centroid id as the integer code hlabel = [] for i in range(label_size): if len(discretizers[node_entity]) < i: len_mod = \ len(discretizers[node_entity]) raise Exception( 'Error: discretizers for node entity: %s \ has length: %d but component %d was required' % (node_entity, len_mod, i)) predictions = \ discretizers[node_entity][i].predict(data) if len(predictions) != 1: raise Exception('Error: discretizer has not \ returned an individual prediction but\ %d predictions' % len(predictions)) discretization_code = predictions[0] + 1 code = fast_hash_2(hash(node_entity), discretization_code, bitmask) hlabel.append(code) graph.node[n]['hlabel'] = hlabel elif isinstance(d[key_label], basestring): # copy a hashed version of the string for a number of times # equal to self.label_size in this way qualitative # ( i.e. string ) labels can be compared to the # discretized labels hlabel = int(hash(d[key_label]) & bitmask) + 1 graph.node[n]['hlabel'] = [hlabel] * label_size else: raise Exception( 'ERROR: something went wrong, type of node label is unknown: \ %s' % d[key_label]) except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def _label_preprocessing(graph, label_size=1, key_label='label', key_entity='entity', discretizers={'entity': []}, bitmask=2 ** 20 - 1): try: graph.graph['label_size'] = label_size for n, d in graph.nodes_iter(data=True): # for dense or sparse vectors is_list = isinstance(d[key_label], list) is_dict = isinstance(d[key_label], dict) if is_list or is_dict: node_entity, data = _extract_entity_and_label(d, key_entity, key_label) if isinstance(d[key_label], list): data = np.array(data, dtype=np.float64).reshape(1, -1) if isinstance(d[key_label], dict): data = _convert_dict_to_sparse_vector(data) # create a list of integer codes of size: label_size # each integer code is determined as follows: # for each entity, use the correspondent # discretizers[node_entity] to extract # the id of the nearest cluster centroid, return the # centroid id as the integer code hlabel = [] for i in range(label_size): if len(discretizers[node_entity]) < i: len_mod = \ len(discretizers[node_entity]) raise Exception('Error: discretizers for node entity: %s \ has length: %d but component %d was required' % (node_entity, len_mod, i)) predictions = \ discretizers[node_entity][i].predict(data) if len(predictions) != 1: raise Exception('Error: discretizer has not \ returned an individual prediction but\ %d predictions' % len(predictions)) discretization_code = predictions[0] + 1 code = fast_hash_2(hash(node_entity), discretization_code, bitmask) hlabel.append(code) graph.node[n]['hlabel'] = hlabel elif isinstance(d[key_label], basestring): # copy a hashed version of the string for a number of times # equal to self.label_size in this way qualitative # ( i.e. string ) labels can be compared to the # discretized labels hlabel = int(hash(d[key_label]) & bitmask) + 1 graph.node[n]['hlabel'] = [hlabel] * label_size else: raise Exception('ERROR: something went wrong, type of node label is unknown: \ %s' % d[key_label]) except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def _label_preprocessing(self, graph): try: graph.graph['label_size'] = self.label_size for n, d in graph.nodes_iter(data=True): # for dense or sparse vectors if isinstance(d[self.key_label], list) or isinstance( d[self.key_label], dict): node_entity, data = self._extract_entity_and_label(d) if isinstance(d[self.key_label], list): data = np.array(data, dtype=np.float64).reshape(1, -1) if isinstance(d[self.key_label], dict): data = self._convert_dict_to_sparse_vector(data) # create a list of integer codes of size: label_size # each integer code is determined as follows: # for each entity, use the correspondent discretization_models[node_entity] to extract # the id of the nearest cluster centroid, return the centroid id as the integer code hlabel = [] for i in range(self.label_size): if len(self.discretization_models[node_entity]) < i: raise Exception( 'Error: discretization_models for node entity: %s \ has length: %d but component %d was required' % (node_entity, len(self.discretization_models[node_entity]), i)) predictions = self.discretization_models[node_entity][ i].predict(data) if len(predictions) != 1: raise Exception( 'Error: discretizer has not returned an individual prediction but\ %d predictions' % len(predictions)) discretization_code = predictions[0] + 1 code = fast_hash_2(hash(node_entity), discretization_code, self.bitmask) hlabel.append(code) graph.node[n]['hlabel'] = hlabel elif isinstance(d[self.key_label], basestring): # copy a hashed version of the string for a number of times equal to self.label_size # in this way qualitative ( i.e. string ) labels can be compared to the discretized labels hlabel = int(hash(d[self.key_label]) & self.bitmask) + 1 graph.node[n]['hlabel'] = [hlabel] * self.label_size else: raise Exception( 'ERROR: something went wrong, type of node label is unknown: \ %s' % d[self.key_label]) except Exception as e: import datetime curr_time = datetime.datetime.now().strftime( "%A, %d. %B %Y %I:%M%p") print("Program run failed on %s" % curr_time) print(e.__doc__) print(e.message)
def _add_sparse_vector_labes(self, graph, vertex_v, node_feature_list): # add the vector with a feature resulting from hashing # the discrete labeled graph sparse encoding with the sparse vector # feature, the val is then multiplied. svec = graph.node[vertex_v].get(self.key_svec, None) if svec: vec_feature_list = defaultdict(lambda: defaultdict(float)) for radius_dist_key in node_feature_list: for feature in node_feature_list[radius_dist_key]: val = node_feature_list[radius_dist_key][feature] for i in svec: vec_val = svec[i] key = fast_hash_2(feature, i, self.bitmask) vec_feature_list[radius_dist_key][key] += val * vec_val node_feature_list = vec_feature_list return node_feature_list
def _transform_vertex_pair(self, graph, vertex_v, vertex_u, distance, feature_list, connection_weight=1): # for all radii for radius in range(self.min_r, self.r + 2, 2): for label_index in range(graph.graph['label_size']): if radius < len(graph.node[vertex_v] ['neigh_graph_hash'][label_index]) and \ radius < len(graph.node[vertex_u]['neigh_graph_hash'][label_index]): # feature as a pair of neighborhoods at a radius,distance # canonicalization of pair of neighborhoods vertex_v_hash = graph.node[vertex_v]['neigh_graph_hash'][ label_index][radius] vertex_u_hash = graph.node[vertex_u]['neigh_graph_hash'][ label_index][radius] if vertex_v_hash < vertex_u_hash: first_hash, second_hash = (vertex_v_hash, vertex_u_hash) else: first_hash, second_hash = (vertex_u_hash, vertex_v_hash) feature = fast_hash_4(first_hash, second_hash, radius, distance, self.bitmask) # half features are those that ignore the central vertex v # the reason to have those is to help model the context # independently from the identity of the vertex itself half_feature = fast_hash_3(vertex_u_hash, radius, distance, self.bitmask) key = fast_hash_2(radius, distance) if graph.graph.get('weighted', False) is False: feature_list[key][feature] += 1 feature_list[key][half_feature] += 1 else: val = connection_weight * \ (graph.node[vertex_v]['neigh_graph_weight'][radius] + graph.node[vertex_u]['neigh_graph_weight'][radius]) feature_list[key][feature] += val half_val = \ connection_weight * \ graph.node[vertex_u]['neigh_graph_weight'][radius] feature_list[key][half_feature] += half_val
def _compute_vertex_based_features(self, seq, weights=None): if seq is None or len(seq) == 0: raise Exception('ERROR: something went wrong, empty instance.') # extract kmer hash codes for all kmers up to r in all positions in seq vertex_based_features = [] seq_len = len(seq) if weights: if len(weights) != seq_len: raise Exception('ERROR: sequence and weights \ must be same length.') neighborhood_weight_cache = \ [self._compute_neighborhood_weight(weights, pos) for pos in range(seq_len)] neigh_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] for pos in range(seq_len): # construct features as pairs of kmers up to distance d # for all radii up to r local_features = defaultdict(lambda: defaultdict(float)) for radius in range(self.min_r, self.r + 1): if radius < len(neigh_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): end = pos + distance if end + radius < seq_len: feature_code = \ fast_hash_4(neigh_hash_cache[pos][radius], neigh_hash_cache[end][radius], radius, distance, self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) if weights: local_features[key][feature_code] += \ neighborhood_weight_cache[pos][radius] local_features[key][feature_code] += \ neighborhood_weight_cache[end][radius] else: local_features[key][feature_code] += 1 vertex_based_features.append(self._normalization(local_features, inner_normalization=False, normalization=self.normalization)) data_matrix = self._convert_dict_to_sparse_matrix(vertex_based_features) return data_matrix
def _transform_vertex_pair(self, graph, vertex_v, vertex_u, distance, feature_list, connection_weight=1): # for all radii for radius in range(self.min_r, self.r + 2, 2): for label_index in range(graph.graph['label_size']): if radius < len(graph.node[vertex_v] ['neigh_graph_hash'][label_index]) and \ radius < len(graph.node[vertex_u]['neigh_graph_hash'][label_index]): # feature as a pair of neighborhoods at a radius,distance # canonicalization of pair of neighborhoods vertex_v_hash = graph.node[vertex_v]['neigh_graph_hash'][label_index][radius] vertex_u_hash = graph.node[vertex_u]['neigh_graph_hash'][label_index][radius] if vertex_v_hash < vertex_u_hash: first_hash, second_hash = (vertex_v_hash, vertex_u_hash) else: first_hash, second_hash = (vertex_u_hash, vertex_v_hash) feature = fast_hash_4(first_hash, second_hash, radius, distance, self.bitmask) # half features are those that ignore the central vertex v # the reason to have those is to help model the context # independently from the identity of the vertex itself half_feature = fast_hash_3(vertex_u_hash, radius, distance, self.bitmask) key = fast_hash_2(radius, distance) if graph.graph.get('weighted', False) is False: feature_list[key][feature] += 1 feature_list[key][half_feature] += 1 else: val = connection_weight * \ (graph.node[vertex_v]['neigh_graph_weight'][radius] + graph.node[vertex_u]['neigh_graph_weight'][radius]) feature_list[key][feature] += val half_val = \ connection_weight * \ graph.node[vertex_u]['neigh_graph_weight'][radius] feature_list[key][half_feature] += half_val
def _transform(self, orig_seq): seq, weights = self._get_sequence_and_weights(orig_seq) # extract kmer hash codes for all kmers up to r in all positions in seq seq_len = len(seq) neigh_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] if weights: if len(weights) != seq_len: raise Exception('ERROR: sequence and weights \ must be same length.') neighborhood_weight_cache = \ [self._compute_neighborhood_weight(weights, pos) for pos in range(seq_len)] # construct features as pairs of kmers up to distance d # for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for pos in range(seq_len): for radius in range(self.min_r, self.r + 1): if radius < len(neigh_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): end = pos + distance if end + radius < seq_len: feature_code = \ fast_hash_4(neigh_hash_cache[pos][radius], neigh_hash_cache[end][radius], radius, distance, self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) if weights: feature_list[key][feature_code] += \ neighborhood_weight_cache[pos][radius] feature_list[key][feature_code] += \ neighborhood_weight_cache[end][radius] else: feature_list[key][feature_code] += 1 return self._normalization(feature_list, inner_normalization=self.inner_normalization, normalization=self.normalization)
def _compute_vertex_based_features(self, seq): if seq is None or len(seq) == 0: raise Exception('ERROR: something went wrong, empty instance.') # extract kmer hash codes for all kmers up to r in all positions in seq feature_dict = {} seq_len = len(seq) neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)] for pos in range(seq_len): # construct features as pairs of kmers up to distance d for all radii up to r feature_list = defaultdict(lambda: defaultdict(float)) for radius in range(self.min_r, self.r + 1): if radius < len(neighborhood_hash_cache[pos]): for distance in range(self.min_d, self.d + 1): if pos + distance + radius < seq_len: feature_code = fast_hash_4(neighborhood_hash_cache[pos][radius], radius, distance, neighborhood_hash_cache[pos + distance][radius], self.bitmask) key = fast_hash_2(radius, distance, self.bitmask) feature_list[key][feature_code] += 1 feature_dict.update(self._normalization(feature_list, pos)) data_matrix = self._convert_dict_to_sparse_matrix(feature_dict) return data_matrix