Example #1
0
 def _compute_neighborhood_graph_hash(self, root, graph):
     # list all hashed labels at increasing distances
     hash_list = []
     # for all distances
     root_dist_dict = graph.node[root]['remote_neighbours']
     for node_set in root_dist_dict.itervalues():
         # create a list of hashed labels
         hash_label_list = []
         for v in node_set:
             # compute the vertex hashed label by hashing the hlabel
             # field
             # with the degree of the vertex (obtained as the size of
             # the adjacency dictionary for the vertex v)
             # or, in case positional is set, using the relative
             # position of the vertex v w.r.t. the root vertex
             if self.positional:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'], root - v)
             else:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'], len(graph[v]))
             hash_label_list.append(vhlabel)
         # sort it
         hash_label_list.sort()
         # hash it
         hashed_nodes_at_distance_d_in_neighborhood = fast_hash(
             hash_label_list)
         hash_list.append(hashed_nodes_at_distance_d_in_neighborhood)
     # hash the sequence of hashes of the node set at increasing
     # distances into a list of features
     hash_neighborhood = fast_hash_vec(hash_list)
     graph.node[root]['neigh_graph_hash'] = hash_neighborhood
Example #2
0
 def _compute_neighborhood_graph_hash(self, root, graph):
     # list all hashed labels at increasing distances
     hash_list = []
     # for all distances
     root_dist_dict = graph.node[root]['remote_neighbours']
     for node_set in root_dist_dict.itervalues():
         # create a list of hashed labels
         hash_label_list = []
         for v in node_set:
             # compute the vertex hashed label by hashing the hlabel
             # field
             # with the degree of the vertex (obtained as the size of
             # the adjacency dictionary for the vertex v)
             # or, in case positional is set, using the relative
             # position of the vertex v w.r.t. the root vertex
             if self.positional:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'],
                     root - v)
             else:
                 vhlabel = \
                     fast_hash_2(graph.node[v]['hlabel'],
                                 len(graph[v]))
             hash_label_list.append(vhlabel)
         # sort it
         hash_label_list.sort()
         # hash it
         hashed_nodes_at_distance_d_in_neighborhood = fast_hash(
             hash_label_list)
         hash_list.append(hashed_nodes_at_distance_d_in_neighborhood)
     # hash the sequence of hashes of the node set at increasing
     # distances into a list of features
     hash_neighborhood = fast_hash_vec(hash_list)
     graph.node[root]['neigh_graph_hash'] = hash_neighborhood
Example #3
0
 def _transform(self, instance_id, seq):
     if seq is None or len(seq) == 0:
         raise Exception('ERROR: something went wrong, empty instance # %d.' % instance_id)
     if len(seq) == 2 and len(seq[1]) > 0:
         # assume the instance is a pair (header,seq) and extract only seq
         seq = seq[1]
     # extract kmer hash codes for all kmers up to r in all positions in seq
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     # construct features as pairs of kmers up to distance d for all radii up to r
     feature_list = defaultdict(lambda: defaultdict(float))
     for pos in range(seq_len):
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     second_endpoint = pos + distance
                     if second_endpoint + radius < seq_len:
                         feature_code = fast_hash_4(neighborhood_hash_cache[pos][radius],
                                                    radius,
                                                    distance,
                                                    neighborhood_hash_cache[second_endpoint][radius],
                                                    self.bitmask)
                         key = fast_hash_2(radius, distance, self.bitmask)
                         feature_list[key][feature_code] += 1
     return self._normalization(feature_list, instance_id)
Example #4
0
 def _transform_distance(self,
                         feature_list=None,
                         pos=None,
                         radius=None,
                         seq_len=None,
                         neigh_hash_cache=None,
                         neighborhood_weight_cache=None):
     distances = list(range(self.min_d, self.d + 1))
     distances += list(range(-self.d, -self.min_d))
     for distance in distances:
         end = pos + distance
         # Note: after having computed pos, we now treat
         # distance as the positive value only
         distance = abs(distance)
         cond1 = self.weights_dict is None
         if cond1 or self.weights_dict.get((radius, distance), 0) != 0:
             if end >= 0 and end + radius < seq_len:
                 if self.use_only_context:
                     pfeat = 42
                 else:
                     pfeat = neigh_hash_cache[pos][radius]
                 efeat = neigh_hash_cache[end][radius]
                 feature_code = fast_hash_4(pfeat, efeat, radius, distance,
                                            self.bitmask)
                 key = fast_hash_2(radius, distance, self.bitmask)
                 if neighborhood_weight_cache:
                     pw = neighborhood_weight_cache[pos][radius]
                     feature_list[key][feature_code] += pw
                     ew = neighborhood_weight_cache[end][radius]
                     feature_list[key][feature_code] += ew
                 else:
                     feature_list[key][feature_code] += 1
Example #5
0
 def _transform(self, instance_id, seq):
     if seq is None or len(seq) == 0:
         raise Exception("ERROR: something went wrong, empty instance # %d." % instance_id)
     if len(seq) == 2 and len(seq[1]) > 0:
         # assume the instance is a pair (header,seq) and extract only seq
         seq = seq[1]
     # extract kmer hash codes for all kmers up to r in all positions in seq
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     # construct features as pairs of kmers up to distance d for all radii up to r
     feature_list = defaultdict(lambda: defaultdict(float))
     for pos in range(seq_len):
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     second_endpoint = pos + distance
                     if second_endpoint + radius < seq_len:
                         feature_code = fast_hash_4(
                             neighborhood_hash_cache[pos][radius],
                             radius,
                             distance,
                             neighborhood_hash_cache[second_endpoint][radius],
                             self.bitmask,
                         )
                         key = fast_hash_2(radius, distance, self.bitmask)
                         feature_list[key][feature_code] += 1
     return self._normalization(feature_list, instance_id)
Example #6
0
 def _compute_vertex_based_features(self, seq):
     if seq is None or len(seq) == 0:
         raise Exception("ERROR: something went wrong, empty instance.")
     # extract kmer hash codes for all kmers up to r in all positions in seq
     feature_dict = {}
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     for pos in range(seq_len):
         # construct features as pairs of kmers up to distance d for all radii up to r
         feature_list = defaultdict(lambda: defaultdict(float))
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     if pos + distance + radius < seq_len:
                         feature_code = fast_hash_4(
                             neighborhood_hash_cache[pos][radius],
                             radius,
                             distance,
                             neighborhood_hash_cache[pos + distance][radius],
                             self.bitmask,
                         )
                         key = fast_hash_2(radius, distance, self.bitmask)
                         feature_list[key][feature_code] += 1
         feature_dict.update(self._normalization(feature_list, pos))
     data_matrix = self._convert_dict_to_sparse_matrix(feature_dict)
     return data_matrix
Example #7
0
 def _transform_distance(self,
                         feature_list=None,
                         pos=None,
                         radius=None,
                         seq_len=None,
                         neigh_hash_cache=None,
                         neighborhood_weight_cache=None):
     distances = list(range(self.min_d, self.d + 1))
     distances += list(range(-self.d, -self.min_d))
     for distance in distances:
         end = pos + distance
         # Note: after having computed pos, we now treat
         # distance as the positive value only
         distance = abs(distance)
         cond1 = self.weights_dict is None
         if cond1 or self.weights_dict.get((radius, distance), 0) != 0:
             if end >= 0 and end + radius < seq_len:
                 pfeat = neigh_hash_cache[pos][radius]
                 efeat = neigh_hash_cache[end][radius]
                 feature_code = fast_hash_4(pfeat,
                                            efeat,
                                            radius,
                                            distance,
                                            self.bitmask)
                 key = fast_hash_2(radius, distance, self.bitmask)
                 if neighborhood_weight_cache:
                     pw = neighborhood_weight_cache[pos][radius]
                     feature_list[key][feature_code] += pw
                     ew = neighborhood_weight_cache[end][radius]
                     feature_list[key][feature_code] += ew
                 else:
                     feature_list[key][feature_code] += 1
 def _transform_vertex_pair_base(self,
                                 graph,
                                 vertex_v,
                                 vertex_u,
                                 distance,
                                 feature_list,
                                 connection_weight=1):
     # for all radii
     for radius in range(self.min_r, self.r + 2, 2):
         for label_index in range(graph.graph['label_size']):
             if radius < len(graph.node[vertex_v]['neighborhood_graph_hash'][label_index]) and \
                     radius < len(graph.node[vertex_u]['neighborhood_graph_hash'][label_index]):
                 # feature as a pair of neighbourhoods at a radius,distance
                 # canonicazation of pair of neighborhoods
                 vertex_v_hash = graph.node[vertex_v][
                     'neighborhood_graph_hash'][label_index][radius]
                 vertex_u_hash = graph.node[vertex_u][
                     'neighborhood_graph_hash'][label_index][radius]
                 if vertex_v_hash < vertex_u_hash:
                     first_hash, second_hash = (vertex_v_hash,
                                                vertex_u_hash)
                 else:
                     first_hash, second_hash = (vertex_u_hash,
                                                vertex_v_hash)
                 feature = fast_hash_4(first_hash, second_hash, radius,
                                       distance, self.bitmask)
                 key = fast_hash_2(radius, distance, self.bitmask)
                 # if self.weighted == False :
                 if graph.graph.get('weighted', False) is False:
                     feature_list[key][feature] += 1
                 else:
                     feature_list[key][feature] += connection_weight * \
                         (graph.node[vertex_v]['neighborhood_graph_weight'][radius] +
                          graph.node[vertex_u]['neighborhood_graph_weight'][radius])
Example #9
0
def _label_preprocessing(graph,
                         label_size=1,
                         key_label='label',
                         key_entity='entity',
                         discretizers={'entity': []},
                         bitmask=2**20 - 1):
    try:
        graph.graph['label_size'] = label_size
        for n, d in graph.nodes_iter(data=True):
            # for dense or sparse vectors
            is_list = isinstance(d[key_label], list)
            is_dict = isinstance(d[key_label], dict)
            if is_list or is_dict:
                node_entity, data = _extract_entity_and_label(
                    d, key_entity, key_label)
                if isinstance(d[key_label], list):
                    data = np.array(data, dtype=np.float64).reshape(1, -1)
                if isinstance(d[key_label], dict):
                    data = _convert_dict_to_sparse_vector(data)
                # create a list of integer codes of size: label_size
                # each integer code is determined as follows:
                # for each entity, use the correspondent
                # discretizers[node_entity] to extract
                # the id of the nearest cluster centroid, return the
                # centroid id as the integer code
                hlabel = []
                for i in range(label_size):
                    if len(discretizers[node_entity]) < i:
                        len_mod = \
                            len(discretizers[node_entity])
                        raise Exception(
                            'Error: discretizers for node entity: %s \
                            has length: %d but component %d was required' %
                            (node_entity, len_mod, i))
                    predictions = \
                        discretizers[node_entity][i].predict(data)
                    if len(predictions) != 1:
                        raise Exception('Error: discretizer has not \
                            returned an individual prediction but\
                            %d predictions' % len(predictions))
                    discretization_code = predictions[0] + 1
                    code = fast_hash_2(hash(node_entity), discretization_code,
                                       bitmask)
                    hlabel.append(code)
                graph.node[n]['hlabel'] = hlabel
            elif isinstance(d[key_label], basestring):
                # copy a hashed version of the string for a number of times
                # equal to self.label_size in this way qualitative
                # ( i.e. string ) labels can be compared to the
                # discretized labels
                hlabel = int(hash(d[key_label]) & bitmask) + 1
                graph.node[n]['hlabel'] = [hlabel] * label_size
            else:
                raise Exception(
                    'ERROR: something went wrong, type of node label is unknown: \
                    %s' % d[key_label])
    except Exception as e:
        logger.debug('Failed iteration. Reason: %s' % e)
        logger.debug('Exception', exc_info=True)
Example #10
0
def _label_preprocessing(graph, label_size=1,
                         key_label='label',
                         key_entity='entity',
                         discretizers={'entity': []},
                         bitmask=2 ** 20 - 1):
    try:
        graph.graph['label_size'] = label_size
        for n, d in graph.nodes_iter(data=True):
            # for dense or sparse vectors
            is_list = isinstance(d[key_label], list)
            is_dict = isinstance(d[key_label], dict)
            if is_list or is_dict:
                node_entity, data = _extract_entity_and_label(d,
                                                              key_entity,
                                                              key_label)
                if isinstance(d[key_label], list):
                    data = np.array(data, dtype=np.float64).reshape(1, -1)
                if isinstance(d[key_label], dict):
                    data = _convert_dict_to_sparse_vector(data)
                # create a list of integer codes of size: label_size
                # each integer code is determined as follows:
                # for each entity, use the correspondent
                # discretizers[node_entity] to extract
                # the id of the nearest cluster centroid, return the
                # centroid id as the integer code
                hlabel = []
                for i in range(label_size):
                    if len(discretizers[node_entity]) < i:
                        len_mod = \
                            len(discretizers[node_entity])
                        raise Exception('Error: discretizers for node entity: %s \
                            has length: %d but component %d was required'
                                        % (node_entity, len_mod, i))
                    predictions = \
                        discretizers[node_entity][i].predict(data)
                    if len(predictions) != 1:
                        raise Exception('Error: discretizer has not \
                            returned an individual prediction but\
                            %d predictions' % len(predictions))
                    discretization_code = predictions[0] + 1
                    code = fast_hash_2(hash(node_entity),
                                       discretization_code,
                                       bitmask)
                    hlabel.append(code)
                graph.node[n]['hlabel'] = hlabel
            elif isinstance(d[key_label], basestring):
                # copy a hashed version of the string for a number of times
                # equal to self.label_size in this way qualitative
                # ( i.e. string ) labels can be compared to the
                # discretized labels
                hlabel = int(hash(d[key_label]) & bitmask) + 1
                graph.node[n]['hlabel'] = [hlabel] * label_size
            else:
                raise Exception('ERROR: something went wrong, type of node label is unknown: \
                    %s' % d[key_label])
    except Exception as e:
        logger.debug('Failed iteration. Reason: %s' % e)
        logger.debug('Exception', exc_info=True)
 def _label_preprocessing(self, graph):
     try:
         graph.graph['label_size'] = self.label_size
         for n, d in graph.nodes_iter(data=True):
             # for dense or sparse vectors
             if isinstance(d[self.key_label], list) or isinstance(
                     d[self.key_label], dict):
                 node_entity, data = self._extract_entity_and_label(d)
                 if isinstance(d[self.key_label], list):
                     data = np.array(data, dtype=np.float64).reshape(1, -1)
                 if isinstance(d[self.key_label], dict):
                     data = self._convert_dict_to_sparse_vector(data)
                 # create a list of integer codes of size: label_size
                 # each integer code is determined as follows:
                 # for each entity, use the correspondent discretization_models[node_entity] to extract
                 # the id of the nearest cluster centroid, return the centroid id as the integer code
                 hlabel = []
                 for i in range(self.label_size):
                     if len(self.discretization_models[node_entity]) < i:
                         raise Exception(
                             'Error: discretization_models for node entity: %s \
                             has length: %d but component %d was required' %
                             (node_entity,
                              len(self.discretization_models[node_entity]),
                              i))
                     predictions = self.discretization_models[node_entity][
                         i].predict(data)
                     if len(predictions) != 1:
                         raise Exception(
                             'Error: discretizer has not returned an individual prediction but\
                             %d predictions' % len(predictions))
                     discretization_code = predictions[0] + 1
                     code = fast_hash_2(hash(node_entity),
                                        discretization_code, self.bitmask)
                     hlabel.append(code)
                 graph.node[n]['hlabel'] = hlabel
             elif isinstance(d[self.key_label], basestring):
                 # copy a hashed version of the string for a number of times equal to self.label_size
                 # in this way qualitative ( i.e. string ) labels can be compared to the discretized labels
                 hlabel = int(hash(d[self.key_label]) & self.bitmask) + 1
                 graph.node[n]['hlabel'] = [hlabel] * self.label_size
             else:
                 raise Exception(
                     'ERROR: something went wrong, type of node label is unknown: \
                     %s' % d[self.key_label])
     except Exception as e:
         import datetime
         curr_time = datetime.datetime.now().strftime(
             "%A, %d. %B %Y %I:%M%p")
         print("Program run failed on %s" % curr_time)
         print(e.__doc__)
         print(e.message)
Example #12
0
 def _add_sparse_vector_labes(self, graph, vertex_v, node_feature_list):
     # add the vector with a feature resulting from hashing
     # the discrete labeled graph sparse encoding with the sparse vector
     # feature, the val is then multiplied.
     svec = graph.node[vertex_v].get(self.key_svec, None)
     if svec:
         vec_feature_list = defaultdict(lambda: defaultdict(float))
         for radius_dist_key in node_feature_list:
             for feature in node_feature_list[radius_dist_key]:
                 val = node_feature_list[radius_dist_key][feature]
                 for i in svec:
                     vec_val = svec[i]
                     key = fast_hash_2(feature, i, self.bitmask)
                     vec_feature_list[radius_dist_key][key] += val * vec_val
         node_feature_list = vec_feature_list
     return node_feature_list
Example #13
0
 def _add_sparse_vector_labes(self, graph, vertex_v, node_feature_list):
     # add the vector with a feature resulting from hashing
     # the discrete labeled graph sparse encoding with the sparse vector
     # feature, the val is then multiplied.
     svec = graph.node[vertex_v].get(self.key_svec, None)
     if svec:
         vec_feature_list = defaultdict(lambda: defaultdict(float))
         for radius_dist_key in node_feature_list:
             for feature in node_feature_list[radius_dist_key]:
                 val = node_feature_list[radius_dist_key][feature]
                 for i in svec:
                     vec_val = svec[i]
                     key = fast_hash_2(feature, i, self.bitmask)
                     vec_feature_list[radius_dist_key][key] += val * vec_val
         node_feature_list = vec_feature_list
     return node_feature_list
Example #14
0
 def _transform_vertex_pair(self,
                            graph,
                            vertex_v,
                            vertex_u,
                            distance,
                            feature_list,
                            connection_weight=1):
     # for all radii
     for radius in range(self.min_r, self.r + 2, 2):
         for label_index in range(graph.graph['label_size']):
             if radius < len(graph.node[vertex_v]
                             ['neigh_graph_hash'][label_index]) and \
                     radius < len(graph.node[vertex_u]['neigh_graph_hash'][label_index]):
                 # feature as a pair of neighborhoods at a radius,distance
                 # canonicalization of pair of neighborhoods
                 vertex_v_hash = graph.node[vertex_v]['neigh_graph_hash'][
                     label_index][radius]
                 vertex_u_hash = graph.node[vertex_u]['neigh_graph_hash'][
                     label_index][radius]
                 if vertex_v_hash < vertex_u_hash:
                     first_hash, second_hash = (vertex_v_hash,
                                                vertex_u_hash)
                 else:
                     first_hash, second_hash = (vertex_u_hash,
                                                vertex_v_hash)
                 feature = fast_hash_4(first_hash, second_hash, radius,
                                       distance, self.bitmask)
                 # half features are those that ignore the central vertex v
                 # the reason to have those is to help model the context
                 # independently from the identity of the vertex itself
                 half_feature = fast_hash_3(vertex_u_hash, radius, distance,
                                            self.bitmask)
                 key = fast_hash_2(radius, distance)
                 if graph.graph.get('weighted', False) is False:
                     feature_list[key][feature] += 1
                     feature_list[key][half_feature] += 1
                 else:
                     val = connection_weight * \
                         (graph.node[vertex_v]['neigh_graph_weight'][radius] +
                          graph.node[vertex_u]['neigh_graph_weight'][radius])
                     feature_list[key][feature] += val
                     half_val = \
                         connection_weight * \
                         graph.node[vertex_u]['neigh_graph_weight'][radius]
                     feature_list[key][half_feature] += half_val
Example #15
0
 def _compute_vertex_based_features(self, seq, weights=None):
     if seq is None or len(seq) == 0:
         raise Exception('ERROR: something went wrong, empty instance.')
     # extract kmer hash codes for all kmers up to r in all positions in seq
     vertex_based_features = []
     seq_len = len(seq)
     if weights:
         if len(weights) != seq_len:
             raise Exception('ERROR: sequence and weights \
                 must be same length.')
         neighborhood_weight_cache = \
             [self._compute_neighborhood_weight(weights, pos)
              for pos in range(seq_len)]
     neigh_hash_cache = [self._compute_neighborhood_hash(seq, pos)
                         for pos in range(seq_len)]
     for pos in range(seq_len):
         # construct features as pairs of kmers up to distance d
         # for all radii up to r
         local_features = defaultdict(lambda: defaultdict(float))
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neigh_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     end = pos + distance
                     if end + radius < seq_len:
                         feature_code = \
                             fast_hash_4(neigh_hash_cache[pos][radius],
                                         neigh_hash_cache[end][radius],
                                         radius,
                                         distance,
                                         self.bitmask)
                         key = fast_hash_2(radius, distance, self.bitmask)
                         if weights:
                             local_features[key][feature_code] += \
                                 neighborhood_weight_cache[pos][radius]
                             local_features[key][feature_code] += \
                                 neighborhood_weight_cache[end][radius]
                         else:
                             local_features[key][feature_code] += 1
         vertex_based_features.append(self._normalization(local_features,
                                                          inner_normalization=False,
                                                          normalization=self.normalization))
     data_matrix = self._convert_dict_to_sparse_matrix(vertex_based_features)
     return data_matrix
Example #16
0
 def _transform_vertex_pair(self,
                            graph,
                            vertex_v,
                            vertex_u,
                            distance,
                            feature_list,
                            connection_weight=1):
     # for all radii
     for radius in range(self.min_r, self.r + 2, 2):
         for label_index in range(graph.graph['label_size']):
             if radius < len(graph.node[vertex_v]
                             ['neigh_graph_hash'][label_index]) and \
                     radius < len(graph.node[vertex_u]['neigh_graph_hash'][label_index]):
                 # feature as a pair of neighborhoods at a radius,distance
                 # canonicalization of pair of neighborhoods
                 vertex_v_hash = graph.node[vertex_v]['neigh_graph_hash'][label_index][radius]
                 vertex_u_hash = graph.node[vertex_u]['neigh_graph_hash'][label_index][radius]
                 if vertex_v_hash < vertex_u_hash:
                     first_hash, second_hash = (vertex_v_hash,
                                                vertex_u_hash)
                 else:
                     first_hash, second_hash = (vertex_u_hash,
                                                vertex_v_hash)
                 feature = fast_hash_4(first_hash, second_hash,
                                       radius, distance, self.bitmask)
                 # half features are those that ignore the central vertex v
                 # the reason to have those is to help model the context
                 # independently from the identity of the vertex itself
                 half_feature = fast_hash_3(vertex_u_hash,
                                            radius, distance, self.bitmask)
                 key = fast_hash_2(radius, distance)
                 if graph.graph.get('weighted', False) is False:
                     feature_list[key][feature] += 1
                     feature_list[key][half_feature] += 1
                 else:
                     val = connection_weight * \
                         (graph.node[vertex_v]['neigh_graph_weight'][radius] +
                          graph.node[vertex_u]['neigh_graph_weight'][radius])
                     feature_list[key][feature] += val
                     half_val = \
                         connection_weight * \
                         graph.node[vertex_u]['neigh_graph_weight'][radius]
                     feature_list[key][half_feature] += half_val
Example #17
0
 def _transform(self, orig_seq):
     seq, weights = self._get_sequence_and_weights(orig_seq)
     # extract kmer hash codes for all kmers up to r in all positions in seq
     seq_len = len(seq)
     neigh_hash_cache = [self._compute_neighborhood_hash(seq, pos)
                         for pos in range(seq_len)]
     if weights:
         if len(weights) != seq_len:
             raise Exception('ERROR: sequence and weights \
                 must be same length.')
         neighborhood_weight_cache = \
             [self._compute_neighborhood_weight(weights, pos)
              for pos in range(seq_len)]
     # construct features as pairs of kmers up to distance d
     # for all radii up to r
     feature_list = defaultdict(lambda: defaultdict(float))
     for pos in range(seq_len):
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neigh_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     end = pos + distance
                     if end + radius < seq_len:
                         feature_code = \
                             fast_hash_4(neigh_hash_cache[pos][radius],
                                         neigh_hash_cache[end][radius],
                                         radius,
                                         distance,
                                         self.bitmask)
                         key = fast_hash_2(radius, distance, self.bitmask)
                         if weights:
                             feature_list[key][feature_code] += \
                                 neighborhood_weight_cache[pos][radius]
                             feature_list[key][feature_code] += \
                                 neighborhood_weight_cache[end][radius]
                         else:
                             feature_list[key][feature_code] += 1
     return self._normalization(feature_list,
                                inner_normalization=self.inner_normalization,
                                normalization=self.normalization)
Example #18
0
 def _compute_vertex_based_features(self, seq):
     if seq is None or len(seq) == 0:
         raise Exception('ERROR: something went wrong, empty instance.')
     # extract kmer hash codes for all kmers up to r in all positions in seq
     feature_dict = {}
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     for pos in range(seq_len):
         # construct features as pairs of kmers up to distance d for all radii up to r
         feature_list = defaultdict(lambda: defaultdict(float))
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 for distance in range(self.min_d, self.d + 1):
                     if pos + distance + radius < seq_len:
                         feature_code = fast_hash_4(neighborhood_hash_cache[pos][radius],
                                                    radius,
                                                    distance,
                                                    neighborhood_hash_cache[pos + distance][radius],
                                                    self.bitmask)
                         key = fast_hash_2(radius, distance, self.bitmask)
                         feature_list[key][feature_code] += 1
         feature_dict.update(self._normalization(feature_list, pos))
     data_matrix = self._convert_dict_to_sparse_matrix(feature_dict)
     return data_matrix