Beispiel #1
0
def get_coords( axes = 'gene', 
                rows = None,
                time_val = None,
                spatial_idxs = None,
                ids = None):
    bdnet = nio.getBDTNP()
    gene_matrix = array([v['vals'][:,time_val] 
                         for v in bdnet.values() 
                         if str(time_val + 1) in v['steps']])
    gene_matrix_keys = [k 
                for k in bdnet.keys() if str(time_val +1) in v['steps']]

    if axes == 'gene':
        import scipy.sparse as ssp
        import scipy.sparse.linalg as las
        import scipy.sparse.lil as ll
        adj = ssp.csr_matrix(gene_matrix.T)
        n_c = 3
        U,s, Vh = svd = las.svds(adj, n_c)
        filtered_genes = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh)        
        xs_gene  = U[ids,0]
        ys_gene  = U[ids,1]
        zs_gene  = U[ids,2]
        
    elif axes == 'space':
        space_space =array([[ [r[idxs]  for idxs in sidxs]
                              for sidxs in spatial_idxs] 
                            for r in rows])
        space_space  = space_space[:, : , time_val]
    
        xs_gene = space_space[ids, 0]
        ys_gene = space_space[ids, 1]
        zs_gene = space_space[ids, 2]
    return xs_gene, ys_gene, zs_gene
def sparse_matrix_to_hdf(sparse_matrix, name_to_store, hdf_file_path):
    nonzero_indices = np.nonzero(sparse_matrix > 0)
    if len(nonzero_indices[0]) == 0:
        raise Exception("can't store empty sparse matrix!")

    if issparse(sparse_matrix):
        if sparse_matrix.__class__ is lil_matrix:
            nonzero_values = sparse_matrix.tocsr()[nonzero_indices].A1
        else:
            nonzero_values = lil_matrix(
                sparse_matrix).tocsr()[nonzero_indices].A1
    else:
        nonzero_values = np.array(sparse_matrix[nonzero_indices])


#     print(sparse_matrix.__class__,'=',name_to_store,sparse_matrix.shape,len(nonzero_values))

    matrix_dataframe = pd.DataFrame({
        "row_indexes": nonzero_indices[0],
        "col_indexes": nonzero_indices[1],
        "data": nonzero_values
    })
    matrix_shape_series = pd.Series(sparse_matrix.shape)

    matrix_dataframe.to_hdf(hdf_file_path, name_to_store)
    matrix_shape_series.to_hdf(hdf_file_path, "%s_shape" % name_to_store)

    del nonzero_indices, nonzero_values, matrix_dataframe, matrix_shape_series
Beispiel #3
0
    def _sentence_graph_from_ptb_str(ptb_str, num_tokens):
        # We need to have num_tokens provided here, or else we won't know for
        # sure how big the graph should be. (There can be tokens missing from
        # the graph, and even if there aren't it would take more processing
        # than it's worth to find the max node index in the PTB tree.)
        tree = ImmutableParentedTree.fromstring(ptb_str)
        edge_graph = lil_matrix((num_tokens, num_tokens), dtype='float')
        edge_labels = {}
        excluded_edges = []

        def convert_node(parent_index, node):
            # Node index is whatever's after the last underscore.
            node_label = node.label()
            node_index = int(node_label[node_label.rindex('_') + 1:])
            edge_label = node[0]  # 0th child is always edge label
            if edge_label in StanfordParsedSentence.DEPTH_EXCLUDED_EDGE_LABELS:
                excluded_edges.append((parent_index, node_index))
            else:
                edge_graph[parent_index, node_index] = 1.0
            edge_labels[parent_index, node_index] = edge_label

            for child in node[
                    2:]:  # Skip edge label (child 0) & POS (child 1).
                convert_node(node_index, child)

        for root_child in tree:
            convert_node(0, root_child)  # initial parent index is 0 for root
        return edge_graph.tocsr(), edge_labels, excluded_edges
def construct_hierarchy_matrix(hierarchy, node2index):
    N = len(hierarchy)
    hier_mat = lil_matrix(np.eye(N), dtype=bool)
    for child, parent in hierarchy.items():
        if parent is None:
            continue

        hier_mat[node2index[child], node2index[parent]] = 1.

    return csr_matrix(hier_mat)
    def tolil(self, copy=False):
        from scipy.sparse.lil import lil_matrix
        lil = lil_matrix(self.shape, dtype=self.dtype)

        self.sum_duplicates()
        ptr, ind, dat = self.indptr, self.indices, self.data
        rows, data = lil.rows, lil.data

        for n in range(self.shape[0]):
            start = ptr[n]
            end = ptr[n + 1]
            rows[n] = ind[start:end].tolist()
            data[n] = dat[start:end].tolist()

        return lil
Beispiel #6
0
    def graph(self):
        """
        Return the k-nearest-neighbour graph with self.k neighbours.

        Optionally the minimum_spanning_tree is added in, according to
        self.include_mst.
        """
        if getattr(self, '_graph', None) is None:
            D = self.manifold_corrected_distance_matrix.toarray()
            idxs = np.argsort(D)
            r = range(D.shape[0])
            idx = idxs[:, :self.k]
            self._graph = lil_matrix(D.shape)
            for neighbours in idx.T:
                self._graph[r, neighbours] = D[r, neighbours]
            if self.include_mst:
                mst = self.minimal_spanning_tree
                for i, j, v in zip(*find(mst)):
                    if self._graph[i, j] == 0:
                        self._graph[i, j] = v
        return self._graph
Beispiel #7
0
    def graph(self):
        """
        Return the k-nearest-neighbour graph with self.k neighbours.

        Optionally the minimum_spanning_tree is added in, according to
        self.include_mst.
        """
        if getattr(self, '_graph', None) is None:
            D = self.manifold_corrected_distance_matrix.toarray()
            idxs = np.argsort(D)
            r = range(D.shape[0])
            idx = idxs[:, :self.k]
            self._graph = lil_matrix(D.shape)
            for neighbours in idx.T:
                self._graph[r, neighbours] = D[r, neighbours]
            if self.include_mst:
                mst = self.minimal_spanning_tree
                for i,j,v in zip(*find(mst)):
                    if self._graph[i,j] == 0:
                        self._graph[i,j] = v
        return self._graph
Beispiel #8
0
def filter_sparse(g1, n_c = 5, 
                  max_edges = -1,
                  last_component = False):
    
    '''
    Filter a sparse version of the network by PCA.

g1:  The input network graph.
n_c: The number of principal components to compute
max_edges: The maximum number of edges to keep. -1 => keep all.
last_component: Keep only the final principal component
'''

    import scipy.sparse.linalg as las
    import scipy.sparse.lil as ll
    import scipy.sparse as ssp

    adj = ssp.csr_matrix(nx.to_scipy_sparse_matrix(g1))
    nodes = g1.nodes()

    U,s, Vh = svd = las.svd(adj, n_c)
    

    U[less(abs(U), .001)] = 0
    Vh[less(abs(Vh), .001)] = 0
    
    if last_component:
        s_last = s; s_last[1:] *= 0
        filtered = ll.lil_matrix(U)*ll.lil_matrix(diag(s_last)) *ll.lil_matrix(Vh)
    else:
        filtered = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh)

    if max_edges != -1:
        filtered.data[argsort(abs(filtered.data))[:-1 * max_edges]] = 0
        filtered.eliminate_zeros()


    g = nx.DiGraph() 
    g.add_nodes_from(nodes)
    g.add_weighted_edges_from([(nodes[nz[0]],nodes[nz[1]],nz[2])
                               for nz in zip(*ssp.find(filtered))])
    
    
    

    
    return g
Beispiel #9
0
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance',
                         train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.

    --- DRAFT version ---

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if metric == 'similarity':
        if train_ind is not None:
            raise NotImplementedError
        kth = n - k
        exclude = -np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    D = np.copy(D)
    if test_ind is None:
        train_set_ind = slice(0, n) #take all
        n_ind = range(n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_ind)
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude
    r = np.zeros(n)
    for i in range(n):
        if train_ind is None:
            if sparse:
                di = D[i, train_set_ind].toarray()
            else:
                di = D[i, train_set_ind]
        else:
            di = D[i, :] # all columns are training in this case
        r[i] = np.partition(di, kth=kth)[kth]

    if sparse:
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        D_ls = np.zeros_like(D)

    if metric == 'similarity':
        for i in n_ind:
            if sparse and nnz[i] <= k: # Don't rescale if there are too few 
                D_ls[i, :] = D[i, :]   # neighbors in the current row
            else:
                D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))
    else:
        for i in n_ind:
            D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))

    if test_ind is None:
        if sparse:
            return D_ls.tocsr()
        else:
            np.fill_diagonal(D_ls, self_value)
            return D_ls
    else:
        # Ensure correct self distances
        for j, sample in enumerate(train_ind):
            D_ls[sample, j] = self_value
        return D_ls[test_ind]
Beispiel #10
0
      break

  fh.seek(node_pos)
  print "Getting number of nodes ..."
  # Get number of nodes
  node_lines = fh.readline().replace(" ", "").strip()
  while not node_lines.endswith("</node>"):
    node_lines += fh.readline().replace(" ", "").strip()

  try:
    NUM_NODES = int(re.search("(?<=id=['\"]n)\d+", node_lines).group(0))+1 # +1 for 0-based indexing
  except Exception, msg:
    print "Cannot determine number of nodes from input file. Check graphml <node> syntax"

  # Got the nodes
  g = lil_matrix((NUM_NODES, NUM_NODES))

  # Put back file handle iterator
  fh.seek(pos)

  print "Getting edges ..."
  line = ""
  while True:
    line += fh.readline().replace(" ", "").strip() # remove if inefficient

    if line.endswith("</edge>"):
      edge = get_edge(line)
      g[edge[0], edge[1]] = edge[2] # Naive i.e slow. TODO: Optimize
      line = ""

    elif line.endswith("</graphml>"):
Beispiel #11
0
    def shortest_path_kernel(self,graph_db, hashed_attributes, param):
        label_name = param.get('label_name', None)

        num_vertices = 0
        for g in graph_db:
            num_vertices += g.number_of_nodes()

        offset = 0
        graph_indices = []
        colors_0 = np.zeros(num_vertices, dtype=np.int64)

        # Get labels (colors) from all graph instances
        offset = 0
        for g in graph_db:
            graph_indices.append((offset, offset + g.number_of_nodes() - 1))

            if label_name:
                for i, label in enumerate(nx.get_node_attributes(g,label_name).values()):
                    colors_0[i + offset] = label

            offset += g.number_of_nodes()
        _, colors_0 = np.unique(colors_0, return_inverse=True)

        colors_1 = hashed_attributes

        triple_indices = []
        triple_offset = 0
        triples = []

        # Solve APSP problem for every graphs in graph data base
        for i, g in enumerate(graph_db):
            M = dict(nx.all_pairs_shortest_path_length(g))

            # index is a tuple giving index of first and last node for graph h
            index = graph_indices[i]

            if label_name:
                l = colors_0[index[0]:index[1] + 1]
                h = colors_1[index[0]:index[1] + 1]
            else:
                h = colors_1[index[0]:index[1] + 1]
            d = len(M)
            # For each pair of vertices collect labels, hashed attributes, and shortest-path distance
            pairs = list(it.product(range(d), repeat=2))
            if label_name:
                t = [hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs if (k != j and ~np.isinf(M[k].get(j, np.inf)))]
            else:
                t = [hash((h[k], h[j], M[k][j])) for (k, j) in pairs if (k != j and ~np.isinf(M[k].get(j, np.inf)))]

            triples.extend(t)

            triple_indices.append((triple_offset, triple_offset + len(t) - 1))
            triple_offset += len(t)

        _, colors = np.unique(triples, return_inverse=True)
        m = np.amax(colors) + 1

        # Compute feature vectors
        feature_vectors = []
        for i, index in enumerate(triple_indices):
            feature_vectors.append(np.bincount(colors[index[0]:index[1] + 1], minlength=m))

        return lil.lil_matrix(feature_vectors, dtype=np.float64) # each feature vector will be row
Beispiel #12
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with Local Scaling.

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if n_jobs == -1:
        n_jobs = cpu_count()
    if metric == 'similarity':
        kth = n - k
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
        if sparse and n_jobs != 1:
            log.warning("Parallel processing not implemented for sparse "
                        "matrices. Using single process instead.")
            n_jobs = 1
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")
    D = np.copy(D)

    if test_ind is None:
        train_ind = slice(0, n) #take all        
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)
    if sparse:
        r = np.zeros(n)
        for i in range(n):
            di = D[i, train_ind].toarray()
            di[i] = exclude
            r[i] = np.partition(di, kth=kth)[kth]
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        np.fill_diagonal(D, exclude)
        if n_jobs > 1:
            r_ctype = RawArray(ctypes.c_double, n)
            r = np.frombuffer(r_ctype, dtype=np.float64)
            with Pool(processes=n_jobs,
                      initializer=_ls_load_shared_data,
                      initargs=(D, train_ind, r, r_ctype)) as pool:
                for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth),
                                   iterable=range(n)):
                    pass # results handled within func
        else:
            r = np.partition(D[:, train_ind], kth=kth)[:, kth]

    if sparse or n_jobs == 1:
        D_ls = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop: calc only triu part
            tmp = np.empty(n-i)
            tmp[0] = self_tmp_value
            if metric == 'similarity':
                if sparse and nnz[i] <= k:  # Don't rescale if there are
                    tmp[1:] = np.nan        # too few neighbors in row
                else:
                    tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            else:
                tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            D_ls[i, i:] = tmp
        # copy triu to tril -> symmetric matrix (diag=zeros)
        # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
        D_ls += D_ls.T
    else:
        D_ls_ctype = RawArray(ctypes.c_double, D.size)
        D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_ls_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool:
            for _ in pool.imap(func=partial(_ls_calculate_sec_dist,
                                  n=n, metric=metric,
                                  self_tmp_value=self_tmp_value),
                               iterable=range(n)):
                pass # results handled within func
        # triu is copied to tril within func
    if sparse:
        for i, nz in enumerate(nnz):
            if nz <= k: # too few neighbors
                D_ls[i, :] = D[i, :]
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
def shortest_path_kernel(graph_db, hashed_attributes, *kwargs):
    compute_gram_matrix = kwargs[0]
    normalize_gram_matrix = kwargs[1]
    use_labels = kwargs[2]

    num_vertices = 0
    for g in graph_db:
        num_vertices += g.num_vertices()

    offset = 0
    graph_indices = []
    colors_0 = np.zeros(num_vertices, dtype=np.int64)

    # Get labels (colors) from all graph instances
    offset = 0
    for g in graph_db:
        graph_indices.append((offset, offset + g.num_vertices() - 1))

        if use_labels == 1:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = g.vp.nl[v]
        if use_labels == 2:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = v.out_degree()

        offset += g.num_vertices()
    _, colors_0 = np.unique(colors_0, return_inverse=True)

    colors_1 = hashed_attributes

    triple_indices = []
    triple_offset = 0
    triples = []

    # Solve APSP problem for every graphs in graph data base
    for i, g in enumerate(graph_db):
        a = gt.adjacency(g)
        M = csg.shortest_path(a, method='J', directed=False, unweighted=True)

        index = graph_indices[i]

        if use_labels:
            l = colors_0[index[0]:index[1] + 1]
            h = colors_1[index[0]:index[1] + 1]
        else:
            h = colors_1[index[0]:index[1] + 1]
        d = M.shape[0]

        # For each pair of vertices collect labels, hashed attributes, and shortest-path distance
        pairs = list(it.product(range(d), repeat=2))
        if use_labels:
            t = [
                hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs
                if (k != j or ~np.isinf(M[k][j]))
            ]
        else:
            t = [
                hash((h[k], h[j], M[k][j])) for (k, j) in pairs
                if (k != j or ~np.isinf(M[k][j]))
            ]

        triples.extend(t)

        triple_indices.append((triple_offset, triple_offset + len(t) - 1))
        triple_offset += len(t)

    _, colors = np.unique(triples, return_inverse=True)
    m = np.amax(colors) + 1

    # Compute feature vectors
    feature_vectors = []
    for i, index in enumerate(triple_indices):
        feature_vectors.append(
            np.bincount(colors[index[0]:index[1] + 1], minlength=m))

    if not compute_gram_matrix:
        return lil.lil_matrix(feature_vectors, dtype=np.float64)
    else:
        # Make feature vectors sparse
        gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64)
        # Compute gram matrix
        gram_matrix = gram_matrix.dot(gram_matrix.T)

        gram_matrix = gram_matrix.toarray()

        if normalize_gram_matrix:
            return aux.normalize_gram_matrix(gram_matrix)
        else:
            return gram_matrix
Beispiel #14
0
def load_as_ir_task():
    '''
        5 source documents (index) X 57 suspicious documents (queries)
        
        1 relevant document (source) for each query!
    '''

    path = datasets_extractors['DATASETS_PATH'][
        'short_plagiarised_answers_dataset']
    files_path = os.path.join(path, "ir_task_short_plagiarised_answers.h5")

    if os.path.exists(files_path):
        #load and return
        queries = pd.read_hdf(files_path, 'queries')
        documents = pd.read_hdf(files_path, 'documents')
        dataset_target = pd.read_hdf(files_path, 'targets')
        data = dataset_target.loc[:, 'data'].values
        row = dataset_target.loc[:, 'index'].values
        col = dataset_target.loc[:, 'col'].values
        dataset_target = coo_matrix(
            (data, (row, col)), shape=(queries.shape[0], documents.shape[0]))
        dataset_target = dataset_target.tolil()
        dataset_encoding = __DATASET_ENCODING

    else:
        spa_original, dataset_encoding = load_to_pandas()
        queries_dataframe, documents_dataframe = spa_original[
            0:95], spa_original[95:100]
        dataset_target = lil_matrix((100, 100))
        del spa_original

        queries = []
        queries_dataframe_indexes = []
        documents = documents_dataframe['content'].tolist()
        documents_dataframe_indexes = documents_dataframe.index.values.tolist()

        for rowi_index, rowi in queries_dataframe[
                queries_dataframe.plag_type != "non"].iterrows():
            i = len(queries)
            for j, source_rowj in documents_dataframe.iterrows():
                j -= 95
                dataset_target[i, j] = source_rowj["task"] == rowi['task']
            queries.append(rowi['content'])
            queries_dataframe_indexes.append(rowi_index)

        non_plagiarism = queries_dataframe[queries_dataframe.plag_type ==
                                           "non"]
        documents = documents + non_plagiarism['content'].values.tolist()
        documents_dataframe_indexes = documents_dataframe_indexes + non_plagiarism.index.values.tolist(
        )
        dataset_target = dataset_target[:len(queries), :len(documents)]

        #     print(len(queries),len(documents),dataset_target.shape)
        del queries_dataframe, documents_dataframe

        queries = pd.DataFrame({
            'content': queries,
            'original_indexes': queries_dataframe_indexes
        })
        documents = pd.DataFrame({
            'content':
            documents,
            'original_indexes':
            documents_dataframe_indexes
        })

        queries.to_hdf(files_path, 'queries', append=True)
        documents.to_hdf(files_path, 'documents', append=True)
        '''
            storing scipy sparse matrix on dataframe to dump on hdf5
        '''
        coo = dataset_target.tocoo()
        pd.DataFrame({
            'index': coo.row,
            'col': coo.col,
            'data': coo.data
        })[['index', 'col',
            'data']].sort_values(['index', 'col'
                                  ]).reset_index(drop=True).to_hdf(files_path,
                                                                   'targets',
                                                                   append=True)

    return queries, documents, dataset_target, dataset_encoding
Beispiel #15
0
    def WL_kernel(self,graph_db, hashed_attributes, param):
        label_name = param.get('label_name', None)
        wl_iterations = param.get('wl_iterations')

        # Create one empty feature vector for each graph
        feature_vectors = []
        for _ in graph_db:
            feature_vectors.append(np.zeros(0, dtype=np.float64))

        # Construct block diagonal matrix of all adjacency matrices
        adjacency_matrices = []
        for g in graph_db:
            adjacency_matrices.append(np.array(nx.adjacency_matrix(g).todense()))
        M = sp.sparse.block_diag(tuple(adjacency_matrices), dtype=np.float64, format="csr")
        num_vertices = M.shape[0]

        # Load list of precalculated logarithms of prime numbers
        log_primes = log_pl.log_primes[0:num_vertices]

        # Color vector representing labels
        colors_0 = np.zeros(num_vertices, dtype=np.float64)
        # Color vector representing hashed attributes
        colors_1 = hashed_attributes

        # Get labels (colors) from all graph instances
        offset = 0
        graph_indices = []


        for g in graph_db:
            if label_name:
                for i, label in enumerate(nx.get_node_attributes(g,label_name).values()):
                    colors_0[i + offset] = label

            graph_indices.append((offset, offset + g.number_of_nodes() - 1))
            offset += g.number_of_nodes()

        # Map labels to [0, number_of_colors)
        if label_name:
            _, colors_0 = np.unique(colors_0, return_inverse=True)

        for it in range(0, wl_iterations + 1):

            if label_name:
                # Map colors into a single color vector
                colors_all = np.array([colors_0, colors_1])
                colors_all = [hash(tuple(row)) for row in colors_all.T]
                _, colors_all = np.unique(colors_all, return_inverse=True)
                max_all = int(np.amax(colors_all) + 1)
                # max_all = int(np.amax(colors_0) + 1)

                feature_vectors = [
                    np.concatenate((feature_vectors[i], np.bincount(colors_all[index[0]:index[1] + 1], minlength=max_all)))
                    for i, index in enumerate(graph_indices)]

                # Avoid coloring computation in last iteration
                if it < wl_iterations:
                    colors_0 = self.wl_coloring(M, colors_0, log_primes[0:len(colors_0)])
                    colors_1 = self.wl_coloring(M, colors_1, log_primes[0:len(colors_1)])
            else:
                max_1 = int(np.amax(colors_1) + 1)

                feature_vectors = [
                    np.concatenate((feature_vectors[i], np.bincount(colors_1[index[0]:index[1] + 1], minlength=max_1))) for
                    i, index in enumerate(graph_indices)]

                # Avoid coloring computation in last iteration
                if it < wl_iterations:
                    colors_1 = self.wl_coloring(M, colors_1, log_primes[0:len(colors_1)])

        return lil.lil_matrix(feature_vectors, dtype=np.float64) # each feature vector will be row
def __nearest_neighbors_search(pipe_to_exec, source_file_path, file_path):
    '''
        runs "pipe_to_exec" nearest neighbors search estimator
            
        parameters: 
        
            * source_file_path : hdf file in which input documents, queries and targets are stored
            * file_path: hdf filename where nns results will be stored
    '''

    #     print(linei.describe)

    d = hdf_to_sparse_matrix('documents', source_file_path)
    pipe_to_exec.fit(d, None)
    d_mean_time = pipe_to_exec.steps[0][1].fit_time

    print("fitted in %f s" % (d_mean_time))

    del d

    q = hdf_to_sparse_matrix('queries', source_file_path)
    d_indices, qd_distances, q_mean_time = pipe_to_exec.transform(q)

    #     print("mean retrieval time %f s"%(q_mean_time))

    time_dataframe = pd.DataFrame({
        'documents_mean_time': [d_mean_time],
        'queries_mean_time': [q_mean_time],
    })
    '''
        storing nearest neighbors search results
    '''
    time_dataframe.to_hdf(file_path.replace('results.h5', 'time.h5'),
                          'time_dataframe')
    sparse_matrix_to_hdf(d_indices, 'retrieved_docs', file_path)
    sparse_matrix_to_hdf(lil_matrix(qd_distances), 'qd_distances', file_path)

    del q, d_mean_time, q_mean_time, qd_distances, time_dataframe
    '''
        Evaluating results in terms of Precision, Recalls and MAP.
    '''

    t = hdf_to_sparse_matrix('targets', source_file_path)

    retrieved_relevants = []
    for q_index in range(d_indices.shape[0]):
        q_retrieved_relevants = np.cumsum(t[q_index, d_indices[q_index, :]].A,
                                          axis=1)
        retrieved_relevants.append(q_retrieved_relevants)

    retrieved_relevants = vstack(retrieved_relevants)
    '''
        broadcasting
    '''
    approachi_recalls = np.divide(retrieved_relevants,
                                  np.matrix(t.sum(axis=1)))
    ranking_sum = np.multiply(
        np.ones(retrieved_relevants.shape),
        np.matrix(range(1, retrieved_relevants.shape[1] + 1)))
    approachi_precisions = np.divide(retrieved_relevants, ranking_sum)

    average_precision = np.zeros((d_indices.shape[0], 1))
    for q_index in range(d_indices.shape[0]):
        relevants_precision = np.multiply(approachi_precisions[q_index, :],
                                          t[q_index, d_indices[q_index, :]].A)
        average_precision[q_index, 0] = relevants_precision.mean(axis=1)


#         print(q_index,'.MAP =',average_precision[q_index,0])

#     print(t.sum(axis=1))
#     print(retrieved_relevants)
    del d_indices, retrieved_relevants

    #     print("MAP=",average_precision.mean(),average_precision.std(),'precision.sum=',average_precision.sum())
    #     print("recalls.sum = ",approachi_recalls.sum(),'| mean = ',approachi_recalls.sum()/(approachi_recalls.shape[0]*approachi_recalls.shape[1]))

    for to_store, to_store_name in [(approachi_precisions, 'precisions'),
                                    (approachi_recalls, 'recalls'),
                                    (average_precision, 'average_precisions')]:
        if not issparse(to_store):
            to_store = csr_matrix(to_store)
        sparse_matrix_to_hdf(
            to_store, to_store_name,
            file_path.replace('results', 'results_evaluation'))

        del to_store
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_set_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.
    
    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 7)
        Neighborhood radius for local scaling.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    log = Logging.ConsoleLogging()
    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'similarity':
        sort_order = -1
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        sort_order = 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if issparse(D):
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.") 
            
    D = np.copy(D)
    n = D.shape[0]
    if test_set_ind is None:
        train_set_ind = slice(0, n) #take all        
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    
    r = np.zeros(n)
    for i in range(n):
        if issparse(D):
            di = D[i, train_set_ind].toarray()
        else:
            di = D[i, train_set_ind]
        di[i] = exclude
        nn = np.argsort(di)[::sort_order]
        r[i] = di[nn[k-1]] #largest similarities or smallest distances
    
    if issparse(D):
        D_ls = lil_matrix(D.shape)
    else:
        D_ls = np.zeros_like(D)
        
    for i in range(n):
        # vectorized inner loop: calc only triu part
        tmp = np.empty(n-i)
        tmp[0] = self_tmp_value
        if metric == 'similarity':
            tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        else:
            tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        D_ls[i, i:] = tmp
    # copy triu to tril -> symmetric matrix (diag=zeros)
    # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
    D_ls += D_ls.T
    
    if issparse(D):
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
Beispiel #18
0
def mock_vec_transform(X):
    ret = lil_matrix((len(X), 5000))
    for idx, x in enumerate(X):
        ret[idx] = len(x)
    return ret
def weisfeiler_lehman_subtree_kernel(graph_db, hashed_attributes, *kwargs):
    iterations = kwargs[0]
    compute_gram_matrix = kwargs[1]
    normalize_gram_matrix = kwargs[2]
    use_labels = kwargs[3]

    # Create one empty feature vector for each graph
    feature_vectors = []
    for _ in graph_db:
        feature_vectors.append(np.zeros(0, dtype=np.float64))

    # Construct block diagonal matrix of all adjacency matrices
    adjacency_matrices = []
    for g in graph_db:
        adjacency_matrices.append(gt.adjacency(g))
    M = sp.sparse.block_diag(tuple(adjacency_matrices),
                             dtype=np.float64,
                             format="csr")
    num_vertices = M.shape[0]

    # Load list of precalculated logarithms of prime numbers
    log_primes = log_pl.log_primes[0:num_vertices]

    # Color vector representing labels
    colors_0 = np.zeros(num_vertices, dtype=np.float64)
    # Color vector representing hashed attributes
    colors_1 = hashed_attributes

    # Get labels (colors) from all graph instances
    offset = 0
    graph_indices = []

    for g in graph_db:
        if use_labels == 1:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = g.vp.nl[v]
        if use_labels == 2:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = v.out_degree()

        graph_indices.append((offset, offset + g.num_vertices() - 1))
        offset += g.num_vertices()

    # Map labels to [0, number_of_colors)
    if use_labels:
        _, colors_0 = np.unique(colors_0, return_inverse=True)

    for it in range(0, iterations + 1):

        if use_labels:
            # Map colors into a single color vector
            colors_all = np.array([colors_0, colors_1])
            colors_all = [hash(tuple(row)) for row in colors_all.T]
            _, colors_all = np.unique(colors_all, return_inverse=True)
            max_all = int(np.amax(colors_all) + 1)
            # max_all = int(np.amax(colors_0) + 1)

            feature_vectors = [
                np.concatenate((feature_vectors[i],
                                np.bincount(colors_all[index[0]:index[1] + 1],
                                            minlength=max_all)))
                for i, index in enumerate(graph_indices)
            ]

            # Avoid coloring computation in last iteration
            if it < iterations:
                colors_0 = compute_coloring(M, colors_0,
                                            log_primes[0:len(colors_0)])
                colors_1 = compute_coloring(M, colors_1,
                                            log_primes[0:len(colors_1)])
        else:
            max_1 = int(np.amax(colors_1) + 1)

            feature_vectors = [
                np.concatenate((feature_vectors[i],
                                np.bincount(colors_1[index[0]:index[1] + 1],
                                            minlength=max_1)))
                for i, index in enumerate(graph_indices)
            ]

            # Avoid coloring computation in last iteration
            if it < iterations:
                colors_1 = compute_coloring(M, colors_1,
                                            log_primes[0:len(colors_1)])

    if not compute_gram_matrix:
        return lil.lil_matrix(feature_vectors, dtype=np.float64)
    else:
        # Make feature vectors sparse
        gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64)
        # Compute gram matrix
        gram_matrix = gram_matrix.dot(gram_matrix.T)

        gram_matrix = gram_matrix.toarray()

        if normalize_gram_matrix:
            return aux.normalize_gram_matrix(gram_matrix)
        else:
            return gram_matrix