def _fill_bpscore_matrix(self, verbose=True):
        # get unique genes:
        all_genes = set(self.assoc.keys())
        annotated_genes = set()

        for gene, terms in self.assoc.items():
            inter = terms.intersection(self.go_dag.terms)
            if len(inter) > 0:
                annotated_genes.add(gene)

        if verbose:
            print("Number of annotated genes: " + str(len(annotated_genes))
                  + "/" + str(len(all_genes)))

        # precompute only for annotated genes, others get a score of 0.0 anyway
        genes = annotated_genes
        # number of genes
        nGenes = len(genes)

        # map terms to numberical range [0, nTerms-1]
        i = 0
        gene_mapping = dict()
        idx_2_gene = list()
        for t in genes:
            gene_mapping[t] = i
            idx_2_gene.append(t)
            i = i + 1

        if verbose:
            print("Pre-calculating BPScore between all " + str(nGenes)
                  + " genes...")
        # float32's precision is sufficient (this way all gene combinations
        # can easily be kept in memory)
        bpscore_matrix = numpy.zeros((nGenes, nGenes), dtype=numpy.float32)

        # fill matrix in row major
        for i in range(0, nGenes):
            g1 = idx_2_gene[i]
            if verbose:
                if i % 10 == 0:
                    print("filling row " + str(i) + "/" + str(nGenes))
            for j in range(i, nGenes):
                g2 = idx_2_gene[j]
                bpscore = GoFastSimilarity.gene_pairwise_score(self, g1, g2)
                bpscore_matrix[i][j] = bpscore
                bpscore_matrix[j][i] = bpscore

        # fill rows sums
        row_sums = numpy.zeros(nGenes, dtype=numpy.float32)
        for i in range(0, nGenes):
            # set the row sum but ignore self
            row_sums[i] = numpy.sum(bpscore_matrix[i]) - bpscore_matrix[i][i]

        # return the score matrix but also the gene mapping
        return (bpscore_matrix, row_sums, gene_mapping)
Exemple #2
0
    def _fill_bpscore_matrix(self, verbose=True):
        # get unique genes:
        all_genes = set(self.assoc.keys())
        annotated_genes = set()

        for gene, terms in self.assoc.items():
            inter = terms.intersection(self.go_dag.terms)
            if len(inter) > 0:
                annotated_genes.add(gene)

        if verbose:
            print("Number of annotated genes: " + str(len(annotated_genes)) +
                  "/" + str(len(all_genes)))

        # precompute only for annotated genes, others get a score of 0.0 anyway
        genes = annotated_genes
        # number of genes
        nGenes = len(genes)

        # map terms to numberical range [0, nTerms-1]
        i = 0
        gene_mapping = dict()
        idx_2_gene = list()
        for t in genes:
            gene_mapping[t] = i
            idx_2_gene.append(t)
            i = i + 1

        if verbose:
            print("Pre-calculating BPScore between all " + str(nGenes) +
                  " genes...")
        # float32's precision is sufficient (this way all gene combinations
        # can easily be kept in memory)
        bpscore_matrix = numpy.zeros((nGenes, nGenes), dtype=numpy.float32)

        # fill matrix in row major
        for i in range(0, nGenes):
            g1 = idx_2_gene[i]
            if verbose:
                if i % 10 == 0:
                    print("filling row " + str(i) + "/" + str(nGenes))
            for j in range(i, nGenes):
                g2 = idx_2_gene[j]
                bpscore = GoFastSimilarity.gene_pairwise_score(self, g1, g2)
                bpscore_matrix[i][j] = bpscore
                bpscore_matrix[j][i] = bpscore

        # fill rows sums
        row_sums = numpy.zeros(nGenes, dtype=numpy.float32)
        for i in range(0, nGenes):
            # set the row sum but ignore self
            row_sums[i] = numpy.sum(bpscore_matrix[i]) - bpscore_matrix[i][i]

        # return the score matrix but also the gene mapping
        return (bpscore_matrix, row_sums, gene_mapping)
Exemple #3
0
    def __init__(self, obo_file, sim_file, mapping_file, sql_conn, verbose=True):
        # initialize the super class
        GoFastSimilarity.__init__(self, obo_file, sql_conn, verbose)

        # load the data file or create and fill it
        self._load_or_create(sim_file, mapping_file)