def _fill_bpscore_matrix(self, verbose=True): # get unique genes: all_genes = set(self.assoc.keys()) annotated_genes = set() for gene, terms in self.assoc.items(): inter = terms.intersection(self.go_dag.terms) if len(inter) > 0: annotated_genes.add(gene) if verbose: print("Number of annotated genes: " + str(len(annotated_genes)) + "/" + str(len(all_genes))) # precompute only for annotated genes, others get a score of 0.0 anyway genes = annotated_genes # number of genes nGenes = len(genes) # map terms to numberical range [0, nTerms-1] i = 0 gene_mapping = dict() idx_2_gene = list() for t in genes: gene_mapping[t] = i idx_2_gene.append(t) i = i + 1 if verbose: print("Pre-calculating BPScore between all " + str(nGenes) + " genes...") # float32's precision is sufficient (this way all gene combinations # can easily be kept in memory) bpscore_matrix = numpy.zeros((nGenes, nGenes), dtype=numpy.float32) # fill matrix in row major for i in range(0, nGenes): g1 = idx_2_gene[i] if verbose: if i % 10 == 0: print("filling row " + str(i) + "/" + str(nGenes)) for j in range(i, nGenes): g2 = idx_2_gene[j] bpscore = GoFastSimilarity.gene_pairwise_score(self, g1, g2) bpscore_matrix[i][j] = bpscore bpscore_matrix[j][i] = bpscore # fill rows sums row_sums = numpy.zeros(nGenes, dtype=numpy.float32) for i in range(0, nGenes): # set the row sum but ignore self row_sums[i] = numpy.sum(bpscore_matrix[i]) - bpscore_matrix[i][i] # return the score matrix but also the gene mapping return (bpscore_matrix, row_sums, gene_mapping)
def __init__(self, obo_file, sim_file, mapping_file, sql_conn, verbose=True): # initialize the super class GoFastSimilarity.__init__(self, obo_file, sql_conn, verbose) # load the data file or create and fill it self._load_or_create(sim_file, mapping_file)