def parse_gene_prior(genePriorFile, mode): #assume that genePriorFile is likes #gene\trank\tp-value dGene = {} fp = lib_utils.open2(genePriorFile, 'r') scoreSum = 0. maxScore = 0. fp.next() for i in fp: gene, rank, score = i.rstrip().split('\t') if mode == 1: dGene[gene] = float(rank) elif mode == 2: dGene[gene] = float(score) scoreSum += dGene[gene] if dGene[gene] > maxScore: maxScore = dGene[gene] fp.close() #normalize if mode == 1: for gene, val in dGene.iteritems(): dGene[gene] = (maxScore - val + 1.) / scoreSum return dGene
def disease_to_genes_sum(self, gene_norm=True): ''' objective: from hit scores of query hpo to disease, associate disease to genes input: -hpo2disease_fn: a file generated by hpo_to_disease() '#query(file_name)\tomim\tgenes\tscore\n' -gene_norm: want to normalize accumulated phenotype score per gene? [True] ''' job_name = 'disease_to_genes' if self.hpo2disease_fn is None: self.hpo_to_diseases() msg = 'aggregating HPO hit scores of disease to each gene [%s;%s]...' % \ (job_name,self.hpo2disease_fn) lib_utils.msgout('notice', msg) self.logger.info(msg) fp = lib_utils.open2(self.hpo2disease_fn, 'r') #accumulating phenotye-matching score into genes associated with the disease pheno_genes = {} pheno_genes_cnt = {} for i in fp: # for each disease if i.startswith('#'): continue i = i.rstrip() _, omim, geneStr, funsimMatAvg = i.rstrip().split('\t') genes = geneStr.split(',') funsimMatAvg = float(funsimMatAvg) for gene in genes: # for each gene if funsimMatAvg > 0.: if gene not in pheno_genes: pheno_genes[gene] = 0. pheno_genes_cnt[gene] = 0. pheno_genes[gene] += funsimMatAvg pheno_genes_cnt[gene] += 1. fp.close() if gene_norm: msg = 'normalizing a bipartite graph between diseases and genes...' lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) for gene in pheno_genes.keys(): pheno_genes[gene] /= pheno_genes_cnt[gene] self.pheno_dmg = lib_utils.normalize_dic(pheno_genes, 'sum') #print phenotypic damage scores self.rank_pheno_gene() msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) #clean up variables pheno_genes = None pheno_genes_cnt = None
def protein_to_gene(esp2geneFile): dProtein2gene = {} fp = lib_utils.open2(esp2geneFile, 'r') fp.next() for i in fp: gene, protein = i[:-1].split('\t') if gene and protein: if protein not in dProtein2gene: dProtein2gene[protein] = gene fp.close() return dProtein2gene
def gene_to_protein(self): ''' to create a map from ref gene symbol to protein ''' dGene2Prot = {} fp = lib_utils.open2(self.dv.entries['esp_to_gene'], 'r') fp.next() for i in fp: gene, protein = i[:-1].split('\t') if gene and protein: if gene not in dGene2Prot: dGene2Prot[gene] = protein fp.close() return dGene2Prot
def protein_to_gene(self): ''' to create a map from protein to ref gene symbol ''' dProt2Gene = {} fp = lib_utils.open2(self.dv.entries['esp_to_gene'], 'r') fp.next() for i in fp: gene, protein = i[:-1].split('\t') if gene and protein: if protein not in dProt2Gene: dProt2Gene[protein] = gene fp.close() return dProt2Gene
def get_sparse_elements(proteinLinkFile, min_edge_weight): ''' to store ppi network input: dProtein2gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link output: update dProtein2gene, dGenes when add_dangled is enabled. Store ppi and lnkProteins ''' #read string DB and assign an integer to each protein symbol fp = lib_utils.open2(proteinLinkFile, 'r') nNodes = 0 linked = [-1, -1] dProtein2num = {} lnkProteins = [] ppi = [[], [], []] #from protein, to protein, link weight lib_utils.msgout( 'notice', 'preparing a genetic network matrix. Please, be patient......', 'pagerank|heat_diffusion') #store col,row,weight from ppi file fp.next() for i in fp: #print '%s'%i #debug linked[0], linked[1], weight = i.rstrip().split(' ') weight = float(weight) if weight < min_edge_weight: continue for c in range(2): protein = extract_ensembl_protein(linked[c]) #to register a protein node if not protein in dProtein2num: dProtein2num[protein] = nNodes lnkProteins.append( protein ) #item index corresponds to a node number of the protein nNodes += 1 ppi[c].append(dProtein2num[protein]) ppi[2].append(weight) fp.close() dProtein2num = None return nNodes, ppi, lnkProteins
def get_sparse_elements(self): ''' to store ppi network input: self.dProt2Gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link output: update self.dProt2Gene, dGenes when add_dangled is enabled. Store ppi and Prots ''' #read string DB and assign an integer to each protein symbol fp = lib_utils.open2(self.dv.entries['string_link'], 'r') linked = [-1, -1] self.nNodes = 0 self.Prots = [] self.dProt2Idx = {} lib_utils.msgout( 'notice', 'preparing a genetic network matrix. Please, be patient ...', 'pagerank|heat_diffusion') #store col,row,weight from ppi file fp.next() for i in fp: #print '%s'%i #debug linked[0], linked[1], weight = i.rstrip().split() weight = float(weight) if weight < self.min_edge_weight: continue for c in range(2): protein = extract_ensembl_protein(linked[c]) #to register a protein node if not protein in self.dProt2Idx: self.dProt2Idx[protein] = self.nNodes # item index corresponds to a node number of the protein self.Prots.append(protein) self.nNodes += 1 self.ppi[c].append(self.dProt2Idx[protein]) self.ppi[2].append(weight) fp.close()
def convert_node2gene(FinalNodeScores, PerturbedGenes, dProtein2gene, lnkProteins, rank_fn): nodeScores, dangledScores = FinalNodeScores cPerturbedGenes, dangledGenes = PerturbedGenes rank_fn2 = lib_utils.file_tag2(rank_fn, 'tmp', None) fp2 = lib_utils.open2(rank_fn2, 'w') fp2.write('#gene\tpredicted_score[-1/log10(x)]\tseed_score\n') for n, protein in enumerate(lnkProteins): seed_score = 0. gene = protein genetic_dmg_score = 0. if protein in dProtein2gene: gene = dProtein2gene[protein] if gene in cPerturbedGenes: seed_score = cPerturbedGenes[gene].score genetic_dmg_score = cPerturbedGenes[gene].gdmg pred_score = 0. if nodeScores[n] > 0: pred_score = -1. / math.log10(nodeScores[n]) if genetic_dmg_score > 0.: fp2.write('%s\t%g\t%g\n' % (gene, pred_score, seed_score)) #add dangled node score for n, gene in enumerate(dangledGenes): pred_score = 0. if dangledScores[n] > 0: pred_score = -1. / math.log10(dangledScores[n]) if cPerturbedGenes[gene].gdmg > 0.: fp2.write('%s\t%g\t%g\n' % (gene, pred_score, cPerturbedGenes[gene].score)) fp2.close() #sort by score lib_utils.sort_tsv_by_col2(rank_fn2, [2], ['gr'], False, rank_fn) os.unlink(rank_fn2)
def main(): parser = argparse.ArgumentParser(description="training cadd cli") parser.add_argument('--hgmd', action='store_const', dest = 'hgmd', required=False, default = False, const = True, help='want to include hgmd for training? It requires license. [False]') parser.add_argument('-r', action='store', dest='kg_sample_rate', required=False, default = .3, type=float, help='sampling rate for benign variants in 1kG') parser.add_argument('-o', action='store', dest='out_dir', required=True, help='output dir') parser.add_argument('--debug', action='store_const', dest='debug', required=False, default=False, const=True, help='debug?[False]') args=parser.parse_args() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) #load HgmdDB (requires license!) if args.hgmd: try: hgmd = HgmdDB() training = hgmd.select_all_hgmd() except: print 'error: check if HGMD database is avail!' sys.exit(1) if False: #load ClinvarDB clnvar = ClinvarDB() training.extend(clnvar.select_all_clinvar()) #load KGDB kgdb = KGDB() training.extend(kgdb.select_snps(0.1, 0.5, sample_rate=args.kg_sample_rate , snp_tag='benign_1kMAF')) #convert to vcf tr_vcfs = [] clitags = ['benign','pathogenic'] for c in range(2): tr_vcf = os.path.join(args.out_dir,'clin_%s_tr.vcf'%clitags[c]) if not args.debug or not os.path.exists(tr_vcf): tr_vcf_body = tr_vcf + '.body' fp2 = lib_utils.open2(tr_vcf_body,'w') printed = {} for chrom, pos, id, ref, alt, clisig in training: if clitags[c] in clisig: prim_key = '%s_%s_%s_%s' % (chrom,pos,ref,alt) if prim_key in printed: continue printed[prim_key]=clisig fp2.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\tCLINSIG_CLASS=%s\n'%(chrom,pos,id,ref,alt,'100','PASS',clisig)) fp2.close() #sort tr_vcf_body_so = tr_vcf_body + '.sorted' cmd = 'sort -k1,1 -k2,2n %s > %s' % (tr_vcf_body,tr_vcf_body_so) p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) output, err = p.communicate() rc = p.returncode #append header tr_vcf_header = tr_vcf+'.head' fp2 = lib_utils.open2(tr_vcf_header,'w') fp2.write('##fileformat=VCFv4.2\n') fp2.write('##INFO=<ID=CLINSIG_CLASS,Number=7,Type=String,Description="benign_CLINVARDB,pathogenic_CLINVARDB,vus_CLINVARDB,benign_HGMDDB,pathogenic_HGMDDB,vus_HGMDDB,benign_1kMAF",Source="CLINVAR,HGMD",Version="03/01/2015">\n') fp2.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n') fp2.close() cmd = 'cat %s %s > %s' % (tr_vcf_header,tr_vcf_body_so,tr_vcf) p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) output, err = p.communicate() rc = p.returncode os.unlink(tr_vcf_header) os.unlink(tr_vcf_body) os.unlink(tr_vcf_body_so) tr_vcfs.append(tr_vcf) training = None #run gcn gcn_dir = os.environ.get('GCN', None) if gcn_dir: annotpipe_bin = os.path.join(gcn_dir, 'gcn', 'bin', 'annotpipe.py') else: print 'error: cannot find annotpipe.py!' sys.exit(1) tr_varant_vcfs = [] cadd_trset_params = [] mnp_cadd_trset_params =[] gerp_trset_params =[] cadd_trsets = [] mnp_cadd_trsets =[] gerp_trsets =[] filter_bin = os.path.join(gcn_dir,'gcn','lib','utils','filter_cj.py') for c,tr_vcf in enumerate(tr_vcfs): tr_varant_vcf = os.path.join(args.out_dir,'clin_%s_tr_varant.vcf'%clitags[c]) cmd = 'python %s -i %s -o %s'%(annotpipe_bin,tr_vcf,tr_varant_vcf) if not args.debug or not os.path.exists(tr_varant_vcf): print cmd p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) output, err = p.communicate() rc = p.returncode #filter varant vcf tr_varant_filt_vcf = os.path.join(args.out_dir,'clin_%s_tr_varant_filt.vcf'%clitags[c]) filterconf_tr = os.path.join(gcn_dir,'gcn','config','filter_tr_%s.conf'%clitags[c]) cmd = 'python %s -i %s -o %s -f %s --no_genotype' %(filter_bin,tr_varant_vcf,tr_varant_filt_vcf,filterconf_tr) if not args.debug or not os.path.exists(tr_varant_filt_vcf): print cmd p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) output, err = p.communicate() rc = p.returncode cadd_trset, mnp_cadd_trset, gerp_trset = train_conservation_coeff(tr_varant_filt_vcf,args.hgmd) #to get parameters fitted in beta dist cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param = run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset) cadd_trset_params.append(cadd_trset_param) mnp_cadd_trset_params.append(mnp_cadd_trset_param) gerp_trset_params.append(gerp_trset_param) cadd_trsets.append(cadd_trset) mnp_cadd_trsets.append(mnp_cadd_trset) gerp_trsets.append(gerp_trset) print 'done.' pyv = os.path.join(args.out_dir,'clin_tr.pyv') fp2 = open(pyv,'wb') pickle.dump([cadd_trset_params, mnp_cadd_trset_params, gerp_trset_params], fp2) fp2.close()
def convert_node2gene(self): ''' for only gt_dmg genes, print out gene, harmonic score, and seed score ''' rank_fn_tmp = '%s.tmp' % self.dv.gene_rank_fn fp2 = lib_utils.open2(rank_fn_tmp, 'w') fp2.write( '#gene\tpredicted_score\tseed_score\tgt_dmg_score\tpheno_score\tcontain_known_pathogenic\n' ) genes_printed = {} #browsing each node in the whole (original) ppi network for n, protein in enumerate(self.Prots): seed_score = 0. gene = protein #check if this node (restart value) was assigned previously if protein in self.dProt2Gene: gene = self.dProt2Gene[protein] if gene in self.dv.gene_dmg: seed_score = self.dv.gene_dmg[gene][0] #to get harmonic score and save into dv.gene_dmg pred_score = 0. if self.harmonic_sc[n][0] > 0.: pred_score = self.harmonic_sc[n][0] if gene in self.dv.gene_dmg: self.dv.gene_dmg[gene][1] = pred_score #NOTE that print only a gene having at one mutation if (not self.dv.gt_dmg) or \ (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.): pheno_sc = 0. if gene in self.dv.pheno_dmg: pheno_sc = self.dv.pheno_dmg[gene].score if self.dv.vknown: if gene in self.dv.vknown_genes: is_vknown = 'Y' else: is_vknown = 'N' else: is_vknown = 'NA' if gene in genes_printed: gene2 = '%s|%s' % (gene, protein) else: gene2 = gene genes_printed[gene] = True fp2.write('%s\t%g\t%g\t%g\t%g\t%s\n'%\ (gene2,pred_score,seed_score,\ self.dv.gt_dmg[gene].score,pheno_sc,is_vknown)) #repeat the same procedure to dangled nodes for n, gene in enumerate(self.dangledGenes): self.dv.gene_dmg[gene][1] = self.harmonic_dng_sc[n][0] if (not self.dv.gt_dmg) or \ (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.): pheno_sc = 0. if gene in self.dv.pheno_dmg: pheno_sc = self.dv.pheno_dmg[gene].score if self.dv.vknown: if gene in self.dv.vknown_genes: is_vknown = 'Y' else: is_vknown = 'N' else: is_vknown = 'NA' fp2.write('%s\t%g\t%g\t%s\t%g\t%s\n'%\ (gene,self.dv.gene_dmg[gene][1],self.dv.gene_dmg[gene][0],\ self.dv.gt_dmg[gene].score,pheno_sc,is_vknown)) fp2.close() #sort by score lib_utils.sort_tsv_by_col2(\ rank_fn_tmp, [2], ['gr'], False, self.dv.gene_rank_fn) os.unlink(rank_fn_tmp)
def append_annotation_to_vcf2(vcf_fn, vars_to_summuary, submissions, out_vcf): print 'appending annotation to clinvar VCF file ...' v = vcf.VCFParser(vcf_fn) ostream = open2(out_vcf, 'w') v.add_meta_info("REFTX", "1", "String", "RefSeq Transcript Name") v.add_meta_info("HGVSc", "1", "String", "HGVSc change in HGVS nomenclature") v.add_meta_info("HGVSp", "1", "String", "AA change in HGVS nomenclature") v.add_meta_info("SPLOC", "1", "Integer", "Distance from the predicted splice site") v.add_meta_info("DATE", "1", "String", "Last evaluated date") v.add_meta_info("REV", "1", "String", "Review status") v.add_meta_info("CLNMETHOD", "1", "String", "Collection methods") v.writeheader(ostream) for rec in v: v.parseinfo(rec) # clnacc = re.split('[|,]', rec.info.CLNACC) # rec.info.CLNACC = '|'.join(list(set(clnacc))) uniq_rcv_ids = [] for rcv_id_str in rec.info.CLNACC: for rcv_id in rcv_id_str.split('|'): if rcv_id in uniq_rcv_ids: continue uniq_rcv_ids.append(rcv_id) # print 'rec.info.CLNACC:',rec.info.CLNACC #cj_debug for rcv_id in uniq_rcv_ids: rcv_id = rcv_id.split('.')[0] if rcv_id in vars_to_summuary: rec.info.REFTX = vars_to_summuary[rcv_id].REFTX if vars_to_summuary[rcv_id].HGVSc: rec.info.HGVSc = vars_to_summuary[rcv_id].HGVSc mObj = re.search(r'c\.(.*)([\+\-]\d+)\D+', rec.info.HGVSc) if mObj: SPLOC = mObj.group(2) if abs(int(SPLOC)) < 3: rec.info.SPLOC = SPLOC if vars_to_summuary[rcv_id].HGVSp: rec.info.HGVSp = vars_to_summuary[rcv_id].HGVSp if vars_to_summuary[rcv_id].DATE: rec.info.DATE = vars_to_summuary[rcv_id].DATE if vars_to_summuary[rcv_id].REV: rec.info.REV = vars_to_summuary[rcv_id].REV if vars_to_summuary[rcv_id].variation_id in submissions: cmethods = list( set(submissions[vars_to_summuary[rcv_id].variation_id]. collection_methods)) # print 'cmethods:',cmethods #cj_debug rec.info.CLNMETHOD = '|'.join(cmethods) found = True break rec.info.CLNACC = uniq_rcv_ids for j, clndbn in enumerate(rec.info.CLNDBN): rec.info.CLNDBN[j] = clndbn.replace('\\x2c_', ',').replace('\\x2c', ',') v.write(ostream, rec) ostream.close() v.stream.close() print 'Done.'
def reformat_go_sim_fns(go_sim_fns, out_fn, method_id=1): #method_id (1) means SimRel suflabs = ['BP', 'MF', 'CC'] fp2 = lib_utils.open2(out_fn, 'w') v = 0 for key, go_sim_fn in go_sim_fns.iteritems(): #BP,MF,CC print 'appending root node at the end of [%s]' % go_sim_fn suflab = suflabs[v] fp = lib_utils.open2(go_sim_fn, 'r') go_sim_fn2 = lib_utils.file_tag2(go_sim_fn, 'category', None) fp.next() #strip off head for i in fp: j = i.rstrip().split('\t') if len(j) == 2: j.append('-1.') fp2.write('%s\t%s\n' % ('\t'.join(j), suflab)) #uniprot1,uniprot2,score,BP fp.close() print 'done.' v += 1 fp2.close() print 'sorting...' #to get temporary file to sort out_fn2 = lib_utils.file_tag2(out_fn, 'sort', None) temp_sort_dir, _, _, _ = lib_utils.separateDirFn2(out_fn) lib_utils.sort_tsv_by_col2(out_fn, [1, 2, 4], ['V', 'V', 'V'], False, out_fn2, temp_dir=temp_sort_dir) os.rename(out_fn2, out_fn) print 'done.' #groupping print 'collapsing GO sim scores to make the format easier to import SQL [%s] ...' % out_fn out_fn2 = lib_utils.file_tag2(out_fn, 'dense', None) fp2 = lib_utils.open2(out_fn2, 'w') #heads = '#uniprot1\tuniprot2\tscore_mode\tBP\tMF\tCC\tmethod_id' #fp2.write('%s\n'%heads) fp = lib_utils.open2(out_fn, 'r') visit1 = True idx = {'BP': 0, 'MF': 1, 'CC': 2} prev_key = None gosim_holder = ['-1', '-1', '-1'] #-1 means N/A for i in fp: prot1, prot2, score, go_class = i.rstrip().split('\t') key = '%s\t%s' % (prot1, prot2) if key != prev_key: if visit1: visit1 = False else: #wrap up fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id)) gosim_holder = ['-1', '-1', '-1'] gosim_holder[idx[go_class]] = score prev_key = key else: #keep storing values gosim_holder[idx[go_class]] = score fp.close() #don't forget the last entry fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id)) fp2.close() os.rename(out_fn2, out_fn) print 'done.'