def _create_failed_intron_gff(introndata, assessed_interfaces, intron2label): """ """ failed_introns = {} for kk in introndata.keys(): start, end = kk introns = introndata[kk] label = intron2label[kk] value = assessed_interfaces[label] if start in [s for s, e in failed_introns.keys()]: if end < min([e for s, e in failed_introns.keys()]): # replace this key for s, e in failed_introns.keys(): if start == s and e == min([_e for _s, _e in failed_introns.keys()]): del (failed_introns[(s, e)]) failed_introns[kk] = value break elif end in [e for s, e in failed_introns.keys()]: if start > max([s for s, e in failed_introns.keys()]): # replace this key for s, e in failed_introns.keys(): if end == e and s == max([_s for _s, _e in failed_introns.keys()]): del (failed_introns[(s, e)]) failed_introns[kk] = value break else: # new key -> store failed_introns[kk] = value # create `failed` intron gff lines gfflines = [] for (start, end) in failed_introns.keys(): assesed_org_list = failed_introns[(start, end)] for orgid in assesed_org_list: if orgid in [intron._reference for intron in introndata[(start, end)]]: continue # if here, a failed organism gclass = "failedintron" # "%s_%s" % (start, end) gname = "%s_%s_%s" % (orgid, start, end) orgSfullname = _get_organism_full_name(orgid) # if ABGP_ORGANISM2FULLSPECIESNAME_MAPPING.has_key(orgid): # orgSfullname = ABGP_ORGANISM2FULLSPECIESNAME_MAPPING[orgid] # elif ABGP_ORGANISM2FULLSPECIESNAME_MAPPING.has_key(orgid[0:-1]): # orgSfullname = ABGP_ORGANISM2FULLSPECIESNAME_MAPPING[orgid[0:-1]] # else: # orgSfullname = orgid # create gclass/gname and GFF decoration string lastcol = """%s %s; Reference %s; Note '%s'""" % (gclass, gname, orgid, orgSfullname) # get fmethod/class from 1 of the intron(s) example = introndata[(start, end)][0] fmethod = example.__class__.__name__ gffline = (None, "ABGPfailed", fmethod, start + 1, end, ".", "+", ".", lastcol) gfflines.append(gffline) # return gfflines list return gfflines
def create_projected_signalp_for_informants(organism, PCG, input, minimal_aa_overlap=None, verbose=False): """ """ # return list with ProjectedSignalPeptides return_projsignalpeptides = [] for orgS in PCG.organism_set(): if organism == orgS: continue if verbose: print orgS, organism # dict in order to create unique projections projections = {} for pacbporf in PCG.get_pacbps_by_organisms(organism, orgS): if not (pacbporf.orfS._has_signalp_sites_predicted and\ pacbporf.orfS._signalp_sites): # no SignalPeptides on this Orf of the informant continue if minimal_aa_overlap and not pacbporf.orfQ._signalp_sites: # overlap required AND no SignalPeptides on the target Orf continue else: # calculate DNA range of target gene's SignalPeptides qdnarange = [] for spQ in pacbporf.orfQ._signalp_sites: qdnarange.extend(range(spQ.start, spQ.end)) # get full name tag of informant orgSfullname = "%s [%s]" % (_get_organism_full_name( orgS, truncate=True), input[orgS]['proteinfref']) # loop over the informant's SignalPeptides for spS in pacbporf.orfS._signalp_sites: prjQSPstart = pacbporf.dnapos_sbjct2query(spS.start) prjQSPend = pacbporf.dnapos_sbjct2query(spS.end) prjQTSSstart = pacbporf.dnapos_sbjct2query(spS.tss.start) prjQTSSend = pacbporf.dnapos_sbjct2query(spS.tss.end) coords = [prjQSPstart, prjQSPend, prjQTSSstart, prjQTSSend] if CoordinateOutOfRange in coords: ############################################################ if verbose: print "OUT-OF-RANGE:", spS ############################################################ # projection on query NOT possible (out of range of Orf) continue ################################################################ if verbose: print orgS, pacbporf, len(pacbporf.orfQ._signalp_sites), print len(pacbporf.orfS._signalp_sites) ################################################################ # calculate AA overlap with target organism SignalPeptides sdnarange = Set(range(prjQSPstart, prjQSPend)) dna_overlap = sdnarange.intersection(qdnarange) aa_overlap = len(dna_overlap) / 3 if minimal_aa_overlap and aa_overlap < minimal_aa_overlap: # minimal overlap required and not achieved continue # create a ProjectedSignalPSignalPeptide prjTSS = TranslationalStartSite(prjQTSSstart, "n" * 19, pssm_score=spS.tss.pssm_score) prjTSS._gff['fmethod'] = 'projectedTSSpssm' prjTSS._gff['column9data'] = {'Informant': orgSfullname} prjSignalP_gff = { 'fmethod': 'projectedSignalPeptide', 'column9data': { 'Informant': orgSfullname }, 'gname': "%s_%s_%s" % (orgS, prjQSPstart, prjQSPend) } prjSignalP = ProjectedSignalPSignalPeptide(prjQSPstart, prjQSPend, spS.pssm_score, tss=prjTSS, gff=prjSignalP_gff) prjSignalP._gff['fmethod'] = 'projectedSignalPeptide' prjSignalP._gff['column9data'] = {'Informant': orgSfullname} # store to prjections dict projections[(prjSignalP.start, prjSignalP.end)] = prjSignalP ################################################################ if verbose: print prjSignalP, aa_overlap ################################################################ # store unique projections to return_projsignalpeptides for sgp in projections.values(): return_projsignalpeptides.append(sgp) # return list of ProjectedSignalPeptides return return_projsignalpeptides
def _create_failed_intron_gff(introndata, assessed_interfaces, intron2label): """ """ failed_introns = {} for kk in introndata.keys(): start, end = kk introns = introndata[kk] label = intron2label[kk] value = assessed_interfaces[label] if start in [s for s, e in failed_introns.keys()]: if end < min([e for s, e in failed_introns.keys()]): # replace this key for s, e in failed_introns.keys(): if start == s and e == min( [_e for _s, _e in failed_introns.keys()]): del (failed_introns[(s, e)]) failed_introns[kk] = value break elif end in [e for s, e in failed_introns.keys()]: if start > max([s for s, e in failed_introns.keys()]): # replace this key for s, e in failed_introns.keys(): if end == e and s == max( [_s for _s, _e in failed_introns.keys()]): del (failed_introns[(s, e)]) failed_introns[kk] = value break else: # new key -> store failed_introns[kk] = value # create `failed` intron gff lines gfflines = [] for (start, end) in failed_introns.keys(): assesed_org_list = failed_introns[(start, end)] for orgid in assesed_org_list: if orgid in [ intron._reference for intron in introndata[(start, end)] ]: continue # if here, a failed organism gclass = "failedintron" # "%s_%s" % (start, end) gname = "%s_%s_%s" % (orgid, start, end) orgSfullname = _get_organism_full_name(orgid) #if ABGP_ORGANISM2FULLSPECIESNAME_MAPPING.has_key(orgid): # orgSfullname = ABGP_ORGANISM2FULLSPECIESNAME_MAPPING[orgid] #elif ABGP_ORGANISM2FULLSPECIESNAME_MAPPING.has_key(orgid[0:-1]): # orgSfullname = ABGP_ORGANISM2FULLSPECIESNAME_MAPPING[orgid[0:-1]] #else: # orgSfullname = orgid # create gclass/gname and GFF decoration string lastcol = """%s %s; Reference %s; Note '%s'""" % ( gclass, gname, orgid, orgSfullname) # get fmethod/class from 1 of the intron(s) example = introndata[(start, end)][0] fmethod = example.__class__.__name__ gffline = (None, 'ABGPfailed', fmethod, start + 1, end, ".", "+", ".", lastcol) gfflines.append(gffline) # return gfflines list return gfflines
def create_projected_signalp_for_informants(organism,PCG,input, minimal_aa_overlap=None,verbose=False): """ """ # return list with ProjectedSignalPeptides return_projsignalpeptides = [] for orgS in PCG.organism_set(): if organism == orgS: continue if verbose: print orgS, organism # dict in order to create unique projections projections = {} for pacbporf in PCG.get_pacbps_by_organisms(organism,orgS): if not (pacbporf.orfS._has_signalp_sites_predicted and\ pacbporf.orfS._signalp_sites): # no SignalPeptides on this Orf of the informant continue if minimal_aa_overlap and not pacbporf.orfQ._signalp_sites: # overlap required AND no SignalPeptides on the target Orf continue else: # calculate DNA range of target gene's SignalPeptides qdnarange = [] for spQ in pacbporf.orfQ._signalp_sites: qdnarange.extend( range(spQ.start,spQ.end) ) # get full name tag of informant orgSfullname = "%s [%s]" % ( _get_organism_full_name(orgS,truncate=True), input[orgS]['proteinfref'] ) # loop over the informant's SignalPeptides for spS in pacbporf.orfS._signalp_sites: prjQSPstart = pacbporf.dnapos_sbjct2query(spS.start) prjQSPend = pacbporf.dnapos_sbjct2query(spS.end) prjQTSSstart= pacbporf.dnapos_sbjct2query(spS.tss.start) prjQTSSend = pacbporf.dnapos_sbjct2query(spS.tss.end) coords = [ prjQSPstart, prjQSPend, prjQTSSstart,prjQTSSend ] if CoordinateOutOfRange in coords: ############################################################ if verbose: print "OUT-OF-RANGE:", spS ############################################################ # projection on query NOT possible (out of range of Orf) continue ################################################################ if verbose: print orgS, pacbporf, len(pacbporf.orfQ._signalp_sites), print len(pacbporf.orfS._signalp_sites) ################################################################ # calculate AA overlap with target organism SignalPeptides sdnarange = Set(range(prjQSPstart,prjQSPend)) dna_overlap = sdnarange.intersection(qdnarange) aa_overlap = len(dna_overlap)/3 if minimal_aa_overlap and aa_overlap < minimal_aa_overlap: # minimal overlap required and not achieved continue # create a ProjectedSignalPSignalPeptide prjTSS = TranslationalStartSite( prjQTSSstart,"n"*19, pssm_score=spS.tss.pssm_score ) prjTSS._gff['fmethod'] = 'projectedTSSpssm' prjTSS._gff['column9data'] = {'Informant': orgSfullname} prjSignalP_gff = { 'fmethod' : 'projectedSignalPeptide', 'column9data': {'Informant': orgSfullname}, 'gname' : "%s_%s_%s" % (orgS,prjQSPstart,prjQSPend) } prjSignalP = ProjectedSignalPSignalPeptide( prjQSPstart,prjQSPend, spS.pssm_score,tss=prjTSS, gff=prjSignalP_gff ) prjSignalP._gff['fmethod'] = 'projectedSignalPeptide' prjSignalP._gff['column9data'] = {'Informant': orgSfullname} # store to prjections dict projections[(prjSignalP.start,prjSignalP.end)] = prjSignalP ################################################################ if verbose: print prjSignalP, aa_overlap ################################################################ # store unique projections to return_projsignalpeptides for sgp in projections.values(): return_projsignalpeptides.append(sgp) # return list of ProjectedSignalPeptides return return_projsignalpeptides