def test_find_orf_neg(self): saccn = self.sallbed.accn("Si001539m") orf = find_orf(self.sallbed, saccn) self.assertEqual(orf, 7662777)
def main(qbed,sbed,missed_pairs, ncpu): """run tblastx on missed pairs...""" #print >>sys.stderr,ncpu ncpu = int(ncpu) pool = Pool(ncpu) pairs_file = get_pairs_file(missed_pairs) print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift" blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = split_fastas(qbed)#MASK CODING sfastas = get_mask_non_cds(sbed) #mask noncoding pairs = [True] _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed) def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] def get_blastn_cmd(pair): """creates the dictionary values used to fill in blast cmd""" if pair is None: return None hit, gene = pair hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end']) # double check fasta to make sure i dont need to add or remove one gstart,gstop = gene['start'],gene['end'] # checks the entire gene... query_file = qfastas[hit['seqid']] subject_file = sfastas[gene['seqid']] blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop) #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd) return blastn_cmd,hit, gene cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c] #print >>sys.stderr, "results: {0}".format(cmds[0][0]) results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds])) for res, (cmd, hit, gene) in zip(results,cmds): print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn']) d,no_res = group_cds(res, gene) gap_list =[] intron_list = [] hit['locs'] = [] if no_res == True: continue for group_key in d.keys(): exon_hits = d[group_key] non_crossing = remove_crossing_hits(exon_hits,hit,gene) if len(non_crossing) > 1: gaps,hstart,hend =bites(non_crossing) gap_list.append(sum(gaps)) elif len(non_crossing) == 1: # print >>sys.stderr, non_crossing [(hstart,hend,sstart,send,evalue)] = non_crossing if len(non_crossing) >= 1: intron_list.append(group_key[0]) hit['locs'].append((hstart,hend)) hit['locs'].sort() #print >>sys.stderr, "hit_loc : {0}".format(hit['locs']) if len(hit['locs']) < 1: continue orf_prediction = find_orf(qbed,hit) introns = "{0}/{1}".format(len(intron_list),len(gene['locs'])) gap_totaln = sum(gap_list) # new hit locs made from blastn res hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed) orf_start = abs(min(hit['locs'][0]) + int(orf_start)) w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift) print >>sys.stdout, w
def test_find_orf(self): qaccn = self.qallbed.accn("Os01g01295") orf = find_orf(self.qallbed, qaccn) self.assertEqual(orf + 1, 141084)