Example #1
0
 def test_find_orf_neg(self):
     saccn = self.sallbed.accn("Si001539m")
     orf = find_orf(self.sallbed, saccn)
     self.assertEqual(orf, 7662777)
Example #2
0
def main(qbed,sbed,missed_pairs, ncpu):
    """run tblastx on missed pairs..."""
    #print >>sys.stderr,ncpu
    ncpu = int(ncpu)
    pool = Pool(ncpu)
    pairs_file = get_pairs_file(missed_pairs)
    print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift"
    blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' "
    qfastas = split_fastas(qbed)#MASK CODING
    sfastas = get_mask_non_cds(sbed) #mask noncoding

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed)

    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None
        
    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        
        def get_blastn_cmd(pair):
            """creates the dictionary values used to fill in blast cmd"""
            if pair is None: return None
            hit, gene = pair
            hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end'])
            # double check fasta to make sure i dont need to add or remove one
            gstart,gstop = gene['start'],gene['end']
            # checks the entire gene...
            query_file = qfastas[hit['seqid']]
            subject_file = sfastas[gene['seqid']]

            blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop)
            #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd)
            
            return blastn_cmd,hit, gene
        
        cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c]
        #print >>sys.stderr, "results: {0}".format(cmds[0][0])
        results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds]))
        for res, (cmd, hit, gene) in zip(results,cmds):
            print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn'])
            d,no_res = group_cds(res, gene)
            gap_list =[]
            intron_list = []
            hit['locs'] = []
            if no_res == True: continue
            for group_key in d.keys():
                exon_hits = d[group_key]
                non_crossing = remove_crossing_hits(exon_hits,hit,gene)
                if len(non_crossing) > 1:
                    gaps,hstart,hend =bites(non_crossing)
                    gap_list.append(sum(gaps))
                elif len(non_crossing) == 1:
                   # print >>sys.stderr, non_crossing
                    [(hstart,hend,sstart,send,evalue)] = non_crossing
                if len(non_crossing) >= 1:
                    intron_list.append(group_key[0])
                    hit['locs'].append((hstart,hend))
            hit['locs'].sort()
            #print >>sys.stderr, "hit_loc : {0}".format(hit['locs'])
            if len(hit['locs']) < 1: continue
            orf_prediction = find_orf(qbed,hit)
            introns = "{0}/{1}".format(len(intron_list),len(gene['locs']))
            gap_totaln = sum(gap_list)
            # new hit locs made from blastn res
            hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed)
            orf_start = abs(min(hit['locs'][0]) + int(orf_start))
            w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift)
            print >>sys.stdout, w
Example #3
0
 def test_find_orf(self):
     qaccn = self.qallbed.accn("Os01g01295")
     orf = find_orf(self.qallbed, qaccn)
     self.assertEqual(orf + 1, 141084)