Esempio n. 1
0
def make_pair_maps(pair_file, fmt, qbed, sbed):
    """make dicts of q => s and s => q"""
    qmap_tuple = []
    for pair in get_pair(pair_file,fmt, qbed, sbed):
        if pair is None: break
        (sname, qname) = pair
        qmap_tuple.append((qname,sname))
        qmap_tuple.append((sname,qname))
    return qmap_tuple
Esempio n. 2
0
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)
    
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
            -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "


    fcnss = sys.stdout
    print >> fcnss,
    "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        # this helps in parallelizing.
	spad_map = [spad] * len(pairs)
        qpad_map = [qpad] * len(pairs)
        sfastas_map = [sfastas] * len(pairs)
        qfastas_map = [qfastas] * len(pairs)
        bl2seq_map =  [bl2seq] * len(pairs)
	####################################       
 
	cmds = [c for c in map(get_cmd, [l for l in pairs if
                l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
	results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Esempio n. 3
0
def make_pair_maps(pair_file, fmt, qbed, sbed):
    """
    make dicts of q => s and s => q
    """
    qmap = collections.defaultdict(list) # key is query, value is a list of subject hits
    smap = collections.defaultdict(list)
    print >>sys.stderr, "pair file:", pair_file
    for pair in get_pair(pair_file, fmt, qbed, sbed):
        if pair is None: break
        (qname, sname) = pair
        qmap[qname].append(sname)
        smap[sname].append(qname)
    return qmap, smap
Esempio n. 4
0
def make_pair_maps(pair_file, fmt, qbed, sbed):
    """
    make dicts of q => s and s => q
    """
    qmap = collections.defaultdict(
        list)  # key is query, value is a list of subject hits
    smap = collections.defaultdict(list)
    print >> sys.stderr, "pair file:", pair_file
    for pair in get_pair(pair_file, fmt, qbed, sbed):
        if pair is None: break
        (qname, sname) = pair
        qmap[qname].append(sname)
        smap[sname].append(qname)
    return qmap, smap
Esempio n. 5
0
def get_homeolog(qfeat,pairsfile, sbed, qbed):
    for region, sregion in get_pair(pairsfile, 'pck', sbed, qbed):
        if region['sfeat'] == qfeat[3]:
            return region['ORG2_qfeat']
Esempio n. 6
0
def main(qbed,sbed,missed_pairs, ncpu):
    """run tblastx on missed pairs..."""
    #print >>sys.stderr,ncpu
    ncpu = int(ncpu)
    pool = Pool(ncpu)
    pairs_file = get_pairs_file(missed_pairs)
    print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift"
    blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' "
    qfastas = split_fastas(qbed)#MASK CODING
    sfastas = get_mask_non_cds(sbed) #mask noncoding

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed)

    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None
        
    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        
        def get_blastn_cmd(pair):
            """creates the dictionary values used to fill in blast cmd"""
            if pair is None: return None
            hit, gene = pair
            hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end'])
            # double check fasta to make sure i dont need to add or remove one
            gstart,gstop = gene['start'],gene['end']
            # checks the entire gene...
            query_file = qfastas[hit['seqid']]
            subject_file = sfastas[gene['seqid']]

            blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop)
            #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd)
            
            return blastn_cmd,hit, gene
        
        cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c]
        #print >>sys.stderr, "results: {0}".format(cmds[0][0])
        results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds]))
        for res, (cmd, hit, gene) in zip(results,cmds):
            print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn'])
            d,no_res = group_cds(res, gene)
            gap_list =[]
            intron_list = []
            hit['locs'] = []
            if no_res == True: continue
            for group_key in d.keys():
                exon_hits = d[group_key]
                non_crossing = remove_crossing_hits(exon_hits,hit,gene)
                if len(non_crossing) > 1:
                    gaps,hstart,hend =bites(non_crossing)
                    gap_list.append(sum(gaps))
                elif len(non_crossing) == 1:
                   # print >>sys.stderr, non_crossing
                    [(hstart,hend,sstart,send,evalue)] = non_crossing
                if len(non_crossing) >= 1:
                    intron_list.append(group_key[0])
                    hit['locs'].append((hstart,hend))
            hit['locs'].sort()
            #print >>sys.stderr, "hit_loc : {0}".format(hit['locs'])
            if len(hit['locs']) < 1: continue
            orf_prediction = find_orf(qbed,hit)
            introns = "{0}/{1}".format(len(intron_list),len(gene['locs']))
            gap_totaln = sum(gap_list)
            # new hit locs made from blastn res
            hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed)
            orf_start = abs(min(hit['locs'][0]) + int(orf_start))
            w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift)
            print >>sys.stdout, w