def write_new_pairs(pair_file_path,new_pairs,qbed_file_new,sbed_path_new):
    """ appends the new pairs to the end of the pair f:/wile and then changes it
    moves it into qa file fmt"""
    write_file = open(pair_file_path,'a')
    for pair in new_pairs:
        new_line = "{0}\t{1}\n".format(pair['qaccn'],pair['saccn'])
        write_file.write(new_line)
    write_file.close()
    header = pair_file_path.split(".")[0]
    raw_file = "{0}.raw.with_new.filtered".format(header)
    pairs_to_qa(pair_file_path,'pair',qbed_file_new,sbed_path_new,raw_file)
def write_new_pairs(pair_file_path, new_pairs, qbed_file_new, sbed_path_new):
    """ appends the new pairs to the end of the pair f:/wile and then changes it
    moves it into qa file fmt"""
    write_file = open(pair_file_path, 'a')
    for pair in new_pairs:
        new_line = "{0}\t{1}\n".format(pair['qaccn'], pair['saccn'])
        write_file.write(new_line)
    write_file.close()
    header = pair_file_path.split(".")[0]
    raw_file = "{0}.raw.with_new.filtered".format(header)
    pairs_to_qa(pair_file_path, 'pair', qbed_file_new, sbed_path_new, raw_file)
def main(cns_file,
         qdups_path,
         sdups_path,
         pair_file,
         fmt,
         qbed,
         sbed,
         qpad,
         spad,
         blast_path,
         unmasked_fasta,
         mask='F',
         ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(
        sbed) if qbed.filename != sbed.filename else qfastas

    ################# file paths #####################
    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".localdups"
    npair_file, nqlocaldups, nslocaldups, ncns_file = map(
        make_copy_of_file,
        [pair_file, qlocaldups_path, slocaldups_path, cns_file])
    ##########################################

    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups, rdups = get_pairs(pair_file, fmt, qdups, sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups, qdups, sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent, sparent) for qparent, sparent in dups
                  if qparent in rdups and sparent in rdups]
    for (qparent, sparent) in dups:
        if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups, qparent)
        sfeat_dups = get_all_dups(sdups, sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed)

        def get_dups_gen():
            try:
                return _get_dups_gen.next()
            except StopIteration:
                return None

        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map = [bl2seq] * len(pairs)
            ###################################
            cmds = [
                c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,
                               qfastas_map, sfastas_map, qpad_map, spad_map)
                if c
            ]
            results = (
                r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
                if not res.strip(): cnss = []
                else:
                    cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed,
                                       qpad, spad, unmasked_fasta)
                print >> sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss))
                cnss_size.append(
                    (len(cnss) * -1, qfeat["start"], sfeat["start"],
                     qfeat["accn"], sfeat["accn"], cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent, sparent) in rdups_dic[qparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[qparent].update({(qparent, sparent): cnss_size})
        elif sparent in rdups:
            if (qparent, sparent) in rdups_dic[sparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[sparent].update({(qparent, sparent): cnss_size})
        else:
            cnss_size.sort()
            cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[
                0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >> sys.stderr, "FINAL: {0},{1},{2}".format(
                qaccn, saccn, cns_number)
            write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                           cnss_size, qparent, sparent, qfeat, sfeat, qdups,
                           sdups)

    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
        qparent, sparent = dparents
        ### one or list? cnss[0]?
        cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[
            dparents]
        qfeat = qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                       [best_reps[dparents]], qparent, sparent, qfeat, sfeat,
                       qdups, sdups)

    write_nolocaldups(
        qbed.path, nqlocaldups,
        "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(
        sbed.path, nslocaldups,
        "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file, 'pair',
                "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]),
                "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]),
                "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
Exemple #4
0
def main(cns_file,qdups_path,sdups_path,pair_file,fmt,qbed,sbed,qpad,spad,blast_path,mask='F',ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas
    
    ################# file paths #####################
    qnolocaldups_path =  qbed.path.split(".")[0] + ".all.nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".all.nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".all.localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".all.localdups"
    npair_file,nqlocaldups,nslocaldups, ncns_file = map(make_copy_of_file,[pair_file,qlocaldups_path,slocaldups_path,cns_file])
    ##########################################
    
    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups,rdups = get_pairs(pair_file,fmt,qdups,sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups,qdups,sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent,sparent) for qparent,sparent in dups if qparent in rdups and sparent in rdups]
    for (qparent,sparent) in dups:
        if skip_pair(qparent,sparent,rdups,rdups_both,ldups):continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups,qparent)
        sfeat_dups = get_all_dups(sdups,sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups,sfeat_dups,qbed,sbed)

        def get_dups_gen():
            try: return _get_dups_gen.next()
            except StopIteration: return None
        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map =  [bl2seq] * len(pairs)
            ###################################
            cmds = [c for c in map(get_cmd, [l for l in pairs if l],
                bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
            results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 
                if not res.strip(): cnss = []
                else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad,spad)
                print >>sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str,l)),cnss))
                cnss_size.append((len(cnss)*-1,qfeat["start"],sfeat["start"],qfeat["accn"],sfeat["accn"],cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent,sparent) in rdups_dic[qparent].keys(): logging.info((qparent,sparent))
            rdups_dic[qparent].update({(qparent,sparent):cnss_size})
        elif sparent in rdups:
            if (qparent,sparent) in rdups_dic[sparent].keys(): logging.info((qparent,sparent))
            rdups_dic[sparent].update({(qparent,sparent):cnss_size})
        else:
            cnss_size.sort()
            cns_number,qfeat_start,sfeat_start,qaccn,saccn,largest_cnss = cnss_size[0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >>sys.stderr, "FINAL: {0},{1},{2}".format(qaccn,saccn,cns_number)
            write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,cnss_size,qparent,sparent,qfeat,sfeat,qdups,sdups)
    
    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
	#print dparents
        qparent,sparent = dparents
        #print parents,best_reps[parents]
        ### one or list? cnss[0]?
        cns_number,qfeat_start, sfeat_start,qaccn,saccn,largest_cnss = best_reps[dparents]
        qfeat= qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,[best_reps[dparents]],qparent,sparent,qfeat,sfeat,qdups,sdups)

    write_nolocaldups(qbed.path,nqlocaldups,"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(sbed.path,nslocaldups,"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file,'pair',"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]),"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))