Esempio n. 1
0
 def test_parse_balse(self):
     orientaion = -1
     cns = parse_blast(
         self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta
     )
     print cns
Esempio n. 2
0
def main(cns_file,
         qdups_path,
         sdups_path,
         pair_file,
         fmt,
         qbed,
         sbed,
         qpad,
         spad,
         blast_path,
         unmasked_fasta,
         mask='F',
         ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(
        sbed) if qbed.filename != sbed.filename else qfastas

    ################# file paths #####################
    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".localdups"
    npair_file, nqlocaldups, nslocaldups, ncns_file = map(
        make_copy_of_file,
        [pair_file, qlocaldups_path, slocaldups_path, cns_file])
    ##########################################

    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups, rdups = get_pairs(pair_file, fmt, qdups, sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups, qdups, sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent, sparent) for qparent, sparent in dups
                  if qparent in rdups and sparent in rdups]
    for (qparent, sparent) in dups:
        if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups, qparent)
        sfeat_dups = get_all_dups(sdups, sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed)

        def get_dups_gen():
            try:
                return _get_dups_gen.next()
            except StopIteration:
                return None

        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map = [bl2seq] * len(pairs)
            ###################################
            cmds = [
                c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,
                               qfastas_map, sfastas_map, qpad_map, spad_map)
                if c
            ]
            results = (
                r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
                if not res.strip(): cnss = []
                else:
                    cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed,
                                       qpad, spad, unmasked_fasta)
                print >> sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss))
                cnss_size.append(
                    (len(cnss) * -1, qfeat["start"], sfeat["start"],
                     qfeat["accn"], sfeat["accn"], cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent, sparent) in rdups_dic[qparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[qparent].update({(qparent, sparent): cnss_size})
        elif sparent in rdups:
            if (qparent, sparent) in rdups_dic[sparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[sparent].update({(qparent, sparent): cnss_size})
        else:
            cnss_size.sort()
            cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[
                0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >> sys.stderr, "FINAL: {0},{1},{2}".format(
                qaccn, saccn, cns_number)
            write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                           cnss_size, qparent, sparent, qfeat, sfeat, qdups,
                           sdups)

    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
        qparent, sparent = dparents
        ### one or list? cnss[0]?
        cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[
            dparents]
        qfeat = qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                       [best_reps[dparents]], qparent, sparent, qfeat, sfeat,
                       qdups, sdups)

    write_nolocaldups(
        qbed.path, nqlocaldups,
        "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(
        sbed.path, nslocaldups,
        "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file, 'pair',
                "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]),
                "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]),
                "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
Esempio n. 3
0
 def test_parse_balse(self):
     orientaion = -1
     cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat,
                       self.qbed, self.sbed, 12000, 26000,
                       self.unmasked_fasta)
     print cns