Example #1
0
def task_scatter_quiver(self):
    p_ctg_fn = fn(self.p_ctg_fa)
    h_ctg_fn = fn(self.h_ctg_fa)
    out_json = fn(self.scattered_quiver_json)
    track_reads_h_done_fn = fn(self.track_reads_h_done)
    bam_dir = os.path.dirname(track_reads_h_done_fn)
    config = self.parameters['config']

    ref_seq_data = {}

    # I think this will crash if the file is empty. Maybe that is ok.
    p_ctg_fa = FastaReader(p_ctg_fn)
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'p'


    # I think this will crash if the file is empty. Maybe that is ok.
    h_ctg_fa = FastaReader(h_ctg_fn)
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'h'

    ctg_ids = sorted(ref_seq_data.keys())
    #p_ctg_out=[]
    #h_ctg_out=[]
    #job_done_plfs = {}
    jobs = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), m_ctg_id)
        ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_bam = os.path.join(bam_dir, '{ctg_id}.bam'.format(ctg_id = ctg_id))
        #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
        #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
        #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))

        if os.path.exists(read_bam):
            # *.sam are created in task_track_reads, fc_select_reads_from_bam.py
            # Network latency should not matter because we have already waited for the 'done' file.
            mkdir(wd)
            if not os.path.exists(ref_fasta):
                # TODO(CD): Up to 50MB of seq data. Should do this on remote host.
                #   See https://github.com/PacificBiosciences/FALCON_unzip/issues/59
                with open(ref_fasta,'w') as f:
                    print >>f, '>'+ctg_id
                    print >>f, sequence
            new_job = {}
            new_job['ctg_id'] = ctg_id
            new_job['ctg_types'] = ctg_types
            new_job['smrt_bin'] = config['smrt_bin']
            new_job['sge_option'] = config['sge_quiver']
            new_job['ref_fasta'] = ref_fasta
            new_job['read_bam'] = read_bam
            jobs.append(new_job)
    open(out_json, 'w').write(json.dumps(jobs))
Example #2
0
def add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta,
                            p_ctg_tiling_path, a_ctg_tiling_path,
                            min_p_len, min_a_len, gfa_graph):
    # Associate tiling paths are not deduplicated.
    # We need the headers of the final haplotigs to filter
    # out the unnecessary tiling paths.
    a_ctg_headers = set()
    f = FastaReader(a_ctg_fasta)
    for r in f:
        a_ctg_headers.add(r.name)

    # Associate tiling paths are not deduplicated.
    # We need the headers of the final haplotigs to filter
    # out the unnecessary tiling paths.
    a_ctg_headers = set()
    f = FastaReader(a_ctg_fasta)
    for r in f:
        a_ctg_headers.add(r.name)

    # Load and filter primary contig paths.
    p_paths, p_edge_to_ctg = load_tiling_paths(p_ctg_tiling_path, 'P')
    _, p_ctg_len = calc_tiling_paths_len(p_paths)
    p_paths = filter_tiling_paths_by_len(p_paths, p_ctg_len, min_p_len)
    for ctg_id, path in p_paths.iteritems():
        gfa_graph.add_tiling_path(path, ctg_id)

    # Load and filter associate contig paths.
    a_paths, a_edge_to_ctg = load_tiling_paths(a_ctg_tiling_path, 'A')
    _, a_ctg_len = calc_tiling_paths_len(a_paths)
    a_paths = filter_tiling_paths_by_len(a_paths, a_ctg_len, min_a_len)
    for ctg_id, path in a_paths.iteritems():
        if ctg_id in a_ctg_headers:
            gfa_graph.add_tiling_path(path, ctg_id)
Example #3
0
 def build_p_rdb_task(self):
     config = self.parameters["config"]
     pread_dir = self.parameters["pread_dir"]
     with open("%s/preads_norm.fasta" % pread_dir, "w") as p_norm:
         c = 0
         for fa_fn in open(fn(self.pread_fofn)).readlines():
             fa_fn = fa_fn.strip()
             f = FastaReader(fa_fn)
             for r in f:
                 if len(r.sequence) < config["length_cutoff_pr"]:
                     continue
                 name = r.name
                 name = name.replace("_", "")
                 print >> p_norm, ">prolog/%d/%d_%d" % (c, 0, len(
                     r.sequence))
                 for i in range(0, len(r.sequence) / 80):
                     print >> p_norm, r.sequence[i * 80:(i + 1) * 80]
                 print >> p_norm, r.sequence[(i + 1) * 80:]
                 c += 1
     os.system("cd %s; fasta2DB preads preads_norm.fasta" % pread_dir)
     os.system("cd %s; DBsplit %s preads" %
               (pread_dir, config["ovlp_DBsplit_option"]))
     os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" %
               (pread_dir, config["ovlp_HPCdaligner_option"]))
     os.system("cd %s; touch rdb_build_done" % pread_dir)
Example #4
0
    def build_p_rdb_task(self):
        config = self.parameters["config"]
        pread_dir = self.parameters["pread_dir"]
        with open("%s/preads_norm.fasta" % pread_dir, "w") as p_norm:
            c = 0
            for fa_fn in open(fn(self.pread_fofn)).readlines():
                fa_fn = fa_fn.strip()
                f = FastaReader(fa_fn)
                for r in f:
                    if len(r.sequence) < config["length_cutoff_pr"]:
                        continue
                    name = r.name
                    name = name.replace("_","")
                    print >> p_norm, ">prolog/%d/%d_%d" % ( c, 0, len(r.sequence) )
                    for i in range(0, len(r.sequence)/80):
                        print >> p_norm, r.sequence[ i *80 : (i + 1) * 80]
                    print >> p_norm, r.sequence[(i+1)*80:]
                    c += 1
        input_db = os.path.join(pread_dir,"preads.db")
        input_idx = os.path.join(pread_dir,".preads.idx")
        input_bps = os.path.join(pread_dir,".preads.bps")
        os.system("cd %s; fasta2DB preads preads_norm.fasta" % pread_dir)
        os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"]))
        os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"]))
	#copy the DB files to tmpdir_for_daligner_input, just to reduce IO burden on storage node
	#use '| true' to let it time out silently in case some nodes are not responsive
	for i in config["node_template"]:
	    os.system("timeout %s ssh -f %s mkdir %s | true" %(config["ssh_timeout"],i, config["tmpdir_for_daligner_input"]))
	    os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_db,config["tmpdir_for_daligner_input"]))
	    os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_idx,config["tmpdir_for_daligner_input"]))
	    os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_bps,config["tmpdir_for_daligner_input"]))
        os.system("cd %s; touch rdb_build_done" % pread_dir)
Example #5
0
    def build_p_rdb_task(self):
        config = self.parameters["config"]
        pread_dir = self.parameters["pread_dir"]
        fa_serial = 0
        for fa_fn in open(fn(self.pread_fofn)).readlines():
            fa_fn = fa_fn.strip()
            c = 0
            fa_serial += 1
            with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm:
                f = FastaReader(fa_fn)
                for r in f:
                    if len(r.sequence) < config["length_cutoff_pr"]:
                        continue
                    name = r.name
                    name = name.replace("_","")
                    ignore_read = False
                    for  cc in r.sequence:
                        if cc not in ["A","C","G","T"]:
                            ignore_read = True
                            break
                    if ignore_read:
                        continue
                    print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) )
                    for i in range(0, len(r.sequence)/80):
                        print >> p_norm, r.sequence[ i *80 : (i + 1) * 80]
                    print >> p_norm, r.sequence[(i+1)*80:]
                    c += 1
            os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) )

        os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"]))
        os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"]))
        os.system("cd %s; touch rdb_build_done" % pread_dir)
Example #6
0
def load_sg_seq(all_read_ids, fasta_fn):

    seqs = {}
    # load all p-read name into memory
    f = FastaReader(fasta_fn)
    for r in f:
        if r.name not in all_read_ids:
            continue
        seqs[r.name] = r.sequence.upper()
    return seqs
Example #7
0
def main(*argv):
    ctg_g = nx.DiGraph()
    ctg_path = {}
    with open("p_ctg_tiling_path") as f:
        for row in f:
            row = row.strip().split()
            ctg_id, v, w, edge_rid, b, e = row[:6]
            ctg_path.setdefault(ctg_id, [])
            ctg_path[ctg_id].append((v, w))
            ctg_g.add_edge(v, w)

    padding_read_ids = set()
    for ctg_id in ctg_path:
        left_end = ctg_path[ctg_id][0][0]
        if ctg_g.in_degree(left_end) == 0:
            left_read = left_end.split(":")[0]
            padding_read_ids.add(left_read)

    f = FastaReader("preads4falcon.fasta")
    padding_reads = {}
    for r in f:
        if r.name not in padding_read_ids:
            continue
        else:
            padding_reads[r.name] = r.sequence

    p_ctg_seq = {}
    f = FastaReader("p_ctg.fa")
    for r in f:
        p_id = r.name.split()[0]
        p_ctg_seq[p_id] = r.sequence
        left_end = ctg_path[p_id][0][0]
        left_read, end = left_end.split(":")
        if left_read in padding_reads:
            seq = padding_reads[left_read]
            if end == "B":
                seq = rc_seq(seq)
            print ">" + p_id + "_p"
            print seq + r.sequence
        else:
            print ">" + p_id
            print r.sequence
Example #8
0
def main(argv=sys.argv):
    args = parse_args(argv)
    reads = FastaReader("a_ctg_all.fa")
    with open("a_ctg.fa","w") as f:
        for r in reads:
            tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
            if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\
               abs(int(delta_l)) < args.min_len_diff:
                   continue
            print >>f, ">"+r.name
            print >>f, r.sequence
Example #9
0
def load_seqs(fasta_fn, store_only_seq_len):
    """
    If store_only_seq_len is True, then the seq is discarded and
    only it's length stored.
    """
    seqs = {}
    f = FastaReader(fasta_fn)
    if store_only_seq_len == False:
        for r in f:
            seqs[r.name.split()[0]] = (len(r.sequence), r.sequence.upper())
    else:
        for r in f:
            seqs[r.name.split()[0]] = (len(r.sequence), '*')
    return seqs
Example #10
0
def main(argv=None):
    p_ctg_coor_map = {}
    with open("p_ctg_tiling_path") as f:
        for row in f:
            row = row.strip().split()
            ctg_id, v, w, edge_rid, b, e = row[:6]
            if ctg_id not in p_ctg_coor_map:
                coor = 0  # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
                p_ctg_coor_map[ctg_id] = {}
                p_ctg_coor_map[ctg_id][v] = 0
                coor += abs(int(b) - int(e))
                p_ctg_coor_map[ctg_id][w] = coor
                continue
            else:
                coor += abs(int(b) - int(e))
                p_ctg_coor_map[ctg_id][w] = coor

    a_ctg_fasta = FastaReader("a_ctg.fa")
    for r in a_ctg_fasta:
        rid = r.name.split()
        rid, v, w = rid[:3]
        pid = rid.split("-")[0]
        print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
Example #11
0
from falcon_kit.FastaReader import FastaReader

import sys

f = FastaReader(sys.argv[1])
rl = set(open(sys.argv[2]).read().split())
for r in f:
    rid = r.name.split()[0]
    if rid not in rl:
        continue
    print ">" + rid
    print r.sequence
Example #12
0
def task_scatter_quiver(self):
    p_ctg_fn = fn(self.p_ctg_fa)
    h_ctg_fn = fn(self.h_ctg_fa)
    out_json = fn(self.scattered_quiver_json)

    ref_seq_data = {}

    # I think this will crash if the file is empty. Maybe that is ok.
    p_ctg_fa = FastaReader(p_ctg_fn)
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'p'

    # I think this will crash if the file is empty. Maybe that is ok.
    h_ctg_fa = FastaReader(h_ctg_fn)
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'h'

    ctg_ids = sorted(ref_seq_data.keys())
    #p_ctg_out=[]
    #h_ctg_out=[]
    #job_done_plfs = {}
    jobs = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_sam = os.path.join(
            os.getcwd(), './4-quiver/reads/'
            '{ctg_id}.sam'.format(ctg_id=ctg_id))
        #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
        #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
        #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))

        if os.path.exists(
                read_sam
        ):  # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency?
            #if ctg_types[ctg_id] == 'p':
            #    p_ctg_out.append( (cns_fasta, cns_fastq) )
            #if ctg_types[ctg_id] == 'h':
            #    h_ctg_out.append( (cns_fasta, cns_fastq) )
            mkdir(wd)
            if not os.path.exists(fn(ref_fasta)):
                with open(fn(ref_fasta), 'w') as f:
                    print >> f, '>' + ctg_id
                    print >> f, sequence
            #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
            #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam},
            #                           outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
            #                           parameters = parameters,
            #                           )
            #quiver_task = make_quiver_task(task_run_quiver)
            #wf.addTask(quiver_task)
            #job_done_plfs['{}'.format(ctg_id)] = job_done
            new_job = {}
            new_job['ctg_id'] = ctg_id
            jobs.append(new_job)
    open(out_json, 'w').write(json.dumps(jobs))
Example #13
0
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, '3-unzip/reads')

    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')

    rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')
    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')

    rid_to_oid = open(rawread_id_file).read().split('\n')  #daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split('\n')  #daligner pread id to the fake ids

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split('/')[1])/10
        return rid_to_oid[int(rid)]

    ref_fasta = FastaReader(ctg_fa)
    all_ctg_ids = set()
    for s in ref_fasta:
        s_id = s.name.split()[0]
        if ctg_id != 'all' and s_id != ctg_id:
            continue

        if len(s.sequence) < min_ctg_lenth:
            continue

        if ctg_id != 'all':
            ref_out = open( os.path.join( out_dir, '%s_ref.fa' % ctg_id), 'w' )
        else:
            ref_out = open( os.path.join( out_dir, '%s_ref.fa' % s_id), 'w' )

        print >>ref_out, '>%s' % s_id
        print >>ref_out, s.sequence
        all_ctg_ids.add(s_id)
        ref_out.close()


    read_set = {}
    ctg_id_hits = {}

    map_fn = os.path.join(read_map_dir,'rawread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if int(row[3]) == 0:
                o_id = rid_to_oid[int(row[0])]
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    map_fn = os.path.join(read_map_dir,'pread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if hit_ctg not in read_set and int(row[3]) == 0:
                o_id = pid_to_oid(row[0])
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1


    with open(os.path.join( out_dir, 'ctg_list'),'w') as f:
        for ctg_id in sorted(list(all_ctg_ids)):
            if ctg_id_hits.get(ctg_id, 0) < 5:
                continue
            if ctg_id[-1] not in ['F', 'R']: #ignore small circle contigs, they need different approach
                continue
            print >>f, ctg_id

    read_out_files = {}
    with open(read_fofn, 'r') as f:
        for r_fn in f:
            r_fn = r_fn.strip()
            read_fa_file = FastaReader(r_fn)
            for r in read_fa_file:
                rid = r.name.split()[0]
                if rid not in read_set:
                    ctg_id = 'unassigned'
                else:
                    ctg_id = read_set[rid]

                if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
                    ctg_id = 'unassigned'

                if ctg_id not in read_out_files:
                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' )
                    read_out_files[ctg_id] = 1
                else:
                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' )

                print >>read_out, '>'+rid
                print >>read_out, r.sequence
                read_out.close()
Example #14
0
                        default="./2-asm-falcon/read_maps",
                        help='path to the read-contig map directory')
    parser.add_argument(
        '--base_dir',
        default="./3-unzip/reads",
        type=str,
        help='the output base_dir, default to current working directory')

    args = parser.parse_args()
    read_fofn = args.fofn
    ctg_fa = args.ctg_fa
    ctg_id = args.ctg_id
    read_map_dir = args.read_map_dir
    base_dir = args.base_dir

    ref_fasta = FastaReader(ctg_fa)
    all_ctg_ids = set()
    for s in ref_fasta:
        s_id = s.name.split()[0]
        if ctg_id != "all" and s_id != ctg_id:
            continue

        if len(s.sequence) < 20000:
            continue
        if ctg_id != "all":
            ref_out = open(os.path.join(base_dir, "%s_ref.fa" % ctg_id), "w")
        else:
            ref_out = open(os.path.join(base_dir, "%s_ref.fa" % s_id), "w")

        print >> ref_out, ">%s" % s_id
        print >> ref_out, s.sequence
missing_monomer_file = open(filename_root + "_missing_monomer.fa", 'w')
regular_pattern_file = open(filename_root + "_regularHORs_pattern.txt", 'w')
irregular_pattern_file = \
    open(filename_root + "_irregularHORs_pattern.txt", 'w')
inversions_pattern_file = open(filename_root + "_inversions_pattern.txt", 'w')
stats_file = open(filename_root + "_stats.txt", 'w')
stats_file.write(header + "\n")

# Print parameters
print "Average monomer length: ", avg_monomer_len
print "Max head-to-tail distance: ", allowed_max_head_to_tail
print "Shortest read length: ", len_threshold
print "Clustering thresolds: ", identity_thresholds

# IMPORT FASTA FILES #
for r in FastaReader(pread_filename):
    # Load all reads from the pread file seq_db.
    # seq_db[Read_ID] = sequence
    if len(r.sequence) < len_threshold:
        too_short_reads_file.write(">" + r.name + "\n" + r.sequence + "\n")
        continue
    seq_db[r.name] = r.sequence

# Load all monomers found in preads into monomer_db.
# monomer_db[Read_ID] = [(start, end), sequence]
for r in FastaReader(inferred_monomer_filename):
    # Parse the read tag.
    # Tag Format: ReadID/RangeStart_RangeEnd/Orientation
    rid, rng, orientation = r.name.split("/")
    # Skip if the read doesn't have any monomers.
    if rid not in seq_db:
Example #16
0
def main(args):

    ctg_id = sys.argv[1]

    if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id=ctg_id)):
        os.system(
            "nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln".
            format(ctg_id=ctg_id))
        os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor")
    else:
        sys.exit(
            0
        )  #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow

    if os.path.exists("hp_aln.coor"):
        filter_out = set()
        with open("hp_aln.coor") as f:
            for row in f:
                row = row.strip().split()
                q_cov = float(row[10])
                idt = float(row[6])
                if q_cov > 99 and idt > 99.9:
                    filter_out.add(row[-1])

        p_ctg_to_phase = {}
        with open("p_ctg_path.%s" % ctg_id) as f:
            for row in f:
                row = row.strip().split()
                b_id, ph_id = (int(row[-2]), int(row[-1]))
                p_ctg_to_phase.setdefault(row[0], {})
                p_ctg_to_phase[row[0]].setdefault((b_id, ph_id), 0)
                p_ctg_to_phase[row[0]][(b_id, ph_id)] += 1

        h_ctg_to_phase = {}
        with open("h_ctg_path.%s" % ctg_id) as f:
            for row in f:
                row = row.strip().split()
                b_id, ph_id = (int(row[-2]), int(row[-1]))
                h_ctg_to_phase.setdefault(row[0], {})
                h_ctg_to_phase[row[0]].setdefault((b_id, ph_id), 0)
                h_ctg_to_phase[row[0]][(b_id, ph_id)] += 1

        h_ids = open("h_ctg_ids.%s" % ctg_id, "w")
        with open("h_ctg.%s.fa" % ctg_id, "w") as f:
            h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
            for r in h_tig_all:
                p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})

                if len(r.sequence) < 500:
                    continue

                if r.name in filter_out:
                    edge_count = sum(h_ctg_to_phase[r.name].values())
                    same_phase_to_p_ctg_count = 0
                    for b_id, ph_id in h_ctg_to_phase[r.name]:
                        if b_id != -1:
                            if (b_id, ph_id) in p_ctg_phase:
                                same_phase_to_p_ctg_count += h_ctg_to_phase[
                                    r.name][(b_id, ph_id)]
                    unphased_edge_count = h_ctg_to_phase[r.name].get((-1, 0),
                                                                     0)

                    print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count
                    if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5:  # there are many non-p_ctg phase segment, do not filter out
                        continue

                print >> f, ">" + r.name
                print >> f, r.sequence
                print >> h_ids, r.name
        h_ids.close()
    reads_in_layout = set()
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l
            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

    seqs = {}
    # load all p-read name into memory
    f = FastaReader(read_fasta)
    for r in f:
        if r.name not in reads_in_layout:
            continue
        seqs[r.name] = r.sequence.upper()


    edge_data = {}
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l

            if type_ != "G":
                continue
Example #18
0
                        help='contig identifier in the bam file',
                        required=True)
    parser.add_argument(
        '--base_dir',
        type=str,
        default="./",
        help='the output base_dir, default to current working directory')

    args = parser.parse_args()
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    PypeThreadWorkflow.setNumThreadAllowed(1, 1)
    wf = PypeThreadWorkflow()

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id,
                                               "variant_map"))
    vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id,
                                               "variant_pos"))
    q_id_map_file = makePypeLocalFile(
        os.path.join(base_dir, ctg_id, "q_id_map"))
Example #19
0
                    dest='min_monomer_length',
                    help='Minimum monomer length')

parser.add_argument('--version', action='version', version='%(prog)s 0.2')

results = parser.parse_args()
in_seq_file = results.fasta_file
hmm_model_fwd = results.hmm_file_fwd
hmm_model_rev = results.hmm_file_rev
mono_len_threshold = results.min_monomer_length
monomers_file = in_seq_file.replace(".fa", "_inferred_monomers.fa")

# Call hmmsearch, build hmms based on consensus alignments
os.system(
    "rm -f hmmoutF.tbl hmmoutF.out; hmmsearch --cpu 8 --tblout hmmoutF.tbl -o hmmoutF.out  --notextw %s %s"
    % (hmm_model_fwd, in_seq_file))
os.system(
    "rm -f hmmoutR.tbl hmmoutR.out; hmmsearch --cpu 8 --tblout hmmoutR.tbl -o hmmoutR.out  --notextw %s %s"
    % (hmm_model_rev, in_seq_file))

seq_db = {}
for r in FastaReader(in_seq_file):
    seq = r.sequence
    seq_db[r.name] = seq

parseHMMout("hmmoutF.out", "inferred_monomers_F.zzz", "F")
parseHMMout("hmmoutR.out", "inferred_monomers_R.zzz", "R")
os.system(
    "cat inferred_monomers_F.zzz inferred_monomers_R.zzz > inferred_monomers.fa; rm inferred_monomers_F.zzz inferred_monomers_R.zzz"
)
Example #20
0
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):

    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, "3-unzip/reads")

    ctg_fa = os.path.join(base_dir, "2-asm-falcon/p_ctg.fa")
    read_map_dir = os.path.join(base_dir, "2-asm-falcon/read_maps")

    rawread_id_file = os.path.join(read_map_dir, "raw_read_ids")
    pread_id_file = os.path.join(read_map_dir, "pread_ids")

    rid_to_oid = open(rawread_id_file).read().split(
        "\n")  #daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split(
        "\n")  #daligner pread id to the fake ids

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split("/")[1]) / 10
        return rid_to_oid[int(rid)]

    ref_fasta = FastaReader(ctg_fa)
    all_ctg_ids = set()
    for s in ref_fasta:
        s_id = s.name.split()[0]
        if ctg_id != "all" and s_id != ctg_id:
            continue

        if len(s.sequence) < min_ctg_lenth:
            continue

        if ctg_id != "all":
            ref_out = open(os.path.join(out_dir, "%s_ref.fa" % ctg_id), "w")
        else:
            ref_out = open(os.path.join(out_dir, "%s_ref.fa" % s_id), "w")

        print >> ref_out, ">%s" % s_id
        print >> ref_out, s.sequence
        all_ctg_ids.add(s_id)
        ref_out.close()

    read_set = {}
    ctg_id_hits = {}

    map_fn = os.path.join(read_map_dir, "rawread_to_contigs")
    with open(map_fn, "r") as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split("-")[0]
            if int(row[3]) == 0:
                o_id = rid_to_oid[int(row[0])]
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    map_fn = os.path.join(read_map_dir, "pread_to_contigs")
    with open(map_fn, "r") as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split("-")[0]
            if hit_ctg not in read_set and int(row[3]) == 0:
                o_id = pid_to_oid(row[0])
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    with open(os.path.join(out_dir, "ctg_list"), "w") as f:
        for ctg_id in sorted(list(all_ctg_ids)):
            if ctg_id_hits.get(ctg_id, 0) < 5:
                continue
            if ctg_id[-1] not in [
                    "F", "R"
            ]:  #ignore small circle contigs, they need different approach
                continue
            print >> f, ctg_id

    read_out_files = {}
    with open(read_fofn, "r") as f:
        for r_fn in f:
            r_fn = r_fn.strip()
            read_fa_file = FastaReader(r_fn)
            for r in read_fa_file:
                rid = r.name.split()[0]
                if rid not in read_set:
                    ctg_id = "unassigned"
                else:
                    ctg_id = read_set[rid]

                if ctg_id == "NA" or ctg_id not in all_ctg_ids:
                    ctg_id = "unassigned"

                if ctg_id not in read_out_files:
                    read_out = open(
                        os.path.join(out_dir, "%s_reads.fa" % ctg_id), "w")
                    read_out_files[ctg_id] = 1
                else:
                    read_out = open(
                        os.path.join(out_dir, "%s_reads.fa" % ctg_id), "a")

                print >> read_out, ">" + rid
                print >> read_out, r.sequence
                read_out.close()
Example #21
0
def phasing(args):
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    PypeThreadWorkflow.setNumThreadAllowed(1, 1)
    wf = PypeThreadWorkflow()

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id,
                                               "variant_map"))
    vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id,
                                               "variant_pos"))
    q_id_map_file = makePypeLocalFile(
        os.path.join(base_dir, ctg_id, "q_id_map"))
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir

    make_het_call_task = PypeTask(
        inputs={"bam_file": bam_file},
        outputs={
            "vmap_file": vmap_file,
            "vpos_file": vpos_file,
            "q_id_map_file": q_id_map_file
        },
        parameters=parameters,
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/het_call")(make_het_call)

    wf.addTasks([make_het_call_task])

    atable_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "atable"))
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["base_dir"] = base_dir
    generate_association_table_task = PypeTask(
        inputs={"vmap_file": vmap_file},
        outputs={"atable_file": atable_file},
        parameters=parameters,
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/g_atable")(generate_association_table)

    wf.addTasks([generate_association_table_task])

    phased_variant_file = makePypeLocalFile(
        os.path.join(base_dir, ctg_id, "phased_variants"))
    get_phased_blocks_task = PypeTask(
        inputs={
            "vmap_file": vmap_file,
            "atable_file": atable_file
        },
        outputs={"phased_variant_file": phased_variant_file},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/get_phased_blocks")(get_phased_blocks)
    wf.addTasks([get_phased_blocks_task])

    phased_read_file = makePypeLocalFile(
        os.path.join(base_dir, ctg_id, "phased_reads"))
    get_phased_reads_task = PypeTask(
        inputs={
            "vmap_file": vmap_file,
            "q_id_map_file": q_id_map_file,
            "phased_variant_file": phased_variant_file
        },
        outputs={"phased_read_file": phased_read_file},
        parameters={"ctg_id": ctg_id},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/get_phased_reads")(get_phased_reads)
    wf.addTasks([get_phased_reads_task])

    wf.refreshTargets()
Example #22
0
    config = {
        "job_type": job_type,
        "sge_quiver": sge_quiver,
        "smrt_bin": smrt_bin
    }

    support.job_type = "SGE"  #tmp hack until we have a configuration parser

    ctg_ids = []

    PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs,
                                           quiver_concurrent_jobs)
    wf = PypeThreadWorkflow()

    ref_seq_data = {}
    p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa")
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "p"

    h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa")
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "h"

    ctg_ids = sorted(ref_seq_data.keys())
    p_ctg_out = []
    h_ctg_out = []
Example #23
0
def make_variant_candidates( args ):

    bam_file_fn = args.bam_file_fn
    pm_count_fn = args.pm_count_fn
    threshold = args.threshold
    min_cov = args.min_cov
    ctg_name = args.ctg_name
    samtools = args.samtools
    ref_fasta_fn = args.ref_fasta_fn

    # assume the ref.fa has only one reference, the name does not mattere, we only read the first one
    ref_seq = None
    for r in FastaReader(ref_fasta_fn):
        if r.name != ctg_name:
            continue
        ref_seq = r.sequence

    if ref_seq == None:
        print >> sys.stderr, "Can't get reference sequence"
        sys.exit(1)

    # maybe we should check if the samtools path is valid
    p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn ) ), stdout=subprocess.PIPE)
    pileup = {}

    pm_count_f = open(pm_count_fn, "w")

    for l in p.stdout:
        l = l.strip().split()
        if l[0][0] == "@":
            continue

        QNAME = l[0]
        RNAME = l[2]

        if RNAME != ctg_name:
            continue

        FLAG = int(l[1])
        POS = int(l[3]) - 1 #make it zero base to match sequence index 
        CIGAR = l[5]
        SEQ = l[9]
        rp = POS
        qp = 0

        skip_base = 0
        total_aln_pos = 0
        for m in re.finditer(cigar_re, CIGAR):
            adv = int(m.group(1))
            total_aln_pos += adv
            if m.group(2)  == "S":
                skip_base += adv

        if 1.0 - 1.0 * skip_base / (total_aln_pos+1) < 0.50: #if a read is less than 50% aligned, skip 
            continue

        for m in re.finditer(cigar_re, CIGAR):

            adv = int(m.group(1))

            if m.group(2) == "S":
                qp += adv

            if m.group(2) in ("M", "=", "X"):
                matches = []
                for i in range(adv):
                    matches.append( (rp, SEQ[qp]) )
                    rp += 1
                    qp += 1
                for pos, b in matches:
                    pileup.setdefault(pos, {"A":0, "C":0, "G":0, "T":0})
                    pileup[pos][b] += 1
            elif m.group(2) == "I":
                for i in range(adv):
                    qp += 1
            elif m.group(2) == "D":
                for i in range(adv):
                    rp += 1

        pos_k = pileup.keys()
        pos_k.sort()

        th = threshold
        for pos in pos_k:
            if pos < POS:  # output pileup informaiton before POS which is the current head of the ref 
                base_count = pileup[pos].items()
                ref_base = ref_seq[pos]
                out = output_count(pos, base_count, ref_base, min_cov, th)
                if out != None:
                    total_count, out_line = out
                    print >> pm_count_f, out_line

                del pileup[pos]

    # for the last one
    th = threshold
    pos_k = pileup.keys()
    pos_k.sort()
    for pos in pos_k:
        base_count = pileup[pos].items()
        ref_base = ref_seq[pos]
        out = output_count(pos, base_count, ref_base, min_cov, th)
        if out != None:
            total_count, out_line = out
            print >> pm_count_f, out_line

        del pileup[pos]
Example #24
0
def phasing(args):
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    samtools = args.samtools

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    wf = PypeProcWatcherWorkflow(
            max_jobs=1,
    )

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir
    parameters["samtools"] = samtools

    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
                         parameters = parameters,
    ) (make_het_call)

    wf.addTasks([make_het_call_task])




    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["base_dir"] = base_dir
    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
                                      outputs = { "atable_file": atable_file },
                                      parameters = parameters,
    ) (generate_association_table)

    wf.addTasks([generate_association_table_task])




    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
                                      outputs = { "phased_variant_file": phased_variant_file },
    ) (get_phased_blocks)
    wf.addTasks([get_phased_blocks_task])




    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
                                                 "q_id_map_file": q_id_map_file,
                                                 "phased_variant_file": phased_variant_file },
                                      outputs = { "phased_read_file": phased_read_file },
                                      parameters = {"ctg_id": ctg_id},
    ) (get_phased_reads)
    wf.addTasks([get_phased_reads_task])


    wf.refreshTargets()
def output_aln_tensor(args):

    bam_file_fn = args.bam_file_fn
    pm_count_fn = args.pm_count_fn
    ctg_name = args.ctg_name
    samtools = args.samtools
    ref_fasta_fn = args.ref_fasta_fn

    # assume the ref.fa has only one reference, the name does not mattere, we only read the first one
    ref_seq = None
    for r in FastaReader(ref_fasta_fn):
        if r.name != ctg_name:
            continue
        ref_seq = r.sequence

    if ref_seq == None:
        print >> sys.stderr, "Can't get reference sequence"
        sys.exit(1)

    begin2end = {}
    with open(pm_count_fn) as f:
        for row in f.readlines():
            row = row.strip().split()
            pos = int(row[0])
            begin2end[pos - 8] = (pos + 8, pos)

    # maybe we should check if the samtools path is valid
    p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn)),
                         stdout=subprocess.PIPE)

    center_to_aln = {}

    for l in p.stdout:
        l = l.strip().split()
        if l[0][0] == "@":
            continue

        QNAME = l[0]
        FLAG = int(l[1])
        RNAME = l[2]
        POS = int(l[3]) - 1  #make it zero base to match sequence index
        CIGAR = l[5]
        SEQ = l[9]
        rp = POS
        qp = 0

        end_to_center = {}
        active_set = set()

        for m in re.finditer(cigar_re, CIGAR):
            adv = int(m.group(1))
            if m.group(2) == "S":
                qp += adv
            if m.group(2) in ("M", "=", "X"):
                matches = []
                for i in xrange(adv):
                    matches.append((rp, SEQ[qp]))

                    if rp in begin2end:
                        r_end, r_center = begin2end[rp]
                        end_to_center[r_end] = r_center
                        active_set.add(r_center)
                        center_to_aln.setdefault(r_center, [])
                        center_to_aln[r_center].append([])

                    for center in list(active_set):
                        center_to_aln[center][-1].append(
                            (rp, qp, ref_seq[rp], SEQ[qp]))

                    if rp in end_to_center:
                        center = end_to_center[rp]
                        active_set.remove(center)

                    rp += 1
                    qp += 1

            elif m.group(2) == "I":
                for i in range(adv):
                    for center in list(active_set):
                        center_to_aln[center][-1].append(
                            (rp, qp, "-", SEQ[qp]))
                    qp += 1

            elif m.group(2) == "D":
                for i in xrange(adv):
                    for center in list(active_set):
                        center_to_aln[center][-1].append(
                            (rp, qp, ref_seq[rp], "-"))

                    if rp in begin2end:
                        r_end, r_center = begin2end[rp]
                        end_to_center[r_end] = r_center
                        active_set.add(r_center)
                        center_to_aln.setdefault(r_center, [])
                        center_to_aln[r_center].append([])

                    if rp in end_to_center:
                        center = end_to_center[rp]
                        active_set.remove(center)

                    rp += 1

        for center in center_to_aln.keys():
            if center + 8 < POS:
                t_line = generate_aln_count_tensor(center_to_aln[center],
                                                   center, ref_seq)
                print t_line
                del center_to_aln[center]

    for center in center_to_aln.keys():
        if center + 8 < POS:
            t_line = generate_aln_count_tensor(center_to_aln[center], center,
                                               ref_seq)
            print t_line
Example #26
0
def main(*argv):
    reads_in_layout = set()
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l
            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

    seqs = {}
    # load all p-read name into memory
    f = FastaReader(read_fasta)
    for r in f:
        if r.name not in reads_in_layout:
            continue
        seqs[r.name] = r.sequence.upper()

    edge_data = {}
    with open(edge_data_file) as f:
        for l in f:
            l = l.strip().split()
            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
            v, w, rid, s, t, aln_score, idt, type_ = l

            if type_ != "G":
                continue
            r1 = v.split(":")[0]
            reads_in_layout.add(r1)
            r2 = w.split(":")[0]
            reads_in_layout.add(r2)

            s = int(s)
            t = int(t)
            aln_score = int(aln_score)
            idt = float(idt)

            if s < t:
                e_seq = seqs[rid][s:t]
            else:
                e_seq = "".join([RCMAP[c] for c in seqs[rid][s:t:-1]])
            edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)

    utg_data = {}
    with open(utg_data_file) as f:
        for l in f:
            l = l.strip().split()
            s, v, t, type_, length, score, path_or_edges = l
            if type_ not in ["compound", "simple", "contained"]:
                continue
            length = int(length)
            score = int(score)
            if type_ in ("simple", "contained"):
                path_or_edges = path_or_edges.split("~")
            else:
                path_or_edges = [
                    tuple(e.split("~")) for e in path_or_edges.split("|")
                ]
            utg_data[(s, v, t)] = type_, length, score, path_or_edges

    p_ctg_out = open("p_ctg.fa", "w")
    a_ctg_out = open("a_ctg_all.fa", "w")
    a_ctg_base_out = open("a_ctg_base.fa", "w")
    p_ctg_t_out = open("p_ctg_tiling_path", "w")
    a_ctg_t_out = open("a_ctg_tiling_path", "w")
    a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w")
    layout_ctg = set()

    with open(ctg_data_file) as f:
        for l in f:
            l = l.strip().split()
            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
            ctg_id = ctg_id
            s0 = i_utig.split("~")[0]

            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
                continue
            else:
                layout_ctg.add((s0, t0))

            ctg_label = i_utig + "~" + t0
            length = int(length)
            utgs = utgs.split("|")
            one_path = []
            total_score = 0
            total_length = 0

            #a_ctg_data = []
            a_ctg_group = {}

            for utg in utgs:
                s, v, t = utg.split("~")
                type_, length, score, path_or_edges = utg_data[(s, v, t)]
                total_score += score
                total_length += length
                if type_ == "simple":
                    if len(one_path) != 0:
                        one_path.extend(path_or_edges[1:])
                    else:
                        one_path.extend(path_or_edges)
                if type_ == "compound":

                    c_graph = nx.DiGraph()

                    all_alt_path = []
                    for ss, vv, tt in path_or_edges:
                        type_, length, score, sub_path = utg_data[(ss, vv, tt)]

                        v1 = sub_path[0]
                        for v2 in sub_path[1:]:
                            c_graph.add_edge(v1,
                                             v2,
                                             e_score=edge_data[(v1, v2)][3])
                            v1 = v2

                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
                    all_alt_path.append((score, shortest_path))

                    #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
                    while 1:
                        n0 = shortest_path[0]
                        for n1 in shortest_path[1:]:
                            c_graph.remove_edge(n0, n1)
                            n0 = n1
                        try:
                            shortest_path = nx.shortest_path(
                                c_graph, s, t, "e_score")
                            score = nx.shortest_path_length(
                                c_graph, s, t, "e_score")
                            #a_ctg_data.append( (s, t, shortest_path) )
                            all_alt_path.append((score, shortest_path))

                        except nx.exception.NetworkXNoPath:
                            break
                        #if len(shortest_path) < 2:
                        #    break
                    all_alt_path.sort()
                    all_alt_path.reverse()
                    shortest_path = all_alt_path[0][1]
                    if len(one_path) != 0:
                        one_path.extend(shortest_path[1:])
                    else:
                        one_path.extend(shortest_path)

                    a_ctg_group[(s, t)] = all_alt_path

            if len(one_path) == 0:
                continue

            one_path_edges = zip(one_path[:-1], one_path[1:])

            sub_seqs = []
            for vv, ww in one_path_edges:
                rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                sub_seqs.append(e_seq)
                print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % (
                    ctg_id, vv, ww, rid, s, t, aln_score, idt)
            print >> p_ctg_out, ">%s %s %s %d %d" % (
                ctg_id, ctg_label, c_type_, total_length, total_score)
            print >> p_ctg_out, "".join(sub_seqs)

            a_id = 1
            for v, w, in a_ctg_group:
                #get the base sequence used in the primary contig
                #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] )
                #if count < 2:
                #    continue
                atig_output = []

                score, atig_path = a_ctg_group[(v, w)][0]
                atig_path_edges = zip(atig_path[:-1], atig_path[1:])
                sub_seqs = []
                total_length = 0
                total_score = 0
                for vv, ww in atig_path_edges:
                    rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                    sub_seqs.append(e_seq)
                    total_length += abs(s - t)
                    total_score += aln_score

                base_seq = "".join(sub_seqs)
                atig_output.append((v, w, atig_path, total_length, total_score,
                                    base_seq, atig_path_edges, 0, 1, 1))

                for score, atig_path in a_ctg_group[(v, w)][1:]:
                    atig_path_edges = zip(atig_path[:-1], atig_path[1:])
                    sub_seqs = []
                    total_length = 0
                    total_score = 0
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        sub_seqs.append(e_seq)
                        total_length += abs(s - t)
                        total_score += aln_score

                    seq = "".join(sub_seqs)

                    delta_len = len(seq) - len(base_seq)
                    idt = 0.0
                    cov = 0.0
                    if len(base_seq) > 2000 and len(seq) > 2000:
                        aln_data, x, y = get_aln_data(base_seq, seq)
                        if len(aln_data) != 0:
                            idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][
                                -2]
                            cov = 1.0 * (aln_data[-1][3] -
                                         aln_data[-1][2]) / aln_data[-1][4]

                    atig_output.append(
                        (v, w, atig_path, total_length, total_score, seq,
                         atig_path_edges, delta_len, idt, cov))

                if len(atig_output) == 1:
                    continue

                sub_id = 0
                for data in atig_output:
                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
                    for vv, ww in atig_path_edges:
                        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
                        if sub_id != 0:
                            print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                aln_score, idt)
                        else:
                            print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (
                                ctg_id, a_id, sub_id, vv, ww, rid, s, t,
                                aln_score, idt)

                    if sub_id != 0:
                        print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id,
                            v0, w0, total_length, total_score,
                            len(atig_path_edges), delta_len, a_idt, cov)
                        print >> a_ctg_out, seq
                    else:
                        print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (
                            ctg_id, a_id, sub_id,
                            v0, w0, total_length, total_score,
                            len(atig_path_edges), delta_len, a_idt, cov)
                        print >> a_ctg_base_out, seq

                    sub_id += 1

                a_id += 1

    a_ctg_out.close()
    a_ctg_base_out.close()
    p_ctg_out.close()
    a_ctg_t_out.close()
    a_ctg_base_t_out.close()
    a_ctg_t_out.close()
    p_ctg_t_out.close()
Example #27
0
def main(argv=sys.argv):

    global fc_run_logger
    fc_run_logger = support.setup_logger(None)

    if len(sys.argv) < 2:
        print "you need to provide a configuration file to specific a couple cluster running environment"
        sys.exit(1)

    config_fn = sys.argv[1]

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = "SGE"
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = " -pe smp 12 -q bigmem"
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = " -pe smp 24 -q bigmem "
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = "/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/"
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = "input_bam.fofn"
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        "job_type": job_type,
        "sge_quiver": sge_quiver,
        "sge_track_reads": sge_track_reads,
        "input_bam_fofn": input_bam_fofn,
        "smrt_bin": smrt_bin
    }

    support.job_type = "SGE"  #tmp hack until we have a configuration parser

    ctg_ids = []

    PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs,
                                           quiver_concurrent_jobs)
    wf = PypeThreadWorkflow()

    parameters = {"wd": os.path.abspath("."), "config": config}
    hasm_done = makePypeLocalFile("./3-unzip/1-hasm/hasm_done")
    job_done = makePypeLocalFile(
        os.path.join(parameters["wd"], "track_reads_h_done"))
    make_track_reads_task = PypeTask(inputs={"hasm_done": hasm_done},
                                     outputs={"job_done": job_done},
                                     parameters=parameters,
                                     TaskType=PypeThreadTaskBase,
                                     URL="task://localhost/track_reads_h")
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ref_seq_data = {}
    p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa")
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "p"

    h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa")
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "h"

    ctg_ids = sorted(ref_seq_data.keys())
    p_ctg_out = []
    h_ctg_out = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split("-")[0]
        wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id)
        mkdir(wd)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), "./4-quiver/reads/"
                "{ctg_id}.sam".format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id)))

        if os.path.exists(fn(read_sam)):
            if ctg_types[ctg_id] == "p":
                p_ctg_out.append((cns_fasta, cns_fastq))
            if ctg_types[ctg_id] == "h":
                h_ctg_out.append((cns_fasta, cns_fastq))
            if not os.path.exists(fn(ref_fasta)):
                with open(fn(ref_fasta), "w") as f:
                    print >> f, ">" + ctg_id
                    print >> f, sequence
            parameters = {
                "job_uid": "q-" + ctg_id,
                "wd": wd,
                "config": config,
                "ctg_id": ctg_id
            }
            make_quiver_task = PypeTask(
                inputs={
                    "ref_fasta": ref_fasta,
                    "read_sam": read_sam
                },
                outputs={
                    "cns_fasta": cns_fasta,
                    "cns_fastq": cns_fastq,
                    "job_done": job_done
                },
                parameters=parameters,
                TaskType=PypeThreadTaskBase,
                URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id))
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)

    wf.refreshTargets()
    os.system("sleep 30")

    mkdir("./4-quiver/cns_output")
    os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta")
    os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq")
    for cns_fasta, cns_fastq in sorted(p_ctg_out):
        os.system(
            "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_p_ctg.fasta".format(
                cns_fasta=fn(cns_fasta)))
        os.system(
            "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_p_ctg.fastq".format(
                cns_fastq=fn(cns_fastq)))

    os.system("rm ./4-quiver/cns_output/cns_h_ctg.fasta")
    os.system("rm ./4-quiver/cns_output/cns_h_ctg.fastq")
    for cns_fasta, cns_fastq in sorted(h_ctg_out):
        os.system(
            "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_h_ctg.fasta".format(
                cns_fasta=fn(cns_fasta)))
        os.system(
            "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_h_ctg.fastq".format(
                cns_fastq=fn(cns_fastq)))
            o2 = "-"
        overlap_length = int(edge_data[1])
        overlap_idt = float(edge_data[2])
        ctg_id = edge_to_ctg.get((v, w), ("NA", "NA"))
        link_lines.append("\t".join([
            "L", r1, o1, r2, o2, "*",
            "ol:i:%d" % overlap_length,
            "oi:f:%.1f" % overlap_idt,
            "ci:A:%s-%s" % ctg_id
        ]))

#f = FastaReader("../1-preads_ovl/db2falcon/preads4falcon.fasta")

try:
    # Works with v1.7.5
    f = FastaReader("../1-preads_ovl/db2falcon/preads4falcon.fasta")
except:
    try:
        # Works with v1.8.2
        f = FastaReader("../1-preads_ovl/preads4falcon.fasta")
    except:
        print "Bummer, this code does not work with your version of FALCON."

seq_len = {}
my_seq = {}
for r in f:
    if r.name not in read_in_graph:
        continue
    seq_len[r.name] = len(r.sequence)

    # Store the sequences in a dictionary
Example #29
0
            p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
            p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1


    h_ctg_to_phase = {}
    with open("h_ctg_path.%s" % ctg_id) as f:
        for row in f:
            row = row.strip().split()
            b_id, ph_id = (int(row[-2]), int(row[-1]) )
            h_ctg_to_phase.setdefault( row[0], {} )
            h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
            h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1

    h_ids = open("h_ctg_ids.%s" % ctg_id,"w")
    with open("h_ctg.%s.fa" % ctg_id, "w") as f:
        h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
        for r in h_tig_all:
            p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})

            if len(r.sequence) < 500:
                continue

            if r.name in filter_out:
                edge_count = sum(h_ctg_to_phase[ r.name ].values())
                same_phase_to_p_ctg_count = 0
                for  b_id, ph_id in h_ctg_to_phase[ r.name ]:
                    if b_id != -1:
                        if (b_id, ph_id) in p_ctg_phase:
                            same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ]
                unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
Example #30
0
# Jason Chin
# Pacific Biosciences
# 2016

from falcon_kit.FastaReader import FastaReader
import os


primary_contigs = {}
f = FastaReader("cns_p_ctg.fasta")
for r in f:
    rname = r.name.split("|")[0]
    primary_contigs.setdefault( rname, (r.sequence, []))
primary_contigs.setdefault( "NA", ("", []))

f = FastaReader("cns_h_ctg.fasta")
all_h_ctg = set()
for r in f:
    rname = r.name.split("|")[0]
    p_name = rname.split("_")[0]
    all_h_ctg.add(rname)
    if p_name in primary_contigs:
        primary_contigs[p_name][1].append( (rname, r.sequence) )
        #print rname, p_name
    else:
        primary_contigs["NA"][1].append( (rname, r.sequence) )


data = []
place_h_ctg = set()
for ctg in primary_contigs: