def task_scatter_quiver(self): p_ctg_fn = fn(self.p_ctg_fa) h_ctg_fn = fn(self.h_ctg_fa) out_json = fn(self.scattered_quiver_json) track_reads_h_done_fn = fn(self.track_reads_h_done) bam_dir = os.path.dirname(track_reads_h_done_fn) config = self.parameters['config'] ref_seq_data = {} # I think this will crash if the file is empty. Maybe that is ok. p_ctg_fa = FastaReader(p_ctg_fn) ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'p' # I think this will crash if the file is empty. Maybe that is ok. h_ctg_fa = FastaReader(h_ctg_fn) for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'h' ctg_ids = sorted(ref_seq_data.keys()) #p_ctg_out=[] #h_ctg_out=[] #job_done_plfs = {} jobs = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), m_ctg_id) ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)) read_bam = os.path.join(bam_dir, '{ctg_id}.bam'.format(ctg_id = ctg_id)) #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id))) #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id))) #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id))) if os.path.exists(read_bam): # *.sam are created in task_track_reads, fc_select_reads_from_bam.py # Network latency should not matter because we have already waited for the 'done' file. mkdir(wd) if not os.path.exists(ref_fasta): # TODO(CD): Up to 50MB of seq data. Should do this on remote host. # See https://github.com/PacificBiosciences/FALCON_unzip/issues/59 with open(ref_fasta,'w') as f: print >>f, '>'+ctg_id print >>f, sequence new_job = {} new_job['ctg_id'] = ctg_id new_job['ctg_types'] = ctg_types new_job['smrt_bin'] = config['smrt_bin'] new_job['sge_option'] = config['sge_quiver'] new_job['ref_fasta'] = ref_fasta new_job['read_bam'] = read_bam jobs.append(new_job) open(out_json, 'w').write(json.dumps(jobs))
def add_tiling_paths_to_gfa(p_ctg_fasta, a_ctg_fasta, p_ctg_tiling_path, a_ctg_tiling_path, min_p_len, min_a_len, gfa_graph): # Associate tiling paths are not deduplicated. # We need the headers of the final haplotigs to filter # out the unnecessary tiling paths. a_ctg_headers = set() f = FastaReader(a_ctg_fasta) for r in f: a_ctg_headers.add(r.name) # Associate tiling paths are not deduplicated. # We need the headers of the final haplotigs to filter # out the unnecessary tiling paths. a_ctg_headers = set() f = FastaReader(a_ctg_fasta) for r in f: a_ctg_headers.add(r.name) # Load and filter primary contig paths. p_paths, p_edge_to_ctg = load_tiling_paths(p_ctg_tiling_path, 'P') _, p_ctg_len = calc_tiling_paths_len(p_paths) p_paths = filter_tiling_paths_by_len(p_paths, p_ctg_len, min_p_len) for ctg_id, path in p_paths.iteritems(): gfa_graph.add_tiling_path(path, ctg_id) # Load and filter associate contig paths. a_paths, a_edge_to_ctg = load_tiling_paths(a_ctg_tiling_path, 'A') _, a_ctg_len = calc_tiling_paths_len(a_paths) a_paths = filter_tiling_paths_by_len(a_paths, a_ctg_len, min_a_len) for ctg_id, path in a_paths.iteritems(): if ctg_id in a_ctg_headers: gfa_graph.add_tiling_path(path, ctg_id)
def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] with open("%s/preads_norm.fasta" % pread_dir, "w") as p_norm: c = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_", "") print >> p_norm, ">prolog/%d/%d_%d" % (c, 0, len( r.sequence)) for i in range(0, len(r.sequence) / 80): print >> p_norm, r.sequence[i * 80:(i + 1) * 80] print >> p_norm, r.sequence[(i + 1) * 80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm.fasta" % pread_dir) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir)
def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] with open("%s/preads_norm.fasta" % pread_dir, "w") as p_norm: c = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_","") print >> p_norm, ">prolog/%d/%d_%d" % ( c, 0, len(r.sequence) ) for i in range(0, len(r.sequence)/80): print >> p_norm, r.sequence[ i *80 : (i + 1) * 80] print >> p_norm, r.sequence[(i+1)*80:] c += 1 input_db = os.path.join(pread_dir,"preads.db") input_idx = os.path.join(pread_dir,".preads.idx") input_bps = os.path.join(pread_dir,".preads.bps") os.system("cd %s; fasta2DB preads preads_norm.fasta" % pread_dir) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) #copy the DB files to tmpdir_for_daligner_input, just to reduce IO burden on storage node #use '| true' to let it time out silently in case some nodes are not responsive for i in config["node_template"]: os.system("timeout %s ssh -f %s mkdir %s | true" %(config["ssh_timeout"],i, config["tmpdir_for_daligner_input"])) os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_db,config["tmpdir_for_daligner_input"])) os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_idx,config["tmpdir_for_daligner_input"])) os.system("timeout %s ssh -f %s cp %s %s | true" % (config["ssh_timeout"],i, input_bps,config["tmpdir_for_daligner_input"])) os.system("cd %s; touch rdb_build_done" % pread_dir)
def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] fa_serial = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() c = 0 fa_serial += 1 with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm: f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_","") ignore_read = False for cc in r.sequence: if cc not in ["A","C","G","T"]: ignore_read = True break if ignore_read: continue print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) ) for i in range(0, len(r.sequence)/80): print >> p_norm, r.sequence[ i *80 : (i + 1) * 80] print >> p_norm, r.sequence[(i+1)*80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) ) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir)
def load_sg_seq(all_read_ids, fasta_fn): seqs = {} # load all p-read name into memory f = FastaReader(fasta_fn) for r in f: if r.name not in all_read_ids: continue seqs[r.name] = r.sequence.upper() return seqs
def main(*argv): ctg_g = nx.DiGraph() ctg_path = {} with open("p_ctg_tiling_path") as f: for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] ctg_path.setdefault(ctg_id, []) ctg_path[ctg_id].append((v, w)) ctg_g.add_edge(v, w) padding_read_ids = set() for ctg_id in ctg_path: left_end = ctg_path[ctg_id][0][0] if ctg_g.in_degree(left_end) == 0: left_read = left_end.split(":")[0] padding_read_ids.add(left_read) f = FastaReader("preads4falcon.fasta") padding_reads = {} for r in f: if r.name not in padding_read_ids: continue else: padding_reads[r.name] = r.sequence p_ctg_seq = {} f = FastaReader("p_ctg.fa") for r in f: p_id = r.name.split()[0] p_ctg_seq[p_id] = r.sequence left_end = ctg_path[p_id][0][0] left_read, end = left_end.split(":") if left_read in padding_reads: seq = padding_reads[left_read] if end == "B": seq = rc_seq(seq) print ">" + p_id + "_p" print seq + r.sequence else: print ">" + p_id print r.sequence
def main(argv=sys.argv): args = parse_args(argv) reads = FastaReader("a_ctg_all.fa") with open("a_ctg.fa","w") as f: for r in reads: tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split() if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\ abs(int(delta_l)) < args.min_len_diff: continue print >>f, ">"+r.name print >>f, r.sequence
def load_seqs(fasta_fn, store_only_seq_len): """ If store_only_seq_len is True, then the seq is discarded and only it's length stored. """ seqs = {} f = FastaReader(fasta_fn) if store_only_seq_len == False: for r in f: seqs[r.name.split()[0]] = (len(r.sequence), r.sequence.upper()) else: for r in f: seqs[r.name.split()[0]] = (len(r.sequence), '*') return seqs
def main(argv=None): p_ctg_coor_map = {} with open("p_ctg_tiling_path") as f: for row in f: row = row.strip().split() ctg_id, v, w, edge_rid, b, e = row[:6] if ctg_id not in p_ctg_coor_map: coor = 0 # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path p_ctg_coor_map[ctg_id] = {} p_ctg_coor_map[ctg_id][v] = 0 coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor continue else: coor += abs(int(b) - int(e)) p_ctg_coor_map[ctg_id][w] = coor a_ctg_fasta = FastaReader("a_ctg.fa") for r in a_ctg_fasta: rid = r.name.split() rid, v, w = rid[:3] pid = rid.split("-")[0] print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
from falcon_kit.FastaReader import FastaReader import sys f = FastaReader(sys.argv[1]) rl = set(open(sys.argv[2]).read().split()) for r in f: rid = r.name.split()[0] if rid not in rl: continue print ">" + rid print r.sequence
def task_scatter_quiver(self): p_ctg_fn = fn(self.p_ctg_fa) h_ctg_fn = fn(self.h_ctg_fa) out_json = fn(self.scattered_quiver_json) ref_seq_data = {} # I think this will crash if the file is empty. Maybe that is ok. p_ctg_fa = FastaReader(p_ctg_fn) ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'p' # I think this will crash if the file is empty. Maybe that is ok. h_ctg_fa = FastaReader(h_ctg_fn) for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'h' ctg_ids = sorted(ref_seq_data.keys()) #p_ctg_out=[] #h_ctg_out=[] #job_done_plfs = {} jobs = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id) ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id)) read_sam = os.path.join( os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id=ctg_id)) #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id))) #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id))) #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id))) if os.path.exists( read_sam ): # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency? #if ctg_types[ctg_id] == 'p': # p_ctg_out.append( (cns_fasta, cns_fastq) ) #if ctg_types[ctg_id] == 'h': # h_ctg_out.append( (cns_fasta, cns_fastq) ) mkdir(wd) if not os.path.exists(fn(ref_fasta)): with open(fn(ref_fasta), 'w') as f: print >> f, '>' + ctg_id print >> f, sequence #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id } #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam}, # outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done}, # parameters = parameters, # ) #quiver_task = make_quiver_task(task_run_quiver) #wf.addTask(quiver_task) #job_done_plfs['{}'.format(ctg_id)] = job_done new_job = {} new_job['ctg_id'] = ctg_id jobs.append(new_job) open(out_json, 'w').write(json.dumps(jobs))
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth): read_fofn = fofn if out_dir == None: out_dir = os.path.join(base_dir, '3-unzip/reads') ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa') read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps') rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids') pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids') rid_to_oid = open(rawread_id_file).read().split('\n') #daligner raw read id to the original ids pid_to_fid = open(pread_id_file).read().split('\n') #daligner pread id to the fake ids def pid_to_oid(pid): fid = pid_to_fid[int(pid)] rid = int(fid.split('/')[1])/10 return rid_to_oid[int(rid)] ref_fasta = FastaReader(ctg_fa) all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != 'all' and s_id != ctg_id: continue if len(s.sequence) < min_ctg_lenth: continue if ctg_id != 'all': ref_out = open( os.path.join( out_dir, '%s_ref.fa' % ctg_id), 'w' ) else: ref_out = open( os.path.join( out_dir, '%s_ref.fa' % s_id), 'w' ) print >>ref_out, '>%s' % s_id print >>ref_out, s.sequence all_ctg_ids.add(s_id) ref_out.close() read_set = {} ctg_id_hits = {} map_fn = os.path.join(read_map_dir,'rawread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if int(row[3]) == 0: o_id = rid_to_oid[int(row[0])] read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 map_fn = os.path.join(read_map_dir,'pread_to_contigs') with open(map_fn, 'r') as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split('-')[0] if hit_ctg not in read_set and int(row[3]) == 0: o_id = pid_to_oid(row[0]) read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 with open(os.path.join( out_dir, 'ctg_list'),'w') as f: for ctg_id in sorted(list(all_ctg_ids)): if ctg_id_hits.get(ctg_id, 0) < 5: continue if ctg_id[-1] not in ['F', 'R']: #ignore small circle contigs, they need different approach continue print >>f, ctg_id read_out_files = {} with open(read_fofn, 'r') as f: for r_fn in f: r_fn = r_fn.strip() read_fa_file = FastaReader(r_fn) for r in read_fa_file: rid = r.name.split()[0] if rid not in read_set: ctg_id = 'unassigned' else: ctg_id = read_set[rid] if ctg_id == 'NA' or ctg_id not in all_ctg_ids: ctg_id = 'unassigned' if ctg_id not in read_out_files: read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' ) read_out_files[ctg_id] = 1 else: read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' ) print >>read_out, '>'+rid print >>read_out, r.sequence read_out.close()
default="./2-asm-falcon/read_maps", help='path to the read-contig map directory') parser.add_argument( '--base_dir', default="./3-unzip/reads", type=str, help='the output base_dir, default to current working directory') args = parser.parse_args() read_fofn = args.fofn ctg_fa = args.ctg_fa ctg_id = args.ctg_id read_map_dir = args.read_map_dir base_dir = args.base_dir ref_fasta = FastaReader(ctg_fa) all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != "all" and s_id != ctg_id: continue if len(s.sequence) < 20000: continue if ctg_id != "all": ref_out = open(os.path.join(base_dir, "%s_ref.fa" % ctg_id), "w") else: ref_out = open(os.path.join(base_dir, "%s_ref.fa" % s_id), "w") print >> ref_out, ">%s" % s_id print >> ref_out, s.sequence
missing_monomer_file = open(filename_root + "_missing_monomer.fa", 'w') regular_pattern_file = open(filename_root + "_regularHORs_pattern.txt", 'w') irregular_pattern_file = \ open(filename_root + "_irregularHORs_pattern.txt", 'w') inversions_pattern_file = open(filename_root + "_inversions_pattern.txt", 'w') stats_file = open(filename_root + "_stats.txt", 'w') stats_file.write(header + "\n") # Print parameters print "Average monomer length: ", avg_monomer_len print "Max head-to-tail distance: ", allowed_max_head_to_tail print "Shortest read length: ", len_threshold print "Clustering thresolds: ", identity_thresholds # IMPORT FASTA FILES # for r in FastaReader(pread_filename): # Load all reads from the pread file seq_db. # seq_db[Read_ID] = sequence if len(r.sequence) < len_threshold: too_short_reads_file.write(">" + r.name + "\n" + r.sequence + "\n") continue seq_db[r.name] = r.sequence # Load all monomers found in preads into monomer_db. # monomer_db[Read_ID] = [(start, end), sequence] for r in FastaReader(inferred_monomer_filename): # Parse the read tag. # Tag Format: ReadID/RangeStart_RangeEnd/Orientation rid, rng, orientation = r.name.split("/") # Skip if the read doesn't have any monomers. if rid not in seq_db:
def main(args): ctg_id = sys.argv[1] if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id=ctg_id)): os.system( "nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln". format(ctg_id=ctg_id)) os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor") else: sys.exit( 0 ) #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow if os.path.exists("hp_aln.coor"): filter_out = set() with open("hp_aln.coor") as f: for row in f: row = row.strip().split() q_cov = float(row[10]) idt = float(row[6]) if q_cov > 99 and idt > 99.9: filter_out.add(row[-1]) p_ctg_to_phase = {} with open("p_ctg_path.%s" % ctg_id) as f: for row in f: row = row.strip().split() b_id, ph_id = (int(row[-2]), int(row[-1])) p_ctg_to_phase.setdefault(row[0], {}) p_ctg_to_phase[row[0]].setdefault((b_id, ph_id), 0) p_ctg_to_phase[row[0]][(b_id, ph_id)] += 1 h_ctg_to_phase = {} with open("h_ctg_path.%s" % ctg_id) as f: for row in f: row = row.strip().split() b_id, ph_id = (int(row[-2]), int(row[-1])) h_ctg_to_phase.setdefault(row[0], {}) h_ctg_to_phase[row[0]].setdefault((b_id, ph_id), 0) h_ctg_to_phase[row[0]][(b_id, ph_id)] += 1 h_ids = open("h_ctg_ids.%s" % ctg_id, "w") with open("h_ctg.%s.fa" % ctg_id, "w") as f: h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id) for r in h_tig_all: p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {}) if len(r.sequence) < 500: continue if r.name in filter_out: edge_count = sum(h_ctg_to_phase[r.name].values()) same_phase_to_p_ctg_count = 0 for b_id, ph_id in h_ctg_to_phase[r.name]: if b_id != -1: if (b_id, ph_id) in p_ctg_phase: same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name][(b_id, ph_id)] unphased_edge_count = h_ctg_to_phase[r.name].get((-1, 0), 0) print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5: # there are many non-p_ctg phase segment, do not filter out continue print >> f, ">" + r.name print >> f, r.sequence print >> h_ids, r.name h_ids.close()
reads_in_layout = set() with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) seqs = {} # load all p-read name into memory f = FastaReader(read_fasta) for r in f: if r.name not in reads_in_layout: continue seqs[r.name] = r.sequence.upper() edge_data = {} with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue
help='contig identifier in the bam file', required=True) parser.add_argument( '--base_dir', type=str, default="./", help='the output base_dir, default to current working directory') args = parser.parse_args() bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() PypeThreadWorkflow.setNumThreadAllowed(1, 1) wf = PypeThreadWorkflow() bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_map")) vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_pos")) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map"))
dest='min_monomer_length', help='Minimum monomer length') parser.add_argument('--version', action='version', version='%(prog)s 0.2') results = parser.parse_args() in_seq_file = results.fasta_file hmm_model_fwd = results.hmm_file_fwd hmm_model_rev = results.hmm_file_rev mono_len_threshold = results.min_monomer_length monomers_file = in_seq_file.replace(".fa", "_inferred_monomers.fa") # Call hmmsearch, build hmms based on consensus alignments os.system( "rm -f hmmoutF.tbl hmmoutF.out; hmmsearch --cpu 8 --tblout hmmoutF.tbl -o hmmoutF.out --notextw %s %s" % (hmm_model_fwd, in_seq_file)) os.system( "rm -f hmmoutR.tbl hmmoutR.out; hmmsearch --cpu 8 --tblout hmmoutR.tbl -o hmmoutR.out --notextw %s %s" % (hmm_model_rev, in_seq_file)) seq_db = {} for r in FastaReader(in_seq_file): seq = r.sequence seq_db[r.name] = seq parseHMMout("hmmoutF.out", "inferred_monomers_F.zzz", "F") parseHMMout("hmmoutR.out", "inferred_monomers_R.zzz", "R") os.system( "cat inferred_monomers_F.zzz inferred_monomers_R.zzz > inferred_monomers.fa; rm inferred_monomers_F.zzz inferred_monomers_R.zzz" )
def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth): read_fofn = fofn if out_dir == None: out_dir = os.path.join(base_dir, "3-unzip/reads") ctg_fa = os.path.join(base_dir, "2-asm-falcon/p_ctg.fa") read_map_dir = os.path.join(base_dir, "2-asm-falcon/read_maps") rawread_id_file = os.path.join(read_map_dir, "raw_read_ids") pread_id_file = os.path.join(read_map_dir, "pread_ids") rid_to_oid = open(rawread_id_file).read().split( "\n") #daligner raw read id to the original ids pid_to_fid = open(pread_id_file).read().split( "\n") #daligner pread id to the fake ids def pid_to_oid(pid): fid = pid_to_fid[int(pid)] rid = int(fid.split("/")[1]) / 10 return rid_to_oid[int(rid)] ref_fasta = FastaReader(ctg_fa) all_ctg_ids = set() for s in ref_fasta: s_id = s.name.split()[0] if ctg_id != "all" and s_id != ctg_id: continue if len(s.sequence) < min_ctg_lenth: continue if ctg_id != "all": ref_out = open(os.path.join(out_dir, "%s_ref.fa" % ctg_id), "w") else: ref_out = open(os.path.join(out_dir, "%s_ref.fa" % s_id), "w") print >> ref_out, ">%s" % s_id print >> ref_out, s.sequence all_ctg_ids.add(s_id) ref_out.close() read_set = {} ctg_id_hits = {} map_fn = os.path.join(read_map_dir, "rawread_to_contigs") with open(map_fn, "r") as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split("-")[0] if int(row[3]) == 0: o_id = rid_to_oid[int(row[0])] read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 map_fn = os.path.join(read_map_dir, "pread_to_contigs") with open(map_fn, "r") as f: for row in f: row = row.strip().split() hit_ctg = row[1] hit_ctg = hit_ctg.split("-")[0] if hit_ctg not in read_set and int(row[3]) == 0: o_id = pid_to_oid(row[0]) read_set[o_id] = hit_ctg ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1 with open(os.path.join(out_dir, "ctg_list"), "w") as f: for ctg_id in sorted(list(all_ctg_ids)): if ctg_id_hits.get(ctg_id, 0) < 5: continue if ctg_id[-1] not in [ "F", "R" ]: #ignore small circle contigs, they need different approach continue print >> f, ctg_id read_out_files = {} with open(read_fofn, "r") as f: for r_fn in f: r_fn = r_fn.strip() read_fa_file = FastaReader(r_fn) for r in read_fa_file: rid = r.name.split()[0] if rid not in read_set: ctg_id = "unassigned" else: ctg_id = read_set[rid] if ctg_id == "NA" or ctg_id not in all_ctg_ids: ctg_id = "unassigned" if ctg_id not in read_out_files: read_out = open( os.path.join(out_dir, "%s_reads.fa" % ctg_id), "w") read_out_files[ctg_id] = 1 else: read_out = open( os.path.join(out_dir, "%s_reads.fa" % ctg_id), "a") print >> read_out, ">" + rid print >> read_out, r.sequence read_out.close()
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() PypeThreadWorkflow.setNumThreadAllowed(1, 1) wf = PypeThreadWorkflow() bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_map")) vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_pos")) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map")) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs={"bam_file": bam_file}, outputs={ "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/het_call")(make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "atable")) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs={"vmap_file": vmap_file}, outputs={"atable_file": atable_file}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/g_atable")(generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_variants")) get_phased_blocks_task = PypeTask( inputs={ "vmap_file": vmap_file, "atable_file": atable_file }, outputs={"phased_variant_file": phased_variant_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_blocks")(get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads")) get_phased_reads_task = PypeTask( inputs={ "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs={"phased_read_file": phased_read_file}, parameters={"ctg_id": ctg_id}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_reads")(get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()
config = { "job_type": job_type, "sge_quiver": sge_quiver, "smrt_bin": smrt_bin } support.job_type = "SGE" #tmp hack until we have a configuration parser ctg_ids = [] PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs, quiver_concurrent_jobs) wf = PypeThreadWorkflow() ref_seq_data = {} p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa") ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "p" h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted(ref_seq_data.keys()) p_ctg_out = [] h_ctg_out = []
def make_variant_candidates( args ): bam_file_fn = args.bam_file_fn pm_count_fn = args.pm_count_fn threshold = args.threshold min_cov = args.min_cov ctg_name = args.ctg_name samtools = args.samtools ref_fasta_fn = args.ref_fasta_fn # assume the ref.fa has only one reference, the name does not mattere, we only read the first one ref_seq = None for r in FastaReader(ref_fasta_fn): if r.name != ctg_name: continue ref_seq = r.sequence if ref_seq == None: print >> sys.stderr, "Can't get reference sequence" sys.exit(1) # maybe we should check if the samtools path is valid p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn ) ), stdout=subprocess.PIPE) pileup = {} pm_count_f = open(pm_count_fn, "w") for l in p.stdout: l = l.strip().split() if l[0][0] == "@": continue QNAME = l[0] RNAME = l[2] if RNAME != ctg_name: continue FLAG = int(l[1]) POS = int(l[3]) - 1 #make it zero base to match sequence index CIGAR = l[5] SEQ = l[9] rp = POS qp = 0 skip_base = 0 total_aln_pos = 0 for m in re.finditer(cigar_re, CIGAR): adv = int(m.group(1)) total_aln_pos += adv if m.group(2) == "S": skip_base += adv if 1.0 - 1.0 * skip_base / (total_aln_pos+1) < 0.50: #if a read is less than 50% aligned, skip continue for m in re.finditer(cigar_re, CIGAR): adv = int(m.group(1)) if m.group(2) == "S": qp += adv if m.group(2) in ("M", "=", "X"): matches = [] for i in range(adv): matches.append( (rp, SEQ[qp]) ) rp += 1 qp += 1 for pos, b in matches: pileup.setdefault(pos, {"A":0, "C":0, "G":0, "T":0}) pileup[pos][b] += 1 elif m.group(2) == "I": for i in range(adv): qp += 1 elif m.group(2) == "D": for i in range(adv): rp += 1 pos_k = pileup.keys() pos_k.sort() th = threshold for pos in pos_k: if pos < POS: # output pileup informaiton before POS which is the current head of the ref base_count = pileup[pos].items() ref_base = ref_seq[pos] out = output_count(pos, base_count, ref_base, min_cov, th) if out != None: total_count, out_line = out print >> pm_count_f, out_line del pileup[pos] # for the last one th = threshold pos_k = pileup.keys() pos_k.sort() for pos in pos_k: base_count = pileup[pos].items() ref_base = ref_seq[pos] out = output_count(pos, base_count, ref_base, min_cov, th) if out != None: total_count, out_line = out print >> pm_count_f, out_line del pileup[pos]
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir samtools = args.samtools ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() wf = PypeProcWatcherWorkflow( max_jobs=1, ) bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") ) vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") ) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir parameters["samtools"] = samtools make_het_call_task = PypeTask( inputs = { "bam_file": bam_file }, outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters = parameters, ) (make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file }, outputs = { "atable_file": atable_file }, parameters = parameters, ) (generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") ) get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file }, outputs = { "phased_variant_file": phased_variant_file }, ) (get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") ) get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs = { "phased_read_file": phased_read_file }, parameters = {"ctg_id": ctg_id}, ) (get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()
def output_aln_tensor(args): bam_file_fn = args.bam_file_fn pm_count_fn = args.pm_count_fn ctg_name = args.ctg_name samtools = args.samtools ref_fasta_fn = args.ref_fasta_fn # assume the ref.fa has only one reference, the name does not mattere, we only read the first one ref_seq = None for r in FastaReader(ref_fasta_fn): if r.name != ctg_name: continue ref_seq = r.sequence if ref_seq == None: print >> sys.stderr, "Can't get reference sequence" sys.exit(1) begin2end = {} with open(pm_count_fn) as f: for row in f.readlines(): row = row.strip().split() pos = int(row[0]) begin2end[pos - 8] = (pos + 8, pos) # maybe we should check if the samtools path is valid p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn)), stdout=subprocess.PIPE) center_to_aln = {} for l in p.stdout: l = l.strip().split() if l[0][0] == "@": continue QNAME = l[0] FLAG = int(l[1]) RNAME = l[2] POS = int(l[3]) - 1 #make it zero base to match sequence index CIGAR = l[5] SEQ = l[9] rp = POS qp = 0 end_to_center = {} active_set = set() for m in re.finditer(cigar_re, CIGAR): adv = int(m.group(1)) if m.group(2) == "S": qp += adv if m.group(2) in ("M", "=", "X"): matches = [] for i in xrange(adv): matches.append((rp, SEQ[qp])) if rp in begin2end: r_end, r_center = begin2end[rp] end_to_center[r_end] = r_center active_set.add(r_center) center_to_aln.setdefault(r_center, []) center_to_aln[r_center].append([]) for center in list(active_set): center_to_aln[center][-1].append( (rp, qp, ref_seq[rp], SEQ[qp])) if rp in end_to_center: center = end_to_center[rp] active_set.remove(center) rp += 1 qp += 1 elif m.group(2) == "I": for i in range(adv): for center in list(active_set): center_to_aln[center][-1].append( (rp, qp, "-", SEQ[qp])) qp += 1 elif m.group(2) == "D": for i in xrange(adv): for center in list(active_set): center_to_aln[center][-1].append( (rp, qp, ref_seq[rp], "-")) if rp in begin2end: r_end, r_center = begin2end[rp] end_to_center[r_end] = r_center active_set.add(r_center) center_to_aln.setdefault(r_center, []) center_to_aln[r_center].append([]) if rp in end_to_center: center = end_to_center[rp] active_set.remove(center) rp += 1 for center in center_to_aln.keys(): if center + 8 < POS: t_line = generate_aln_count_tensor(center_to_aln[center], center, ref_seq) print t_line del center_to_aln[center] for center in center_to_aln.keys(): if center + 8 < POS: t_line = generate_aln_count_tensor(center_to_aln[center], center, ref_seq) print t_line
def main(*argv): reads_in_layout = set() with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) seqs = {} # load all p-read name into memory f = FastaReader(read_fasta) for r in f: if r.name not in reads_in_layout: continue seqs[r.name] = r.sequence.upper() edge_data = {} with open(edge_data_file) as f: for l in f: l = l.strip().split() """001039799:E 000333411:E 000333411 17524 20167 17524 99.62""" v, w, rid, s, t, aln_score, idt, type_ = l if type_ != "G": continue r1 = v.split(":")[0] reads_in_layout.add(r1) r2 = w.split(":")[0] reads_in_layout.add(r2) s = int(s) t = int(t) aln_score = int(aln_score) idt = float(idt) if s < t: e_seq = seqs[rid][s:t] else: e_seq = "".join([RCMAP[c] for c in seqs[rid][s:t:-1]]) edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq) utg_data = {} with open(utg_data_file) as f: for l in f: l = l.strip().split() s, v, t, type_, length, score, path_or_edges = l if type_ not in ["compound", "simple", "contained"]: continue length = int(length) score = int(score) if type_ in ("simple", "contained"): path_or_edges = path_or_edges.split("~") else: path_or_edges = [ tuple(e.split("~")) for e in path_or_edges.split("|") ] utg_data[(s, v, t)] = type_, length, score, path_or_edges p_ctg_out = open("p_ctg.fa", "w") a_ctg_out = open("a_ctg_all.fa", "w") a_ctg_base_out = open("a_ctg_base.fa", "w") p_ctg_t_out = open("p_ctg_tiling_path", "w") a_ctg_t_out = open("a_ctg_tiling_path", "w") a_ctg_base_t_out = open("a_ctg_base_tiling_path", "w") layout_ctg = set() with open(ctg_data_file) as f: for l in f: l = l.strip().split() ctg_id, c_type_, i_utig, t0, length, score, utgs = l ctg_id = ctg_id s0 = i_utig.split("~")[0] if (reverse_end(t0), reverse_end(s0)) in layout_ctg: continue else: layout_ctg.add((s0, t0)) ctg_label = i_utig + "~" + t0 length = int(length) utgs = utgs.split("|") one_path = [] total_score = 0 total_length = 0 #a_ctg_data = [] a_ctg_group = {} for utg in utgs: s, v, t = utg.split("~") type_, length, score, path_or_edges = utg_data[(s, v, t)] total_score += score total_length += length if type_ == "simple": if len(one_path) != 0: one_path.extend(path_or_edges[1:]) else: one_path.extend(path_or_edges) if type_ == "compound": c_graph = nx.DiGraph() all_alt_path = [] for ss, vv, tt in path_or_edges: type_, length, score, sub_path = utg_data[(ss, vv, tt)] v1 = sub_path[0] for v2 in sub_path[1:]: c_graph.add_edge(v1, v2, e_score=edge_data[(v1, v2)][3]) v1 = v2 shortest_path = nx.shortest_path(c_graph, s, t, "e_score") score = nx.shortest_path_length(c_graph, s, t, "e_score") all_alt_path.append((score, shortest_path)) #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig while 1: n0 = shortest_path[0] for n1 in shortest_path[1:]: c_graph.remove_edge(n0, n1) n0 = n1 try: shortest_path = nx.shortest_path( c_graph, s, t, "e_score") score = nx.shortest_path_length( c_graph, s, t, "e_score") #a_ctg_data.append( (s, t, shortest_path) ) all_alt_path.append((score, shortest_path)) except nx.exception.NetworkXNoPath: break #if len(shortest_path) < 2: # break all_alt_path.sort() all_alt_path.reverse() shortest_path = all_alt_path[0][1] if len(one_path) != 0: one_path.extend(shortest_path[1:]) else: one_path.extend(shortest_path) a_ctg_group[(s, t)] = all_alt_path if len(one_path) == 0: continue one_path_edges = zip(one_path[:-1], one_path[1:]) sub_seqs = [] for vv, ww in one_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % ( ctg_id, vv, ww, rid, s, t, aln_score, idt) print >> p_ctg_out, ">%s %s %s %d %d" % ( ctg_id, ctg_label, c_type_, total_length, total_score) print >> p_ctg_out, "".join(sub_seqs) a_id = 1 for v, w, in a_ctg_group: #get the base sequence used in the primary contig #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] ) #if count < 2: # continue atig_output = [] score, atig_path = a_ctg_group[(v, w)][0] atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score base_seq = "".join(sub_seqs) atig_output.append((v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1)) for score, atig_path in a_ctg_group[(v, w)][1:]: atig_path_edges = zip(atig_path[:-1], atig_path[1:]) sub_seqs = [] total_length = 0 total_score = 0 for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] sub_seqs.append(e_seq) total_length += abs(s - t) total_score += aln_score seq = "".join(sub_seqs) delta_len = len(seq) - len(base_seq) idt = 0.0 cov = 0.0 if len(base_seq) > 2000 and len(seq) > 2000: aln_data, x, y = get_aln_data(base_seq, seq) if len(aln_data) != 0: idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][ -2] cov = 1.0 * (aln_data[-1][3] - aln_data[-1][2]) / aln_data[-1][4] atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov)) if len(atig_output) == 1: continue sub_id = 0 for data in atig_output: v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data for vv, ww in atig_path_edges: rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)] if sub_id != 0: print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt) else: print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % ( ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt) if sub_id != 0: print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov) print >> a_ctg_out, seq else: print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % ( ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov) print >> a_ctg_base_out, seq sub_id += 1 a_id += 1 a_ctg_out.close() a_ctg_base_out.close() p_ctg_out.close() a_ctg_t_out.close() a_ctg_base_t_out.close() a_ctg_t_out.close() p_ctg_t_out.close()
def main(argv=sys.argv): global fc_run_logger fc_run_logger = support.setup_logger(None) if len(sys.argv) < 2: print "you need to provide a configuration file to specific a couple cluster running environment" sys.exit(1) config_fn = sys.argv[1] config = ConfigParser.ConfigParser() config.read(config_fn) job_type = "SGE" if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = " -pe smp 12 -q bigmem" if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = " -pe smp 24 -q bigmem " if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = "/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/" if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = "input_bam.fofn" if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { "job_type": job_type, "sge_quiver": sge_quiver, "sge_track_reads": sge_track_reads, "input_bam_fofn": input_bam_fofn, "smrt_bin": smrt_bin } support.job_type = "SGE" #tmp hack until we have a configuration parser ctg_ids = [] PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs, quiver_concurrent_jobs) wf = PypeThreadWorkflow() parameters = {"wd": os.path.abspath("."), "config": config} hasm_done = makePypeLocalFile("./3-unzip/1-hasm/hasm_done") job_done = makePypeLocalFile( os.path.join(parameters["wd"], "track_reads_h_done")) make_track_reads_task = PypeTask(inputs={"hasm_done": hasm_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/track_reads_h") track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ref_seq_data = {} p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa") ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "p" h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted(ref_seq_data.keys()) p_ctg_out = [] h_ctg_out = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split("-")[0] wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id) mkdir(wd) ref_fasta = makePypeLocalFile( os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id))) if os.path.exists(fn(read_sam)): if ctg_types[ctg_id] == "p": p_ctg_out.append((cns_fasta, cns_fastq)) if ctg_types[ctg_id] == "h": h_ctg_out.append((cns_fasta, cns_fastq)) if not os.path.exists(fn(ref_fasta)): with open(fn(ref_fasta), "w") as f: print >> f, ">" + ctg_id print >> f, sequence parameters = { "job_uid": "q-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_quiver_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_sam": read_sam }, outputs={ "cns_fasta": cns_fasta, "cns_fastq": cns_fastq, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id)) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) wf.refreshTargets() os.system("sleep 30") mkdir("./4-quiver/cns_output") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq") for cns_fasta, cns_fastq in sorted(p_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_p_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_p_ctg.fastq".format( cns_fastq=fn(cns_fastq))) os.system("rm ./4-quiver/cns_output/cns_h_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_h_ctg.fastq") for cns_fasta, cns_fastq in sorted(h_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_h_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_h_ctg.fastq".format( cns_fastq=fn(cns_fastq)))
o2 = "-" overlap_length = int(edge_data[1]) overlap_idt = float(edge_data[2]) ctg_id = edge_to_ctg.get((v, w), ("NA", "NA")) link_lines.append("\t".join([ "L", r1, o1, r2, o2, "*", "ol:i:%d" % overlap_length, "oi:f:%.1f" % overlap_idt, "ci:A:%s-%s" % ctg_id ])) #f = FastaReader("../1-preads_ovl/db2falcon/preads4falcon.fasta") try: # Works with v1.7.5 f = FastaReader("../1-preads_ovl/db2falcon/preads4falcon.fasta") except: try: # Works with v1.8.2 f = FastaReader("../1-preads_ovl/preads4falcon.fasta") except: print "Bummer, this code does not work with your version of FALCON." seq_len = {} my_seq = {} for r in f: if r.name not in read_in_graph: continue seq_len[r.name] = len(r.sequence) # Store the sequences in a dictionary
p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0) p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1 h_ctg_to_phase = {} with open("h_ctg_path.%s" % ctg_id) as f: for row in f: row = row.strip().split() b_id, ph_id = (int(row[-2]), int(row[-1]) ) h_ctg_to_phase.setdefault( row[0], {} ) h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0) h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1 h_ids = open("h_ctg_ids.%s" % ctg_id,"w") with open("h_ctg.%s.fa" % ctg_id, "w") as f: h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id) for r in h_tig_all: p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {}) if len(r.sequence) < 500: continue if r.name in filter_out: edge_count = sum(h_ctg_to_phase[ r.name ].values()) same_phase_to_p_ctg_count = 0 for b_id, ph_id in h_ctg_to_phase[ r.name ]: if b_id != -1: if (b_id, ph_id) in p_ctg_phase: same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ] unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
# Jason Chin # Pacific Biosciences # 2016 from falcon_kit.FastaReader import FastaReader import os primary_contigs = {} f = FastaReader("cns_p_ctg.fasta") for r in f: rname = r.name.split("|")[0] primary_contigs.setdefault( rname, (r.sequence, [])) primary_contigs.setdefault( "NA", ("", [])) f = FastaReader("cns_h_ctg.fasta") all_h_ctg = set() for r in f: rname = r.name.split("|")[0] p_name = rname.split("_")[0] all_h_ctg.add(rname) if p_name in primary_contigs: primary_contigs[p_name][1].append( (rname, r.sequence) ) #print rname, p_name else: primary_contigs["NA"][1].append( (rname, r.sequence) ) data = [] place_h_ctg = set() for ctg in primary_contigs: