def remove_contained(alignments): '''Removes alignments contained within other alignments @param alignments list of alignments @return list of alignments with contained alignments removed ''' logf = compose(log,partial(add, "remove_contained: ")) alignments = make_list(alignments) logf("Starting Alignments: %d" % len(alignments)) is_contained = lambda a,b : sg(b) >= sg(a) and eg(b) <= eg(a) logf("Sorting") end_sorted = sorted(alignments, key=eg, reverse=True) alignments = sorted(end_sorted, key=sg) logf("Searching") #remove contained contained = [False] * len(alignments) for i in xrange(len(alignments)): for j in xrange(i): if is_contained(alignments[j],alignments[i]): contained[i] = True break filtered = map(itemgetter(1), ifilter(compose(lambda x : not x,itemgetter(0)), izip(contained,alignments))) logf("Filtered Alignments: %d" % len(filtered)) return filtered
def greedy_repeat_filter(alignment_iterable, final_sort_key=None): '''takes a list of alignments, if two alignmnts have the same start or end positions take the longest final_sort_key gives us the final value to sort by to break ties Larger values are better ''' logf = compose(log, partial(add,"greedy_repeat_filter: ")) s_sorted = make_list(alignment_iterable) logf("Staring Alignments: %d" % len(s_sorted)) if final_sort_key: s_sorted = sorted(alignment_iterable, key=final_sort_key, reverse=True) s_sorted = sorted(s_sorted, key=eg, reverse=True) s_sorted = sorted(s_sorted, key=sg) filtered_alignments = imap(itemgetter(0),group(sg, s_sorted)) e_sorted = sorted(filtered_alignments, key=eg, reverse=True) filtered = map(itemgetter(0), group(eg, e_sorted)) logf("Filtered Alignments %d " % len(filtered) ) return filtered
def group(key_func, alignment_iterable): ''' Groups alignments by key_func, only returns the groups as an iterable ''' return imap(compose(list,itemgetter(1)), groupby(alignment_iterable, key=key_func))
def disabled_test_LIS(config): test_data_path = config.get("test_data_path") alignment_file = os.path.join(test_data_path, "channel_286_read_45_1406145606_2D.blast6.gz") blast_alignment_getter = compose(blast_record_iterator, ioffe) aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) return lambda : aln_funcs.LIS(aln_funcs.score_getter_matching_consensus_estimated, blast_alignment_getter(alignment_file))
def coverage_from_blast6(): if not len(sys.argv) == 2: sys.exit("coverage_from_blast6 in.blast6") raw_alignment_it = blast_record_iterator(iterator_over_file(sys.argv[1])) lno = partial(best_scoring_non_overlapping, attrgetter("qstart"), attrgetter("qend"), attrgetter("bitscore")) q_filt_alignment_it = chain.from_iterable( imap(compose(lno, itemgetter(1)), groupby(raw_alignment_it, attrgetter("qname")))) #read all alignments into memory ref_sorted_alignments = sorted(q_filt_alignment_it, key=attrgetter("sname")) for reference,alignments in groupby(ref_sorted_alignments, attrgetter("sname")): alignments = list(alignments) ref_len = alignments[0].slen blast_start_getter = lambda a: a.sstart-1 blast_end_getter = lambda a: a.send-1 cov_arr = coverage_array_from_ranges(alignments, ref_len, blast_start_getter, blast_end_getter) filter(print, izip(count(1),cov_arr)) #mark the regions with 0 coverage zerocov = map(lambda x: 1 if x==0 else 0, cov_arr) zerocov_regions = get_marked_ranges(zerocov) region_printer = compose(print,lambda (x,(y,z)) : "\t".join(map(str,[x,y,z]))) #filter(region_printer, izip(repeat(reference), zerocov_regions)) ##Get Low ID regions ranges_w_id = imap(compose(lambda (x,y,i) : (x-1,y-1,i), attrgetter("sstart","send","pctid")), alignments) pct_arr = coverage_array_from_ranges(alignments, ref_len, blast_start_getter, blast_end_getter, lambda r, (o_pid,o_cnt): (r.pctid+o_pid, o_cnt+1), (0,0)) lowid = map(lambda (c_pid,cnt): 1 if cnt != 0 and c_pid/cnt < 95.0 else 0, pct_arr) lowid_regions = get_marked_ranges(lowid)
def blast6filter_main(cmdline_args = None): if not cmdline_args: import sys cmdline_args = sys.argv if not len(cmdline_args) == 3: return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first" task,infile = cmdline_args[1:3] fileit = iterator_over_file(infile) alignment_getter = blast_record_iterator(fileit) # #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns))) # #sys.exit(1) if task.startswith("r"): grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) score_func = aln_funcs.score_getter_matching_consensus_estimated greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid")) def remove_self(alns): a = list(alns) log("Remove Self: Working on %d" % len(alns)) filtered = filter(lambda y: not y.qname == y.sname, a) log("Remove Self: Filtered alignments: %d" % len(filtered)) return filtered lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) if task == "r_noover": score_func = aln_funcs.score_getter_penalize_overlap_estimated lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained) best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns) if task =="r_experimental": lis = compose(greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) else: grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) lis = compose(partial(aln_funcs.LIS,aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained) best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns) filter(print,imap(record_to_string, chain.from_iterable(best)))
def LIS(score_getter, alignments): '''Score getter takes two alignments and returns a score, Should probably choose from the above scoring functions''' logf = compose(log, partial(add, "LIS:")) alignments = make_list(alignments) logf("Starting Alignments: %d" % len(alignments)) if len(alignments) == 0: return [] logf("Sorting") end_sorted = sorted(alignments, key=eg, reverse=True) alns = sorted(end_sorted, key=sg) logf("Starting DP") #initialize lis array lis = map(LIS_t._make, izip(imap(partial(score_getter,None), alns), repeat(-1))) #DP for i in xrange(len(alns)): for j in xrange(i): ##Score getter needs to know about how it's being used (Oh well) score = lis[j].score + score_getter(alns[j], alns[i]) if score > lis[i].score: lis[i] = LIS_t(score, j) #traceback max_pos, _ = max(enumerate(lis), key=itemgetter(1)) tb = [False] * len(alns) cur_max = max_pos while True: tb[cur_max] = True cur_max = lis[cur_max].prev if cur_max == -1: break filtered = filter(itemgetter(0), izip(tb,lis,alns)) logf("Filtered Alignments: %d" % len(filtered)) return filtered
def correct_oxford(reads_fn=None, alignments_fn=None): '''Corrects oxford reads''' log = logger(sys.stderr) if not reads_fn or not alignments_blast6_fn: if not len(sys.argv) == 3: sys.exit("correct.py raw_reads.fa alignments.blast6") (reads_fn,alignments_fn) = sys.argv[1:3] log("Reading raw reads into memory") #just put all reads in memory fastas = compose(fasta_iterator, iterator_over_file)(reads_fn) raw_reads = dict(map(attrgetter("name","seq"), fastas)) log("Reading raw reads DONE :)") #The alignments need to be sorted by the long read name (second column) alignment_it = line_record_iterator(Blast6SeqRecord, Blast6SeqTypes, iterator_over_file(alignments_fn)) important_field_getter = attrgetter("qname","sname","qstart","qend", "sstart","send", "qseq", "sseq") for readname, alignments in groupby(alignment_it, attrgetter("sname")): log("Working on %s" % readname) raw_read_seq = raw_reads.get(readname) if not raw_read_seq: log("Can not find sequence for %s" % readname) continue log("Raw Read Length: %d" % len(raw_read_seq)) g = AlnGraph(raw_read_seq) alignments = imap(important_field_getter, alignments) num_alignments = 0 for qname,sname,qstart,qend,sstart,send,qseq,sseq in alignments: #blast alignments are one based, convert to 0 based (qstart, qend) = (qstart-1, qend-1) (sstart, send) = (sstart-1, send-1) #reverse complement, must switch the alignment strings if send < sstart: (qseq, sseq) = tuple(map(reverse_complement, [qseq,sseq])) send, sstart = sstart,send (qseq, sseq) = convert_mismatches(qseq,sseq) try: alignment_tuple =((qstart, qend, qseq), (sstart, send, sseq), qname) g.add_alignment( alignment_tuple) except Exception as e: log("Add Alignmented Error: %s" % e) continue if num_alignments > TOO_MANY_ALIGNMENTS: break num_alignments += 1 log("Processed Alignments: %d" % num_alignments) if num_alignments > TOO_MANY_ALIGNMENTS: log("Too Many Alignments, Skipping") continue log("Generating Consensus") consensus = g.generate_all_consensus(min_cov=0)[0] log("Consensus Length %d" % len(consensus[0])) log("%s Done\n\n" % readname) #log("Output dag info") #output_dag_info(g, "g.info") print ">"+readname+"_consensus" print consensus[0]
def blast6filter_main(cmdline_args=None): if not cmdline_args: import sys cmdline_args = sys.argv if not len(cmdline_args) == 3: return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first" task, infile = cmdline_args[1:3] fileit = iterator_over_file(infile) alignment_getter = blast_record_iterator(fileit) # #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns))) # #sys.exit(1) if task.startswith("r"): grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) score_func = aln_funcs.score_getter_matching_consensus_estimated greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid")) def remove_self(alns): a = list(alns) log("Remove Self: Working on %d" % len(alns)) filtered = filter(lambda y: not y.qname == y.sname, a) log("Remove Self: Filtered alignments: %d" % len(filtered)) return filtered lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) if task == "r_noover": score_func = aln_funcs.score_getter_penalize_overlap_estimated lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained) best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns) if task == "r_experimental": lis = compose(greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) else: grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) lis = compose( partial(aln_funcs.LIS, aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained) best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns) filter(print, imap(record_to_string, chain.from_iterable(best)))
def record_to_string(record, delim="\t"): fields = record._fields val_getter = compose(str, partial(getattr, record)) return delim.join(imap(val_getter, fields))
def line_record_iterator(record, types, iterable): '''Converts an iterable (of lines) to records with given type''' record_maker = compose(record._make, partial(zipmap,types) , getattr(str, "split")) return imap(record_maker, iterable)