def screen_maf(qa_file, maf_file): """ Screen the .maf file based on the cluster info in the qa_file """ clusters = read_clusters(qa_file) filtered_maf = maf_file + ".filtered" screened_alignments = set() for cluster in clusters: for anchor in cluster: score = anchor[-1] if score != 0: screened_alignments.add(anchor) fp = file(maf_file) reader = maf.Reader(fp) fw = file(filtered_maf, "w") writer = maf.Writer(fw) for rec in reader: alignment = [] for c in rec.components: chr, left, right, strand, score = c.src, c.forward_strand_start, \ c.forward_strand_end, c.strand, rec.score alignment.append((chr, left, right, strand, score)) cluster = alignment_to_cluster(alignment) if cluster[0] in screened_alignments: writer.write(rec) fp.close() print >>sys.stderr, "write (%d) alignments to '%s'" % \ (len(screened_alignments), filtered_maf)
def build_index(self, filename, indexfile): """ Recipe from Brad Chapman's blog <http://bcbio.wordpress.com/2009/07/26/sorting-genomic-alignments-using-python/> """ indexes = interval_index_file.Indexes() in_handle = open(filename) reader = maf.Reader(in_handle) while True: pos = reader.file.tell() rec = next(reader) if rec is None: break for c in rec.components: indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size, ) index_handle = open(indexfile, "w") indexes.write(index_handle) index_handle.close()
def maf_to_blast8(f): try: print >>sys.stderr, "reading %s" % f reader = maf.Reader(open(f)) except: print >>sys.stderr, "[warning] %s not readable" % f return for rec in reader: a, b = rec.components query = a.src subject = b.src qstart = a.forward_strand_start qstop = a.forward_strand_end sstart = b.forward_strand_start sstop = b.forward_strand_end score = rec.score evalue = blastz_score_to_ncbi_expectation(score) score = blastz_score_to_ncbi_bits(score) evalue, score = "%.2g" % evalue, "%.1f" % score hitlen = len(a.text) #print a.text #print b.text pctid, nmismatch, ngaps = alignment_details(a.text, b.text) print "\t".join(str(x) for x in (query,subject,pctid,hitlen,nmismatch,ngaps, qstart,qstop,sstart,sstop,evalue,score))
def main(in_file): base, ext = os.path.splitext(in_file) out_file = "%s-sorted%s" % (base, ext) index_file = in_file + ".index" if not os.path.exists(index_file): build_index(in_file, index_file) # pull out the sizes and positions of each record rec_info = [] with open(in_file) as in_handle: reader = maf.Reader(in_handle) while 1: pos = reader.file.tell() rec = reader.next() if rec is None: break rec_info.append((rec.text_size, pos)) rec_info.sort(reverse=True) # write the records in order, pulling from the index index = maf.Indexed(in_file, index_file) with open(out_file, "w") as out_handle: writer = maf.Writer(out_handle) for size, pos in rec_info: rec = index.get_at_offset(pos) writer.write(rec)
def __main__(): print "Restricted to species:", sys.argv[3] input_filename = sys.argv[1] output_filename = sys.argv[2] species = sys.argv[3].split(',') partial = sys.argv[4] num_species = len(species) file_in = open(input_filename, 'r') maf_reader = maf.Reader(file_in) file_out = open(output_filename, 'w') block_num = -1 for i, m in enumerate(maf_reader): block_num += 1 if "None" not in species: m = m.limit_to_species(species) l = m.components if len(l) < num_species and partial == "partial_disallowed": continue for c in l: spec, chrom = maf.src_split(c.src) file_out.write(">" + c.src + "(" + c.strand + "):" + str(c.start) + "-" + str(c.end) + "|" + spec + "_" + str(block_num) + "\n") file_out.write(c.text + "\n") file_out.write("\n") file_in.close() file_out.close()
def main(options, args): in_file = args[0] base, ext = os.path.splitext(in_file) out_file = "%s-filtered%s" % (base, ext) index_file = in_file + ".index" if not os.path.exists(index_file): build_index(in_file, index_file) index = maf.Indexed(in_file, index_file) fp = file(in_file) reader = maf.Reader(fp) intervals = [] # give each interval a unique id endpoints = collections.defaultdict( list) # chromosome => list of endpoints filtered_rec = set() j = 0 rec_info = [] while 1: pos = reader.file.tell() rec_info.append((j / 2, pos)) # position of alignment j in file rec = reader.next() if rec is None: break for c in rec.components: chromosome, left, right, weight = c.src, c.forward_strand_start, \ c.forward_strand_end, rec.score interval = Weighted_interval(chromosome, left, right, weight) intervals.append(interval) endpoints[chromosome].append((left, j, -weight)) # left end endpoints[chromosome].append((right, j, weight)) # right end j += 1 for chromosome in sorted(endpoints.keys()): v = endpoints[chromosome] print chromosome, ": start with %d intervals" % (len(v) / 2) filtered_rec |= interval_chain(intervals, v) print "filtered alignment size %d" % len(filtered_rec) fw = file(out_file, "w") writer = maf.Writer(fw) for j, pos in rec_info: if j in filtered_rec: rec = index.get_at_offset(pos) writer.write(rec) fp.close() fw.close()
def __main__(): try: maf_reader = maf.Reader(open(sys.argv[1])) except Exception as e: maf_utilities.tool_fail("Error opening input MAF: %s" % e) try: file_out = open(sys.argv[2], 'w') except Exception as e: maf_utilities.tool_fail("Error opening file for output: %s" % e) try: species = maf_utilities.parse_species_option(sys.argv[3]) if species: num_species = len(species) else: num_species = 0 except Exception as e: maf_utilities.tool_fail("Error determining species value: %s" % e) try: partial = sys.argv[4] except Exception as e: maf_utilities.tool_fail("Error determining keep partial value: %s" % e) if species: print("Restricted to species: %s" % ', '.join(species)) else: print("Not restricted to species.") for block_num, block in enumerate(maf_reader): if species: block = block.limit_to_species(species) if len(maf_utilities.get_species_in_block( block)) < num_species and partial == "partial_disallowed": continue spec_counts = {} for component in block.components: spec, chrom = maf_utilities.src_split(component.src) if spec not in spec_counts: spec_counts[spec] = 0 else: spec_counts[spec] += 1 file_out.write("%s\n" % maf_utilities.get_fasta_header( component, { 'block_index': block_num, 'species': spec, 'sequence_index': spec_counts[spec] }, suffix="%s_%i_%i" % (spec, block_num, spec_counts[spec]))) file_out.write("%s\n" % component.text) file_out.write("\n") file_out.close()
def main(options, args): infile, chr1, chr2 = args in_file = args[0] base, ext = os.path.splitext(in_file) out_file = "%(base)s.%(chr1)s_vs_%(chr2)s_filtered%(ext)s" % locals() fp = file(in_file) fw = file(out_file, "w") reader = maf.Reader(fp) writer = maf.Writer(fw) for rec in reader: c1, c2 = rec.components[0].src, rec.components[1].src if (chr1, chr2) == (c1, c2) or (chr1, chr2) == (c2, c1): writer.write(rec)
def main(): if len(sys.argv) < 4: print("%s motif inmaf spec1,spec2,... " % sys.argv[0], file=sys.stderr) sys.exit(0) targmotif = sys.argv[1] inmaf = open(sys.argv[2]) threshold = 0 species = [] for sp in sys.argv[3].split(','): species.append(sp) for maf in align_maf.Reader(inmaf): mafchrom = maf.components[0].src.split('.')[1] mafstart = maf.components[0].start mafend = maf.components[0].end reftext = maf.components[0].text # maf block scores for each matrix for scoremax, width, headers in MafMotifScorer(species, maf, targmotif): #print >>sys.stderr,headers blocklength = width mafsrc, mafstart, mafend = headers[0] mafchrom = mafsrc.split('.')[1] # lists of scores for each position in scoremax mx = scoremax for offset in range(blocklength): # scan all species with threshold for i in range(len(species)): if mx[i][offset] > threshold: refstart = mafstart + offset - reftext.count( '-', 0, offset) refend = refstart + len(targmotif) data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species)) ]) # quote the motif print(mafchrom, refstart, refend, "'" + targmotif + "'", data) break