def test_writer(): val = StringIO() writer = maf.Writer(val, {'scoring': 'foobar'}) a = align.Alignment() a.score = 7009 a.components.append( align.Component(src="human_hoxa", start=100, size=9, strand="+", src_size=1000257, text="ACA-TTACT")) a.components.append( align.Component(src="horse_hoxa", start=120, size=10, strand="-", src_size=98892, text="ACAATTGCT")) check_component(a.components[0], "human_hoxa", 100, 9, "+", 1000257, "ACA-TTACT") check_component(a.components[1], "horse_hoxa", 120, 10, "-", 98892, "ACAATTGCT") writer.write(a) assert val.getvalue() == """##maf version=1 scoring=foobar
def test_slice(): a = align.Alignment() a.score = "7009" a.components.append(align.Component(src="human_hoxa", start=100, size=9, strand="+", src_size=100257, text="ACA-TTACT")) a.components.append(align.Component(src="horse_hoxa", start=120, size=10, strand="-", src_size=98892, text="ACAATTGCT")) b = a.slice_by_component(0, 101, 105) check_component(b.components[0], src="human_hoxa", start=101, size=4, strand="+", src_size=100257, text="CA-TT") check_component(b.components[1], src="horse_hoxa", start=121, size=5, strand="-", src_size=98892, text="CAATT") # test slicing with + strand src reader = maf.Reader(StringIO(test_maf_3)) a = next(reader) b = a.slice_by_component(0, 40, 62) check_component(b.components[0], src="apple", start=40, size=22, strand="+", src_size=110, text="TTCGTCACT------GTCGTAAGGGTTC") check_component(b.components[1], src="orange", start=28, size=22, strand="-", src_size=100, text="TT--TCACTGCTATCGTCGTA----TTC") # test slicing with - strand src b = a.slice_by_component(1, 30, 68) check_component(b.components[0], src="apple", start=46, size=41, strand="+", src_size=110, text="ACT------GTCGTAAGGGTTCAGA--CTGTCTATGTATACACAAGTTG") check_component(b.components[1], src="orange", start=32, size=38, strand="-", src_size=100, text="ACTGCTATCGTCGTA----TTCAGACTTCG-CTATCT------GAGTTG") a = next(reader) assert a is None
def do_interval(sources, index, out, ref_src, start, end, seq_db, missing_data): assert sources[0].split('.')[0] == ref_src.split( '.')[0], "%s != %s" % (sources[0].split('.')[0], ref_src.split('.')[0]) base_len = end - start blocks = index.get(ref_src, start, end) # From low to high score blocks.sort(lambda a, b: cmp(a.score, b.score)) mask = [-1] * base_len # print len( blocks ) # print blocks[0] ref_src_size = None for i, block in enumerate(blocks): ref = block.get_component_by_src_start(ref_src) ref_src_size = ref.src_size assert ref.strand == "+" slice_start = max(start, ref.start) slice_end = min(end, ref.end) for j in range(slice_start, slice_end): mask[j - start] = i #print >>sys.stderr, mask tiled = [] for i in range(len(sources)): tiled.append([]) for ss, ee, index in intervals_from_mask(mask): if index < 0: tiled[0].append( bx.seq.nib.NibFile(open(seq_db[ref_src])).get( start + ss, ee - ss)) for row in tiled[1:]: if missing_data: row.append("*" * (ee - ss)) else: row.append("-" * (ee - ss)) else: slice_start = start + ss slice_end = start + ee block = blocks[index] ref = block.get_component_by_src_start(ref_src) sliced = block.slice_by_component(ref, slice_start, slice_end) sliced = sliced.limit_to_species(sources) sliced.remove_all_gap_columns() for i, src in enumerate(sources): comp = sliced.get_component_by_src_start(src) if comp: tiled[i].append(comp.text) else: if missing_data: tiled[i].append("*" * sliced.text_size) else: tiled[i].append("-" * sliced.text_size) a = align.Alignment() for i, name in enumerate(sources): text = "".join(tiled[i]) size = len(text) - text.count("-") if i == 0: if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile(open(seq_db[ref_src])).length c = align.Component(ref_src, start, end - start, "+", ref_src_size, text) else: c = align.Component(name + ".fake", 0, size, "?", size, text) a.add_component(c) out.write(a)
def do_interval( sources, index, out, ref_src, start, end, seq_db, missing_data, strand ): """ Join together alignment blocks to create a semi human projected local alignment (small reference sequence deletions are kept as supported by the local alignment). """ ref_src_size = None # Make sure the reference component is also the first in the source list assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" \ % ( sources[0].split('.')[0], ref_src.split('.')[0] ) # Determine the overall length of the interval base_len = end - start # Counter for the last reference species base we have processed last_stop = start # Rows in maf blocks come in in arbitrary order, we'll convert things # to the destred order of the tiled block source_to_index = dict( ( name, i ) for ( i, name ) in enumerate( sources ) ) # This gets all the maf blocks overlapping our interval of interest # NOTE: Unlike maf_tile we're expecting # things to be single coverage in the reference species, so we won't # sort by score and lay down. blocks = index.get( ref_src, start, end ) # The last component seen for each species onto which we are tiling last_components = [ None ] * len( sources ) last_status = [ None ] * len( sources ) cols_needing_fill = [ 0 ] * len( sources ) # The list of strings in which we build up the tiled alignment tiled_rows = [ "" for i in range( len( sources ) ) ] # Enumerate the (ordered) list of blocks for i, block in enumerate( blocks ): # Check for overlap in reference species ref = block.get_component_by_src_start( ref_src ) if ref.start < last_stop: if ref.end < last_stop: continue block = block.slice_by_component( ref, last_stop, min( end, ref.end ) ) ref = block.get_component_by_src_start( ref_src ) block = block.slice_by_component( ref, max( start, ref.start ), min( end, ref.end ) ) ref = block.get_component_by_src_start( ref_src ) # print block assert last_components[0] is None or ref.start >= last_components[0].end, \ "MAF must be sorted and single coverage in reference species!" assert ref.strand == "+", \ "MAF must have all reference species blocks on the plus strand" # Store the size of the reference sequence for building fake block if ref_src_size is None: ref_src_size = ref.src_size # Handle the reference component seperately, it has no synteny status # but we will try to fill in missing sequence if ref.start > last_stop: # Need to fill in some reference sequence chunk_len = ref.start - last_stop text = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len ) tiled_rows[0] += text for source in sources[1:]: cols_needing_fill[ source_to_index[ source ] ] += chunk_len # Do reference component chunk_len = len( ref.text ) tiled_rows[0] += ref.text # Do each other component for source in sources[1:]: source_index = source_to_index[ source ] comp = block.get_component_by_src_start( source ) if comp: if comp.synteny_left is None: left_status, left_length = None, -1 else: left_status, left_length = comp.synteny_left if comp.synteny_right is None: right_status, right_length = None, -1 else: right_status, right_length = comp.synteny_right # We have a component, do we need to do some filling? cols_to_fill = cols_needing_fill[ source_index ] if cols_to_fill > 0: # Adjacent components should have matching status ## assert last_status[ source_index ] is None or last_status[ source_index ] == left_status, \ ## "left status (%s) does not match right status (%s) of last component for %s" \ ## % ( left_status, last_status[ source_index ], source ) if left_status is None: fill_char = guess_fill_char( last_components[source_index], comp ) else: fill_char = get_fill_char( left_status ) tiled_rows[ source_index ] += ( fill_char * cols_to_fill ) cols_needing_fill[ source_index ] = 0 # Okay, filled up to current position, now append the text tiled_rows[ source_index ] += comp.text assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \ "length of tiled row should match reference row" last_components[ source_index ] = comp last_status[ source_index ] = right_status else: # No component, we'll have to fill this region when we know # the status cols_needing_fill[ source_index ] += chunk_len last_stop = ref.end # No more components, clean up the ends if last_stop < end: # Need to fill in some reference sequence chunk_len = end - last_stop tiled_rows[0] += bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len ) for source in sources[1:]: cols_needing_fill[ source_to_index[ source ] ] += chunk_len # Any final filling that needs to be done? for source in sources[1:]: source_index = source_to_index[ source ] fill_needed = cols_needing_fill[ source_index ] if fill_needed > 0: if last_components[ source_index ] is None: # print >>sys.stderr, "Never saw any components for %s, filling with @" % source fill_char = '@' else: if last_status[ source_index ] is None: fill_char = '*' else: fill_char = get_fill_char( last_status[ source_index ] ) tiled_rows[ source_index ] += fill_char * fill_needed assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \ "length of tiled row should match reference row" # Okay, now make up the fake alignment from the tiled rows. tiled_rows = remove_all_gap_columns( tiled_rows ) a = align.Alignment() for i, name in enumerate( sources ): text = "".join( tiled_rows[i] ) size = len( text ) - text.count( "-" ) if i == 0: if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length c = align.Component( ref_src, start, end-start, "+", ref_src_size, text ) else: c = align.Component( name + ".fake", 0, size, "?", size, text ) a.add_component( c ) if strand == '-': a = a.reverse_complement() out.write( a )