Beispiel #1
0
def test_writer():

    val = StringIO()
    writer = maf.Writer(val, {'scoring': 'foobar'})

    a = align.Alignment()
    a.score = 7009

    a.components.append(
        align.Component(src="human_hoxa",
                        start=100,
                        size=9,
                        strand="+",
                        src_size=1000257,
                        text="ACA-TTACT"))
    a.components.append(
        align.Component(src="horse_hoxa",
                        start=120,
                        size=10,
                        strand="-",
                        src_size=98892,
                        text="ACAATTGCT"))

    check_component(a.components[0], "human_hoxa", 100, 9, "+", 1000257,
                    "ACA-TTACT")
    check_component(a.components[1], "horse_hoxa", 120, 10, "-", 98892,
                    "ACAATTGCT")

    writer.write(a)

    assert val.getvalue() == """##maf version=1 scoring=foobar
Beispiel #2
0
def test_slice():

    a = align.Alignment()
    a.score = "7009"
    a.components.append(align.Component(src="human_hoxa", start=100, size=9, strand="+", src_size=100257, text="ACA-TTACT"))
    a.components.append(align.Component(src="horse_hoxa", start=120, size=10, strand="-", src_size=98892, text="ACAATTGCT"))

    b = a.slice_by_component(0, 101, 105)

    check_component(b.components[0], src="human_hoxa", start=101, size=4, strand="+", src_size=100257, text="CA-TT")
    check_component(b.components[1], src="horse_hoxa", start=121, size=5, strand="-", src_size=98892, text="CAATT")

    # test slicing with + strand src
    reader = maf.Reader(StringIO(test_maf_3))
    a = next(reader)
    b = a.slice_by_component(0, 40, 62)
    check_component(b.components[0], src="apple", start=40, size=22, strand="+", src_size=110, text="TTCGTCACT------GTCGTAAGGGTTC")
    check_component(b.components[1], src="orange", start=28, size=22, strand="-", src_size=100, text="TT--TCACTGCTATCGTCGTA----TTC")

    # test slicing with - strand src
    b = a.slice_by_component(1, 30, 68)
    check_component(b.components[0], src="apple", start=46, size=41, strand="+", src_size=110, text="ACT------GTCGTAAGGGTTCAGA--CTGTCTATGTATACACAAGTTG")
    check_component(b.components[1], src="orange", start=32, size=38, strand="-", src_size=100, text="ACTGCTATCGTCGTA----TTCAGACTTCG-CTATCT------GAGTTG")

    a = next(reader)
    assert a is None
Beispiel #3
0
def do_interval(sources, index, out, ref_src, start, end, seq_db,
                missing_data):

    assert sources[0].split('.')[0] == ref_src.split(
        '.')[0], "%s != %s" % (sources[0].split('.')[0], ref_src.split('.')[0])

    base_len = end - start

    blocks = index.get(ref_src, start, end)
    # From low to high score
    blocks.sort(lambda a, b: cmp(a.score, b.score))

    mask = [-1] * base_len

    # print len( blocks )
    # print blocks[0]

    ref_src_size = None
    for i, block in enumerate(blocks):
        ref = block.get_component_by_src_start(ref_src)
        ref_src_size = ref.src_size
        assert ref.strand == "+"
        slice_start = max(start, ref.start)
        slice_end = min(end, ref.end)
        for j in range(slice_start, slice_end):
            mask[j - start] = i

    #print >>sys.stderr, mask

    tiled = []
    for i in range(len(sources)):
        tiled.append([])

    for ss, ee, index in intervals_from_mask(mask):
        if index < 0:
            tiled[0].append(
                bx.seq.nib.NibFile(open(seq_db[ref_src])).get(
                    start + ss, ee - ss))
            for row in tiled[1:]:
                if missing_data:
                    row.append("*" * (ee - ss))
                else:
                    row.append("-" * (ee - ss))
        else:
            slice_start = start + ss
            slice_end = start + ee
            block = blocks[index]
            ref = block.get_component_by_src_start(ref_src)
            sliced = block.slice_by_component(ref, slice_start, slice_end)
            sliced = sliced.limit_to_species(sources)
            sliced.remove_all_gap_columns()
            for i, src in enumerate(sources):
                comp = sliced.get_component_by_src_start(src)
                if comp:
                    tiled[i].append(comp.text)
                else:
                    if missing_data: tiled[i].append("*" * sliced.text_size)
                    else: tiled[i].append("-" * sliced.text_size)

    a = align.Alignment()
    for i, name in enumerate(sources):
        text = "".join(tiled[i])
        size = len(text) - text.count("-")
        if i == 0:
            if ref_src_size is None:
                ref_src_size = bx.seq.nib.NibFile(open(seq_db[ref_src])).length
            c = align.Component(ref_src, start, end - start, "+", ref_src_size,
                                text)
        else:
            c = align.Component(name + ".fake", 0, size, "?", size, text)
        a.add_component(c)

    out.write(a)
Beispiel #4
0
def do_interval( sources, index, out, ref_src, start, end, seq_db, missing_data, strand ):
    """
    Join together alignment blocks to create a semi human projected local 
    alignment (small reference sequence deletions are kept as supported by 
    the local alignment).
    """
    ref_src_size = None
    # Make sure the reference component is also the first in the source list
    assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" \
        % ( sources[0].split('.')[0], ref_src.split('.')[0] )
    # Determine the overall length of the interval
    base_len = end - start
    # Counter for the last reference species base we have processed
    last_stop = start
    # Rows in maf blocks come in in arbitrary order, we'll convert things
    # to the destred order of the tiled block
    source_to_index = dict( ( name, i ) for ( i, name ) in enumerate( sources ) )
    # This gets all the maf blocks overlapping our interval of interest
    # NOTE: Unlike maf_tile we're expecting 
    # things to be single coverage in the reference species, so we won't 
    # sort by score and lay down.
    blocks = index.get( ref_src, start, end )
    # The last component seen for each species onto which we are tiling
    last_components = [ None ] * len( sources )
    last_status = [ None ] * len( sources )
    cols_needing_fill = [ 0 ] * len( sources )
    # The list of strings in which we build up the tiled alignment
    tiled_rows = [ "" for i in range( len( sources ) ) ]
    # Enumerate the (ordered) list of blocks
    for i, block in enumerate( blocks ):
        # Check for overlap in reference species
        ref = block.get_component_by_src_start( ref_src )
        if ref.start < last_stop:
            if ref.end < last_stop: 
                continue
            block = block.slice_by_component( ref, last_stop, min( end, ref.end ) )
            ref = block.get_component_by_src_start( ref_src )
        block = block.slice_by_component( ref, max( start, ref.start ), min( end, ref.end ) )
        ref = block.get_component_by_src_start( ref_src )
        # print block
        assert last_components[0] is None or ref.start >= last_components[0].end, \
            "MAF must be sorted and single coverage in reference species!"
        assert ref.strand == "+", \
            "MAF must have all reference species blocks on the plus strand"
        # Store the size of the reference sequence for building fake block   
        if ref_src_size is None:
            ref_src_size = ref.src_size
        # Handle the reference component seperately, it has no synteny status
        # but we will try to fill in missing sequence
        if ref.start > last_stop:
            # Need to fill in some reference sequence
            chunk_len = ref.start - last_stop
            text = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len ) 
            tiled_rows[0] += text
            for source in sources[1:]:
                cols_needing_fill[ source_to_index[ source ] ] += chunk_len
        # Do reference component
        chunk_len = len( ref.text )
        tiled_rows[0] += ref.text
        # Do each other component
        for source in sources[1:]:
            source_index = source_to_index[ source ]
            comp = block.get_component_by_src_start( source )
            if comp:
                if comp.synteny_left is None:
                    left_status, left_length = None, -1
                else:
                    left_status, left_length = comp.synteny_left
                if comp.synteny_right is None:
                    right_status, right_length = None, -1
                else:
                    right_status, right_length = comp.synteny_right
                # We have a component, do we need to do some filling?
                cols_to_fill = cols_needing_fill[ source_index ]
                if cols_to_fill > 0:
                    # Adjacent components should have matching status
                    ## assert last_status[ source_index ] is None or last_status[ source_index ] == left_status, \
                    ##     "left status (%s) does not match right status (%s) of last component for %s" \
                    ##         % ( left_status, last_status[ source_index ], source )
                    if left_status is None:
                        fill_char = guess_fill_char( last_components[source_index], comp )
                    else:
                        fill_char = get_fill_char( left_status )
                    tiled_rows[ source_index ] += ( fill_char * cols_to_fill )
                    cols_needing_fill[ source_index ] = 0
                # Okay, filled up to current position, now append the text
                tiled_rows[ source_index ] += comp.text
                assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \
                    "length of tiled row should match reference row"
                last_components[ source_index ] = comp
                last_status[ source_index ] = right_status
            else:
                # No component, we'll have to fill this region when we know
                # the status
                cols_needing_fill[ source_index ] += chunk_len
        last_stop = ref.end
    # No more components, clean up the ends
    if last_stop < end:
        # Need to fill in some reference sequence
        chunk_len = end - last_stop
        tiled_rows[0] += bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len ) 
        for source in sources[1:]:
            cols_needing_fill[ source_to_index[ source ] ] += chunk_len
    # Any final filling that needs to be done?
    for source in sources[1:]:
        source_index = source_to_index[ source ]
        fill_needed = cols_needing_fill[ source_index ]
        if fill_needed > 0:
            if last_components[ source_index ] is None:
                # print >>sys.stderr, "Never saw any components for %s, filling with @" % source
                fill_char = '@'
            else:
                if last_status[ source_index ] is None:
                    fill_char = '*'
                else:
                    fill_char = get_fill_char( last_status[ source_index ] )
            tiled_rows[ source_index ] += fill_char * fill_needed
        assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \
            "length of tiled row should match reference row"
    # Okay, now make up the fake alignment from the tiled rows.
    tiled_rows = remove_all_gap_columns( tiled_rows )
    a = align.Alignment()
    for i, name in enumerate( sources ):
        text = "".join( tiled_rows[i] )
        size = len( text ) - text.count( "-" )
        if i == 0:
            if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length
            c = align.Component( ref_src, start, end-start, "+", ref_src_size, text )
        else:
            c = align.Component( name + ".fake", 0, size, "?", size, text )
        a.add_component( c )
    if strand == '-':
        a = a.reverse_complement()
    out.write( a )