Ejemplo n.º 1
0
def build_ivals(fp, genome_db, ests_db):
    """Generate aligned intervals based on the GMAP/GFF3 data in 'fp'.

    Yields (pathname, ivals) where ivals is a list of interval pairs.
    """

    for row in gff_parser.read(fp):
        ivals = []
        src_seq = get_src_sequence(genome_db, row)
        dest_seq = get_dest_sequence(ests_db, row)
        
        src_i = 0
        dst_i = 0
        for typ, m in gaps(row):
            if typ == 'M':              # full match
                src_ival = src_seq[src_i:src_i + m]
                dst_ival = dest_seq[dst_i:dst_i + m]
                ivals.append((src_ival, dst_ival))

                src_i += m
                dst_i += m
            elif typ == 'I':            # insertion on dest seq
                dst_i += m
            elif typ == 'D':            # insertion on src seq
                src_i += m
            else:
                raise Exception, "unknown char in Gap attr: %s%d" % (typ, m)

        yield row.attributes.ID, ivals
Ejemplo n.º 2
0
import sys
import gff_parser
import bincount

###

scores = bincount.BinCount()
paths_by_contig = {}
cdna_by_contig = {}
paths_by_cdna = {}

for row in gff_parser.read(open(sys.argv[1])):
    scores.add(row.score)

    contig = row.seqid
    path = row.attributes.ID
    cdna = row.attributes.Name

    s = paths_by_contig.setdefault(contig, set())
    s.add(path)

    s = cdna_by_contig.setdefault(contig, set())
    s.add(cdna)
    
    s = paths_by_cdna.setdefault(cdna, set())
    s.add(path)

###

scores.bin(1).write(open('scores.bin', 'w'), center=False)