def build_ivals(fp, genome_db, ests_db): """Generate aligned intervals based on the GMAP/GFF3 data in 'fp'. Yields (pathname, ivals) where ivals is a list of interval pairs. """ for row in gff_parser.read(fp): ivals = [] src_seq = get_src_sequence(genome_db, row) dest_seq = get_dest_sequence(ests_db, row) src_i = 0 dst_i = 0 for typ, m in gaps(row): if typ == 'M': # full match src_ival = src_seq[src_i:src_i + m] dst_ival = dest_seq[dst_i:dst_i + m] ivals.append((src_ival, dst_ival)) src_i += m dst_i += m elif typ == 'I': # insertion on dest seq dst_i += m elif typ == 'D': # insertion on src seq src_i += m else: raise Exception, "unknown char in Gap attr: %s%d" % (typ, m) yield row.attributes.ID, ivals
import sys import gff_parser import bincount ### scores = bincount.BinCount() paths_by_contig = {} cdna_by_contig = {} paths_by_cdna = {} for row in gff_parser.read(open(sys.argv[1])): scores.add(row.score) contig = row.seqid path = row.attributes.ID cdna = row.attributes.Name s = paths_by_contig.setdefault(contig, set()) s.add(path) s = cdna_by_contig.setdefault(contig, set()) s.add(cdna) s = paths_by_cdna.setdefault(cdna, set()) s.add(path) ### scores.bin(1).write(open('scores.bin', 'w'), center=False)