Ejemplo n.º 1
0
def get_gff3_features(gff3_file, assemblies=None):
    '''
    Parses the passed GFF3 file and returns two dicts, loaded with biocode.biothings objects:

    1. The first dict are the Assembly objects, keyed on assembly ID.  Each Assembly has all of the
       children populated, so you can fully recover gene, RNA, exon and CDS features iterating on
       the assembly.
    2. The second dist is a flat structure of all the descendent feature objects of the Assemblies
       keyed by the feature IDs.  

    See the documentation for each feature type in biocode.biothings for more info
    '''

    if assemblies is None:
        assemblies = dict()

    features = dict()

    # these are related to parsing any embedded FASTA
    in_fasta_section = False
    is_assembly_fasta = False
    current_fasta_id = None
    lnum = 0
    FASTA_RE = re.compile(r'^\#\#FASTA\s*$')

    for line in open(gff3_file):
        lnum = lnum + 1

        if in_fasta_section == True:
            m = re.search('>(\S+)\s*(.*)', line)
            if m:
                current_fasta_id = m.group(1)

                if current_fasta_id in assemblies:
                    is_assembly_fasta = True
                else:
                    is_assembly_fasta = False

            else:
                if current_fasta_id is None:
                    if (len(str(line.rstrip())) > 0):
                        raise Exception("FASTA parse error - sequence appears without preceding fasta id at line " + str(lnum))
                if is_assembly_fasta == True:
                    # must be a sequence line for an assembly
                    # python 2.6+ makes string concatenation amortized O(n)
                    #  http://stackoverflow.com/a/4435752/1368079
                    assemblies[current_fasta_id].residues += str(line.rstrip())
                    assemblies[current_fasta_id].length = len( assemblies[current_fasta_id].residues )

            continue

        elif FASTA_RE.match(line):
            # all data to the end of the file must be FASTA
            in_fasta_section = True
            continue

        # ignore all other comments
        if line.startswith('#'):
            continue

        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]

        # initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id, residues='' )

        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        atts = column_9_dict(cols[8])
        feat_id = atts.get('ID')
        parent_id = atts.get('Parent')
        parent_feat = None

        # sanity check
        if rfmin > rfmax:
            raise Exception("ERROR: Coordinates in GFF for feature id {0} appear to be reversed and violate GFF3 specification: {1} > {2}".format(feat_id, cols[3], cols[4]))

        if 'locus_tag' in atts:
            locus_tag = atts['locus_tag']
        else:
            locus_tag = None

        # shared features are not yet supported
        if isinstance(parent_id, list):
            raise Exception("This line contains a shared feature with multiple parents.  This isn't yet supported:\n{0}".format(line))

        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) )

        if cols[6] == '-':
            rstrand = -1
        elif cols[6] == '+':
            rstrand = 1
        else:
            rstrand = 0

        phase = cols[7]

        if cols[2] == 'gene':
            gene = biothings.Gene(id=feat_id, locus_tag=locus_tag)
            gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            features[feat_id] = gene
            current_assembly.add_gene(gene)

        elif cols[2] == 'mRNA':
            mRNA = biothings.mRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag)
            mRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_mRNA(mRNA)
            features[feat_id] = mRNA

        elif cols[2] == 'rRNA':
            rRNA = biothings.rRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag)
            rRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_rRNA(rRNA)
            features[feat_id] = rRNA

        elif cols[2] == 'tRNA':
            tRNA = biothings.tRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag)
            tRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_tRNA(tRNA)
            features[feat_id] = tRNA

        elif cols[2] == 'exon':
            exon = biothings.Exon(id=feat_id, parent=parent_feat)
            exon.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_exon(exon)
            features[feat_id] = exon

        elif cols[2] == 'CDS':
            if phase == '.':
                phase = 0
            else:
                phase = int(phase)

            CDS = biothings.CDS(id=feat_id, parent=parent_feat, phase=phase)
            CDS.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand, phase=phase)
            parent_feat.add_CDS(CDS)
            features[feat_id] = CDS

        elif cols[2] == 'polypeptide':
            polypeptide = biothings.Polypeptide(id=feat_id, parent=parent_feat)
            polypeptide.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_polypeptide(polypeptide)
            polypeptide.annotation = parse_annotation_from_column_9(cols[8])
            features[feat_id] = polypeptide

        elif cols[2] == 'five_prime_UTR':
            utr = biothings.FivePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_five_prime_UTR(utr)
            features[feat_id] = utr

        elif cols[2] == 'three_prime_UTR':
            utr = biothings.ThreePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_three_prime_UTR(utr)
            features[feat_id] = utr

        else:
            sys.stderr.write( "Skipping feature {0} with type {1}\n".format(feat_id, cols[2]) )
            continue

        features[feat_id].length = rfmax - rfmin

    return (assemblies, features)
Ejemplo n.º 2
0
def get_gff3_features(gff3_file):
    '''
    Parses the passed GFF3 file and returns two dicts, loaded with biocode.biothings objects:

    1. The first dict are the Assembly objects, keyed on assembly ID.  Each Assembly has all of the
       children populated, so you can fully recover gene, RNA, exon and CDS features iterating on
       the assembly.
    2. The second dist is a flat structure of all the descendent feature objects of the Assemblies
       keyed by the feature IDs.

    See the documentation for each feature type in biocode.biothings for more info
    '''
    
    assemblies = dict()
    features   = dict()

    # these are related to parsing any embedded FASTA
    in_fasta_section = False
    is_assembly_fasta = False
    current_fasta_id = None
    
    
    for line in open(gff3_file):
        if in_fasta_section == True:
            m = re.search('>(\S+)\s*(.*)', line)
            if m:
                current_fasta_id = m.group(1)

                if current_fasta_id in assemblies:
                    is_assembly_fasta = True
                else:
                    is_assembly_fasta = False
                    
            else:
                if is_assembly_fasta == True:
                    # must be a sequence line for an assembly
                    # python 2.6+ makes string concatenation amortized O(n)
                    #  http://stackoverflow.com/a/4435752/1368079
                    assemblies[current_fasta_id].residues += str(line.rstrip())
                    assemblies[current_fasta_id].length = len( assemblies[current_fasta_id].residues )

            continue
        
        elif line.startswith("##FASTA"):
            # all data to the end of the file must be FASTA
            in_fasta_section = True
            continue

        
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        
        # initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id, residues='' )

        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        feat_id = column_9_value(cols[8], 'ID')
        parent_id = column_9_value(cols[8], 'Parent')
        parent_feat = None

        # shared features is not yet supported
        if isinstance(parent_id, list):
            raise Exception("This line contains a shared feature with multiple parents.  This isn't yet supported:\n{0}".format(line))

        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) )

        if cols[6] == '-':
            rstrand = -1
        elif cols[6] == '+':
            rstrand = 1
        else:
            rstrand = 0
            
        if cols[2] == 'gene':
            gene = biothings.Gene(id=feat_id)
            gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            features[feat_id] = gene
            current_assembly.add_gene(gene)
        
        elif cols[2] == 'mRNA':
            mRNA = biothings.mRNA(id=feat_id, parent=parent_feat)
            mRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_mRNA(mRNA)
            features[feat_id] = mRNA

        elif cols[2] == 'rRNA':
            rRNA = biothings.rRNA(id=feat_id, parent=parent_feat)
            rRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_rRNA(rRNA)
            features[feat_id] = rRNA
            
        elif cols[2] == 'tRNA':
            tRNA = biothings.tRNA(id=feat_id, parent=parent_feat)
            tRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_tRNA(tRNA)
            features[feat_id] = tRNA

        elif cols[2] == 'exon':
            exon = biothings.Exon(id=feat_id, parent=parent_feat)
            exon.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_exon(exon)
            features[feat_id] = exon

        elif cols[2] == 'CDS':
            CDS = biothings.CDS(id=feat_id, parent=parent_feat)
            CDS.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_CDS(CDS)
            features[feat_id] = CDS

    return (assemblies, features)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file.  (default)' )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false' )
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]
            
        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception("ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue
            
            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')
                
                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = biothings.Gene( id=locus_tag )
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                tRNA = biothings.tRNA( id=feat_id, parent=current_gene )
                tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                rRNA = biothings.rRNA( id=feat_id, parent=current_gene )
                rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0
            
            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                current_CDS_phase = 0
                
                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)
                    
                    CDS = biothings.CDS( id=cds_id, parent=current_RNA )
                    CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase )
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                    exon = biothings.Exon( id=exon_id, parent=current_RNA )
                    exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand )
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1
                
                product = feat.qualifiers['product'][0]

            else:
                print("WARNING: The following feature was skipped:\n{0}".format(feat))
                features_skipped_count += 1

        # don't forget to do the last gene, if there were any
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
Ejemplo n.º 4
0
def get_gff3_features(gff3_file, assemblies=None):
    '''
    Parses the passed GFF3 file and returns two dicts, loaded with biocode.biothings objects:

    1. The first dict are the Assembly objects, keyed on assembly ID.  Each Assembly has all of the
       children populated, so you can fully recover gene, RNA, exon and CDS features iterating on
       the assembly.
    2. The second dist is a flat structure of all the descendent feature objects of the Assemblies
       keyed by the feature IDs.

    See the documentation for each feature type in biocode.biothings for more info
    '''

    if assemblies is None:
        assemblies = dict()

    features = dict()

    # these are related to parsing any embedded FASTA
    in_fasta_section = False
    is_assembly_fasta = False
    current_fasta_id = None

    for line in open(gff3_file):
        #print("INFO: processing line: {0}".format(line))

        if in_fasta_section == True:
            m = re.search('>(\S+)\s*(.*)', line)
            if m:
                current_fasta_id = m.group(1)

                if current_fasta_id in assemblies:
                    is_assembly_fasta = True
                else:
                    is_assembly_fasta = False

            else:
                if is_assembly_fasta == True:
                    # must be a sequence line for an assembly
                    # python 2.6+ makes string concatenation amortized O(n)
                    #  http://stackoverflow.com/a/4435752/1368079
                    assemblies[current_fasta_id].residues += str(line.rstrip())
                    assemblies[current_fasta_id].length = len(
                        assemblies[current_fasta_id].residues)

            continue

        elif line.startswith("##FASTA"):
            # all data to the end of the file must be FASTA
            in_fasta_section = True
            continue

        # ignore all other comments
        if line.startswith('#'):
            continue

        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]

        # initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly(id=mol_id, residues='')

        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        atts = column_9_dict(cols[8])
        feat_id = atts.get('ID')
        parent_id = atts.get('Parent')
        parent_feat = None

        # sanity check
        if rfmin > rfmax:
            raise Exception(
                "ERROR: Coordinates in GFF for feature id {0} appear to be reversed and violate GFF3 specification: {1} > {2}"
                .format(feat_id, cols[3], cols[4]))

        if 'locus_tag' in atts:
            locus_tag = atts['locus_tag']
        else:
            locus_tag = None

        # shared features is not yet supported
        if isinstance(parent_id, list):
            raise Exception(
                "This line contains a shared feature with multiple parents.  This isn't yet supported:\n{0}"
                .format(line))

        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception(
                    "Error in GFF3: Parent {0} referenced by a child feature before it was defined"
                    .format(parent_id))

        if cols[6] == '-':
            rstrand = -1
        elif cols[6] == '+':
            rstrand = 1
        else:
            rstrand = 0

        if cols[2] == 'gene':
            gene = biothings.Gene(id=feat_id, locus_tag=locus_tag)
            gene.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            features[feat_id] = gene
            current_assembly.add_gene(gene)

        elif cols[2] == 'mRNA':
            mRNA = biothings.mRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            mRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_mRNA(mRNA)
            features[feat_id] = mRNA

        elif cols[2] == 'rRNA':
            rRNA = biothings.rRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            rRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_rRNA(rRNA)
            features[feat_id] = rRNA

        elif cols[2] == 'tRNA':
            tRNA = biothings.tRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            tRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_tRNA(tRNA)
            features[feat_id] = tRNA

        elif cols[2] == 'exon':
            exon = biothings.Exon(id=feat_id, parent=parent_feat)
            exon.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_exon(exon)
            features[feat_id] = exon

        elif cols[2] == 'CDS':
            CDS = biothings.CDS(id=feat_id, parent=parent_feat)
            CDS.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_CDS(CDS)
            features[feat_id] = CDS

        elif cols[2] == 'polypeptide':
            polypeptide = biothings.Polypeptide(id=feat_id, parent=parent_feat)
            polypeptide.locate_on(target=current_assembly,
                                  fmin=rfmin,
                                  fmax=rfmax,
                                  strand=rstrand)
            parent_feat.add_polypeptide(polypeptide)
            polypeptide.annotation = parse_annotation_from_column_9(cols[8])
            features[feat_id] = polypeptide

        elif cols[2] == 'five_prime_UTR':
            utr = biothings.FivePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_five_prime_UTR(utr)
            features[feat_id] = utr

        elif cols[2] == 'three_prime_UTR':
            utr = biothings.ThreePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_three_prime_UTR(utr)
            features[feat_id] = utr

        else:
            sys.stderr.write("Skipping feature {0} with type {1}\n".format(
                feat_id, cols[2]))
            continue

        features[feat_id].length = rfmax - rfmin

    return (assemblies, features)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file.  (default)' )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false' )
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]
            
        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception("ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue
            
            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')
                
                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = biothings.Gene( id=locus_tag )
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene
                current_RNA = None

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                tRNA = biothings.tRNA( id=feat_id, parent=current_gene )
                tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                rRNA = biothings.rRNA( id=feat_id, parent=current_gene )
                rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0
            
            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to
                #  manually make one
                if current_RNA is None:
                    feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                    mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                    mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                    gene.add_mRNA(mRNA)
                    current_RNA = mRNA
                
                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                current_CDS_phase = 0
                
                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)
                    
                    CDS = biothings.CDS( id=cds_id, parent=current_RNA )
                    CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase )
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                    exon = biothings.Exon( id=exon_id, parent=current_RNA )
                    exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand )
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1
                
            else:
                print("WARNING: The following feature was skipped:\n{0}".format(feat))
                features_skipped_count += 1

        # don't forget to do the last gene, if there were any
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
            gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            print("DEBUG: locating gene {0} on {1} at coordinates fmin:{2}-fmax:{3} strand:{4}".format(feat_id, mol_id, rfmin, rfmax, rstrand) )
            features[feat_id] = gene
            current_assembly.add_gene(gene)

            last_gene = gene
        
        elif cols[2] == 'mRNA':
            mRNA = biothings.mRNA(id=feat_id, parent=parent_feat)
            mRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            print("DEBUG: attaching mRNA:{0} to parent gene:{1}".format(feat_id, parent_feat.id) )
            parent_feat.add_mRNA(mRNA)
            features[feat_id] = mRNA

        elif cols[2] == 'rRNA':
            rRNA = biothings.rRNA(id=feat_id, parent=parent_feat)
            rRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand)
            parent_feat.add_rRNA(rRNA)
            features[feat_id] = rRNA
=======
        ## then mark any that align to it (except self)
        for sbj_gene in things:
            if qry_gene.id == sbj_gene.id:
                continue
            elif qry_gene.overlaps_with(sbj_gene):
                handled_ids[sbj_gene.id] = 1
>>>>>>> .r101
            
    return nonoverlapping_set