Exemple #1
0
def initialize_polypeptides( log_fh, fasta_file ):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = biocodeutils.fasta_dict_from_file( fasta_file )

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = biothings.Polypeptide( id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s'] )
        annotation = bioannotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annotation
        
        polypeptides[seq_id] = polypeptide
    
    return polypeptides
Exemple #2
0
def get_gff3_features(gff3_file, assemblies=None):
    '''
    Parses the passed GFF3 file and returns two dicts, loaded with biocode.biothings objects:

    1. The first dict are the Assembly objects, keyed on assembly ID.  Each Assembly has all of the
       children populated, so you can fully recover gene, RNA, exon and CDS features iterating on
       the assembly.
    2. The second dist is a flat structure of all the descendent feature objects of the Assemblies
       keyed by the feature IDs.

    See the documentation for each feature type in biocode.biothings for more info
    '''

    if assemblies is None:
        assemblies = dict()

    features = dict()

    # these are related to parsing any embedded FASTA
    in_fasta_section = False
    is_assembly_fasta = False
    current_fasta_id = None

    for line in open(gff3_file):
        #print("INFO: processing line: {0}".format(line))

        if in_fasta_section == True:
            m = re.search('>(\S+)\s*(.*)', line)
            if m:
                current_fasta_id = m.group(1)

                if current_fasta_id in assemblies:
                    is_assembly_fasta = True
                else:
                    is_assembly_fasta = False

            else:
                if is_assembly_fasta == True:
                    # must be a sequence line for an assembly
                    # python 2.6+ makes string concatenation amortized O(n)
                    #  http://stackoverflow.com/a/4435752/1368079
                    assemblies[current_fasta_id].residues += str(line.rstrip())
                    assemblies[current_fasta_id].length = len(
                        assemblies[current_fasta_id].residues)

            continue

        elif line.startswith("##FASTA"):
            # all data to the end of the file must be FASTA
            in_fasta_section = True
            continue

        # ignore all other comments
        if line.startswith('#'):
            continue

        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]

        # initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly(id=mol_id, residues='')

        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        atts = column_9_dict(cols[8])
        feat_id = atts.get('ID')
        parent_id = atts.get('Parent')
        parent_feat = None

        # sanity check
        if rfmin > rfmax:
            raise Exception(
                "ERROR: Coordinates in GFF for feature id {0} appear to be reversed and violate GFF3 specification: {1} > {2}"
                .format(feat_id, cols[3], cols[4]))

        if 'locus_tag' in atts:
            locus_tag = atts['locus_tag']
        else:
            locus_tag = None

        # shared features is not yet supported
        if isinstance(parent_id, list):
            raise Exception(
                "This line contains a shared feature with multiple parents.  This isn't yet supported:\n{0}"
                .format(line))

        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception(
                    "Error in GFF3: Parent {0} referenced by a child feature before it was defined"
                    .format(parent_id))

        if cols[6] == '-':
            rstrand = -1
        elif cols[6] == '+':
            rstrand = 1
        else:
            rstrand = 0

        if cols[2] == 'gene':
            gene = biothings.Gene(id=feat_id, locus_tag=locus_tag)
            gene.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            features[feat_id] = gene
            current_assembly.add_gene(gene)

        elif cols[2] == 'mRNA':
            mRNA = biothings.mRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            mRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_mRNA(mRNA)
            features[feat_id] = mRNA

        elif cols[2] == 'rRNA':
            rRNA = biothings.rRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            rRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_rRNA(rRNA)
            features[feat_id] = rRNA

        elif cols[2] == 'tRNA':
            tRNA = biothings.tRNA(id=feat_id,
                                  parent=parent_feat,
                                  locus_tag=locus_tag)
            tRNA.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_tRNA(tRNA)
            features[feat_id] = tRNA

        elif cols[2] == 'exon':
            exon = biothings.Exon(id=feat_id, parent=parent_feat)
            exon.locate_on(target=current_assembly,
                           fmin=rfmin,
                           fmax=rfmax,
                           strand=rstrand)
            parent_feat.add_exon(exon)
            features[feat_id] = exon

        elif cols[2] == 'CDS':
            CDS = biothings.CDS(id=feat_id, parent=parent_feat)
            CDS.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_CDS(CDS)
            features[feat_id] = CDS

        elif cols[2] == 'polypeptide':
            polypeptide = biothings.Polypeptide(id=feat_id, parent=parent_feat)
            polypeptide.locate_on(target=current_assembly,
                                  fmin=rfmin,
                                  fmax=rfmax,
                                  strand=rstrand)
            parent_feat.add_polypeptide(polypeptide)
            polypeptide.annotation = parse_annotation_from_column_9(cols[8])
            features[feat_id] = polypeptide

        elif cols[2] == 'five_prime_UTR':
            utr = biothings.FivePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_five_prime_UTR(utr)
            features[feat_id] = utr

        elif cols[2] == 'three_prime_UTR':
            utr = biothings.ThreePrimeUTR(id=feat_id, parent=parent_feat)
            utr.locate_on(target=current_assembly,
                          fmin=rfmin,
                          fmax=rfmax,
                          strand=rstrand)
            parent_feat.add_three_prime_UTR(utr)
            features[feat_id] = utr

        else:
            sys.stderr.write("Skipping feature {0} with type {1}\n".format(
                feat_id, cols[2]))
            continue

        features[feat_id].length = rfmax - rfmin

    return (assemblies, features)
def main():
    parser = argparse.ArgumentParser(
        description='Metagenemark GFF -> GFF3 conversion script')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to a GFF file created by Metagenemark')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        required=True,
                        help='Prefix to use in ID generation')
    parser.add_argument('-pf',
                        '--protein_fasta',
                        type=str,
                        required=False,
                        help='Optional protein FASTA to be written')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    # key like 2 = SRS014890.polypeptide.2
    polypeptide_lookup = dict()
    writing_protein = False

    gene = None
    mRNAs = dict()
    current_sequence = None
    current_gene_comment_lines = list()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    if args.protein_fasta is not None:
        protein_out = open(args.protein_fasta, mode='wt', encoding='utf-8')

    for line in open(args.input):
        if line.startswith("#"):
            if line.startswith("##FASTA"):
                current_gene_comment_lines.append("#{0}".format(line))

            elif line.startswith("##end-Protein"):
                writing_protein = False
                current_gene_comment_lines.append(line)

            # since we're already doing our own header, don't duplicate the old one
            elif line.startswith("##gff-version"):
                continue
            else:
                if line.startswith("##Protein "):
                    m = re.match("##Protein (\d+)", line)
                    if m:
                        writing_protein = True
                        protein_out.write(">{0}\n".format(
                            polypeptide_lookup[m.group(1)]))
                    else:
                        raise Exception(
                            "ERROR: Expected line to match: ##Protein N")
                elif writing_protein == True:
                    protein_out.write(line[2:])

                current_gene_comment_lines.append(line)

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            mol_id_m = re.match('^(\S+) ', mol_id)

            if mol_id_m:
                print("MATCH!")
                mol_id = mol_id_m.group(1)

            feat_type = cols[2]

            ## we expect only gene types here
            if feat_type not in ['gene', 'CDS']:
                raise Exception(
                    "ERROR: expected only 'gene' or 'CDS' feature types as input (depending on metagenemark version)."
                )

            m_gene = re.match('gene_id[ =](\d+)', cols[8])

            if m_gene:
                gene_num = m_gene.group(1)
            else:
                raise Exception(
                    "ERROR: expected 9th column to have gene ids like: gene_id 5"
                )

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = biothings.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            gene = biothings.Gene(
                id="{0}.gene.{1}".format(args.prefix, gene_num))
            gene.locate_on(target=current_assembly,
                           fmin=int(cols[3]) - 1,
                           fmax=int(cols[4]),
                           strand=cols[6])

            mRNA = biothings.mRNA(id="{0}.mRNA.{1}".format(
                args.prefix, gene_num),
                                  parent=gene.id)
            mRNA.locate_on(target=current_assembly,
                           fmin=int(cols[3]) - 1,
                           fmax=int(cols[4]),
                           strand=cols[6])
            gene.add_mRNA(mRNA)

            CDS = biothings.CDS(id="{0}.CDS.{1}".format(args.prefix, gene_num),
                                parent=mRNA.id)
            CDS.locate_on(target=current_assembly,
                          fmin=int(cols[3]) - 1,
                          fmax=int(cols[4]),
                          strand=cols[6],
                          phase=int(cols[7]))
            mRNA.add_CDS(CDS)

            exon = biothings.Exon(id="{0}.exon.{1}".format(
                args.prefix, gene_num),
                                  parent=mRNA.id)
            exon.locate_on(target=current_assembly,
                           fmin=int(cols[3]) - 1,
                           fmax=int(cols[4]),
                           strand=cols[6])
            mRNA.add_exon(exon)

            polypeptide_id = "{0}.polypeptide.{1}".format(
                args.prefix, gene_num)
            polypeptide = biothings.Polypeptide(id=polypeptide_id,
                                                parent=mRNA.id)
            polypeptide.locate_on(target=current_assembly,
                                  fmin=int(cols[3]) - 1,
                                  fmax=int(cols[4]),
                                  strand=cols[6])
            mRNA.add_polypeptide(polypeptide)
            polypeptide_lookup[gene_num] = polypeptide_id

            gene.print_as(fh=fout, source='GeneMark.hmm', format='gff3')
            fout.write("".join(current_gene_comment_lines))
            current_gene_comment_lines = list()
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'

    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = biocodegff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(
        len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = biothings.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]],
                                       fmin=int(cols[3]) - 1,
                                       fmax=int(cols[4]),
                                       strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(
        len(polypeptides)))

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print(
                        "DEBUG: {0} not found as a parent to any polypeptide".
                        format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()

                for CDS in CDSs:
                    keep = True

                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))

            gene.print_as(fh=fout, source=source, format='gff3')