Beispiel #1
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)
    parser = argparse.ArgumentParser(prog='gff2prot.py',
                                     description='''Script to convert GFF3 and FASTA proteins.''',
                                     epilog="""Written by Jon Palmer (2018) [email protected]""",
                                     formatter_class=MyFormatter)
    parser.add_argument('-g', '--gff3', required=True,
                        help='Genome annotation GFF3 format')
    parser.add_argument('-f', '--fasta', required=True,
                        help='Genome in FASTA format')
    parser.add_argument('--no_stop', action='store_true',
                        help='Dont print stop codon')
    args = parser.parse_args(args)

    # translate GFF3 to proteins
    # load into dictionary
    Genes = {}
    Genes = lib.gff2dict(args.gff3, args.fasta, Genes)

    for k, v in natsorted(Genes.items()):
        for i, x in enumerate(v['ids']):
            if args.no_stop:
                Prot = v['protein'][i].rstrip('*')
            else:
                Prot = v['protein'][i]
            sys.stdout.write('>%s %s\n%s\n' % (x, k, Prot))
Beispiel #2
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='gff2prot.py',
        description=
        '''Script to convert GFF3 and FASTA to tbl, proteins, transcripts.''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-g',
                        '--gff3',
                        required=True,
                        help='Genome annotation GFF3 format')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Genome in FASTA format')
    args = parser.parse_args(args)

    # load into dictionary
    Genes = {}
    Genes = lib.gff2dict(args.gff3, args.fasta, Genes)

    # sort the dictionary
    def _sortDict(d):
        return (d[1]['location'][0], d[1]['location'][1])

    # now sort dictionary by contig and location, rename using prefix, translate to protein space to get proper start/stop info
    sGenes = sorted(iter(Genes.items()), key=_sortDict)
    sortedGenes = OrderedDict(sGenes)
    scaff2genes = {}
    for k, v in list(sortedGenes.items()):
        if not v['contig'] in scaff2genes:
            scaff2genes[v['contig']] = [k]
        else:
            scaff2genes[v['contig']].append(k)

    # get length of scaffolds
    scaffLen = scaffold2Dict(args.fasta)

    # now write table
    dicts2tbl(sortedGenes,
              scaff2genes,
              scaffLen,
              'CFMR',
              '12345',
              annotations=True)
Beispiel #3
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)
    parser = argparse.ArgumentParser(prog='gff_reformat.py',
                                     description='''Script to rename gene models GFF3 file.''',
                                     epilog="""Written by Jon Palmer (2020) [email protected]""",
                                     formatter_class=MyFormatter)
    parser.add_argument('-g', '--gff3', required=True,
                        help='Genome annotation GFF3 format')
    parser.add_argument('-f', '--fasta', required=True,
                        help='Genome in FASTA format')
    parser.add_argument('-l', '--locus_tag', default='FUN',
                        help='Basename of gene names')
    parser.add_argument('-n', '--numbering', default=1, type=int,
                        help='Start numbering at')
    parser.add_argument('-o', '--out', required=True, help='Output GFF3')
    args = parser.parse_args(args)

    # load into dictionary
    Genes = {}
    Genes = lib.gff2dict(args.gff3, args.fasta, Genes)
    print('Parsed {:,} gene models from {}'.format(len(Genes), args.gff3))

    # now create ordered dictionary and sort by contig and position
    def _sortDict(d):
        return (d[1]['contig'], d[1]['location'][0])

    sGenes = natsorted(iter(Genes.items()), key=_sortDict)
    sortedGenes = OrderedDict(sGenes)
    renamedGenes = {}
    counter = args.numbering
    args.locus_tag = args.locus_tag.rstrip('_')
    transcripts = 0
    for k, v in list(sortedGenes.items()):
        locusTag = args.locus_tag+'_'+str(counter).zfill(6)
        renamedGenes[locusTag] = v
        renamedGenes[locusTag]['gene_synonym'].append(k)
        newIds = []
        for i in range(0, len(v['ids'])):
            newIds.append('{}-T{}'.format(locusTag, i+1))
            transcripts += 1
        renamedGenes[locusTag]['ids'] = newIds
        counter += 1

    # write to gff3
    lib.dict2gff3(renamedGenes, args.out)
    print('Sorted and renamed {:,} gene models {:,} transcripts: {}'.format(
        len(renamedGenes), transcripts, args.out))