Exemple #1
0
def meropsDB(info, force=False):
    fasta = os.path.join(FUNDB, 'merops_scan.lib')
    filtered = os.path.join(FUNDB, 'merops.formatted.fa')
    database = os.path.join(FUNDB, 'merops.dmnd')
    if os.path.isfile(fasta) and args.update and not force:
        if check4newDB('merops', info):
            force=True
    if not os.path.isfile(fasta) or force:
        lib.log.info('Downloading Merops database')
        download(lib.DBURL.get('merops'), fasta)
        md5 = calcmd5(fasta)
        #reformat fasta headers
        with open(filtered, 'w') as filtout:
            with open(fasta, 'rU') as infile:
                for line in infile:
                    if line.startswith('>'):
                        line = line.rstrip()
                        ID = line.split()[0]
                        family = line.split('#')[1]
                        filtout.write('{:} {:}\n'.format(ID, family))
                    else:
                        filtout.write(line)
        lib.log.info('Building diamond database')
        cmd = ['diamond', 'makedb', '--in', 'merops.formatted.fa', '--db', 'merops']
        lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log)
        num_records = lib.countfasta(filtered)
        info['merops'] = ('diamond', database, '12.0', '2017-10-04', num_records, md5)
    type, name, version, date, records, checksum = info.get('merops')
    lib.log.info('MEROPS Database: version={:} date={:} records={:,}'.format(version, date, records))
Exemple #2
0
def uniprotDB(info, force=False):
    '''
    download swissprot/uniprot database, format for diamond, and output date of database
    '''
    fasta = os.path.join(FUNDB, 'uniprot_sprot.fasta')
    database = os.path.join(FUNDB, 'uniprot.dmnd')
    versionfile = os.path.join(FUNDB, 'uniprot.release-date.txt')
    if os.path.isfile(fasta) and args.update and not force:
        if check4newDB('uniprot-release', info):
            force=True
    if not os.path.isfile(fasta) or force:
        lib.log.info('Downloading UniProtKB/SwissProt database')
        download(lib.DBURL.get('uniprot'), fasta+'.gz')
        subprocess.call(['gunzip', '-f', 'uniprot_sprot.fasta.gz'], cwd=os.path.join(FUNDB))
        download(lib.DBURL.get('uniprot-release'), versionfile)
        md5 = calcmd5(versionfile)
        unidate = ''
        univers = ''
        with open(versionfile, 'rU') as infile:
            for line in infile:
                if line.startswith('UniProtKB/Swiss-Prot Release'):
                    rest, datepart = line.split(' of ')
                    unidate = datetime.datetime.strptime(datepart.rstrip(), "%d-%b-%Y").strftime("%Y-%m-%d") 
                    univers = rest.split(' ')[-1]
        lib.log.info('Building diamond database')
        cmd = ['diamond', 'makedb', '--in', 'uniprot_sprot.fasta', '--db', 'uniprot']
        lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log)
        num_records = lib.countfasta(os.path.join(FUNDB, 'uniprot_sprot.fasta'))
        info['uniprot'] = ('diamond', database, univers, unidate, num_records, md5)
    type, name, version, date, records, checksum = info.get('uniprot')
    lib.log.info('UniProtKB Database: version={:} date={:} records={:,}'.format(version, date, records))
Exemple #3
0
def repeatDB(info, force=False):
    fasta = os.path.join(FUNDB, 'funannotate.repeat.proteins.fa')
    filtered = os.path.join(FUNDB, 'funannotate.repeats.reformat.fa')
    database = os.path.join(FUNDB, 'repeats.dmnd')
    if os.path.isfile(fasta) and args.update and not force:
        if check4newDB('repeats', info):
            force=True
    if not os.path.isfile(fasta) or force:
        lib.log.info('Downloading Repeat database')
        download(lib.DBURL.get('repeats'), fasta+'.tar.gz')
        md5 = calcmd5(fasta+'.tar.gz')
        subprocess.call(['tar', '-zxf', 'funannotate.repeat.proteins.fa.tar.gz'], cwd=os.path.join(FUNDB))
        with open(filtered, 'w') as out:
            with open(fasta, 'rU') as infile:
                for line in infile:
                    #this repeat fasta file has messed up headers....
                    if line.startswith('>'):
                        line = line.replace('#', '_')
                        line = line.replace('/', '-')
                        line = line.replace('&', '')
                    out.write(line)
        lib.log.info('Building diamond database')
        cmd = ['diamond', 'makedb', '--in', 'funannotate.repeats.reformat.fa', '--db', 'repeats', '-parse_seqids']
        lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log)
        num_records = lib.countfasta(filtered)
        info['repeats'] = ('diamond', database, '1.0', today, num_records, md5)
    type, name, version, date, records, checksum = info.get('repeats')
    lib.log.info('Repeat Database: version={:} date={:} records={:,}'.format(version, date, records))
def runPASAtrain(genome, transcripts, cleaned_transcripts, stranded, intronlen, cpus, dbname, output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)
    
    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    if args.pasa_db == 'sqlite':
    	pasaDBname_path = os.path.abspath(os.path.join(folder, pasaDBname))
    else:
    	pasaDBname_path = pasaDBname
    with open(alignConfig, 'w') as config1:
        with open(os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1:
            for line in template1:
                line = line.replace('<__DATABASE__>', pasaDBname_path)
                line = line.replace('<__MYSQLDB__>', pasaDBname_path)
                config1.write(line)
    if not os.path.isfile(os.path.join(folder, pasaDBname+'.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name 
        lib.log.info("Running PASA alignment step using {:,} transcripts".format(lib.countfasta(cleaned_transcripts)))
        cmd = [LAUNCHPASA, '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-T','-t', os.path.abspath(cleaned_transcripts), '-u', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--ALT_SPLICE', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus)]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found: {:}'.format(os.path.join(folder, pasaDBname+'.assemblies.fasta')))
    #generate TSV gene-transcripts
    Loci = []
    numTranscripts = 0
    with open(os.path.join(folder, 'pasa.gene2transcripts.tsv'), 'w') as gene2transcripts:
        with open(os.path.join(folder, pasaDBname+'.pasa_assemblies_described.txt'), 'rU') as description:
            for line in description:
                if not line.startswith('#'):
                    cols = line.split('\t')
                    gene2transcripts.write('g_%s\t%s\n' % (cols[1], cols[2]))
                    numTranscripts += 1
                    if not cols[1] in Loci:
                        Loci.append(cols[1])
    lib.log.info("PASA assigned {:,} transcipts to {:,} loci (genes)".format(numTranscripts, len(Loci)))
    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(folder, pasaDBname+'.assemblies.fasta.transdecoder.genome.gff3') 
    cmd = [os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname+'.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname+'.pasa_assemblies.gff3']
    lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)
    lib.log.info('PASA finished. PASAweb accessible via: localhost:port/cgi-bin/index.cgi?db=%s' % pasaDBname_path)
def mibigDB(info, force=False):
    fasta = os.path.join(args.database, 'mibig.fa')
    database = os.path.join(args.database, 'mibig.dmnd')
    if not os.path.isfile(fasta) or force:
        lib.log.info('Downloading MiBIG Secondary Metabolism database')
        download(URL.get('mibig'), fasta)
        version = os.path.basename(URL.get('mibig')).split('_')[-1].replace(
            '.fasta', '')
        lib.log.info('Building diamond database')
        cmd = ['diamond', 'makedb', '--in', 'mibig.fa', '--db', 'mibig']
        lib.runSubprocess(cmd, os.path.join(args.database), lib.log)
        num_records = lib.countfasta(fasta)
        info['mibig'] = ('diamond', database, version, today, num_records)
    type, name, version, date, records = info.get('mibig')
    lib.log.info('MiBIG Database: version={:} date={:} records={:,}'.format(
        version, date, records))
Exemple #6
0
                                      os.path.basename(query)))
        os.rename(scaffold,
                  os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
    else:
        for y in [query, scaffold]:
            try:
                os.remove(y)
            except OSError:
                lib.log.debug("Error removing %s" % (y))
    #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
    if lib.getSize(exonerate_out) < 500:
        os.remove(exonerate_out)


#count number of proteins to look for
total = lib.countfasta(args.proteins)
lib.log.info('Using {0:,}'.format(total) + ' proteins as queries')

#make tmpdir
tmpdir = 'p2g_' + str(os.getpid())
if not os.path.isdir(tmpdir):
    os.makedirs(tmpdir)
    os.makedirs(os.path.join(tmpdir, 'failed'))
    os.makedirs(os.path.join(tmpdir, 'scaffolds'))

if args.filter == 'tblastn':
    lib.log.debug("BLAST v%s; Exonerate v%s" % (blast_version, exo_version))
    #check for tblastn input
    if args.tblastn:
        lib.log.info("Using pre-calculated tBLASTN result")
        BlastResult = args.tblastn
def removeAntiSense(input, readTuple, output):
    '''
    function will map reads to the input transcripts, determine strandedness, and then filter
    out transcripts that were assembled in antisense orientation. idea here is that the antisense
    transcripts, while potentially valid, aren't going to help update the gene models and perhaps
    could hurt the annotation effort?
    '''
    lib.log.info("Running anti-sense filtering of Trinity transcripts")
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    aligner = choose_aligner()
    if aligner == 'hisat2':
        bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Hisat2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'hisat2-build', input,
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

            #now launch the aligner
            lib.log.info("Aligning reads to trinity transcripts with Hisat2")
            hisat2cmd = [
                'hisat2', '-p',
                str(args.cpus), '-k', '50', '--max-intronlen',
                str(args.max_intronlen), '-x',
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            if readTuple[2]:
                hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                hisat2cmd = hisat2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(hisat2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'bowtie2':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir,
                                  'bowtie2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Bowtie2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'bowtie2-build', input,
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with Bowtie2")
            bowtie2cmd = [
                'bowtie2', '-p',
                str(args.cpus), '-k', '50', '--local', '--no-unal', '-x',
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            if readTuple[2]:
                bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                bowtie2cmd = bowtie2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(bowtie2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'rapmap':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building RapMap index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'rapmap', 'quasiindex', '-t', input, '-i',
                os.path.join(tmpdir, 'rapmap_index')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with RapMap")
            rapmapcmd = [
                'rapmap', 'quasimap', '-t',
                str(args.cpus), '-i',
                os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2',
                readTuple[1]
            ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(rapmapcmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess(cmd, '.', lib.log)

    #now run Trinity examine strandeness tool
    lib.log.info("Examining strand specificity")
    cmd = [
        os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'),
        bowtie2bam,
        os.path.join(tmpdir, 'strand_specific')
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #parse output dat file and get list of transcripts to remove
    removeList = []
    with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile:
        for line in infile:
            line = line.replace('\n', '')
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if args.stranded == 'RF':  #then we want to keep negative ratios in cols[4]
                if not cols[4].startswith('-'):
                    removeList.append(cols[0])
            elif args.stranded == 'FR':  #keep + values
                if cols[4].startswith('-'):
                    removeList.append(cols[0])

    #now parse the input fasta file removing records in list
    with open(output, 'w') as outfile:
        with open(input, 'rU') as infile:
            for record in SeqIO.parse(infile, 'fasta'):
                if not record.id in removeList:
                    outfile.write(">%s\n%s\n" %
                                  (record.description, str(record.seq)))
    lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
                        break
    else:
        lib.log.error(
            "No species name given will cause problems downstream, please pass a name to -s,--species"
        )
        sys.exit(1)
else:
    organism = args.species
    if not args.isolate:
        isolate = '???'
    else:
        isolate = args.isolate

############################################################################
#start workflow here
ProtCount = lib.countfasta(Proteins)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

#run PFAM-A search
lib.log.info("Running HMMer search of PFAM domains")
pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt')
if not lib.checkannotations(pfam_results):
    lib.PFAMsearch(Proteins, args.cpus, 1e-50,
                   os.path.join(outputdir, 'annotate_misc'), pfam_results)
num_annotations = lib.line_count(pfam_results)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run SwissProt Blast search
lib.log.info("Running Blastp search of UniProt DB")
blast_out = os.path.join(outputdir, 'annotate_misc',
                         'annotations.swissprot.txt')
if not lib.checkannotations(blast_out):
Exemple #9
0
lib.setupLogging(log_name)
FNULL = open(os.devnull, 'w')
cmd_args = " ".join(sys.argv)+'\n'
lib.log.debug(cmd_args)
print "-------------------------------------------------------"
lib.SystemInfo()

#get version of funannotate
version = lib.get_version()
lib.log.info("Running %s" % version)

#check buscos, download if necessary
if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)):
    lib.download_buscos(args.busco_db)

ProtCount = lib.countfasta(args.input)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')  

#convert to proteins and screen with busco
lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
BUSCODB = os.path.join(parentdir, 'DB', args.busco_db)
BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO2.py')
cmd = [sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f']
lib.runSubprocess(cmd, '.', lib.log)

#check that it ran correctly
busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv')
if not lib.checkannotations(busco_results):
    lib.log.error("BUSCO failed, check logfile")
    sys.exit(1)
nameChange = {}
    description='''Script that sorts input by length and then renames contig headers.''',
    epilog="""Written by Jon Palmer (2016) [email protected]""",
    formatter_class = MyFormatter)
parser.add_argument('-i','--input', required=True, help='Multi-fasta genome file')
parser.add_argument('-o','--out', required=True, help='Cleaned output (FASTA)')
parser.add_argument('-b','--base', default='scaffold', help='Basename of contig header')
args=parser.parse_args()

def SortRenameHeaders(input, basename, output):
    #sort records and write temp file
    with open(output, 'w') as output:
        with open(input, 'rU') as input:
            records = list(SeqIO.parse(input, 'fasta'))
            records.sort(cmp=lambda x,y: cmp(len(y),len(x)))
            counter = 1
            for rec in records:
                rec.name = ''
                rec.description = ''
                rec.id = basename + '_' + str(counter)
                if len(rec.id) > 16:
                    print "Error. Fasta header too long %s.  Choose a different --base name. Max is 16 characters" % rec.id
                    os._exit(1)
                counter +=1
            SeqIO.write(records, output, 'fasta')

Count = lib.countfasta(args.input)
print('{0:,}'.format(Count) + ' contigs records loaded')
print("Sorting and renaming contig headers")
SortRenameHeaders(args.input, args.base, args.out)

                    organism = f.qualifiers.get("organism", ["???"])[0]
                    if not args.isolate:
                        isolate = f.qualifiers.get("isolate", ["???"])[0]
                    else:
                        isolate = args.isolate
                    break
else:
    organism = args.species
    if not args.isolate:
        isolate = '???'
    else:
        isolate = args.isolate

############################################################################
#start workflow here
ProtCount = lib.countfasta(Proteins)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')  
 
#run PFAM-A search
lib.log.info("Running HMMer search of PFAM domains")
pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt')
if not lib.checkannotations(pfam_results):
    lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results)
num_annotations = lib.line_count(pfam_results)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run SwissProt Blast search
lib.log.info("Running Blastp search of UniProt DB")
blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt')
if not lib.checkannotations(blast_out):
    lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out)
num_annotations = lib.line_count(blast_out)
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname,
                 output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)

    #create pasa and transdecoder logfiles
    pasa_log = os.path.join(folder, 'pasa.log')
    transdecoder_log = os.path.join(folder, 'transdecoder.log')

    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    with open(alignConfig, 'w') as config1:
        with open(
                os.path.join(PASA, 'pasa_conf',
                             'pasa.alignAssembly.Template.txt'),
                'rU') as template1:
            for line in template1:
                line = line.replace('<__MYSQLDB__>', pasaDBname)
                config1.write(line)
    if not os.path.isfile(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name
        lib.log.info(
            "Running PASA alignment step using {:,} transcripts".format(
                lib.countfasta(transcripts)))
        cmd = [
            os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c',
            os.path.abspath(alignConfig), '-r', '-C', '-R', '-g',
            os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t',
            os.path.abspath(transcripts), '--stringent_alignment_overlap',
            args.pasa_alignment_overlap, '--TRANSDECODER',
            '--MAX_INTRON_LENGTH',
            str(intronlen), '--CPU',
            str(pasa_cpus)
        ]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found {:}'.format(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')))
    #generate TSV gene-transcripts
    numLoci = getPASAtranscripts2genes(
        os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'),
        os.path.join(folder, 'pasa.gene2transcripts.tsv'))
    numTranscripts = lib.countfasta(
        os.path.join(folder, pasaDBname + '.assemblies.fasta'))
    lib.log.info(
        "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold".
        format(numTranscripts, numLoci, args.pasa_alignment_overlap))

    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(
        folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3')
    if lib.which('TransDecoder.LongOrfs') and lib.which(
            'TransDecoder.Predict'):
        cmd = [
            'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta',
            '--gene_trans_map', 'pasa.gene2transcripts.tsv'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta',
            '--single_best_only'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            os.path.join(PASA, 'pasa-plugins', 'transdecoder',
                         'cdna_alignment_orf_to_genome_orf.pl'),
            pasaDBname + '.assemblies.fasta.transdecoder.gff3',
            pasaDBname + '.pasa_assemblies.gff3',
            pasaDBname + '.assemblies.fasta'
        ]
        lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff)
    else:
        cmd = [
            os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'),
            '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta',
            '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)