Ejemplo n.º 1
0
def showAll(dir):
    Table = []
    TableHeader = [
        'Species', 'Augustus', 'GeneMark', 'Snap', 'GlimmerHMM',
        'CodingQuarry', 'Date'
    ]
    for f in os.listdir(dir):
        ff = os.path.join(dir, f)
        if os.path.isdir(ff) and lib.checkannotations(
                os.path.join(ff, 'info.json')):
            with open(os.path.join(ff, 'info.json')) as infile:
                data = json.load(infile)
            sources = [f]
            for x in [
                    'augustus', 'genemark', 'snap', 'glimmerhmm',
                    'codingquarry'
            ]:
                if x in data:
                    if len(data[x][0]) < 1:
                        sources.append('None')
                    else:
                        sourceFile = data[x][0]['source']
                        if ': ' in sourceFile:
                            sourceFile = sourceFile.split(':')[0]
                        sources.append(sourceFile)
            sources.append(data['augustus'][0]['date'])
        Table.append(sources)
    Table = natsorted(Table, key=lambda x: x[0])
    Table.insert(0, TableHeader)
    lib.print_table(Table, max_col_width=40)
Ejemplo n.º 2
0
def runGOenrichment(input):
    basename = os.path.basename(input).replace('.txt', '')
    goa_out = os.path.join(args.out, basename+'.go.enrichment.txt')
    if not lib.checkannotations(goa_out):
        cmd = ['find_enrichment.py', '--obo', os.path.join(FUNDB, 'go.obo'),
               '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', '--outfile', goa_out,
               input, os.path.join(args.input, 'population.txt'), os.path.join(args.input, 'associations.txt')]
        subprocess.call(cmd, stdout=FNULL, stderr=FNULL)
Ejemplo n.º 3
0
def speciesAvailable(dir):
    # return dictionary of species name and path to info.json file
    Results = {}
    for f in os.listdir(dir):
        ff = os.path.join(dir, f)
        if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')):
            with open(os.path.join(ff, 'info.json')) as infile:
                data = json.load(infile)
            Results[f] = data
    return Results
Ejemplo n.º 4
0
def runGOenrichment(input):
    basename = os.path.basename(input).replace('.txt', '')
    goa_out = os.path.join(args.out, basename + '.go.enrichment.txt')
    go_log = os.path.join(args.out, basename + '.go.enrichment.log')
    if not lib.checkannotations(goa_out):
        cmd = [
            'find_enrichment.py', '--obo',
            os.path.join(FUNDB, 'go.obo'), '--pval', '0.001', '--alpha',
            '0.001', '--method', 'fdr', '--outfile', goa_out, input,
            os.path.join(args.input, 'population.txt'),
            os.path.join(args.input, 'associations.txt')
        ]
        with open(go_log, 'w') as outfile:
            outfile.write('{}\n'.format(' '.join(cmd)))
        with open(go_log, 'a') as outfile:
            subprocess.call(cmd, stdout=outfile, stderr=outfile)
Ejemplo n.º 5
0
if args.cpus > len(scaffolds):
    num = len(scaffolds)
else:
    num = args.cpus
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
              (len(scaffolds), num))
lib.runMultiProgress(runAugustus, scaffolds, num)

lib.log.debug("Augustus prediction is finished, now concatenating results")
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
    for file in scaffolds:
        file = os.path.join(tmpdir, file + '.augustus.gff3')
        with open(file) as input:
            output.write(input.read())

if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
    lib.log.debug('Augustus finished, now joining results')
if lib.which_path('join_aug_pred.pl'):
    join_script = 'join_aug_pred.pl'
else:
    join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')

cmd = '{:} < {:} > {:}'.format(join_script,
                               os.path.join(tmpdir, 'augustus_all.gff3'),
                               args.out)
lib.log.debug(cmd)

with open(args.out, 'w') as finalout:
    with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'r') as infile:
        subprocess.call([join_script], stdin=infile, stdout=finalout)
Ejemplo n.º 6
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-remote.py',
        description=
        '''Script that adds functional annotation to a genome using remote searches.''',
        epilog="""Written by Jon Palmer (2016-2017) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        help='Folder from funannotate predict.')
    parser.add_argument('-g',
                        '--genbank',
                        help='Annotated genome in GenBank format')
    parser.add_argument('-m',
                        '--methods',
                        required=True,
                        nargs='+',
                        choices=['all', 'phobius', 'antismash'],
                        help='Method to run')
    parser.add_argument('-o', '--out', help='Basename of output files')
    parser.add_argument('-e',
                        '--email',
                        required=True,
                        help='Email address for IPRSCAN server')
    parser.add_argument('--force',
                        action='store_true',
                        help='Over-write output folder')
    parser.add_argument('-a',
                        '--antismash',
                        default='fungi',
                        choices=['fungi', 'plants'],
                        help='antiSMASH server')
    args = parser.parse_args(args)

    global parentdir, RUNIPRSCAN, XMLCombine
    parentdir = os.path.join(os.path.dirname(__file__))
    RUNIPRSCAN = os.path.join(parentdir, 'aux_scripts', 'runIPRscan.py')
    XMLCombine = os.path.join(parentdir, 'aux_scripts', 'xmlcombine.py')

    # create log file
    log_name = 'funannotate-remote.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print "-------------------------------------------------------"
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # need to do some checks here of the input
    genbank = ''
    Proteins = ''
    tablefile = ''
    Fastafile = ''
    if not args.input:
        # did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
        if not args.out:
            lib.log.error(
                "If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)"
            )
            sys.exit(1)
        else:
            outputdir = args.out
            # create outputdir and subdirs
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
        if not args.genbank:
            lib.log.error(
                "You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank"
            )
            sys.exit(1)
        else:
            # create output directories
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
            else:
                lib.log.error("Output directory %s already exists" %
                              (outputdir))
                if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                    os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                if not os.path.isdir(
                        os.path.join(outputdir, 'annotate_results')):
                    os.makedirs(os.path.join(outputdir, 'annotate_results'))
                if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
                    os.makedirs(os.path.join(outputdir, 'logfiles'))
            genbank = args.genbank
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            lib.log.info("Checking GenBank file for annotation")
            if not lib.checkGenBank(genbank):
                lib.log.error("Found no annotation in GenBank file, exiting")
                sys.exit(1)
            lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    else:
        # should be a folder, with funannotate files, thus store results there, no need to create output folder
        if not os.path.isdir(args.input):
            lib.log.error("%s directory does not exist" % args.input)
            sys.exit(1)
        # funannotate results should be here
        if os.path.isdir(os.path.join(args.input, 'update_results')):
            inputdir = os.path.join(args.input, 'update_results')
            outputdir = args.input
        elif os.path.isdir(os.path.join(args.input, 'predict_results')):
            inputdir = os.path.join(args.input, 'predict_results')
            outputdir = args.input
        else:
            # here user specified the predict_results folder, or it is a custom folder
            inputdir = os.path.join(args.input)

        # get files that you need
        for file in os.listdir(inputdir):
            if file.endswith('.gbk'):
                genbank = os.path.join(inputdir, file)
            elif file.endswith('.tbl'):
                tablefile = os.path.join(inputdir, file)
            elif file.endswith('.scaffolds.fa'):
                Fastafile = os.path.join(inputdir, file)
        # now create the files from genbank input file for consistency in gene naming, etc
        if not genbank:
            lib.log.error(
                "Properly formatted 'funannotate predict' files do no exist in this directory"
            )
            sys.exit(1)
        else:
            # if user gave predict_results folder, then set output to up one directory
            if 'predict_results' in inputdir or 'update_results' in inputdir:
                outputdir = lib.get_parent_dir(inputdir)
            else:
                if not args.out:
                    outputdir = inputdir  # output the results in the input directory
                else:
                    outputdir = args.out
                    if not os.path.isdir(outputdir):
                        os.makedirs(outputdir)
            # create output directories
            if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
            else:
                lib.log.error(
                    "Output directory %s already exists, will use any existing data.  If this is not what you want, exit, and provide a unique name for output folder"
                    % (outputdir))
            lib.log.info("Parsing input files")
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.mrna-transcripts.fasta')
            CDSTranscripts = os.path.join(outputdir, 'annotate_misc',
                                          'genome.cds-transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            if tablefile and Fastafile:
                lib.log.debug("Generating files from %s" % tablefile)
                lib.tbl2allout(tablefile, Fastafile, GFF, Proteins,
                               Transcripts, CDSTranscripts, Scaffolds)
            else:
                lib.log.debug("Generating files from %s" % genbank)
                lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    # make sure logfiles directory is present, will need later
    if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
        os.makedirs(os.path.join(outputdir, 'logfiles'))

    # get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
    Proteins = os.path.abspath(Proteins)
    genbank = os.path.abspath(genbank)

    if 'phobius' in args.methods or 'all' in args.methods:
        # run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
        phobius_out = os.path.join(outputdir, 'annotate_misc',
                                   'phobius.results.txt')
        phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
        lib.log.info(
            "Predicting secreted and transmembrane proteins using Phobius")
        if not lib.checkannotations(phobius_out):
            if args.email:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-e',
                    str(args.email), '-l', phobiusLog
                ])
            else:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-l', phobiusLog
                ])

    if 'antismash' in args.methods or 'all' in args.methods:
        if args.antismash == 'fungi':
            base_address = "https://fungismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'ncbi': '',
                'smcogs': 'on',
                'knownclusterblast': 'on',
                'activesitefinder': 'on',
                'subclusterblast': 'on',
                'jobtype': 'antismash5',
                'hmmdetection_strictness': 'relaxed'
            }
        elif args.antismash == 'plants':
            base_address = "https://plantismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'knownclusterblast': 'on',
                'subclusterblast': 'on'
            }
        version = requests.get(base_address + "/api/v1.0/version")
        as_vers = version.json()['antismash_generation']
        tax = version.json()['taxon']
        as_status = requests.get(base_address + "/api/v1.0/stats")
        queue = as_status.json()['queue_length']
        running = as_status.json()['running']
        lib.log.info("Connecting to antiSMASH %s v%s webserver" %
                     (tax, as_vers))
        lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running))
        lib.log.info("PLEASE to not abuse the webserver, be considerate!")
        if int(queue) > 10 and not args.force:
            lib.log.error(
                "There are more than 10 antiSMASH jobs in queue, use --force to submit anyway"
            )
            sys.exit(1)
        job_files = {'seq': open(genbank, 'rb')}

        lib.log.info("Uploading %s to webserver" % genbank)
        postjob = requests.post(base_address + "/api/v1.0/submit",
                                files=job_files,
                                data=job_parameters)
        jobid = postjob.json()['id']
        # now we can query the job every so often, not sure what is reasonable here, start with 2 minutes?
        lib.log.info("Waiting for results from job: %s" % jobid)
        while True:
            job_status = requests.get(base_address + "/api/v1.0/status/" +
                                      jobid)
            if job_status.json()['status'] == 'done':
                break
            time.sleep(60)  # check every minute
        result_url = job_status.json()['result_url']
        base_url = result_url.replace('index.html', '')
        lib.log.info("antiSMASH v%s job finished" % (as_vers))
        lib.log.debug("%s" % job_status.json())
        # need to retrieve results, have to find link, seems like this might be first scaffold name?
        # after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it
        job_html = requests.get(base_address + result_url)
        link = None
        for line in job_html.iter_lines():
            if 'Download all results' in line:
                cols = line.split('a href="')
        for x in cols:
            if '.zip' in x:
                link = x.split('"')[0]
        if not link:
            lib.log.error('Error parsing output zip file from antismash')
            sys.exit(1)
        baselink = link.replace('.zip', '')
        download_url = base_address + base_url + link
        download(download_url, 'antiSMASH.zip')
        # now unzip and move folder
        zipref = zipfile.ZipFile('antiSMASH.zip', 'r')
        zipref.extractall(os.path.join(outputdir, jobid))
        zipref.close()
        os.remove('antiSMASH.zip')
        lib.log.info("Results folder: %s/%s" % (outputdir, jobid))
        # now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection
        anti_GBK = os.path.join(outputdir, jobid, os.path.basename(genbank))
        final = os.path.join(outputdir, 'annotate_misc',
                             'antiSMASH.results.gbk')
        shutil.copyfile(anti_GBK, final)
        lib.log.info("Results GBK: %s" % final)

    lib.log.info("Remote searches complete")
    # move logfile
    if os.path.isfile(log_name):
        shutil.copyfile(log_name, os.path.join(outputdir, 'logfiles',
                                               log_name))
        os.remove(log_name)
Ejemplo n.º 7
0
def runExonerate(input):
    s = input.split(':::')
    ProtID = s[0]
    ScaffID = s[1]
    ScaffStart = int(s[2])
    ScaffEnd = int(s[3])
    # get the protein model
    query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa')
    with open(query, 'w') as output:
        SeqIO.write(protein_dict[ProtID], output, 'fasta')
    # now get the genome region, use different variable names for SeqRecords to avoid collision
    scaffold = os.path.join(
        tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' +
        str(ScaffEnd) + '.fa')
    with open(scaffold, 'w') as output2:
        with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'),
                  'rU') as fullscaff:
            for header, Sequence in SimpleFastaParser(fullscaff):
                # grab a 3 kb cushion on either side of hit region, careful of scaffold ends
                start = ScaffStart - 3000
                if start < 1:
                    start = 1
                end = ScaffEnd + 3000
                if end > len(Sequence):
                    end = len(Sequence)
                output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
    exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__'
    # check that input files are created and valid
    exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out')
    ryo = "AveragePercentIdentity: %pi\n"
    cmd = [
        'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment',
        'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron',
        str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold
    ]
    if lib.checkannotations(query) and lib.checkannotations(scaffold):
        # run exonerate, capture errors
        with open(exonerate_out, 'w') as output3:
            proc = subprocess.Popen(cmd,
                                    stdout=output3,
                                    stderr=subprocess.PIPE)
        stderr = proc.communicate()
        if 'WARNING' in stderr[1]:
            lib.log.debug('Error in input:{:}'.format(input))
            lib.log.debug(
                '%s, Len=%i, %i-%i; %i-%i' %
                (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
            os.rename(query,
                      os.path.join(tmpdir, 'failed', os.path.basename(query)))
            os.rename(
                scaffold,
                os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
        else:
            for y in [query, scaffold]:
                try:
                    lib.SafeRemove(y)
                except OSError:
                    lib.log.debug("Error removing %s" % (y))
        # check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
        if lib.getSize(exonerate_out) < 500:
            os.remove(exonerate_out)
    else:
        lib.log.debug('Error in query or scaffold:{:}'.format(input))
        lib.SafeRemove(query)
        lib.SafeRemove(scaffold)
Ejemplo n.º 8
0
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    if not lib.checkannotations(shortBAM):
        # build hisat2 index, using exons and splice sites
        lib.log.info("Building Hisat2 genome index")
        cmd = ['hisat2-build', '-p',
               str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')]
        lib.runSubprocess4(cmd, '.', lib.log)
        # align reads using hisat2
        lib.log.info("Aligning reads to genome using Hisat2")
        # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
        # use half number of threads for bam compression threads
        bamthreads = (args.cpus + 2 // 2) // 2
        if args.stranded != 'no' and not readTuple[2]:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded]
        else:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')]
        if readTuple[0] and readTuple[1]:
            hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
        if readTuple[2]:
            hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
        cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join(
            hisat2cmd), str(bamthreads), shortBAM]
        lib.runSubprocess(cmd, '.', lib.log)
    else:
        lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM))

    # now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no':
        cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec',
               '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str(
                   args.max_intronlen),
               '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    else:
        cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM,
               '--genome_guided_max_intron', str(
                   args.max_intronlen), '--CPU', str(args.cpus),
               '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    cmd = cmd + jaccard_clip
    if longReads and lib.checkannotations(longReads):
        cmd = cmd + ['--long_reads', os.path.realpath(longReads)]
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    # this will create all the Trinity commands, will now run these in parallel using multiprocessing
    # in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'r') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            # don't think this should be appended to every command....
            line = line.replace('--no_distributed_trinity_exec', '')
            line = line.replace('"', '')  # don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling "+"{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus-1))
    lib.runMultiProgress(safe_run, file_list, args.cpus-1)

    # collected output files and clean
    outputfiles = os.path.join(
        tmpdir, 'trinity_gg', 'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'):
            fileout.write('%s\n' % filename)
    # now grab them all using Trinity script
    cmd = ['perl', os.path.abspath(os.path.join(
        TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG']
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
    lib.log.info('{:,} transcripts derived from Trinity'.format(
        lib.countfasta(output)))
Ejemplo n.º 9
0
def RepeatModelMask(input, cpus, tmpdir, output, repeatlib, species, debug):
    lib.log.info("Loading sequences and soft-masking genome")
    outdir = os.path.join(tmpdir, 'RepeatModeler')
    input = os.path.abspath(input)
    output = os.path.abspath(output)
    # lets run RepeatModeler here to get repeat library
    if os.path.exists(outdir):
        shutil.rmtree(outdir)
    os.makedirs(outdir)
    lib.log.info("Soft-masking: building RepeatModeler database")
    with open(debug, 'a') as debug_log:
        subprocess.call(
            ['BuildDatabase', '-engine', 'ncbi', '-name', 'Repeats', input],
            cwd=outdir,
            stdout=debug_log,
            stderr=debug_log)
    lib.log.info("Soft-masking: generating repeat library using RepeatModeler")
    with open(debug, 'a') as debug_log:
        subprocess.call(
            ['RepeatModeler', '-database', 'Repeats', '-pa',
             str(cpus)],
            cwd=outdir,
            stdout=debug_log,
            stderr=debug_log)
    # find name of folder
    RP_folder = '.'
    for i in os.listdir(outdir):
        if i.startswith('RM_'):
            RP_folder = i
    library = os.path.abspath(repeatlib)
    if lib.checkannotations(
            os.path.join(outdir, RP_folder, 'consensi.fa.classified')):
        shutil.copyfile(
            os.path.join(outdir, RP_folder, 'consensi.fa.classified'), library)
    # now soft-mask the genome for gene predictors
    outdir2 = os.path.join(tmpdir, 'RepeatMasker')
    if os.path.isdir(outdir2):
        shutil.rmtree(outdir2)
    os.makedirs(outdir2)
    if not os.path.isfile(library):
        lib.log.info(
            "Soft-masking: running RepeatMasker with default library (RepeatModeler found 0 models)"
        )
        with open(debug, 'a') as debug_log:
            subprocess.call([
                'RepeatMasker', '-e', 'ncbi', '-gff', '-species', species,
                '-pa',
                str(cpus), '-xsmall', '-dir', '.', input
            ],
                            cwd=outdir2,
                            stdout=debug_log,
                            stderr=debug_log)
    else:
        lib.log.info("Soft-masking: running RepeatMasker with custom library")
        with open(debug, 'a') as debug_log:
            subprocess.call([
                'RepeatMasker', '-e', 'ncbi', '-gff', '-lib', library, '-pa',
                str(cpus), '-xsmall', '-dir', '.', input
            ],
                            cwd=outdir2,
                            stdout=debug_log,
                            stderr=debug_log)
    for file in os.listdir(outdir2):
        if file.endswith('.masked'):
            shutil.copyfile(os.path.join(outdir2, file), output)
Ejemplo n.º 10
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-mask.py',
        description='''Wrapper for RepeatModeler/RepeatMasker''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='genome assembly FASTA format')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Output softmasked FASTA file')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep intermediate files')
    parser.add_argument('-m',
                        '--method',
                        default='tantan',
                        choices=['repeatmodeler', 'repeatmasker', 'tantan'],
                        help='Method to mask repeats with')
    parser.add_argument('-s',
                        '--repeatmasker_species',
                        help='RepeatMasker species, will skip repeatmodeler')
    parser.add_argument(
        '-l',
        '--repeatmodeler_lib',
        help='Pre-computed RepeatModeler (or other) repetitive elements')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    args = parser.parse_args(args)

    # create log file for Repeats(capture stderr)
    log_name = 'funannotate-mask.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running funanotate v{:}".format(version))

    repeats = None
    tmpdir = None
    if args.method == 'tantan':
        programs = ['tantan']
        lib.CheckDependencies(programs)
        lib.log.info('Soft-masking simple repeats with tantan')
        runTanTan(args.input, args.out)
    else:
        programs = ['RepeatMasker']
        if args.method == 'repeatmodeler':
            programs += ['BuildDatabase', 'RepeatModeler']
        lib.CheckDependencies(programs)

        # create tmpdir
        pid = uuid.uuid4()
        tmpdir = 'mask_' + str(pid)
        os.makedirs(tmpdir)

        # parse options which dictates how repeatmodeler/masker are run
        if not args.repeatmodeler_lib:  # no fasta file given, so
            if not args.repeatmasker_species:  # no species given, so run entire repeatmodler + repeat masker
                repeats = 'repeatmodeler-library.' + str(pid) + '.fasta'
                RepeatModelMask(args.input, args.cpus, tmpdir, args.out,
                                repeats, args.repeatmasker_species, log_name)
            else:
                RepeatMaskSpecies(args.input, args.repeatmasker_species,
                                  args.cpus, tmpdir, args.out, log_name)
        else:
            if lib.checkannotations(args.repeatmodeler_lib):
                RepeatMask(args.input, args.repeatmodeler_lib, args.cpus,
                           tmpdir, args.out, log_name)
            else:
                lib.log.error(
                    'ERROR: repeat library is not a valid file: {:}'.format(
                        args.repeatmodeler_lib))
                sys.exit(1)

    # output some stats on %reads masked.
    scaffolds = 0
    maskedSize = 0
    GenomeLength = 0
    with open(args.out, 'r') as input:
        for rec, Seq in SimpleFastaParser(input):
            scaffolds += 1
            GenomeLength += len(Seq)
            maskedSize += lib.n_lower_chars(Seq)

    percentMask = maskedSize / float(GenomeLength)
    lib.log.info(
        'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)'
        .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize,
                percentMask * 100))
    if repeats:
        lib.log.info('RepeatModeler library: {:}'.format(repeats))
    # clean up
    if not args.debug:
        if tmpdir:
            lib.SafeRemove(tmpdir)
    print("-------------------------------------------------------")
Ejemplo n.º 11
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='gbk2parts.py',
        description='''Script to convert GBK file to its components.''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--tbl',
                        required=True,
                        help='Genome annotation in tbl format')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Genome in FASTA format')
    parser.add_argument(
        '-s',
        '--species',
        required=True,
        help=
        'Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space'
    )
    parser.add_argument('--isolate', help='Isolate name (e.g. Af293)')
    parser.add_argument('--strain', help='Strain name (e.g. CEA10)')
    parser.add_argument(
        '-t',
        '--tbl2asn',
        help='Custom parameters for tbl2asn, example: linkage and gap info')
    parser.add_argument('--sbt', help='tbl2asn template file')
    parser.add_argument('-o', '--output', help='Output basename')
    args = parser.parse_args(args)

    parentdir = os.path.dirname(lib.__file__)

    # see if organism/species/isolate was passed at command line
    organism = None
    if args.species:
        organism = args.species
    else:
        organism = os.path.basename(args.tbl).split('.t')[0]
    if args.strain:
        organism_name = organism + '_' + args.strain
    elif args.isolate:
        organism_name = organism + '_' + args.isolate
    else:
        organism_name = organism
    organism_name = organism_name.replace(' ', '_')
    if args.output:
        outputname = args.output
    else:
        outputname = organism_name

    # create tmp folder to run tbl2asn from
    # make tmp folder
    tmp = outputname + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    # now move files into proper location
    if not lib.checkannotations(args.fasta):
        print(('FASTA genome file not found: {:}'.format(args.fasta)))
        sys.exit(1)
    if not lib.checkannotations(args.tbl):
        print(('TBL annotations file not found: {:}'.format(args.tbl)))
        sys.exit(1)
    shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa'))
    shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl'))

    # now we can run tbl2asn
    if args.sbt:
        SBT = args.sbt
    else:
        SBT = os.path.join(parentdir, 'config', 'test.sbt')
    discrep = outputname + '.discrepency.txt'
    version = 1
    runtbl2asn(tmp, SBT, discrep, organism, args.isolate, args.strain,
               args.tbl2asn, version)

    # check the output for errors for NCBI
    final_fixes = os.path.join(tmp, 'models-need-fixing.txt')
    prefix = locustagGB(os.path.join(tmp, 'genome.gbf'))
    errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'),
                             os.path.join(tmp, 'genome.val'), prefix,
                             final_fixes)

    # get output files
    gbkout = outputname + '.gbk'
    shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout)
    if errors < 1:
        lib.SafeRemove(tmp)
Ejemplo n.º 12
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-predict.py',
        usage="%(prog)s [options] -i genome.fasta",
        description='''Script that adds a proteome to the outgroups.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Proteome in FASTA format')
    parser.add_argument('-s',
                        '--species',
                        required=True,
                        help='Species name "binomial in quotes"')
    parser.add_argument(
        '-b',
        '--busco_db',
        default='dikarya',
        choices=[
            'fungi', 'microsporidia', 'dikarya', 'ascomycota',
            'pezizomycotina', 'eurotiomycetes', 'sordariomycetes',
            'saccharomycetes', 'saccharomycetales', 'basidiomycota',
            'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa',
            'nematoda', 'arthropoda', 'insecta', 'endopterygota',
            'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii',
            'tetrapoda', 'aves', 'mammalia', 'euarchontoglires',
            'laurasiatheria', 'embryophyta'
        ],
        help='BUSCO database to use')
    parser.add_argument('-c',
                        '--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('-d',
                        '--database',
                        help='Path to funannotate database, $FUNANNOTATE_DB')
    args = parser.parse_args(args)

    if args.database:
        FUNDB = args.database
    else:
        try:
            FUNDB = os.environ["FUNANNOTATE_DB"]
        except KeyError:
            lib.log.error(
                'Funannotate database not properly configured, run funannotate setup.'
            )
            sys.exit(1)

    parentdir = os.path.join(os.path.dirname(__file__))

    # get base name
    species = args.species.replace(' ', '_').lower() + '.' + args.busco_db
    OUTGROUPS = os.path.join(FUNDB, 'outgroups')

    # create log file
    log_name = species + '-add2outgroups.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # check buscos, download if necessary
    if not os.path.isdir(os.path.join(FUNDB, args.busco_db)):
        lib.log.error(
            "%s busco database is missing, install with funannotate setup -b %s"
            % (args.busco_db, args.busco_db))
        sys.exit(1)

    ProtCount = lib.countfasta(args.input)
    lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

    # convert to proteins and screen with busco
    lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
    BUSCODB = os.path.join(FUNDB, args.busco_db)
    BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py')
    cmd = [
        sys.executable, BUSCO, '-i',
        os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB,
        '-o', species, '--cpu',
        str(args.cpus), '-f'
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    # check that it ran correctly
    busco_results = os.path.join('run_' + species,
                                 'full_table_' + species + '.tsv')
    if not lib.checkannotations(busco_results):
        lib.log.error("BUSCO failed, check logfile")
        sys.exit(1)
    nameChange = {}
    with open(busco_results, 'rU') as input:
        for line in input:
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if cols[1] == 'Complete':
                if not cols[2] in nameChange:
                    nameChange[cols[2]] = cols[0]
                else:
                    lib.log.error(
                        "Duplicate ID found: %s %s. Removing from results" %
                        (cols[2], cols[0]))
                    del nameChange[cols[2]]

    # output counts
    lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')

    # index the proteome for parsing
    SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))

    # setup output proteome
    busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa')
    with open(busco_out, 'w') as output:
        for k, v in list(nameChange.items()):
            rec = SeqRecords[k]
            output.write('>%s\n%s\n' % (v, rec.seq))
    lib.log.info("Results written to: %s" % busco_out)

    # clean up your mess
    shutil.rmtree('run_' + species)
    shutil.rmtree('tmp')
Ejemplo n.º 13
0
                    default=1,
                    type=int,
                    help='location of HMM database')
parser.add_argument('-o', '--out', required=True, help='output file')
args = parser.parse_args()

global FUNDB, FNULL
FUNDB = args.db
FNULL = open(os.devnull, 'w')

# now loop through each genome comparing to population
file_list = []
for f in os.listdir(args.input):
    if f.startswith('associations'):
        continue
    if f.startswith('population'):
        continue
    file = os.path.join(args.input, f)
    if lib.checkannotations(file):
        file_list.append(file)
    else:
        print('  WARNING: skipping {} as no GO terms'.format(f))

# run over multiple CPUs
if len(file_list) > args.cpus:
    procs = args.cpus
else:
    procs = len(file_list)

lib.runMultiNoProgress(GO_safe_run, file_list, procs)