Example #1
0
    Hits = parseDiamond(BlastResult)

lib.log.info('Found {0:,}'.format(len(Hits)) + ' preliminary alignments')

#index the genome and proteins
protein_dict = SeqIO.index(os.path.abspath(args.proteins),
                           'fasta')  #do index here in case memory problems?

#split genome fasta into individual scaffolds
with open(os.path.abspath(args.genome), 'rU') as input:
    for record in SeqIO.parse(input, "fasta"):
        SeqIO.write(record, os.path.join(tmpdir, 'scaffolds',
                                         record.id + ".fa"), "fasta")

#run multiprocessing exonerate
lib.runMultiProgress(runExonerate, Hits, args.cpus)

#now need to loop through and offset exonerate predictions back to whole scaffolds
exonerate_raw = os.path.join(tmpdir, 'exonerate.out.combined')
with open(exonerate_raw, 'w') as output:
    for file in os.listdir(tmpdir):
        if file.endswith('.out'):
            with open(os.path.join(tmpdir, file), 'rU') as exoresult:
                offset = int(file.split('__')[1])
                for line in itertools.islice(exoresult, 3, None):
                    if line.startswith('#') or line.startswith(
                            'Average') or line.startswith('-- completed'):
                        output.write(line)
                    else:
                        cols = line.split('\t')
                        cols[3] = str(int(cols[3]) + offset)
def runTrinityGG(genome, readTuple, output):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    #build hisat2 index, using exons and splice sites
    lib.log.info("Starting Trinity genome guided")
    lib.log.info("Building Hisat2 genome index")
    cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')]
    lib.runSubprocess4(cmd, '.', lib.log)
    #align reads using hisat2
    lib.log.info("Aligning reads to genome using Hisat2")
    hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam')
    #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    if args.stranded != 'no' and not readTuple[2]:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness',
            args.stranded
        ]
    else:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome')
        ]
    if readTuple[0] and readTuple[1]:
        hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
    if readTuple[2]:
        hisat2cmd = hisat2cmd + ['-U', readTuple[2]]

    cmd = [
        os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd),
        str(bamthreads), hisat2bam
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    #now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no' and not readTuple[2]:
        cmd = [
            'Trinity', '--SS_lib_type', args.stranded,
            '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam,
            '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    else:
        cmd = [
            'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam',
            hisat2bam, '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    cmd = cmd + jaccard_clip
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'rU') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            line = line.replace(
                '--no_distributed_trinity_exec',
                '')  #don't think this should be appended to every command....
            line = line.replace('"', '')  #don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling " + "{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus - 1))
    lib.runMultiProgress(safe_run, file_list, args.cpus - 1)

    #collected output files and clean
    outputfiles = os.path.join(tmpdir, 'trinity_gg',
                               'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'),
                                   '*inity.fasta'):
            fileout.write('%s\n' % filename)
    #now grab them all using Trinity script
    cmd = [
        os.path.join(TRINITY, 'util', 'support_scripts',
                     'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG'
    ]
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
Example #3
0
        '{0:,}'.format(num_prots) + ' proteins')
    #build in a check before running (in case script gets stopped and needs to restart
    finished = []
    for file in os.listdir(IPROUT):
        if file.endswith('.xml'):
            base = file.split('.xml')[0]
            fasta_file = os.path.join(PROTS, base + '.fa')
            finished.append(fasta_file)

    finished = set(finished)  #make sure no duplicates
    runlist = [x for x in proteins if x not in finished]
    if len(runlist) < num_prots:
        lib.log.info("Results found, querying remaining %i proteins" %
                     len(runlist))
    #start up the list, max 25 at a time
    lib.runMultiProgress(runIPRpython, runlist, 25)
    #clean up protein fasta files
    shutil.rmtree(PROTS)
    #now convert to single file and then clean up
    with open(IPRCombined, 'w') as output:
        subprocess.call([sys.executable, XMLCombine, IPROUT], stdout=output)
    if lib.checkannotations(IPRCombined):
        shutil.rmtree(IPROUT)

if 'antismash' in args.methods or 'all' in args.methods:
    if args.antismash == 'fungi':
        base_address = "https://fungismash.secondarymetabolites.org"
        job_parameters = {
            'email': args.email,
            'smcogs': 'on',
            'knownclusterblast': 'on',
Example #4
0
            with open(os.path.join(tmpdir, i+'.hints.gff'), 'w') as output:
                with open(args.hints, 'rU') as hintsfile:
                    for line in hintsfile:
                        cols = line.split('\t')
                        if cols[0] == i:
                            output.write(line)
'''

#now loop through each scaffold running augustus
if args.cpus > len(scaffolds):
    num = len(scaffolds)
else:
    num = args.cpus
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
              (len(scaffolds), num))
lib.runMultiProgress(runAugustus, scaffolds, num)

lib.log.debug("Augustus prediction is finished, now concatenating results")
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
    for file in scaffolds:
        file = os.path.join(tmpdir, file + '.augustus.gff3')
        with open(file) as input:
            output.write(input.read())

join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
with open(args.out, 'w') as finalout:
    with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as input:
        subprocess.call([join_script], stdin=input, stdout=finalout)
if not args.debug:
    shutil.rmtree(tmpdir)
lib.log.info("Found %i gene models" % countGFFgenes(args.out))
Example #5
0
            else:
                eggs = 'None'
            if len(buscos) > 0:
                buscos = set(buscos)
                buscos = ', '.join(str(v) for v in buscos)
            else:
                buscos = 'None'
            #write now to file
            output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(proteins)))

if args.run_dnds:
    #multiprocessing dN/dS on list of folders
    dNdSList = lib.get_subdirs(ortho_folder)
    if args.run_dnds == 'estimate':
        lib.log.debug("Running simple dN/dS estimate")
        lib.runMultiProgress(lib.rundNdSestimate, dNdSList, args.cpus)
    else:
        lib.log.debug("Running exhasitve dN/dS ratio with Likelihood Ratio Tests")
        lib.runMultiProgress(lib.rundNdSexhaustive, dNdSList, args.cpus)

    #after all data is run, then parse result log files, return dictionary
    dNdSresults = lib.parsedNdS(ortho_folder)
if len(args.input) > 1:
    orthologs = os.path.join(args.out, 'orthology','orthology_groups.txt')
    with open(orthologs, 'w') as output:
        with open(orthologstmp, 'rU') as input:
            for line in input:
                line = line.replace('\n', '')
                cols = line.split('\t')
                if args.run_dnds:
                    if cols[0] in dNdSresults:
#create tmpdir to store fasta files and output files
TMPDIR = 'phobius_' + str(os.getpid())

#split fasta
lib.splitFASTA(args.input, TMPDIR)

#now get list of files in tmpdir
proteins = []
for file in os.listdir(TMPDIR):
    if file.endswith('.fa'):
        proteins.append(file)

#now run the script
if lib.which('phobius.pl'):
    lib.runMultiProgress(runPhobiusLocal, proteins, multiprocessing.cpu_count())
else:
    lib.runMultiProgress(runPhobiusRemote, proteins, 29) #max is 30 jobs at a time

#collect all results
phobius = []
for file in os.listdir(TMPDIR):
    if file.endswith('.phobius'):
        phobius.append(os.path.join(TMPDIR,file))

#write output
TMdomain = 0
SigPep = 0
with open(args.out, 'w') as output:
    output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction'))
    for x in phobius: