def runDiamond(input, query, cpus, output, premade_db=None): # create DB of protein sequences # check diamond version if lib.getDiamondVersion() >= '2.0.5': # run in frameshift mode cmd = ['diamond', 'blastx', '--threads', str(cpus), '-q', input, '--db', 'diamond', '-o', 'diamond.matches.tab', '-e', '1e-10', '-k', '0', '--more-sensitive', '--unal', '0', '-c', '1', '-F', '15', '-f', '6', 'sseqid', 'slen', 'sstart', 'send', 'qseqid', 'qlen', 'qstart', 'qend', 'pident', 'length', 'evalue', 'score', 'qcovhsp', 'qframe'] else: if int(cpus) > 8: cpus = 8 cmd = ['diamond', 'blastx', '--threads', str(cpus), '-q', input, '--db', 'diamond', '-o', 'diamond.matches.tab', '-e', '1e-10', '-k', '0', '--more-sensitive', '-f', '6', 'sseqid', 'slen', 'sstart', 'send', 'qseqid', 'qlen', 'qstart', 'qend', 'pident', 'length', 'evalue', 'score', 'qcovhsp', 'qframe'] if premade_db is None: db_cmd = ['diamond', 'makedb', '--threads', str(cpus), '--in', query, '--db', 'diamond'] lib.runSubprocess4(db_cmd, output, lib.log) else: lib.log.debug('Using premade Diamond database: {}'.format(premade_db)) os.symlink(os.path.abspath(premade_db), os.path.join(output, 'diamond.dmnd')) # now run search lib.runSubprocess4(cmd, output, lib.log)
def runDiamond(input, query, cpus, output): # create DB of protein sequences cmd = [ 'diamond', 'makedb', '--threads', str(cpus), '--in', query, '--db', 'diamond' ] lib.runSubprocess4(cmd, output, lib.log) # now run search cmd = [ 'diamond', 'blastx', '--threads', str(cpus), '-q', input, '--db', 'diamond', '-o', 'diamond.matches.tab', '-e', '1e-10', '-k', '0', '--more-sensitive', '-f', '6', 'sseqid', 'slen', 'sstart', 'send', 'qseqid', 'qlen', 'qstart', 'qend', 'pident', 'length', 'evalue', 'score', 'qcovhsp', 'qframe' ] lib.runSubprocess4(cmd, output, lib.log)
def runDiamond(input, query, cpus, output, premade_db=None): # create DB of protein sequences if int(cpus) > 8: cpus = 8 if premade_db is None: cmd = [ 'diamond', 'makedb', '--threads', str(cpus), '--in', query, '--db', 'diamond' ] lib.runSubprocess4(cmd, output, lib.log) else: lib.log.info("Using premade Diamond database at:" + premade_db) os.symlink(premade_db, output + "/diamond.dmnd") # now run search lib.log.info("Now running diamond search...") cmd = [ 'diamond', 'blastx', '--threads', str(cpus), '-q', input, '--db', 'diamond', '-o', 'diamond.matches.tab', '-e', '1e-10', '-k', '0', '--more-sensitive', '-f', '6', 'sseqid', 'slen', 'sstart', 'send', 'qseqid', 'qlen', 'qstart', 'qend', 'pident', 'length', 'evalue', 'score', 'qcovhsp', 'qframe' ] lib.runSubprocess4(cmd, output, lib.log)
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' if not lib.checkannotations(shortBAM): # build hisat2 index, using exons and splice sites lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', '-p', str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) # align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM # use half number of threads for bam compression threads bamthreads = (args.cpus + 2 // 2) // 2 if args.stranded != 'no' and not readTuple[2]: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded] else: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join( hisat2cmd), str(bamthreads), shortBAM] lib.runSubprocess(cmd, '.', lib.log) else: lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM)) # now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no': cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] else: cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] cmd = cmd + jaccard_clip if longReads and lib.checkannotations(longReads): cmd = cmd + ['--long_reads', os.path.realpath(longReads)] lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') # this will create all the Trinity commands, will now run these in parallel using multiprocessing # in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'r') as cmdFile: for line in cmdFile: line = line.replace('\n', '') # don't think this should be appended to every command.... line = line.replace('--no_distributed_trinity_exec', '') line = line.replace('"', '') # don't need these double quotes file_list.append(line) lib.log.info("Assembling "+"{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus-1)) lib.runMultiProgress(safe_run, file_list, args.cpus-1) # collected output files and clean outputfiles = os.path.join( tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) # now grab them all using Trinity script cmd = ['perl', os.path.abspath(os.path.join( TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG'] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output) lib.log.info('{:,} transcripts derived from Trinity'.format( lib.countfasta(output)))