def divide_values(file, ref_scores): """divide each BSR value in a row by that row's maximum value""" errors = [] infile = open(file, "rU") firstLine = infile.readline() FL_F=firstLine.split() outfile = open("BSR_matrix_values.txt", "w") outfile.write('\t'.join([str(item) for item in FL_F])+"\n") outdata=[] for line in infile: fields=line.split() all_fields=list(fields) try: fields=map(float, all_fields[1:]) except: raise TypeError("abnormal number of fields observed") values= [ ] for x in fields: try: values.append(float(x)/float(ref_scores.get(all_fields[0]))) except: """if a mismatch error in names encountered, change values to 0""" errors.append(all_fields[0]) values.append(float("0")) sort_values=['%.2f' % elem for elem in values] outfile.write('\t'.join([str(item) for item in sort_values])+"\n") outdata.append(values) if len(errors)>0: nr=[x for i, x in enumerate(errors) if x not in errors[i+1:]] logging.logPrint("The following genes had no hits in datasets or are too short, values changed to 0, check names and output: %s" % "\n".join(nr)) outfile.close() return outdata
def translate_genes(genes,outfile,min_len): """translate nucleotide into peptide with BioPython""" infile = open(genes, "rU") output = [] output_handle = open(outfile, "w") too_short = [] for record in SeqIO.parse(infile, "fasta"): try: min_pep_len=int(min_len) """Should I trim these sequences back to be multiples of 3?""" if (len(record.seq)/3.0).is_integer(): pep_seq=record.seq.translate(to_stop=True, table=11) elif ((len(record.seq)-1)/3.0).is_integer(): pep_seq=record.seq[:-1].translate(to_stop=True, table=11) elif ((len(record.seq)-2)/3.0).is_integer(): pep_seq=record.seq[:-2].translate(to_stop=True, table=11) elif ((len(record.seq)-3)/3.0).is_integer(): pep_seq=record.seq[:-3].translate(to_stop=True, table=11) else: print("Sequence of odd length found and couldn't be trimmed") if len(pep_seq)>=min_pep_len: output_handle.write(">"+record.id+"\n") output_handle.write("".join(pep_seq)+"\n") output.append(pep_seq) else: too_short.append(record.id) except: raise TypeError("odd characters observed in sequence %s" % record.id) infile.close() output_handle.close() for record in output: return str(record) if len(too_short)>0: logging.logPrint("The following sequences were too short and will not be processed: %s" % "\n".join(too_short))
def monitorDownload(pr, downloaderChan, baseDir, url, minRate): sizeSamples = [] while True: baseSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) time.sleep(SAMPLE_RATE) ## # If the program exited and exited correctly, then we're good # otherwise take another sample size and see if we should terminate if pr.exitCode is not None: downloaderChan.receive() return True else: currentSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) - baseSize logging.debugPrint(lambda: "Download rate: %8d - %s" % (currentSize / SAMPLE_RATE, getUrlFilename(url))) size = currentSize / SAMPLE_RATE if size < 0: size = 0 sizeSamples.append(size) if len(sizeSamples) > MAX_SAMPLE_SIZE: sizeSamples.pop(0) if len(sizeSamples) >= MAX_SAMPLE_SIZE and sum(sizeSamples) / len(sizeSamples) < minRate: logging.logPrint( "Average Rate: %8d - %s - KILLING" % (sum(sizeSamples) / len(sizeSamples), getUrlFilename(url)) ) os.kill(pr.pipe.pid, signal.SIGTERM) ## # Give it a second to finish up whatever it's doing time.sleep(2) try: downloaderChan.receive() except: pass return False
def translate_genes(genes): """translate nucleotide into peptide with BioPython""" infile = open(genes, "rU") output = [] output_handle = open("genes.pep", "w") too_short = [] for record in SeqIO.parse(infile, "fasta"): try: if len(record.seq.translate(to_stop=True, table=11)) >= 30: print >> output_handle, ">" + record.id print >> output_handle, record.seq.translate(to_stop=True, table=11) output.append(record.seq.translate(to_stop=True, table=11)) else: too_short.append(record.id) except: raise TypeError("odd characters observed in sequence") for record in output: return str(record) infile.close() output_handle.close() if len(too_short) > 0: logging.logPrint( "The following sequences were too short and will not be processed: %s" % "\n".join(too_short)) return output
def monitorDownload(pr, downloaderChan, baseDir, url, minRate): sizeSamples = [] while True: baseSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) time.sleep(SAMPLE_RATE) ## # If the program exited and exited correctly, then we're good # otherwise take another sample size and see if we should terminate if pr.exitCode is not None: downloaderChan.receive() return True else: currentSize = getSizeOfFiles(getDownloadFilenames(baseDir, url)) - baseSize logging.debugPrint(lambda : 'Download rate: %8d - %s' % (currentSize/SAMPLE_RATE, getUrlFilename(url))) size = currentSize/SAMPLE_RATE if size < 0: size = 0 sizeSamples.append(size) if len(sizeSamples) > MAX_SAMPLE_SIZE: sizeSamples.pop(0) if len(sizeSamples) >= MAX_SAMPLE_SIZE and sum(sizeSamples)/len(sizeSamples) < minRate: logging.logPrint('Average Rate: %8d - %s - KILLING' % (sum(sizeSamples)/len(sizeSamples), getUrlFilename(url))) os.kill(pr.pipe.pid, signal.SIGTERM) ## # Give it a second to finish up whatever it's doing time.sleep(2) try: downloaderChan.receive() except: pass return False
def runDownloader(chan): pr, rchan = chan.receive() try: commands.runProgramRunnerEx(pr) logging.debugPrint(lambda : 'Successfully completed download') rchan.send(None) except Exception, err: logging.logPrint('Download failed for unknown reason: ' + str(err)) rchan.sendError(err)
def runDownloader(chan): pr, rchan = chan.receive() try: commands.runProgramRunnerEx(pr) logging.debugPrint(lambda: "Successfully completed download") rchan.send(None) except Exception, err: logging.logPrint("Download failed for unknown reason: " + str(err)) rchan.sendError(err)
def _perform_workflow(data): tn, f = data name,values=make_table_dev(f, "F", clusters) names.append(name) table_list.append(values) if debug == "T": logging.logPrint("sample %s processed" % f) else: pass
def _perform_workflow_nl(data): tn, f = data[0] clusters = data[1] names = data[2] table_list = data[3] debug = data[4] name, values = make_table_dev(f, "F", clusters) names.append(name) table_list.append(values) if debug == "T": logging.logPrint("sample %s processed" % f)
def run_raxml(fasta_in, tree, out_class_file, insertion_method, parameters, model, suffix): """untested function, system calls""" if "NULL" == parameters: if "ASC_GTRGAMMA" == model: args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method, '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-t', '%s' % tree, '--asc-corr=lewis', '--no-bfgs', '>', '/dev/null 2>&1'] else: args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method, '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-t', '%s' % tree, '--no-bfgs', '>', '/dev/null 2>&1'] else: if "ASC_GTRGAMMA" == model: args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method, '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-R', parameters, '-t', '%s' % tree, '--asc-corr=lewis', '--no-bfgs', '>', '/dev/null 2>&1'] else: args = ['raxmlHPC-SSE3', '-f', '%s' % insertion_method, '-s', '%s' % fasta_in, '-m', '%s' % model, '-n', '%s' % suffix, '-R', parameters, '-t', '%s' % tree, '--no-bfgs', '>', '/dev/null 2>&1'] try: vcf_fh = open('%s.raxml.out' % suffix, 'w') except: log_isg.logPrint('could not open raxml file') try: log_fh = open('%s.raxml.log' % suffix, 'w') except: log_isg.logPrint('could not open log file') log_isg.logPrint("inserting sequence into tree") try: raxml_run = Popen(args, stderr=log_fh, stdout=vcf_fh) raxml_run.wait() log_isg.logPrint("sequence(s) inserted into tree") except: log_isg.logPrint("sequence(s) were not inserted into tree!!!!!") os.system("sed 's/\[[^]]*\]//g' RAxML_labelledTree.%s > %s.tree_including_unknowns_noedges.tree" % (suffix, suffix)) subprocess.check_call("mv RAxML_labelledTree.%s %s_tree_including_unknowns_edges.tree" % (suffix, suffix) , shell=True) try: subprocess.check_call("cat RAxML_classificationLikelihoodWeights.%s >> %s" % (suffix, out_class_file), shell=True) except: pass os.system("rm RAxML_*.%s" % suffix) return suffix
def raxml_calculate_base_tree(in_fasta, model, name): """not tested, all system calls""" args = ['raxmlHPC-SSE3', '-f', 'd', '-p', '12345', '-s', '%s' % in_fasta, '-m', '%s' % model, '-n', '%s' % name, "--no-bfgs", '>', '/dev/null 2>&1'] try: vcf_fh = open('raxml.out', 'w') except: log_isg.logPrint('could not open raxml file') try: log_fh = open('raxml.log', 'w') except: log_isg.logPrint('could not open log file') try: raxml_run = Popen(args, stderr=log_fh, stdout=vcf_fh) raxml_run.wait() except: print "could not infer base pruned tree" sys.exit()
def run(options): logging.logPrint('Starting') batchConfig = config.configFromStream(open(options.configFile), lazy=True) machineConf = config.configFromStream(open('/tmp/machine.conf')) state = State( options.workflowConfig, options.batchStatesFile, _validateWrapper( batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'), pipeline_misc.determineWrapper( machineConf, batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'))), _interpretBatchFile(options.batchFile), _extractInnerPipelineConfig(batchConfig), batchConfig('pipeline.PIPELINE_WRAPPER_NAME'), int(batchConfig('batch.options.CONCURRENT_PRERUN')), int(batchConfig('batch.options.CONCURRENT_PIPELINES')), int(batchConfig('batch.options.CONCURRENT_POSTRUN'))) logging.logPrint('Queuing any incomplete work') queueCount = _queueIncompleteWork(state) logging.logPrint('Queued: %d' % queueCount) if state.pipelinesQueue.hasWork(): yield defer_work_queue.waitForCompletion(state.pipelinesQueue) for batchState in state.batchStates.values(): if 'state' not in batchState or batchState['state'] == 'failed': raise JobFailed()
def run(options): logging.logPrint('Starting') batchConfig = config.configFromStream(open(options.configFile), lazy=True) machineConf = config.configFromStream(open('/tmp/machine.conf')) state = State(options.workflowConfig, options.batchStatesFile, _validateWrapper(batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'), pipeline_misc.determineWrapper(machineConf, batchConfig('batch_pipeline.pipeline.PIPELINE_TEMPLATE'))), _interpretBatchFile(options.batchFile), _extractInnerPipelineConfig(batchConfig), batchConfig('pipeline.PIPELINE_WRAPPER_NAME'), int(batchConfig('batch.options.CONCURRENT_PRERUN')), int(batchConfig('batch.options.CONCURRENT_PIPELINES')), int(batchConfig('batch.options.CONCURRENT_POSTRUN'))) logging.logPrint('Queuing any incomplete work') queueCount = _queueIncompleteWork(state) logging.logPrint('Queued: %d' % queueCount) if state.pipelinesQueue.hasWork(): yield defer_work_queue.waitForCompletion(state.pipelinesQueue) for batchState in state.batchStates.values(): if 'state' not in batchState or batchState['state'] == 'failed': raise JobFailed()
def __call__(self): """ This returns: (onComplete, [(stream1, func1), .. (streamn, funcn)]) Where onComplete is any cleanup that needs to happen once all the streams are consumed and stream1 is a stream and func1 is the function to call upon data coming in for that stream """ if self.log: logPrint(self.cmd) env = self.env if self.addEnv and not self.env: # Copy the current environment because we'll be modifying it env = functional.updateDict(dict(os.environ), self.addEnv) elif self.addEnv: env = functional.updateDict(dict(self.env), self.addEnv) pipe = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=env) self.pipe = pipe return (self.onComplete, [(pipe.stdout, self.stdoutf), (pipe.stderr, self.stderrf)])
def run_gatk(reference, processors, name, gatk, tmp_dir): """gatk controller, mbq used to be set to 17, but was recently changed - untested, system call""" args = ['java', '-Djava.io.tmpdir=%s' % tmp_dir, '-jar', '%s' % gatk, '-T', 'UnifiedGenotyper', '-R', '%s' % reference, '-nt', '%s' % processors, '-S', 'silent', '-ploidy', '1', '-out_mode', 'EMIT_ALL_CONFIDENT_SITES', '-stand_call_conf', '30', '-stand_emit_conf', '30', '-I', '%s_renamed_header.bam' % name, '-rf', 'BadCigar'] try: vcf_fh = open('%s.vcf.out' % name, 'w') except: log_isg.logPrint('could not open vcf file') try: log_fh = open('%s.vcf.log' % name, 'w') except: log_isg.logPrint('could not open log file') try: gatk_run = Popen(args, stderr=log_fh, stdout=vcf_fh) gatk_run.wait() except: log_isg.logPrint("GATK encountered problems and did not run")
def _log(batchState, msg): logging.logPrint('BATCH_NUM %d - %s' % (batchState['batch_num'], msg))
def updateBatchState(self): logging.logPrint('Dumping states file with %d entries' % len(self.batchStates)) return batch_state.dump(self.batchStatesFile, self.batchStates)
def main(directory, id, filter, processors, genes, usearch, blast, penalty, reward, length, max_plog, min_hlog, f_plog, keep, debug_table): start_dir = os.getcwd() ap=os.path.abspath("%s" % start_dir) dir_path=os.path.abspath("%s" % directory) logging.logPrint("Testing paths of dependencies") if blast=="blastn" or blast=="tblastn": ab = subprocess.call(['which', 'blastall']) if ab == 0: print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402" else: print "blastall isn't in your path, but needs to be!" sys.exit() try: os.makedirs('%s/joined' % dir_path) except: print "old run directory exists in your genomes directory (%s/joined). Delete and run again" % dir_path sys.exit() for infile in glob.glob(os.path.join(dir_path, '*.fasta')): name=get_seq_name(infile) os.system("cp %s %s/joined/%s.new" % (infile,dir_path,name)) if "null" in genes: rc = subprocess.call(['which', 'prodigal']) if rc == 0: pass else: print "prodigal is not in your path, but needs to be!" print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119" try: if os.path.exists(usearch): print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461" except: raise TypeError("-u usearch flag must be set for use with prodigal") sys.exc_clear() if blast=="blat": ac = subprocess.call(['which', 'blat']) if ac == 0: print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool. Genome Research 12:656-664" else: print "You have requested blat, but it is not in your PATH" sys.exit() logging.logPrint("predicting genes with Prodigal") predict_genes(dir_path, processors) logging.logPrint("Prodigal done") os.system("cat *genes.seqs > all_gene_seqs.out") filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") rename_fasta_header("all_gene_seqs.out", "all_sorted.txt") os.system("mkdir split_files") os.system("cp all_sorted.txt split_files/") os.system("rm all_sorted.txt") os.chdir("split_files/") os.system("split -l 200000 all_sorted.txt") logging.logPrint("clustering with USEARCH at an ID of %s" % id) sort_usearch(usearch) run_usearch(usearch, id) os.system("cat *.usearch.out > all_sorted.txt") os.system("mv all_sorted.txt %s/joined" % dir_path) os.chdir("%s/joined" % dir_path) uclust_cluster(usearch, id) logging.logPrint("USEARCH clustering finished") clusters = get_cluster_ids("consensus.fasta") subprocess.check_call("formatdb -i consensus.fasta -p F", shell=True) if "tblastn" == blast: translate_consensus("consensus.fasta") filter_seqs("tmp.pep") blast_against_self("consensus.fasta", "consensus.pep", "tmp_blast.out", filter, blast, penalty, reward, processors) elif "blastn" == blast: blast_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, blast, penalty, reward, processors) elif "blat" == blast: blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors) else: pass subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) os.system("rm *new_genes.*") if blast == "tblastn" or blast == "blastn": logging.logPrint("starting BLAST") else: logging.logPrint("starting BLAT") if "tblastn" == blast: blast_against_each_genome(dir_path, processors, filter, "consensus.pep", blast, penalty, reward) elif "blastn" == blast: blast_against_each_genome(dir_path, processors, filter, "consensus.fasta", blast, penalty, reward) elif "blat" == blast: blat_against_each_genome(dir_path, "consensus.fasta",processors) else: pass find_dups(ref_scores, length, max_plog, min_hlog) else: logging.logPrint("Using pre-compiled set of predicted genes") files = glob.glob(os.path.join(dir_path, "*.fasta")) if len(files)==0: print "no usable reference genomes found!" sys.exit() else: pass gene_path=os.path.abspath("%s" % genes) clusters = get_cluster_ids(gene_path) os.system("cp %s %s/joined/" % (gene_path,dir_path)) os.chdir("%s/joined" % dir_path) if gene_path.endswith(".pep"): logging.logPrint("using tblastn on peptides") try: subprocess.check_call("formatdb -i %s" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self(gene_path, gene_path, "tmp_blast.out", filter, "blastp", penalty, reward, processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome(dir_path, processors, filter, gene_path, "tblastn", penalty, reward) elif gene_path.endswith(".fasta"): if "tblastn" == blast: logging.logPrint("using tblastn") translate_genes(gene_path,) try: subprocess.check_call("formatdb -i %s -p F" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self(gene_path, "genes.pep", "tmp_blast.out", filter, blast, penalty, reward, processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome(dir_path, processors, filter, "genes.pep", blast, penalty, reward) os.system("cp genes.pep %s" % start_dir) elif "blastn" == blast: logging.logPrint("using blastn") try: subprocess.check_call("formatdb -i %s -p F" % gene_path, shell=True) except: logging.logPrint("BLAST not found") sys.exit() blast_against_self(gene_path, gene_path, "tmp_blast.out", filter, blast, penalty, reward, processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome(dir_path, processors, filter, gene_path, blast, penalty, reward) elif "blat" == blast: logging.logPrint("using blat") blat_against_self(gene_path, gene_path, "tmp_blast.out", processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAT") blat_against_each_genome(dir_path,gene_path,processors) else: pass else: print "input file format not supported" sys.exit() if blast=="blat": logging.logPrint("BLAT done") else: logging.logPrint("BLAST done") parse_blast_report() get_unique_lines() if debug_table == "old": make_table(processors, "F", clusters) elif debug_table == "new": curr_dir=os.getcwd() table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(table_files)] names=[] def _perform_workflow(data): tn, f = data name=make_table_dev(f, "F", clusters) names.append(name) results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=processors)) nr_sorted=sorted(clusters) open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) names_out = open("names.txt", "w") names_redux = [val for subl in names for val in subl] for x in names_redux: print >> names_out, "".join(x) names_out.close() else: print "incorrect debug option selected, exiting" sys.exit() subprocess.check_call("paste ref.list *.matrix > bsr_matrix", shell=True) divide_values("bsr_matrix", ref_scores) subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True) if "T" in f_plog: filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt") os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir) else: pass try: subprocess.check_call("cp names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s" % start_dir, shell=True, stderr=open(os.devnull, 'w')) except: sys.exc_clear() logging.logPrint("all Done") os.chdir("%s" % dir_path) if "T" == keep: pass else: os.system("rm -rf joined")
def _perform_workflow(data): """idx is the sample name, f is the file dictionary""" idx, f = data if os.path.isfile("%s.tmp.xyx.matrix" % idx): pass else: if len(f)>1: if "T" in trim: """paired end sequences - Hardcoded the number of processors per job to 2""" args=['java','-jar','%s' % trim_path,'PE', '-threads', '2', '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz', '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path, 'MINLEN:%s' % int(get_sequence_length(f[0])/2)] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') if os.path.isfile("%s.F.paired.fastq.gz" % idx): pass else: try: trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh) trim_cmd.wait() except: log_isg.logPrint('problem enountered trying to run trimmomatic') else: os.link(f[0], "%s.F.paired.fastq.gz" % idx) os.link(f[1], "%s.R.paired.fastq.gz" % idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: run_bwa(reference, '%s.F.paired.fastq.gz' % idx, '%s.R.paired.fastq.gz' % idx, processors, idx) else: if "T" in trim: """single end support""" args=['java','-jar','%s' % trim_path,'SE', '-threads', '2', '%s' % f[0], '%s.single.fastq.gz' % idx, 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path, 'MINLEN:%s' % int(get_sequence_length(f[0])/2)] try: vcf_fh = open('%s.trimmomatic.out' % idx, 'w') except: log_isg.logPrint('could not open trimmomatic file') try: log_fh = open('%s.trimmomatic.log' % idx, 'w') except: log_isg.logPrint('could not open log file') if os.path.isfile("%s.single.fastq.gz" % idx): pass else: try: trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh) trim_cmd.wait() except: log_isg.logPrint("problem encountered with trimmomatic") else: os.link(f[0], "%s.single.fastq.gz" % idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: run_bwa(reference, '%s.single.fastq.gz' % idx, "NULL", processors, idx) if os.path.isfile("%s_renamed_header.bam" % idx): pass else: process_sam("%s.sam" % idx, idx) """inserts read group information, required by new versions of GATK""" os.system("java -jar %s INPUT=%s.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (picard,idx,idx,idx,idx,idx)) os.system("samtools index %s_renamed_header.bam > /dev/null 2>&1" % idx) run_gatk(reference, processors, idx, gatk, tmp_dir) if "T" == doc: lock.acquire() os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx)) os.system("java -Djava.io.tmpdir=%s -jar %s -R %s/scratch/reference.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (tmp_dir,gatk,ap,idx,idx)) lock.release() process_coverage(idx) else: pass process_vcf("%s.vcf.out" % idx, ref_coords, coverage, proportion, idx) make_temp_matrix("%s.filtered.vcf" % idx, matrix, idx)
def main(directory,id,filter,processors,genes,cluster_method,blast,length, max_plog,min_hlog,f_plog,keep,filter_peps,filter_scaffolds,prefix,temp_dir,min_pep_length,debug): start_dir = os.getcwd() ap=os.path.abspath("%s" % start_dir) dir_path=os.path.abspath("%s" % directory) logging.logPrint("Testing paths of dependencies") if blast=="blastn" or blast=="tblastn": ab = subprocess.call(['which', 'blastn']) if ab == 0: print("citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402") else: print("blastn isn't in your path, but needs to be!") sys.exit() if "NULL" in temp_dir: fastadir = tempfile.mkdtemp() else: fastadir = os.path.abspath("%s" % temp_dir) if os.path.exists('%s' % temp_dir): print("old run directory exists in your genomes directory (%s). Delete and run again" % temp_dir) sys.exit() else: os.makedirs('%s' % temp_dir) for infile in glob.glob(os.path.join(dir_path, '*.fasta')): name=get_seq_name(infile) os.link("%s" % infile, "%s/%s.new" % (fastadir,name)) if "null" in genes: rc = subprocess.call(['which', 'prodigal']) if rc == 0: pass else: print("prodigal is not in your path, but needs to be!") sys.exit() print("citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119") if "usearch" in cluster_method: print("citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461") elif "cd-hit" in cluster_method: print("citation: Li, W., Godzik, A. 2006. Cd-hit: a fast program for clustering and comparing large sets of protein or nuceltodie sequences. Bioinformatics 22(13):1658-1659") elif "vsearch" in cluster_method: print("citation: Rognes, T., Flouri, T., Nichols, B., Qunice, C., Mahe, Frederic. 2016. VSEARCH: a versatile open source tool for metagenomics. PeerJ Preprints. DOI: https://doi.org/10.7287/peerj.preprints.2409v1") if blast=="blat": ac = subprocess.call(['which', 'blat']) if ac == 0: print("citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool. Genome Research 12:656-664") else: print("You have requested blat, but it is not in your PATH") sys.exit() logging.logPrint("predicting genes with Prodigal") #predict_genes(fastadir, processors) predict_genes_dev(fastadir, processors) logging.logPrint("Prodigal done") """This function produces locus tags""" genbank_hits = process_genbank_files(dir_path) if genbank_hits == None or len(genbank_hits) == 0: os.system("cat *genes.seqs > all_gene_seqs.out") if filter_scaffolds == "T": filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") else: pass else: logging.logPrint("Converting genbank files") """First combine all of the prodigal files into one file""" os.system("cat *genes.seqs > all_gene_seqs.out") if filter_scaffolds == "T": filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") else: pass """This combines the locus tags with the Prodigal prediction""" os.system("cat *locus_tags.fasta all_gene_seqs.out > tmp.out") os.system("mv tmp.out all_gene_seqs.out") """I also need to convert the GenBank file to a FASTA file""" for hit in genbank_hits: reduced_hit = hit.replace(".gbk","") SeqIO.convert("%s/%s" % (dir_path, hit), "genbank", "%s.fasta.new" % reduced_hit, "fasta") if "NULL" in cluster_method: print("Clustering chosen, but no method selected...exiting") sys.exit() elif "usearch" in cluster_method: ac = subprocess.call(['which', 'usearch']) if ac == 0: os.system("mkdir split_files") os.system("cp all_gene_seqs.out split_files/all_sorted.txt") os.chdir("split_files/") logging.logPrint("Splitting FASTA file for use with USEARCH") split_files("all_sorted.txt") logging.logPrint("clustering with USEARCH at an ID of %s" % id) #run_usearch(id) run_usearch_dev(id,4) os.system("cat *.usearch.out > all_sorted.txt") os.system("mv all_sorted.txt %s" % fastadir) os.chdir("%s" % fastadir) uclust_cluster(id) logging.logPrint("USEARCH clustering finished") else: print("usearch must be in your path as usearch...exiting") sys.exit() elif "vsearch" in cluster_method: ac = subprocess.call(['which', 'vsearch']) if ac == 0: logging.logPrint("clustering with VSEARCH at an ID of %s, using %s processors" % (id,processors)) run_vsearch(id, processors) os.system("mv vsearch.out consensus.fasta") logging.logPrint("VSEARCH clustering finished") else: print("vsearch must be in your path as vsearch...exiting") sys.exit() elif "cd-hit" in cluster_method: ac = subprocess.call(['which', 'cd-hit-est']) if ac == 0: logging.logPrint("clustering with cd-hit at an ID of %s, using %s processors" % (id,processors)) subprocess.check_call("cd-hit-est -i all_gene_seqs.out -o consensus.fasta -M 0 -T %s -c %s > /dev/null 2>&1" % (processors, id), shell=True) else: print("cd-hit must be in your path as cd-hit-est...exiting") sys.exit() """need to check for dups here""" dup_ids = test_duplicate_header_ids("consensus.fasta") if dup_ids == "True": pass elif dup_ids == "False": print("duplicate headers identified, renaming..") rename_fasta_header("consensus.fasta", "tmp.txt") os.system("mv tmp.txt consensus.fasta") else: pass if "tblastn" == blast: subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) translate_genes("consensus.fasta","tmp.pep",min_pep_length) if filter_peps == "T": filter_seqs("tmp.pep") os.system("rm tmp.pep") else: os.system("mv tmp.pep consensus.pep") clusters = get_cluster_ids("consensus.pep") blast_against_self_tblastn("tblastn", "consensus.fasta", "consensus.pep", "tmp_blast.out", processors, filter) elif "blastn" == blast: subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) blast_against_self_blastn("blastn", "consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, processors) clusters = get_cluster_ids("consensus.fasta") elif "blat" == blast: blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors) clusters = get_cluster_ids("consensus.fasta") else: pass subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) os.system("cp tmp_blast.out ref.scores") subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) os.system("rm *new_genes.*") if blast == "tblastn" or blast == "blastn": logging.logPrint("starting BLAST") else: logging.logPrint("starting BLAT") if "tblastn" == blast: blast_against_each_genome_tblastn(processors, "consensus.pep", filter) elif "blastn" == blast: blast_against_each_genome_blastn(processors, filter, "consensus.fasta") elif "blat" == blast: blat_against_each_genome("consensus.fasta",processors) else: pass else: logging.logPrint("Using pre-compiled set of predicted genes") files = glob.glob(os.path.join(dir_path, "*.fasta")) if len(files)==0: print("no usable reference genomes found!") sys.exit() else: pass gene_path=os.path.abspath("%s" % genes) dup_ids = test_duplicate_header_ids(gene_path) if dup_ids == "True": pass elif dup_ids == "False": print("duplicate headers identified, exiting..") sys.exit() clusters = get_cluster_ids(gene_path) os.system("cp %s %s" % (gene_path,fastadir)) os.chdir("%s" % fastadir) if gene_path.endswith(".pep"): logging.logPrint("using tblastn on peptides") try: subprocess.check_call("makeblastdb -in %s -dbtype prot > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("blastp", gene_path, gene_path, "tmp_blast.out", processors, filter) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) os.system("cp self_blast.out ref.scores") subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(processors, gene_path, filter) elif gene_path.endswith(".fasta"): if "tblastn" == blast: logging.logPrint("using tblastn") translate_genes(gene_path,"genes.pep",min_pep_length) try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("tblastn", gene_path, "genes.pep", "tmp_blast.out", processors, filter) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(processors, "genes.pep", filter) os.system("cp genes.pep %s" % start_dir) elif "blastn" == blast: logging.logPrint("using blastn") try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("Database not formatted correctly...exiting") sys.exit() try: blast_against_self_blastn("blastn", gene_path, gene_path, "tmp_blast.out", filter, processors) except: print("problem with blastn, exiting") sys.exit() logging.logPrint("starting BLAST") try: blast_against_each_genome_blastn(processors, filter, gene_path) except: print("problem with blastn, exiting") sys.exit() elif "blat" == blast: logging.logPrint("using blat") blat_against_self(gene_path, gene_path, "tmp_blast.out", processors) logging.logPrint("starting BLAT") blat_against_each_genome(gene_path,processors) else: pass else: print("input file format not supported") sys.exit() """testing to see if I can remove some redundancy""" subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) os.system("cp self_blast.out ref.scores") ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) """testing block complete""" find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors) if blast=="blat": logging.logPrint("BLAT done") else: logging.logPrint("BLAST done") parse_blast_report("false") get_unique_lines() curr_dir=os.getcwd() table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(table_files)] names=[] table_list = [] nr_sorted=sorted(clusters) centroid_list = [] centroid_list.append(" ") for x in nr_sorted: centroid_list.append(x) table_list.append(centroid_list) logging.logPrint("starting matrix building") new_names,new_table = new_loop(files_and_temp_names, processors, clusters, debug) new_table_list = table_list+new_table logging.logPrint("matrix built") open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) names_out = open("names.txt", "w") names_redux = [val for subl in new_names for val in subl] for x in names_redux: names_out.write("".join(x)+"\n") names_out.close() create_bsr_matrix_dev(new_table_list) divide_values("bsr_matrix", ref_scores) subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True) try: subprocess.check_call("cp dup_matrix.txt names.txt consensus.pep duplicate_ids.txt consensus.fasta paralog_ids.txt %s" % ap, shell=True, stderr=open(os.devnull, 'w')) except: sys.exc_clear() """new code to rename files according to a prefix""" import datetime timestamp = datetime.datetime.now() rename = str(timestamp.year), str(timestamp.month), str(timestamp.day), str(timestamp.hour), str(timestamp.minute), str(timestamp.second) if "T" in f_plog: filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt") if "NULL" in prefix: os.system("cp bsr_matrix_values_filtered.txt %s/%s_paralogs_filtered_bsr_matrix_values.txt" % (start_dir,"".join(rename))) else: os.system("cp bsr_matrix_values_filtered.txt %s/%s_paralogs_filtered_bsr_matrix_values.txt" % (start_dir,prefix)) os.chdir("%s" % ap) if "NULL" in prefix: os.system("mv dup_matrix.txt %s_dup_matrix.txt" % "".join(rename)) os.system("mv names.txt %s_names.txt" % "".join(rename)) os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % "".join(rename)) os.system("mv paralog_ids.txt %s_paralog_ids.txt" % "".join(rename)) os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % "".join(rename)) if os.path.isfile("consensus.fasta"): os.system("mv consensus.fasta %s_consensus.fasta" % "".join(rename)) if os.path.isfile("consensus.pep"): os.system("mv consensus.pep %s_consensus.pep" % "".join(rename)) else: os.system("mv dup_matrix.txt %s_dup_matrix.txt" % prefix) os.system("mv names.txt %s_names.txt" % prefix) os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % prefix) os.system("mv paralog_ids.txt %s_paralog_ids.txt" % prefix) os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % prefix) if os.path.isfile("consensus.fasta"): os.system("mv consensus.fasta %s_consensus.fasta" % prefix) if os.path.isfile("consensus.pep"): os.system("mv consensus.pep %s_consensus.pep" % prefix) if "NULL" in prefix: outfile = open("%s_run_parameters.txt" % "".join(rename), "w") else: outfile = open("%s_run_parameters.txt" % prefix, "w") outfile.write("-d %s \\\n" % directory) outfile.write("-i %s \\\n" % id) outfile.write("-f %s \\\n" % filter) outfile.write("-p %s \\\n" % processors) outfile.write("-g %s \\\n" % genes) outfile.write("-c %s \\\n" % cluster_method) outfile.write("-b %s \\\n" % blast) outfile.write("-l %s \\\n" % length) outfile.write("-m %s \\\n" % max_plog) outfile.write("-n %s \\\n" % min_hlog) outfile.write("-t %s \\\n" % f_plog) outfile.write("-k %s \\\n" % keep) outfile.write("-s %s \\\n" % filter_peps) outfile.write("-e %s \\\n" % filter_scaffolds) outfile.write("-x %s \\\n" % prefix) outfile.write("-z %s\n" % debug) outfile.write("temp data stored here if kept: %s" % fastadir) outfile.close() logging.logPrint("all Done") if "T" == keep: pass else: os.system("rm -rf %s" % fastadir) os.chdir("%s" % ap)
try: os.makedirs('%s/joined' % dir_path) except OSError, e: if e.errno != errno.EEXIST: raise for infile in glob.glob(os.path.join(dir_path, '*.fasta')): name=get_seq_name(infile) os.system("cp %s %s/joined/%s.new" % (infile,dir_path,name)) if "null" in genes: try: if os.path.exists(usearch): pass except: raise TypeError("-u usearch flag must be set for use with prodigal") sys.exc_clear() logging.logPrint("predicting genes with Prodigal") predict_genes(dir_path, processors) logging.logPrint("Prodigal done") os.system("cat *genes.seqs > all_gene_seqs.out") uclust_sort(usearch) rename_fasta_header("tmp_sorted.txt", "all_sorted.txt") uclust_cluster(usearch, id) translate_consensus("consensus.fasta") filter_seqs("tmp.pep") subprocess.check_call("formatdb -i consensus.fasta -p F", shell=True) blast_against_self("consensus.fasta", "consensus.pep", "tmp_blast.out", filter, blast, penalty, reward) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores=parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) os.system("rm *new_genes.*") logging.logPrint("starting BLAST")
def main(matrix,tree,reference,directory,parameters,processors,coverage,proportion,keep,subsample,subnums,doc,tmp_dir,insertion_method,fudge,only_subs,model,trim): ref_path=os.path.abspath("%s" % reference) dir_path=os.path.abspath("%s" % directory) #check for binary dependencies log_isg.logPrint('testing the paths of all dependencies') ap=os.path.abspath("%s" % os.getcwd()) aa = subprocess.call(['which', 'raxmlHPC-SSE3']) if aa == 0: pass else: print "RAxML must be in your path as raxmlHPC-SSE3" sys.exit() print "*citation: 'Stamatakis, A. RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies. Bioinformatics (2014).'" print "*citation: 'Berger SA, Krompass D, Stamatakis A. Performance, accuracy, and Web server for evolutionary placement of short sequence reads under maximum likelihood. Syst Biol. 2011;60(3):291-302'" ab = subprocess.call(['which', 'samtools']) if ab == 0: pass else: print "samtools must be in your path" sys.exit() print "*citation: 'Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, Genome Project Data Processing S. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009;25(16):2078-9'" ac = subprocess.call(['which', 'bwa']) if ac == 0: pass else: print "bwa must be in your path" sys.exit() print "*citation: 'Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXivorg. 2013(arXiv:1303.3997 [q-bio.GN])'" print "Patristic distances calculated with DendroPy" print "*citation: 'Sukumaran J, Holder MT. DendroPy: a Python library for phylogenetic computing. Bioinformatics. 2010;26(12):1569-71. Epub 2010/04/28. doi: 10.1093/bioinformatics/btq228. PubMed PMID: 20421198'" print "Also uses GATK for variant calling" print "*citation: 'McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome research. 2010;20(9):1297-303'" print "Uses trimmomatic for read trimming" print "*citation: Bolger A.M., Lohse M., Usadel B. Trimmomatic: A flexible trimmer for Illumina Sequence Data. Bioinformatics. 2014. Doi:10.1093/bioinformatics/btu170" print "Uses BioPython for FASTA parsing" print "*citation :C**k PJ, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, de Hoon MJ. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009;25(11):1422-3" print "" #done checking for dependencies""" log_isg.logPrint('WG-FAST pipeline starting') log_isg.logPrint("WG-FAST was invoked with the following parameters:") print "-m %s \\" % matrix print "-t %s \\" % tree print "-r %s \\" % reference print "-d %s \\" % directory print "-x %s \\" % parameters print "-p %s \\" % processors print "-c %s \\" % coverage print "-o %s \\" % proportion print "-k %s \\" % keep print "-s %s \\" % subsample print "-n %s \\" % subnums print "-g %s \\" % doc print "-e %s \\" % tmp_dir print "-z %s \\" % insertion_method print "-f %s \\" % fudge print "-y %s \\" % only_subs print "-j %s \\" % model print "-i %s" % trim try: os.makedirs('%s/scratch' % ap) except OSError, e: if e.errno != errno.EEXIST:raise
try: convertChannel.receive() vmWareDir = 'clovr-vmware.%s' % options('general.version') runSystemEx('mkdir -p ' + vmWareDir, log=True) runSystemEx('mv VMware_conversion/shared/converted_img.vmdk %s' % os.path.join(vmWareDir, 'clovr.9-04.x86-64.%s.vmdk' % options('general.version'))) runSystemEx('mkdir -p %s %s' % (os.path.join(vmWareDir, 'keys'), os.path.join(vmWareDir, 'user_data')), log=True) runSystemEx('cp -rv /usr/local/projects/clovr/shared ' + vmWareDir, log=True) fout = open(os.path.join(vmWareDir, 'start_clovr.vmx'), 'w') clovrConf = config.configFromMap(dict(version=options('general.version'))) for line in open('/usr/local/projects/clovr/start_clovr.vmx'): fout.write(config.replaceStr(line, clovrConf)) except Exception, err: errorPrint('Converting image failed. Error message:') errorPrint(str(err)) try: amiId = bundleChannel.receive() logPrint('AMI: ' + amiId) except Exception, err: amiId = None errorPrint('Bundling AMI failed for some reason. Error message:') errorPrint(str(err)) if __name__ == '__main__': main(*buildConfigN(OPTIONS))
def main(directory, id, filter, processors, genes, cluster_method, blast, length, max_plog, min_hlog, f_plog, keep, filter_peps, filter_scaffolds, prefix, temp_dir, debug): start_dir = os.getcwd() ap = os.path.abspath("%s" % start_dir) dir_path = os.path.abspath("%s" % directory) logging.logPrint("Testing paths of dependencies") if blast == "blastn" or blast == "tblastn": ab = subprocess.call(['which', 'blastn']) if ab == 0: print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402" else: print "blastn isn't in your path, but needs to be!" sys.exit() if "NULL" in temp_dir: fastadir = tempfile.mkdtemp() else: fastadir = os.path.abspath("%s" % temp_dir) if os.path.exists('%s' % temp_dir): print "old run directory exists in your genomes directory (%s). Delete and run again" % temp_dir sys.exit() else: os.makedirs('%s' % temp_dir) for infile in glob.glob(os.path.join(dir_path, '*.fasta')): name = get_seq_name(infile) os.link("%s" % infile, "%s/%s.new" % (fastadir, name)) if "null" in genes: rc = subprocess.call(['which', 'prodigal']) if rc == 0: pass else: print "prodigal is not in your path, but needs to be!" sys.exit() print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119" if "usearch" in cluster_method: print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461" elif "cd-hit" in cluster_method: print "citation: Li, W., Godzik, A. 2006. Cd-hit: a fast program for clustering and comparing large sets of protein or nuceltodie sequences. Bioinformatics 22(13):1658-1659" elif "vsearch" in cluster_method: print "citation: Rognes, T., Flouri, T., Nichols, B., Qunice, C., Mahe, Frederic. 2016. VSEARCH: a versatile open source tool for metagenomics. PeerJ Preprints. DOI: https://doi.org/10.7287/peerj.preprints.2409v1" if blast == "blat": ac = subprocess.call(['which', 'blat']) if ac == 0: print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool. Genome Research 12:656-664" else: print "You have requested blat, but it is not in your PATH" sys.exit() logging.logPrint("predicting genes with Prodigal") predict_genes(fastadir, processors) logging.logPrint("Prodigal done") """This function produces locus tags""" genbank_hits = process_genbank_files(dir_path) if genbank_hits == None or len(genbank_hits) == 0: os.system("cat *genes.seqs > all_gene_seqs.out") if filter_scaffolds == "T": filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") else: pass else: logging.logPrint("Converting genbank files") """First combine all of the prodigal files into one file""" os.system("cat *genes.seqs > all_gene_seqs.out") if filter_scaffolds == "T": filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") else: pass """This combines the locus tags with the Prodigal prediction""" os.system("cat *locus_tags.fasta all_gene_seqs.out > tmp.out") os.system("mv tmp.out all_gene_seqs.out") """I also need to convert the GenBank file to a FASTA file""" for hit in genbank_hits: reduced_hit = hit.replace(".gbk", "") SeqIO.convert("%s/%s" % (dir_path, hit), "genbank", "%s.fasta.new" % reduced_hit, "fasta") if "NULL" in cluster_method: print "Clustering chosen, but no method selected...exiting" sys.exit() elif "usearch" in cluster_method: ac = subprocess.call(['which', 'usearch']) if ac == 0: os.system("mkdir split_files") os.system("cp all_gene_seqs.out split_files/all_sorted.txt") os.chdir("split_files/") logging.logPrint("Splitting FASTA file for use with USEARCH") split_files("all_sorted.txt") logging.logPrint("clustering with USEARCH at an ID of %s" % id) run_usearch(id) os.system("cat *.usearch.out > all_sorted.txt") os.system("mv all_sorted.txt %s" % fastadir) os.chdir("%s" % fastadir) uclust_cluster(id) logging.logPrint("USEARCH clustering finished") else: print "usearch must be in your path as usearch...exiting" sys.exit() elif "vsearch" in cluster_method: ac = subprocess.call(['which', 'vsearch']) if ac == 0: logging.logPrint( "clustering with VSEARCH at an ID of %s, using %s processors" % (id, processors)) run_vsearch(id, processors) os.system("mv vsearch.out consensus.fasta") logging.logPrint("VSEARCH clustering finished") else: print "vsearch must be in your path as vsearch...exiting" sys.exit() elif "cd-hit" in cluster_method: ac = subprocess.call(['which', 'cd-hit-est']) if ac == 0: logging.logPrint( "clustering with cd-hit at an ID of %s, using %s processors" % (id, processors)) subprocess.check_call( "cd-hit-est -i all_gene_seqs.out -o consensus.fasta -M 0 -T %s -c %s > /dev/null 2>&1" % (processors, id), shell=True) else: print "cd-hit must be in your path as cd-hit-est...exiting" sys.exit() """need to check for dups here""" dup_ids = test_duplicate_header_ids("consensus.fasta") if dup_ids == "True": pass elif dup_ids == "False": print "duplicate headers identified, renaming.." rename_fasta_header("consensus.fasta", "tmp.txt") os.system("mv tmp.txt consensus.fasta") else: pass if "tblastn" == blast: subprocess.check_call( "makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) translate_consensus("consensus.fasta") if filter_peps == "T": filter_seqs("tmp.pep") os.system("rm tmp.pep") else: os.system("mv tmp.pep consensus.pep") clusters = get_cluster_ids("consensus.pep") blast_against_self_tblastn("tblastn", "consensus.fasta", "consensus.pep", "tmp_blast.out", processors, filter) elif "blastn" == blast: subprocess.check_call( "makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) blast_against_self_blastn("blastn", "consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, processors) clusters = get_cluster_ids("consensus.fasta") elif "blat" == blast: blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors) clusters = get_cluster_ids("consensus.fasta") else: pass subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) os.system("rm *new_genes.*") if blast == "tblastn" or blast == "blastn": logging.logPrint("starting BLAST") else: logging.logPrint("starting BLAT") if "tblastn" == blast: blast_against_each_genome_tblastn(dir_path, processors, "consensus.pep", filter) elif "blastn" == blast: blast_against_each_genome_blastn(dir_path, processors, filter, "consensus.fasta") elif "blat" == blast: blat_against_each_genome(dir_path, "consensus.fasta", processors) else: pass else: logging.logPrint("Using pre-compiled set of predicted genes") files = glob.glob(os.path.join(dir_path, "*.fasta")) if len(files) == 0: print "no usable reference genomes found!" sys.exit() else: pass gene_path = os.path.abspath("%s" % genes) dup_ids = test_duplicate_header_ids(gene_path) if dup_ids == "True": pass elif dup_ids == "False": print "duplicate headers identified, exiting.." sys.exit() clusters = get_cluster_ids(gene_path) os.system("cp %s %s" % (gene_path, fastadir)) os.chdir("%s" % fastadir) if gene_path.endswith(".pep"): logging.logPrint("using tblastn on peptides") try: subprocess.check_call( "makeblastdb -in %s -dbtype prot > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("blastp", gene_path, gene_path, "tmp_blast.out", processors, filter) subprocess.check_call( "sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(dir_path, processors, gene_path, filter) elif gene_path.endswith(".fasta"): if "tblastn" == blast: logging.logPrint("using tblastn") translate_genes(gene_path) try: subprocess.check_call( "makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("tblastn", gene_path, "genes.pep", "tmp_blast.out", processors, filter) subprocess.check_call( "sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(dir_path, processors, "genes.pep", filter) os.system("cp genes.pep %s" % start_dir) elif "blastn" == blast: logging.logPrint("using blastn") try: subprocess.check_call( "makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint( "Database not formatted correctly...exiting") sys.exit() try: blast_against_self_blastn("blastn", gene_path, gene_path, "tmp_blast.out", filter, processors) except: print "problem with blastn, exiting" sys.exit() subprocess.check_call( "sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) os.system("cp self_blast.out tmp.out") ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") try: blast_against_each_genome_blastn(dir_path, processors, filter, gene_path) except: print "problem with blastn, exiting" sys.exit() elif "blat" == blast: logging.logPrint("using blat") blat_against_self(gene_path, gene_path, "tmp_blast.out", processors) subprocess.check_call( "sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAT") blat_against_each_genome(dir_path, gene_path, processors) else: pass else: print "input file format not supported" sys.exit() find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors) if blast == "blat": logging.logPrint("BLAT done") else: logging.logPrint("BLAST done") parse_blast_report("false") get_unique_lines() curr_dir = os.getcwd() table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(table_files)] names = [] table_list = [] nr_sorted = sorted(clusters) centroid_list = [] centroid_list.append(" ") for x in nr_sorted: centroid_list.append(x) table_list.append(centroid_list) logging.logPrint("starting matrix building") new_names, new_table = new_loop(files_and_temp_names, processors, clusters, debug) new_table_list = table_list + new_table logging.logPrint("matrix built") open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) names_out = open("names.txt", "w") names_redux = [val for subl in new_names for val in subl] for x in names_redux: print >> names_out, "".join(x) names_out.close() create_bsr_matrix_dev(new_table_list) divide_values("bsr_matrix", ref_scores) subprocess.check_call( "paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True) if "T" in f_plog: filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt") os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir) else: pass try: subprocess.check_call( "cp dup_matrix.txt names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s" % ap, shell=True, stderr=open(os.devnull, 'w')) except: sys.exc_clear() """new code to rename files according to a prefix""" import datetime timestamp = datetime.datetime.now() rename = str(timestamp.year), str(timestamp.month), str( timestamp.day), str(timestamp.hour), str(timestamp.minute), str( timestamp.second) os.chdir("%s" % ap) if "NULL" in prefix: os.system("mv dup_matrix.txt %s_dup_matrix.txt" % "".join(rename)) os.system("mv names.txt %s_names.txt" % "".join(rename)) os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % "".join(rename)) os.system("mv paralog_ids.txt %s_paralog_ids.txt" % "".join(rename)) os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % "".join(rename)) if os.path.isfile("consensus.fasta"): os.system("mv consensus.fasta %s_consensus.fasta" % "".join(rename)) if os.path.isfile("consensus.pep"): os.system("mv consensus.pep %s_consensus.pep" % "".join(rename)) else: os.system("mv dup_matrix.txt %s_dup_matrix.txt" % prefix) os.system("mv names.txt %s_names.txt" % prefix) os.system("mv duplicate_ids.txt %s_duplicate_ids.txt" % prefix) os.system("mv paralog_ids.txt %s_paralog_ids.txt" % prefix) os.system("mv bsr_matrix_values.txt %s_bsr_matrix.txt" % prefix) if os.path.isfile("consensus.fasta"): os.system("mv consensus.fasta %s_consensus.fasta" % prefix) if os.path.isfile("consensus.pep"): os.system("mv consensus.pep %s_consensus.pep" % prefix) if "NULL" in prefix: outfile = open("%s_run_parameters.txt" % "".join(rename), "w") else: outfile = open("%s_run_parameters.txt" % prefix, "w") print >> outfile, "-d %s \\" % directory print >> outfile, "-i %s \\" % id print >> outfile, "-f %s \\" % filter print >> outfile, "-p %s \\" % processors print >> outfile, "-g %s \\" % genes print >> outfile, "-c %s \\" % cluster_method print >> outfile, "-b %s \\" % blast print >> outfile, "-l %s \\" % length print >> outfile, "-m %s \\" % max_plog print >> outfile, "-n %s \\" % min_hlog print >> outfile, "-t %s \\" % f_plog print >> outfile, "-k %s \\" % keep print >> outfile, "-s %s \\" % filter_peps print >> outfile, "-e %s \\" % filter_scaffolds print >> outfile, "-x %s \\" % prefix print >> outfile, "-z %s" % debug print >> outfile, "temp data stored here if kept: %s" % fastadir outfile.close() logging.logPrint("all Done") if "T" == keep: pass else: os.system("rm -rf %s" % fastadir) os.chdir("%s" % ap)
runSystemEx('mkdir -p ' + vmWareDir, log=True) runSystemEx( 'mv VMware_conversion/shared/converted_img.vmdk %s' % os.path.join( vmWareDir, 'clovr.9-04.x86-64.%s.vmdk' % options('general.version'))) runSystemEx('mkdir -p %s %s' % (os.path.join( vmWareDir, 'keys'), os.path.join(vmWareDir, 'user_data')), log=True) runSystemEx('cp -rv /usr/local/projects/clovr/shared ' + vmWareDir, log=True) fout = open(os.path.join(vmWareDir, 'start_clovr.vmx'), 'w') clovrConf = config.configFromMap( dict(version=options('general.version'))) for line in open('/usr/local/projects/clovr/start_clovr.vmx'): fout.write(config.replaceStr(line, clovrConf)) except Exception, err: errorPrint('Converting image failed. Error message:') errorPrint(str(err)) try: amiId = bundleChannel.receive() logPrint('AMI: ' + amiId) except Exception, err: amiId = None errorPrint('Bundling AMI failed for some reason. Error message:') errorPrint(str(err)) if __name__ == '__main__': main(*buildConfigN(OPTIONS))
def main(matrix, tree, reference, directory, parameters, processors, coverage, proportion, keep, subsample, subnums, doc, tmp_dir, insertion_method, fudge, only_subs, model, trim, gatk_method): ref_path = os.path.abspath("%s" % reference) dir_path = os.path.abspath("%s" % directory) #check for binary dependencies log_isg.logPrint('testing the paths of all dependencies') ap = os.path.abspath("%s" % os.getcwd()) aa = subprocess.call(['which', 'raxmlHPC-SSE3']) if aa == 0: pass else: print "RAxML must be in your path as raxmlHPC-SSE3" sys.exit() print "*citation: 'Stamatakis, A. RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies. Bioinformatics (2014).'" print "*citation: 'Berger SA, Krompass D, Stamatakis A. Performance, accuracy, and Web server for evolutionary placement of short sequence reads under maximum likelihood. Syst Biol. 2011;60(3):291-302'" ab = subprocess.call(['which', 'samtools']) if ab == 0: pass else: print "samtools must be in your path" sys.exit() print "*citation: 'Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, Genome Project Data Processing S. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009;25(16):2078-9'" ac = subprocess.call(['which', 'bwa']) if ac == 0: pass else: print "bwa must be in your path" sys.exit() print "*citation: 'Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXivorg. 2013(arXiv:1303.3997 [q-bio.GN])'" print "Patristic distances calculated with DendroPy" print "*citation: 'Sukumaran J, Holder MT. DendroPy: a Python library for phylogenetic computing. Bioinformatics. 2010;26(12):1569-71. Epub 2010/04/28. doi: 10.1093/bioinformatics/btq228. PubMed PMID: 20421198'" print "Also uses GATK for variant calling" print "*citation: 'McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome research. 2010;20(9):1297-303'" print "Uses trimmomatic for read trimming" print "*citation: Bolger A.M., Lohse M., Usadel B. Trimmomatic: A flexible trimmer for Illumina Sequence Data. Bioinformatics. 2014. Doi:10.1093/bioinformatics/btu170" print "Uses BioPython for FASTA parsing" print "*citation :C**k PJ, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, de Hoon MJ. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009;25(11):1422-3" print "" #done checking for dependencies""" log_isg.logPrint('WG-FAST pipeline starting') log_isg.logPrint("WG-FAST was invoked with the following parameters:") print "-m %s \\" % matrix print "-t %s \\" % tree print "-r %s \\" % reference print "-d %s \\" % directory print "-x %s \\" % parameters print "-p %s \\" % processors print "-c %s \\" % coverage print "-o %s \\" % proportion print "-k %s \\" % keep print "-s %s \\" % subsample print "-n %s \\" % subnums print "-g %s \\" % doc print "-e %s \\" % tmp_dir print "-z %s \\" % insertion_method print "-f %s \\" % fudge print "-y %s \\" % only_subs print "-j %s \\" % model print "-i %s \\" % trim print "-q %s" % gatk_method try: os.makedirs('%s/scratch' % ap) except OSError, e: if e.errno != errno.EEXIST: raise
def main( directory, id, filter, processors, genes, usearch, vsearch, blast, penalty, reward, length, max_plog, min_hlog, f_plog, keep, filter_peps, debug, ): start_dir = os.getcwd() ap = os.path.abspath("%s" % start_dir) dir_path = os.path.abspath("%s" % directory) logging.logPrint("Testing paths of dependencies") if blast == "blastn" or blast == "tblastn": ab = subprocess.call(["which", "blastn"]) if ab == 0: print "citation: Altschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, and Lipman DJ. 1997. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25:3389-3402" else: print "blastn isn't in your path, but needs to be!" sys.exit() try: os.makedirs("%s/joined" % dir_path) except: print "old run directory exists in your genomes directory (%s/joined). Delete and run again" % dir_path sys.exit() for infile in glob.glob(os.path.join(dir_path, "*.fasta")): name = get_seq_name(infile) os.link("%s" % infile, "%s/joined/%s.new" % (dir_path, name)) if "null" in genes: rc = subprocess.call(["which", "prodigal"]) if rc == 0: pass else: print "prodigal is not in your path, but needs to be!" sys.exit() print "citation: Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, and Hauser LJ. 2010. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11:119" if os.path.exists(usearch): print "citation: Edgar RC. 2010. Search and clustering orders of magnitude faster than BLAST. Bioinformatics 26:2460-2461" else: pass if blast == "blat": ac = subprocess.call(["which", "blat"]) if ac == 0: print "citation: W.James Kent. 2002. BLAT - The BLAST-Like Alignment Tool. Genome Research 12:656-664" else: print "You have requested blat, but it is not in your PATH" sys.exit() logging.logPrint("predicting genes with Prodigal") predict_genes(dir_path, processors) logging.logPrint("Prodigal done") os.system("cat *genes.seqs > all_gene_seqs.out") filter_scaffolds("all_gene_seqs.out") os.system("mv tmp.out all_gene_seqs.out") rename_fasta_header("all_gene_seqs.out", "all_sorted.txt") if os.path.exists(usearch) and os.path.exists(vsearch): print "usearch and vsearch both selected, only usearch will be used" if os.path.exists(usearch): os.system("mkdir split_files") os.system("cp all_sorted.txt split_files/") os.system("rm all_sorted.txt") os.chdir("split_files/") os.system("split -l 200000 all_sorted.txt") logging.logPrint("clustering with USEARCH at an ID of %s" % id) run_usearch(usearch, id) os.system("cat *.usearch.out > all_sorted.txt") os.system("mv all_sorted.txt %s/joined" % dir_path) os.chdir("%s/joined" % dir_path) uclust_cluster(usearch, id) logging.logPrint("USEARCH clustering finished") elif os.path.exists(vsearch): logging.logPrint("clustering with VSEARCH at an ID of %s" % id) run_vsearch(vsearch, id, processors) os.system("mv vsearch.out consensus.fasta") logging.logPrint("VSEARCH clustering finished") else: print "neither usearch or vsearch selected for use with Prodigal!, exiting." sys.exit() if "tblastn" == blast: subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) translate_consensus("consensus.fasta") if filter_peps == "T": filter_seqs("tmp.pep") os.system("rm tmp.pep") else: os.system("mv tmp.pep consensus.pep") clusters = get_cluster_ids("consensus.pep") blast_against_self_tblastn("tblastn", "consensus.fasta", "consensus.pep", "tmp_blast.out", processors) elif "blastn" == blast: subprocess.check_call("makeblastdb -in consensus.fasta -dbtype nucl > /dev/null 2>&1", shell=True) blast_against_self_blastn( "blastn", "consensus.fasta", "consensus.fasta", "tmp_blast.out", filter, penalty, reward, processors ) clusters = get_cluster_ids("consensus.fasta") elif "blat" == blast: blat_against_self("consensus.fasta", "consensus.fasta", "tmp_blast.out", processors) clusters = get_cluster_ids("consensus.fasta") else: pass subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) os.system("rm *new_genes.*") if blast == "tblastn" or blast == "blastn": logging.logPrint("starting BLAST") else: logging.logPrint("starting BLAT") if "tblastn" == blast: blast_against_each_genome_tblastn(dir_path, processors, "consensus.pep") elif "blastn" == blast: blast_against_each_genome_blastn(dir_path, processors, filter, "consensus.fasta", penalty, reward) elif "blat" == blast: blat_against_each_genome(dir_path, "consensus.fasta", processors) else: pass find_dups(ref_scores, length, max_plog, min_hlog) else: logging.logPrint("Using pre-compiled set of predicted genes") files = glob.glob(os.path.join(dir_path, "*.fasta")) if len(files) == 0: print "no usable reference genomes found!" sys.exit() else: pass gene_path = os.path.abspath("%s" % genes) clusters = get_cluster_ids(gene_path) os.system("cp %s %s/joined/" % (gene_path, dir_path)) os.chdir("%s/joined" % dir_path) if gene_path.endswith(".pep"): logging.logPrint("using tblastn on peptides") try: subprocess.check_call("makeblastdb -in %s -dbtype prot > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("blastp", gene_path, gene_path, "tmp_blast.out", processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(dir_path, processors, gene_path) elif gene_path.endswith(".fasta"): if "tblastn" == blast: logging.logPrint("using tblastn") translate_genes(gene_path) try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("problem encountered with BLAST database") sys.exit() blast_against_self_tblastn("tblastn", gene_path, "genes.pep", "tmp_blast.out", processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_tblastn(dir_path, processors, "genes.pep") os.system("cp genes.pep %s" % start_dir) elif "blastn" == blast: logging.logPrint("using blastn") try: subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % gene_path, shell=True) except: logging.logPrint("Database not formatted correctly...exiting") sys.exit() try: blast_against_self_blastn( "blastn", gene_path, gene_path, "tmp_blast.out", filter, penalty, reward, processors ) except: print "problem with blastn, exiting" sys.exit() subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAST") blast_against_each_genome_blastn(dir_path, processors, filter, gene_path, penalty, reward) elif "blat" == blast: logging.logPrint("using blat") blat_against_self(gene_path, gene_path, "tmp_blast.out", processors) subprocess.check_call("sort -u -k 1,1 tmp_blast.out > self_blast.out", shell=True) ref_scores = parse_self_blast(open("self_blast.out", "U")) subprocess.check_call("rm tmp_blast.out self_blast.out", shell=True) logging.logPrint("starting BLAT") blat_against_each_genome(dir_path, gene_path, processors) else: pass else: print "input file format not supported" sys.exit() if blast == "blat": logging.logPrint("BLAT done") else: logging.logPrint("BLAST done") parse_blast_report("false") get_unique_lines() curr_dir = os.getcwd() table_files = glob.glob(os.path.join(curr_dir, "*.filtered.unique")) files_and_temp_names = [(str(idx), os.path.join(curr_dir, f)) for idx, f in enumerate(table_files)] names = [] table_list = [] nr_sorted = sorted(clusters) centroid_list = [] centroid_list.append(" ") for x in nr_sorted: centroid_list.append(x) table_list.append(centroid_list) logging.logPrint("starting matrix building") new_names, new_table = new_loop(files_and_temp_names, processors, clusters, debug) new_table_list = table_list + new_table logging.logPrint("matrix built") open("ref.list", "a").write("\n") for x in nr_sorted: open("ref.list", "a").write("%s\n" % x) names_out = open("names.txt", "w") names_redux = [val for subl in new_names for val in subl] for x in names_redux: print >> names_out, "".join(x) names_out.close() create_bsr_matrix_dev(new_table_list) divide_values("bsr_matrix", ref_scores) subprocess.check_call("paste ref.list BSR_matrix_values.txt > %s/bsr_matrix_values.txt" % start_dir, shell=True) if "T" in f_plog: filter_paralogs("%s/bsr_matrix_values.txt" % start_dir, "paralog_ids.txt") os.system("cp bsr_matrix_values_filtered.txt %s" % start_dir) else: pass try: subprocess.check_call( "cp names.txt consensus.pep consensus.fasta duplicate_ids.txt paralog_ids.txt %s" % start_dir, shell=True, stderr=open(os.devnull, "w"), ) except: sys.exc_clear() logging.logPrint("all Done") os.chdir("%s" % dir_path) if "T" == keep: pass else: os.system("rm -rf joined") os.chdir("%s" % ap)
write_reduced_matrix(matrix) ref_name=get_seq_name(reference) if only_subs == "T": pass else: fileSets=read_file_sets(dir_path) ref_coords = get_all_snps(matrix) run_loop(fileSets, dir_path,"%s/scratch/reference.fasta" % ap , processors, GATK_PATH, ref_coords, coverage, proportion, matrix, ap,doc,tmp_dir,ADD_GROUPS,TRIM_PATH,WGFAST_PATH,trim) """will subsample based on the number of SNPs reported by the following function""" used_snps=find_used_snps() #Outnames is required for the sub-sampling routine, even with -y T outnames=grab_names() for name in outnames: for k,v in used_snps.iteritems(): if name==k: log_isg.logPrint("number of callable positions in genome %s = %s" % (k,v)) if only_subs == "T": try: os.system("rm RAxML*") except: pass pass else: create_merged_vcf() subprocess.check_call("paste temp.matrix merged.vcf > combined.matrix", shell=True) matrix_to_fasta("combined.matrix", "all.fasta") os.system("mv combined.matrix %s/nasp_matrix.with_unknowns.txt" % ap) """this fixes the SNP output to conform with RAxML""" os.system("sed 's/://g' all.fasta | sed 's/,//g' > out.fasta") if insertion_method == "ML": suffix = run_raxml("out.fasta", tree,"out.classification_results.txt", "V", parameters, model, "out")
pass else: fileSets = read_file_sets(dir_path) ref_coords = get_all_snps(matrix) run_loop(fileSets, dir_path, "%s/scratch/reference.fasta" % ap, processors, GATK_PATH, ref_coords, coverage, proportion, matrix, ap, doc, tmp_dir, ADD_GROUPS, TRIM_PATH, WGFAST_PATH, trim, gatk_method) """will subsample based on the number of SNPs reported by the following function""" used_snps = find_used_snps() #Outnames is required for the sub-sampling routine, even with -y T outnames = grab_names() for name in outnames: for k, v in used_snps.iteritems(): if name == k: log_isg.logPrint( "number of callable positions in genome %s = %s" % (k, v)) if only_subs == "T": try: os.system("rm RAxML*") except: pass pass else: create_merged_vcf() subprocess.check_call("paste temp.matrix merged.vcf > combined.matrix", shell=True) matrix_to_fasta("combined.matrix", "all.fasta") os.system("mv combined.matrix %s/nasp_matrix.with_unknowns.txt" % ap) """this fixes the SNP output to conform with RAxML""" os.system("sed 's/://g' all.fasta | sed 's/,//g' > out.fasta") if insertion_method == "ML":