def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = check_path(split_dir) splited_out_dir = check_path(splited_output_dir) save_mkdir(splited_dir) save_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def write_splited(self, out_dir="./", extension="t", value_separator=","): from Routines.File import check_path for fl_key in self: with open("%s%s.%s" % (check_path(out_dir), fl_key, extension), "w") as out_fd: for sl_key in self[fl_key]: out_fd.write( "%s\t%s\n" % (sl_key, value_separator.join(self[fl_key][sl_key])))
def parallel_convert(self, list_of_files, output_directory, output_format=".tiff"): save_mkdir(output_directory) options_list = [] for filename in list_of_files: option = " %s" % filename option += " %s%s%s" % (check_path(output_directory), split_filename(filename)[1], output_format) options_list.append(option) self.parallel_execute(options_list)
import os import argparse import numpy as np from Routines.File import check_path, save_mkdir parser = argparse.ArgumentParser() parser.add_argument("-d", "--sample_directory", action="store", dest="samples_dir", required=True, type=lambda s: check_path(os.path.abspath(s)), help="Directory with samples") parser.add_argument( "-s", "--samples", action="store", dest="samples", help="Comma-separated list of subdirectories(one per sample) to handle. " "If not set all subdirectories will be considered as containing samples." "In sample directory should one(in case SE reads) or two(in case PE reads) files." "Filenames should should contain '_1.fq' or '_1.fastq' for forward(left) reads, " " '_2.fq' or '_2.fastq' for reverse(right) reads and '.fq' or '.fastq' for SE reads" ) parser.add_argument( "-o", "--output_dir",
help="Input file with left reads") parser.add_argument("-r", "--input_right", action="store", dest="input_right", help="Input file with right reads") parser.add_argument("-o", "--out_dir", action="store", dest="out_dir", default="./", help="Directory to write output") parser.add_argument("-m", "--min_len", action="store", dest="min_len", type=int, default=1, help="Minimum length of read to output") args = parser.parse_args() n_regexp = re.compile("N+$") if args.input_se: se_directory, se_prefix, se_extension = split_filename(args.input_se) se_in_fd = open(args.input_se, "r") se_out_file = "%s%s.filtered%s" % (check_path(args.out_dir), se_prefix, se_extension) se_out_fd = open(se_out_file, "w") while True: name, sequence, separator, quality = read_entry(se_in_fd) if name is None: break match = n_regexp.search(sequence) if match is None: se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence, separator, quality)) elif match.start() >= args.min_len: se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence[:match.start()+1], separator, quality[:match.start()+1])) else: continue se_in_fd.close()
def parallel_hmmscan(self, hmmfile, seqfile, outfile, num_of_seqs_per_scan=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", tblout_outfile=None, domtblout_outfile=None, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir=None, splited_pfamtblout_dir=None, dont_output_alignments=False, model_evalue_threshold=None, model_score_threshold=None, domain_evalue_threshold=None, domain_score_threshold=None, model_evalue_significant_threshold=None, model_score_significant_threshold=None, domain_evalue_significant_threshold=None, domain_score_significant_threshold=None, use_profile_GA_gathering_cutoffs_for_thresholds=False, use_profile_NC_noise_cutoffs_for_thresholds=False, use_profile_TC_trusted_cutoffs_for_thresholds=False, turn_off_all_heruristics=False, turn_off_bias_filter=False, MSV_threshold=None, Vit_threshold=None, Fwd_threshold=None, turn_off_biased_composition_score_corrections=None, input_format=None, threads=None, combine_output_to_single_file=True, biopython_165_compartibility=False, remove_tmp_dirs=True, async_run=False, external_process_pool=None): splited_dir = check_path(split_dir) splited_out_dir = check_path(splited_output_dir) save_mkdir(splited_dir) save_mkdir(splited_out_dir) if splited_tblout_dir: save_mkdir(splited_tblout_dir) if splited_domtblout_dir: save_mkdir(splited_domtblout_dir) if splited_pfamtblout_dir: save_mkdir(splited_pfamtblout_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix ) if splited_tblout_dir else None domtblout_file = "%s%s.hits" % ( splited_domtblout_dir, filename_prefix) if splited_domtblout_dir else None pfamtblout_file = "%s%s.hits" % ( splited_pfamtblout_dir, filename_prefix) if splited_pfamtblout_dir else None list_of_files.append((input_file, output_file, tblout_file, domtblout_file, pfamtblout_file)) common_options = self.__parse_hmmsxxx_common_options( tblout=None, domtblout=None, pfamtblout=None, dont_output_alignments=dont_output_alignments, model_evalue_threshold=model_evalue_threshold, model_score_threshold=model_score_threshold, domain_evalue_threshold=domain_evalue_threshold, domain_score_threshold=domain_score_threshold, model_evalue_significant_threshold= model_evalue_significant_threshold, model_score_significant_threshold=model_score_significant_threshold, domain_evalue_significant_threshold= domain_evalue_significant_threshold, domain_score_significant_threshold= domain_score_significant_threshold, use_profile_GA_gathering_cutoffs_for_thresholds= use_profile_GA_gathering_cutoffs_for_thresholds, use_profile_NC_noise_cutoffs_for_thresholds= use_profile_NC_noise_cutoffs_for_thresholds, use_profile_TC_trusted_cutoffs_for_thresholds= use_profile_TC_trusted_cutoffs_for_thresholds, turn_off_all_heruristics=turn_off_all_heruristics, turn_off_bias_filter=turn_off_bias_filter, MSV_threshold=MSV_threshold, Vit_threshold=Vit_threshold, Fwd_threshold=Fwd_threshold, turn_off_biased_composition_score_corrections= turn_off_biased_composition_score_corrections) common_options += " --qformat %s" if input_format else "" options_list = [] out_files = [] tblout_files = [] domtblout_files = [] pfamtblout_files = [] for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files: options = common_options options += " --tblout %s" % tblout_file if tblout_file else "" options += " --domtblout %s" % domtblout_file if domtblout_file else "" options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else "" options += " -o %s" % out_filename options += " %s" % hmmfile options += " %s" % in_file options_list.append(options) out_files.append(out_filename) tblout_files.append(tblout_file) domtblout_files.append(domtblout_file) pfamtblout_files.append(pfamtblout_file) self.parallel_execute(options_list, cmd="hmmscan", threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: if biopython_165_compartibility: CGAS.cgas( out_files, sed_string= "s/^Description:.*/Description: <unknown description>/", output=outfile) else: CGAS.cat(out_files, output=outfile) if tblout_outfile: CGAS.cat(tblout_files, output=tblout_outfile) if domtblout_outfile: CGAS.cat(domtblout_files, output=domtblout_outfile) if pfamtblout_outfile: CGAS.cat(pfamtblout_files, output=pfamtblout_outfile) if remove_tmp_dirs: if splited_tblout_dir: shutil.rmtree(splited_tblout_dir) if splited_domtblout_dir: shutil.rmtree(splited_domtblout_dir) if splited_pfamtblout_dir: shutil.rmtree(splited_pfamtblout_dir) for tmp_dir in splited_dir, splited_out_dir: shutil.rmtree(tmp_dir)
"occurrences of both. So this option is not suitable for generating sets of forward " "and reverse-complement kmers. For this case use -r/--add_reverse_complement option. " "Not compatible with -r/--add_reverse_complement option.") parser.add_argument( "-r", "--add_reverse_complement", action="store_true", dest="add_rev_com", help="Add reverse-complement sequences before counting kmers. " "Works only for fasta sequences. " "Not compatible with -b/--count_both_strands option") parser = argparse.ArgumentParser() args = parser.parse_args() args.path_to_mavr = check_path(args.path_to_mavr) MaSuRCA.threads = args.threads Jellyfish.threads = args.threads Jellyfish.path = args.jellyfish_path if args.jellyfish_path else "" iteration_reference_file = args.initial_sequences working_dir = os.getcwd() abs_path_left_source_reads = os.path.abspath(args.left_source_reads) abs_path_right_source_reads = os.path.abspath(args.right_source_reads) """ for filename in args.source_reads: ab if os.path.isabs(filename): abs_path_source_reads.append(filename) else:
help="File with families") parser.add_argument("-o", "--output_dir", action="store", dest="output_dir", help="Directory to write output") parser.add_argument("-t", "--threads", action="store", dest="threads", type=int, default=1, help="Number of threads to use") args = parser.parse_args() args.output_dir = check_path(args.output_dir) def check_edge_strict(nodes_list, id_list): for node in nodes_list: if node not in id_list: return False return True def check_edge_soft(nodes_list, id_list): for node in nodes_list: if node in id_list: return True return False