def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def parallel_align(self, list_of_files, output_directory, output_suffix="alignment", gap_open_penalty=None, offset=None, maxiterate=None, quiet=False, mode="globalpair", number_of_processes=1): # TODO: add rest of options options = " --thread %i" % self.threads options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else "" options += " --ep %f" % offset if offset is not None else "" options += " --maxiterate %i" % maxiterate if maxiterate is not None else "" options += " --quiet" if quiet else "" options += " --%s" % mode options_list = [] for filename in list_of_files: basename = split_filename(filename)[1] op = options op += " %s" % filename op += " > %s/%s_%s.fasta" % (output_directory, basename, output_suffix) options_list.append(op) self.parallel_execute(options_list, threads=number_of_processes)
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = check_path(split_dir) splited_out_dir = check_path(splited_output_dir) save_mkdir(splited_dir) save_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def parallel_convert(self, list_of_files, output_directory, output_format=".tiff"): save_mkdir(output_directory) options_list = [] for filename in list_of_files: option = " %s" % filename option += " %s%s%s" % (check_path(output_directory), split_filename(filename)[1], output_format) options_list.append(option) self.parallel_execute(options_list)
action="store", dest="suffix", default=".gaps_removed", help="Suffix to use in output files. Default: '.gaps_removed'") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of alignment") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print not found ids. Default - no") args = parser.parse_args() save_mkdir(args.output) for alignment_file in args.input: splited_filename = split_filename(alignment_file) if args.verbose: print("Handling %s ..." % alignment_file) output_filename = "%s%s%s%s" % (args.output, splited_filename[1], args.suffix, splited_filename[2]) alignment = AlignIO.read(alignment_file, args.format) filtered_alignment = MultipleAlignmentRoutines.remove_columns_with_gaps( alignment, args.max_gap_number, gap_symbol=args.gap_symbol) AlignIO.write(filtered_alignment, output_filename, args.format)
action="store_true", dest="convert_to_single_letter", help="Convert aminoacids to single letters") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) gene_alias_dict = SynDict() if args.gene_alias_file: gene_alias_dict.read(args.gene_alias_file, split_values=False) out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") summary_dict = TwoLvlDict() for filename in args.input: directory, prefix, extension = split_filename(filename) if args.write_dir_path and args.write_ext: name = filename elif args.write_dir_path: name = (directory + prefix) if directory else prefix elif args.write_ext: name = prefix + extension else: name = prefix if args.suffix_to_remove in name: name = name.replace(args.suffix_to_remove, "") summary_dict[name] = OrderedDict() with open(filename, "r") as file_fd: file_fd.readline() for line in file_fd:
help="Suffix of fam files. Default: .fam") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="Suffix of fam files") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_list = [] suffix_list = [] if args.use_basename: for filename in sorted(os.listdir(args.input)): dir, basename, ext = split_filename(filename) species_list.append(basename) suffix_list.append("%s" % ext) else: species_list = sorted(args.species_set) suffix_list = [args.suffix for i in range(0, len(species_list))] out_fd.write("#species\tnumber_of_families\tnumber_of_proteins\n") for species, suffix in zip(species_list, suffix_list): fam_dict = SynDict() fam_dict.read("%s%s%s" % (args.input, species, suffix), separator="\t", split_values=True, values_separator=",", key_index=0, value_index=1)
help="Input file with SE reads") parser.add_argument("-l", "--input_left", action="store", dest="input_left", help="Input file with left reads") parser.add_argument("-r", "--input_right", action="store", dest="input_right", help="Input file with right reads") parser.add_argument("-o", "--out_dir", action="store", dest="out_dir", default="./", help="Directory to write output") parser.add_argument("-m", "--min_len", action="store", dest="min_len", type=int, default=1, help="Minimum length of read to output") args = parser.parse_args() n_regexp = re.compile("N+$") if args.input_se: se_directory, se_prefix, se_extension = split_filename(args.input_se) se_in_fd = open(args.input_se, "r") se_out_file = "%s%s.filtered%s" % (check_path(args.out_dir), se_prefix, se_extension) se_out_fd = open(se_out_file, "w") while True: name, sequence, separator, quality = read_entry(se_in_fd) if name is None: break match = n_regexp.search(sequence) if match is None: se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence, separator, quality)) elif match.start() >= args.min_len: se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence[:match.start()+1], separator, quality[:match.start()+1])) else: continue
def parallel_hmmscan(self, hmmfile, seqfile, outfile, num_of_seqs_per_scan=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", tblout_outfile=None, domtblout_outfile=None, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir=None, splited_pfamtblout_dir=None, dont_output_alignments=False, model_evalue_threshold=None, model_score_threshold=None, domain_evalue_threshold=None, domain_score_threshold=None, model_evalue_significant_threshold=None, model_score_significant_threshold=None, domain_evalue_significant_threshold=None, domain_score_significant_threshold=None, use_profile_GA_gathering_cutoffs_for_thresholds=False, use_profile_NC_noise_cutoffs_for_thresholds=False, use_profile_TC_trusted_cutoffs_for_thresholds=False, turn_off_all_heruristics=False, turn_off_bias_filter=False, MSV_threshold=None, Vit_threshold=None, Fwd_threshold=None, turn_off_biased_composition_score_corrections=None, input_format=None, threads=None, combine_output_to_single_file=True, biopython_165_compartibility=False, remove_tmp_dirs=True, async_run=False, external_process_pool=None): splited_dir = check_path(split_dir) splited_out_dir = check_path(splited_output_dir) save_mkdir(splited_dir) save_mkdir(splited_out_dir) if splited_tblout_dir: save_mkdir(splited_tblout_dir) if splited_domtblout_dir: save_mkdir(splited_domtblout_dir) if splited_pfamtblout_dir: save_mkdir(splited_pfamtblout_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix ) if splited_tblout_dir else None domtblout_file = "%s%s.hits" % ( splited_domtblout_dir, filename_prefix) if splited_domtblout_dir else None pfamtblout_file = "%s%s.hits" % ( splited_pfamtblout_dir, filename_prefix) if splited_pfamtblout_dir else None list_of_files.append((input_file, output_file, tblout_file, domtblout_file, pfamtblout_file)) common_options = self.__parse_hmmsxxx_common_options( tblout=None, domtblout=None, pfamtblout=None, dont_output_alignments=dont_output_alignments, model_evalue_threshold=model_evalue_threshold, model_score_threshold=model_score_threshold, domain_evalue_threshold=domain_evalue_threshold, domain_score_threshold=domain_score_threshold, model_evalue_significant_threshold= model_evalue_significant_threshold, model_score_significant_threshold=model_score_significant_threshold, domain_evalue_significant_threshold= domain_evalue_significant_threshold, domain_score_significant_threshold= domain_score_significant_threshold, use_profile_GA_gathering_cutoffs_for_thresholds= use_profile_GA_gathering_cutoffs_for_thresholds, use_profile_NC_noise_cutoffs_for_thresholds= use_profile_NC_noise_cutoffs_for_thresholds, use_profile_TC_trusted_cutoffs_for_thresholds= use_profile_TC_trusted_cutoffs_for_thresholds, turn_off_all_heruristics=turn_off_all_heruristics, turn_off_bias_filter=turn_off_bias_filter, MSV_threshold=MSV_threshold, Vit_threshold=Vit_threshold, Fwd_threshold=Fwd_threshold, turn_off_biased_composition_score_corrections= turn_off_biased_composition_score_corrections) common_options += " --qformat %s" if input_format else "" options_list = [] out_files = [] tblout_files = [] domtblout_files = [] pfamtblout_files = [] for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files: options = common_options options += " --tblout %s" % tblout_file if tblout_file else "" options += " --domtblout %s" % domtblout_file if domtblout_file else "" options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else "" options += " -o %s" % out_filename options += " %s" % hmmfile options += " %s" % in_file options_list.append(options) out_files.append(out_filename) tblout_files.append(tblout_file) domtblout_files.append(domtblout_file) pfamtblout_files.append(pfamtblout_file) self.parallel_execute(options_list, cmd="hmmscan", threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: if biopython_165_compartibility: CGAS.cgas( out_files, sed_string= "s/^Description:.*/Description: <unknown description>/", output=outfile) else: CGAS.cat(out_files, output=outfile) if tblout_outfile: CGAS.cat(tblout_files, output=tblout_outfile) if domtblout_outfile: CGAS.cat(domtblout_files, output=domtblout_outfile) if pfamtblout_outfile: CGAS.cat(pfamtblout_files, output=pfamtblout_outfile) if remove_tmp_dirs: if splited_tblout_dir: shutil.rmtree(splited_tblout_dir) if splited_domtblout_dir: shutil.rmtree(splited_domtblout_dir) if splited_pfamtblout_dir: shutil.rmtree(splited_pfamtblout_dir) for tmp_dir in splited_dir, splited_out_dir: shutil.rmtree(tmp_dir)
def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4): try: os.mkdir(output_dir) except OSError: pass id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}", capture_output=True) split_index = 1 ids_written = 0 ids_list = IdList() #ids_list = read_ids(id_fd, close_after_if_file_object=False) ids_list.read(id_fd, close_after_if_file_object=True) number_of_ids = len(ids_list) out_prefix = split_filename( hmmfile)[1] if output_prefix is None else output_prefix num_of_ids = int( number_of_ids / num_of_files) + 1 if num_of_files else num_of_recs_per_file common_options = " -f" common_options += " %s" % hmmfile options_list = [] while (ids_written + num_of_ids) <= number_of_ids: tmp_id_list = IdList(ids_list[ids_written:ids_written + num_of_ids]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 ids_written += num_of_ids if ids_written != number_of_ids: tmp_id_list = IdList(ids_list[ids_written:]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 #print options_list self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
""" for iteration_index in range(1, args.number_of_iterations): os.chdir(working_dir) iteration = "iteration_%i" % iteration_index iteration_dir = "%s/%s" % (working_dir, iteration) iteration_ref = "%s/%s_reference.fasta" % (iteration_dir, iteration) iteration_ref_index = "%s/%s_reference.idx" % (iteration_dir, iteration) base_prefix = "%s/%s_reference_with_rev_com" % (iteration_dir, iteration) iteration_ref_with_rev_com = "%s/%s_reference_with_rev_com.fasta" % ( iteration_dir, iteration) kmer_file = "%s_%i_mer.kmer" % (base_prefix, args.kmer_length) masurca_config_file = "masurca_%s.config" % iteration left_reads_prefix = split_filename(abs_path_left_source_reads)[1] right_reads_prefix = split_filename(abs_path_right_source_reads)[1] left_reads_se = "%s.se.fastq" % left_reads_prefix right_reads_se = "%s.se.fastq" % right_reads_prefix left_reads_filtered = "%s.filtered.fastq" % left_reads_prefix right_reads_filtered = "%s.filtered.fastq" % right_reads_prefix try: os.mkdir(iteration_dir) except OSError: pass shutil.copyfile(iteration_reference_file, iteration_ref) os.chdir(iteration_dir) iteration_reference_dict = SeqIO.index_db(iteration_ref_index,
def parallel_alignment(self, query_file, target_file, model, num_of_recs_per_file=None, show_alignment=None, show_sugar=True, show_cigar=None, show_vulgar=None, show_query_gff=None, show_target_gff=None, store_intermediate_files=False, splited_fasta_dir="splited_fasta_dir", splited_result_dir="splited_output", number_of_results_to_report=None, other_options=None, num_of_files=None, converted_output_dir="converted_output"): splited_filename = split_filename(query_file) self.split_fasta(query_file, splited_fasta_dir, num_of_recs_per_file=num_of_recs_per_file, num_of_files=num_of_files, output_prefix=splited_filename[1]) common_options = self.parse_common_options( model, show_alignment=show_alignment, show_sugar=show_sugar, show_cigar=show_cigar, show_vulgar=show_vulgar, show_query_gff=show_query_gff, show_target_gff=show_target_gff, number_of_results_to_report=number_of_results_to_report, other_options=other_options) options_list = [] splited_files = os.listdir(splited_fasta_dir) save_mkdir(splited_result_dir) #save_mkdir(converted_output_dir) for filename in splited_files: filename_list = split_filename(filename) options = common_options options += " -q %s/%s" % (splited_fasta_dir, filename) options += " -t %s" % target_file options += " > %s/%s.output" % (splited_result_dir, filename_list[1]) options_list.append(options) self.parallel_execute(options_list) """ for filename in splited_files: trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (splited_result_dir, filename, matching_weight, mismatching_penalty, indel_penalty, match_probability, indel_probability, min_alignment_score, max_period) self.convert_trf_report(trf_output_file, "%s/%s" % (converted_output_dir, filename)) for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab"): file_str = "" merged_file = "%s%s" % (output_prefix, suffix) for filename in splited_files: file_str += " %s/%s%s" % (converted_output_dir, filename, suffix) CGAS.cat(file_str, merged_file) """ if not store_intermediate_files: shutil.rmtree(splited_fasta_dir)
def parallel_search_tandem_repeat(self, query_file, output_prefix, matching_weight=2, mismatching_penalty=7, indel_penalty=7, match_probability=80, indel_probability=10, min_alignment_score=50, max_period=500, report_flanking_sequences=False, splited_fasta_dir="splited_fasta_dir", splited_result_dir="splited_output", converted_output_dir="converted_output", max_len_per_file=100000, store_intermediate_files=False): work_dir = os.getcwd() splited_filename = split_filename(query_file) self.split_fasta_by_seq_len(query_file, splited_fasta_dir, max_len_per_file=max_len_per_file, output_prefix=splited_filename[1]) common_options = self.parse_common_options( matching_weight=matching_weight, mismatching_penalty=mismatching_penalty, indel_penalty=indel_penalty, match_probability=match_probability, indel_probability=indel_probability, min_alignment_score=min_alignment_score, max_period=max_period, report_flanking_sequences=report_flanking_sequences, make_dat_file=True) common_options += " -h" # suppress html output options_list = [] splited_files = os.listdir(splited_fasta_dir) save_mkdir(splited_result_dir) save_mkdir(converted_output_dir) os.chdir(splited_result_dir) input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \ else "../%s" % splited_fasta_dir for filename in splited_files: file_options = "%s/%s" % (input_dir, filename) file_options += common_options options_list.append(file_options) self.parallel_execute(options_list) os.chdir(work_dir) for filename in splited_files: trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % ( splited_result_dir, filename, matching_weight, mismatching_penalty, indel_penalty, match_probability, indel_probability, min_alignment_score, max_period) self.convert_trf_report(trf_output_file, "%s/%s" % (converted_output_dir, filename)) for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab"): file_str = "" merged_file = "%s%s" % (output_prefix, suffix) for filename in splited_files: file_str += " %s/%s%s" % (converted_output_dir, filename, suffix) CGAS.cat(file_str, merged_file) if not store_intermediate_files: shutil.rmtree(splited_fasta_dir) shutil.rmtree(splited_result_dir) shutil.rmtree(converted_output_dir)
"--number_of_top_orfs_for_training", action="store", type=int, dest="number_of_top_orfs_for_training", help= "If no --number_of_top_orfs_for_training, top longest ORFs to train Markov " "Model (hexamer stats) (default: 500)") parser.add_argument("-c", "--hmmer_dir", action="store", dest="hmmer_dir", default="", help="Directory with hmmer v3.1 binaries") args = parser.parse_args() input_filename_list = split_filename(args.input) input_filename = input_filename_list[1] + input_filename_list[2] workdir_dir = "%s.transdecoder_dir/" % input_filename pep_from_longest_orfs = "%s/longest_orfs.pep" % workdir_dir hmmscan_dir = "hmmscan_vs_pfam/" blastp_dir = "blastp_vs_uniref/" save_mkdir(hmmscan_dir) save_mkdir(blastp_dir) hmmscan_splited_fasta_dir = "%ssplited_fasta_dir/" % hmmscan_dir splited_domtblout_dir = "%ssplited_domtblout_dir/" % hmmscan_dir hmmscan_vs_pfam_output = "%s%s.pfam.hits" % (hmmscan_dir, input_filename) domtblout_outfile = "%s%s.pfam.domtblout" % (
type=int, help="Format of input trees") parser.add_argument("-o", "--output_file", action="store", dest="output_file", default="stdout", help="Output file with leaves of trees. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output_file == "stdout" else open( args.output_file, "w") tree_files_list = os.listdir(args.tree_dir) names_dict = SynDict() for tree_file in tree_files_list: tree_name = split_filename(tree_file)[1] with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd: tree = Tree(tree_fd.readline().strip(), format=args.tree_format) leaves_list = [] for node in tree.traverse(): if node.is_leaf(): leaves_list.append(node.name) names_dict[tree_name] = leaves_list names_dict.write(args.outp_fd, splited_values=True) if args.output_file != "stdout": out_fd.close()
if args.threads == 1: TRF.search_tandem_repeats( args.input_file, matching_weight=args.matching_weight, mismatching_penalty=args.mismatching_penalty, indel_penalty=args.indel_penalty, match_probability=args.matching_probability, indel_probability=args.indel_probability, min_alignment_score=args.min_score, max_period=args.max_period_size, report_flanking_sequences=args.report_flanking_sequences, make_dat_file=True, disable_html_output=args.enable_html_output) trf_report = "%s.%i.%i.%i.%i.%i.%i.%i.dat" % ( split_filename(args.input_file)[1] + split_filename(args.input_file)[2], args.matching_weight, args.mismatching_penalty, args.indel_penalty, args.matching_probability, args.indel_probability, args.min_score, args.max_period_size) TRF.convert_trf_report(trf_report, args.output_prefix) else: TRF.parallel_search_tandem_repeat( args.input_file, args.output_prefix, matching_weight=args.matching_weight, mismatching_penalty=args.mismatching_penalty, indel_penalty=args.indel_penalty, match_probability=args.matching_probability,