def index(self, genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None): FileRoutines.safe_mkdir(genome_dir) options = "--runMode genomeGenerate" options += " --genomeDir %s" % os.path.abspath(genome_dir) options += " --runThreadN %i" % self.threads options += " --genomeFastaFiles %s" % ( os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else " ".join(map(os.path.abspath, genome_fasta))) options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else "" options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else "" options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else "" # number of bases taken from both sides of splice junction. 100 by default if genome_size: options += " --genomeSAindexNbases %i" % min( [14, (floor(log(genome_size, 2) / 2)) - 1]) else: options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else "" # size of k-mers used for preindexing of suffix array options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else "" # padding size (log2) of reference sequences. 18 by default # recommended value min(18, log2(GenomeLength/NumberOfScaffolds)) self.execute(options)
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) self.safe_mkdir(splited_dir) self.safe_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = FileRoutines.split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def parallel_align(self, list_of_files, output_dir, msa_tool='prank', seq_type=None, bootstrap_number=100, genetic_code=1, threads=None, msa_tool_options=None, seq_cutoff=None, col_cutoff=None, mafft_bin=None, prank_bin=None, muscle_bin=None, pagan_bin=None, ruby_bin=None, program=None, cmd_log_file=None, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_jobs=None, max_running_time=None, max_memory_per_node=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): common_options = self.parse_common_options(output_dir=output_dir, msa_tool=msa_tool, seq_type=seq_type, bootstrap_number=bootstrap_number, genetic_code=genetic_code, threads=threads, msa_tool_options=msa_tool_options, seq_cutoff=seq_cutoff, col_cutoff=col_cutoff, mafft_bin=mafft_bin, prank_bin=prank_bin, muscle_bin=muscle_bin, pagan_bin=pagan_bin, ruby_bin=ruby_bin, program=program) FileRoutines.safe_mkdir(output_dir) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " --seqFile %s" % filename op += " --dataset %s" % basename options_list.append(op) if handling_mode == "local": self.parallel_execute(options_list) elif handling_mode == "slurm": cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list] self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list, cmd_log_file, max_jobs=max_jobs, job_name=job_name, log_prefix=log_prefix, error_log_prefix=error_log_prefix, cpus_per_node=None, max_running_jobs=None, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memory_per_node=max_memory_per_node, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict)
def extract_single_copy_clusters_from_files( self, list_of_cluster_files, output_file, label_elements=False, separator="@", label_position="first", function_to_convert_filename_to_label=None): dict_of_cluster_dicts = OrderedDict() for filename in list_of_cluster_files: if function_to_convert_filename_to_label: label = function_to_convert_filename_to_label(filename) else: label = FileRoutines.split_filename(filename)[ 1] # use basename as label dict_of_cluster_dicts[label] = SynDict() dict_of_cluster_dicts[label].read(filename, split_values=True, comments_prefix="#") sc_clusters_dict = self.extract_single_copy_clusters( dict_of_cluster_dicts, label_elements=label_elements, separator=separator, label_position=label_position) sc_clusters_dict.write(output_file, splited_values=True) return sc_clusters_dict
def parallel_align(self, list_of_files, output_directory, output_suffix="alignment", gap_open_penalty=None, offset=None, maxiterate=None, quiet=False, mode="globalpair", number_of_processes=1, anysymbol=False): # TODO: add rest of options options = " --thread %i" % self.threads options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else "" options += " --ep %f" % offset if offset is not None else "" options += " --maxiterate %i" % maxiterate if maxiterate is not None else "" options += " --quiet" if quiet else "" options += " --%s" % mode options += " --anysymbol" if anysymbol else "" options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = options op += " %s" % filename op += " > %s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) self.parallel_execute(options_list, threads=number_of_processes)
def extract_proteins_from_alignments(dir_with_alignments, output_dir): out_dir = FileRoutines.check_path(output_dir) print type(FileRoutines) input_files = make_list_of_path_to_files( [dir_with_alignments] if isinstance(dir_with_alignments, str ) else dir_with_alignments) FileRoutines.safe_mkdir(out_dir) from Routines import MultipleAlignmentRoutines for filename in input_files: filename_list = FileRoutines.split_filename(filename) output_file = "%s%s%s" % (out_dir, filename_list[1], filename_list[2]) MultipleAlignmentRoutines.extract_sequences_from_alignment( filename, output_file)
def read_cluster_files_from_dir(dir_with_cluster_files): cluster_files_list = sorted(os.listdir(dir_with_cluster_files)) clusters_dict = OrderedDict() for filename in cluster_files_list: filepath = "%s%s" % ( FileRoutines.check_path(dir_with_cluster_files), filename) filename_list = FileRoutines.split_filename(filepath) clusters_dict[filename_list[1]] = SynDict() clusters_dict[filename_list[1]].read(filepath, header=False, separator="\t", allow_repeats_of_key=False, split_values=True, values_separator=",", key_index=0, value_index=1, comments_prefix="#") return clusters_dict
def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None, show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None, showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False, codon_alignment=None, translated_alignment=None): common_options = self.parse_common_options( tree_file=tree_file, output_format=output_format, show_xml=show_xml, show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences, show_evolutionary_events=show_evolutionary_events, showall=showall, compute_posterior_support=compute_posterior_support, njtree=njtree, skip_insertions=skip_insertions, codon_alignment=codon_alignment, translated_alignment=translated_alignment) FileRoutines.safe_mkdir(output_directory) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " -d=%s" % filename op += " -o=%s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) self.parallel_execute(options_list)
def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True, other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir", config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None, extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None, async_run=False, min_intron_len=None, parsing_mode="parse"): common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model, output_gff3=output_gff3, other_options=other_options, config_dir=config_dir, use_softmasking=use_softmasking, hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile, predict_UTR=predict_UTR, min_intron_len=min_intron_len) splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) FileRoutines.safe_mkdir(splited_dir) FileRoutines.safe_mkdir(splited_out_dir) self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_output_files = [] options_list = [] for filename in input_list_of_files: input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.gff" % (splited_out_dir, filename) list_of_output_files.append(output_file) options = common_options options += " %s" % input_file options += " > %s" % output_file options_list.append(options) self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run) if combine_output_to_single_file: CGAS.cat(list_of_output_files, output=output)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def split_proteins_per_species(dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): input_files = FileRoutines.make_list_of_path_to_files( [dir_with_proteins] if isinstance(dir_with_proteins, str ) else dir_with_proteins) out_dir = FileRoutines.check_path(output_dir) FileRoutines.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import MultipleAlignmentRoutines, FileRoutines parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help="Comma-separated list of files/directories with alignments") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="File to write merged alignment") parser.add_argument( "-c", "--coordinates_file", action="store", dest="coords_file", required=True, help="File to write file with coordinates of alignments in merged alignment" )
action="store", dest="max_memory_per_thread", default="1G", help="Maximum memory per thread. Default - 1G") args = parser.parse_args() if args.prepare_bam and ((not args.prepared_bam_prefix) or (not args.temp_dir)): raise ValueError( "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used" ) SamtoolsV1.threads = args.threads if args.prepare_bam or args.mix_ends: FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir)) prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix prepared_unpaired_bam_file = ( "%s.unpaired.bam" % args.prepared_bam_prefix) if args.mix_ends else None """ SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread) """ SamtoolsV1.prepare_bam_for_read_extraction( args.input, prepared_pe_bam_file, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread, bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file) if args.paired:
parser.add_argument("-i", "--input_file_list", action="store", dest="input", required=True, type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format)
parser.add_argument("--indel_InbreedingCoeff", action="store", dest="indel_InbreedingCoeff", type=float, default=-0.8, help="Indel InbreedingCoeff threshold. Default - -0.8") parser.add_argument("--indel_FS", action="store", dest="indel_FS", type=float, default=200.0, help="Indel FS threshold. Default - 200.0") args = parser.parse_args() VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir) VariantFiltration.filter_bad_variants( args.reference, args.input_vcf, args.output_prefix, snp_filter_name=args.snp_filter_name, snp_QD=args.snp_QD, snp_FS=args.snp_FS, snp_MQ=args.snp_MQ, snp_HaplotypeScore=args.snp_HaplotypeScore, snp_MappingQualityRankSum=args.snp_MappingQualityRankSum, snp_ReadPosRankSum=args.snp_ReadPosRankSum, indel_filter_name=args.indel_filter_name, indel_QD=args.indel_QD, indel_ReadPosRankSum=args.indel_ReadPosRankSum,
def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None, show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None, showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False, codon_alignment=None, translated_alignment=None, cmd_log_file=None, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_jobs=None, max_running_time=None, max_memory_per_node=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): common_options = self.parse_common_options( tree_file=tree_file, output_format=output_format, show_xml=show_xml, show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences, show_evolutionary_events=show_evolutionary_events, showall=showall, compute_posterior_support=compute_posterior_support, njtree=njtree, skip_insertions=skip_insertions, codon_alignment=codon_alignment, translated_alignment=translated_alignment) FileRoutines.safe_mkdir(output_directory) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " -d=%s" % filename op += " -o=%s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) if handling_mode == "local": self.parallel_execute(options_list) elif handling_mode == "slurm": cmd_list = [ "%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list ] self.slurm_run_multiple_jobs_in_wrap_mode( cmd_list, cmd_log_file, max_jobs=max_jobs, job_name=job_name, log_prefix=log_prefix, error_log_prefix=error_log_prefix, cpus_per_node=None, max_running_jobs=None, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memory_per_node=max_memory_per_node, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict)
def parallel_search_tandem_repeat(self, query_file, output_prefix, matching_weight=2, mismatching_penalty=7, indel_penalty=7, match_probability=80, indel_probability=10, min_alignment_score=50, max_period=500, report_flanking_sequences=False, splited_fasta_dir="splited_fasta_dir", splited_result_dir="splited_output", converted_output_dir="converted_output", max_len_per_file=100000, store_intermediate_files=False): work_dir = os.getcwd() splited_filename = FileRoutines.split_filename(query_file) self.split_fasta_by_seq_len(query_file, splited_fasta_dir, max_len_per_file=max_len_per_file, output_prefix=splited_filename[1]) common_options = self.parse_common_options( matching_weight=matching_weight, mismatching_penalty=mismatching_penalty, indel_penalty=indel_penalty, match_probability=match_probability, indel_probability=indel_probability, min_alignment_score=min_alignment_score, max_period=max_period, report_flanking_sequences=report_flanking_sequences, make_dat_file=True) common_options += " -h" # suppress html output options_list = [] splited_files = os.listdir(splited_fasta_dir) FileRoutines.safe_mkdir(splited_result_dir) FileRoutines.safe_mkdir(converted_output_dir) os.chdir(splited_result_dir) input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \ else "../%s" % splited_fasta_dir for filename in splited_files: file_options = "%s/%s" % (input_dir, filename) file_options += common_options options_list.append(file_options) self.parallel_execute(options_list) os.chdir(work_dir) trf_output_file_list = [] for filename in splited_files: trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % ( splited_result_dir, filename, matching_weight, mismatching_penalty, indel_penalty, match_probability, indel_probability, min_alignment_score, max_period) trf_output_file_list.append(trf_output_file) trf_report = self.convert_trf_report(trf_output_file_list, output_prefix) """ for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab", ".with_rep_seqs.gff", ".fasta"): file_str = "" merged_file = "%s%s" % (output_prefix, suffix) for filename in splited_files: file_str += " %s/%s%s" % (converted_output_dir, filename, suffix) CGAS.cat(file_str, merged_file) """ if not store_intermediate_files: shutil.rmtree(splited_fasta_dir) shutil.rmtree(splited_result_dir) shutil.rmtree(converted_output_dir) return trf_report
STAR.path = args.star_dir if args.genome_fasta: STAR.index(args.genome_dir, args.genome_fasta, annotation_gtf=args.annotation_gtf, junction_tab_file=args.junction_tab_file, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=args.genome_size) sample_list = args.samples if args.samples else Pipeline.get_sample_list( args.samples_dir) FileRoutines.safe_mkdir(args.output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (args.samples_dir, sample) alignment_sample_dir = "%s/%s/" % (args.output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( args.genome_dir, forward_files, reverse_read_list=reverse_files,
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff) for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence: MatplotlibRoutines.percent_histogram_from_file( stat_file, stat_file, data_type=None, column_list=(2, ), comments="#", n_bins=20, title="Transcript support by hints", extensions=("png", "svg"), legend_location="upper center", stats_as_legend=True) if args.pfam_db and args.swissprot_db: db_or_hints_dir = "supported_by_db_or_hints/" db_and_hints_dir = "supported_by_db_and_hints/" for directory in db_and_hints_dir, db_or_hints_dir: FileRoutines.safe_mkdir(directory) os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" % (args.output, db_or_hints_dir)) os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" % (args.output, db_and_hints_dir))
def __init__(self): FileRoutines.__init__(self)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import MultipleAlignmentRoutines, FileRoutines parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help="Comma-separated list of files/directories with alignments") parser.add_argument( "-o", "--output_directory", action="store", dest="output_dir", default="./", help= "Output directory to write resulting files. Default - current directory") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of alignments")
"--output", action="store", dest="output", required=True, help="File to write clusters with single-copy clusters") parser.add_argument( "-p", "--label position", action="store", dest="label_position", default="first", help="Position of label. Allowed - first, last. Default - first") parser.add_argument("-s", "--separator", action="store", dest="separator", default="@", help="Separator to use. default - '@'") args = parser.parse_args() list_of_cluster_files = FileRoutines.make_list_of_path_to_files(args.input) single_copy_clusters = SequenceClusterRoutines.extract_single_copy_clusters_from_files( list_of_cluster_files, args.output, label_elements=args.label, separator=args.separator, label_position=args.label_position) print "Was found %i single-copy clusters" % len(single_copy_clusters)
"--output", action="store", dest="output", required=True, help="File to write clusters with labeled elements") parser.add_argument( "-p", "--label position", action="store", dest="label_position", default="first", help="Position of label. Allowed - first, last. Default - first") parser.add_argument("-s", "--separator", action="store", dest="separator", default="@", help="Separator to use. default - '@'") args = parser.parse_args() label = args.label if args.label else FileRoutines.split_filename( args.cluster_file)[1] SequenceClusterRoutines.label_cluster_elements_from_file( args.cluster_file, label, args.output, separator=args.separator, label_position=args.label_position)
def align_samples(self, samples_dir, output_dir, genome_dir, genome_fasta=None, samples=None, annotation_gtf=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=8000000000, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=True, max_intron_length=None): #STAR.threads = threads #STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=annotation_gtf, junction_tab_file=junction_tab_file_list, sjdboverhang=sjdboverhang, genomeSAindexNbases=genomeSAindexNbases, genomeChrBinNbits=genomeChrBinNbits, genome_size=genome_size) sample_list = samples if samples else self.get_sample_list(samples_dir) FileRoutines.safe_mkdir(output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_dir, sample) alignment_sample_dir = "%s/%s/" % (output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf if not genome_fasta else None, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) print "\tIndexing bam file..." resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir SamtoolsV1.index(resulting_bam_file)
def parallel_codeml(self, in_dir, tree_file, out_dir, seq_type="codons", codon_frequency="F3X4", noisy=0, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, model=1, nssites=0, genetic_code=0, fix_kappa=False, kappa=5, fix_omega=False, omega=0.2, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0, Mgene=None): FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) out_file = "%s/%s.out" % (filename_out_dir, basename) ctl_file = "%s/%s.ctl" % (filename_out_dir, basename) options_list.append(ctl_file) dir_list.append(filename_out_dir) FileRoutines.safe_mkdir(filename_out_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=model, nssites=nssites, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega, omega=omega, getSE=getSE, RateAncestor=RateAncestor, Mgene=Mgene, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list)
action="store_true", dest="header", default=False, help="Set if header is present in input file") parser.add_argument("-u", "--use_column_value_as_prefix", action="store_true", dest="use_column_value_as_prefix", default=False, help="Use column value as prefix for output files") parser.add_argument( "-r", "--sorted_input", action="store_true", dest="sorted_input", default=False, help= "Input file is sorted. Do it to reduce number of simultaneously opened files" ) args = parser.parse_args() FileRoutines.split_by_column( args.input_file, args.column_number, separator=args.separator, header=args.header, outfile_prefix=args.output_prefix, use_column_value_as_prefix=args.use_column_value_as_prefix, sorted_input=args.sorted_input)
parser.add_argument("-i", "--input_vcf", action="store", dest="input_vcf", required=True, help="Input vcf file") parser.add_argument("-o", "--output_vcf", action="store", dest="output_vcf", required=True, help="Output vcf file") parser.add_argument("-r", "--reference", action="store", dest="reference", required=True, help="Fasta with reference genome") parser.add_argument("-g", "--gatk_directory", action="store", dest="gatk_dir", default="", help="Directory with GATK jar") args = parser.parse_args() SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir) SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf, args.output_vcf)
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = FileRoutines.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (FileRoutines.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)