def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def split_proteins_per_species(dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): input_files = FileRoutines.make_list_of_path_to_files( [dir_with_proteins] if isinstance(dir_with_proteins, str ) else dir_with_proteins) out_dir = FileRoutines.check_path(output_dir) FileRoutines.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import MultipleAlignmentRoutines, FileRoutines parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help="Comma-separated list of files/directories with alignments") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="File to write merged alignment") parser.add_argument( "-c", "--coordinates_file", action="store", dest="coords_file", required=True, help="File to write file with coordinates of alignments in merged alignment" )
"--output", action="store", dest="output", required=True, help="File to write clusters with single-copy clusters") parser.add_argument( "-p", "--label position", action="store", dest="label_position", default="first", help="Position of label. Allowed - first, last. Default - first") parser.add_argument("-s", "--separator", action="store", dest="separator", default="@", help="Separator to use. default - '@'") args = parser.parse_args() list_of_cluster_files = FileRoutines.make_list_of_path_to_files(args.input) single_copy_clusters = SequenceClusterRoutines.extract_single_copy_clusters_from_files( list_of_cluster_files, args.output, label_elements=args.label, separator=args.separator, label_position=args.label_position) print "Was found %i single-copy clusters" % len(single_copy_clusters)
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
def parallel_codeml(self, in_dir, tree_file, out_dir, seq_type="codons", codon_frequency="F3X4", noisy=0, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, model=1, nssites=0, genetic_code=0, fix_kappa=False, kappa=5, fix_omega=False, omega=0.2, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0, Mgene=None): FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) out_file = "%s/%s.out" % (filename_out_dir, basename) ctl_file = "%s/%s.ctl" % (filename_out_dir, basename) options_list.append(ctl_file) dir_list.append(filename_out_dir) FileRoutines.safe_mkdir(filename_out_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=model, nssites=nssites, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega, omega=omega, getSE=getSE, RateAncestor=RateAncestor, Mgene=Mgene, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list)
def mask(self, list_of_fasta_files, output_dir="./", soft_masking=True, engine="ncbi", slow_search=True, quick_search=False, rush_search=False, no_low_complexity=None, only_low_complexity=None, no_interspersed=None, only_interspersed=None, no_rna=None, only_alu=None, custom_library=None, species=None, html_output=False, ace_output=False, gff_output=False): if (slow_search and quick_search) or ( rush_search and quick_search) or (slow_search and rush_search): raise ValueError( "Both quick search(-q) and slow search(-s) options were set. Choose ONE!" ) if species and custom_library: tmp_repeat_file = "%s.repeats.tmp.fa" % species tmp_repeats_all_file = "all.repeats.tmp.fasta" self.extract_repeats_from_database(tmp_repeat_file, species=species) cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library, tmp_repeats_all_file) self.execute(cmd=cmd) options = " -pa %i" % self.threads options += " -e %s" % engine options += " -s" if slow_search else "" options += " -q" if quick_search else "" options += " -qq" if rush_search else "" options += " -nolow" if no_low_complexity else "" options += " -low" if only_low_complexity else "" options += " -noint" if no_interspersed else "" options += " -int" if only_interspersed else "" options += " -norna" if no_rna else "" options += " -alu" if only_alu else "" if species and custom_library: options += " -lib %s" % tmp_repeats_all_file elif custom_library: options += " -lib %s" % custom_library if custom_library else "" elif species: options += " -species %s" % species if species else "" options += " -dir %s" % output_dir options += " -html" if html_output else "" options += " -ace" if ace_output else "" options += " -gff" if gff_output else "" options += " -xsmall" if soft_masking else "" options += " " + (list_of_fasta_files if isinstance( list_of_fasta_files, str) else " ".join( FileRoutines.make_list_of_path_to_files(list_of_fasta_files))) self.execute(options=options) """
def mask(self, list_of_fasta_files, output_dir="./", soft_masking=True, engine="ncbi", search_speed="normal", no_low_complexity=None, only_low_complexity=None, no_interspersed=None, only_interspersed=None, no_rna=None, only_alu=None, custom_library=None, species=None, html_output=False, ace_output=False, gff_output=False): if species and custom_library: tmp_repeat_file = "%s/%s.repeats.tmp.fa" % (output_dir, species) tmp_repeats_all_file = "%s/all.repeats.tmp.fasta" % output_dir self.extract_repeats_from_database(tmp_repeat_file, species=species) cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library, tmp_repeats_all_file) self.execute(cmd=cmd) options = " -pa %i" % self.threads options += " -e %s" % engine if search_speed == "slow": options += " -s" elif search_speed == "quick": options += " -q" elif search_speed == "rush": options += " -qq" options += " -nolow" if no_low_complexity else "" options += " -low" if only_low_complexity else "" options += " -noint" if no_interspersed else "" options += " -int" if only_interspersed else "" options += " -norna" if no_rna else "" options += " -alu" if only_alu else "" if species and custom_library: options += " -lib %s" % tmp_repeats_all_file elif custom_library: options += " -lib %s" % custom_library if custom_library else "" elif species: options += " -species %s" % species if species else "" options += " -dir %s" % output_dir options += " -html" if html_output else "" options += " -ace" if ace_output else "" options += " -gff" if gff_output else "" options += " -xsmall" if soft_masking else "" options += " " + (list_of_fasta_files if isinstance( list_of_fasta_files, str) else " ".join( FileRoutines.make_list_of_path_to_files(list_of_fasta_files))) self.execute(options=options) """
#!/usr/bin/env python import os from Bio import SeqIO from Routines import FileRoutines workdir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/" data_dir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/data/" os.chdir(workdir) data_files = FileRoutines.make_list_of_path_to_files([data_dir]) record_dict = SeqIO.index_db("tmp.idx", data_files, format="genbank") print("#organism\ttaxonomy\tregion_id\ttranscript_id\tproduct\texon_len") for record_id in record_dict: for feature in record_dict[record_id].features: if feature.type == "mRNA": mRNA_string = "" mRNA_string += "%s" % record_dict[record_id].annotations["organism"] mRNA_string += "\t%s" % (";".join( record_dict[record_id].annotations["taxonomy"])) mRNA_string += "\t%s" % record_id mRNA_string += "\t%s" % (feature.qualifiers["transcript_id"][0] if "transcript_id" in feature.qualifiers else ".") mRNA_string += "\t%s" % (feature.qualifiers["product"][0] if "product" in feature.qualifiers else ".") location_lenths = []