def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def split_proteins_per_species(dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): input_files = FileRoutines.make_list_of_path_to_files( [dir_with_proteins] if isinstance(dir_with_proteins, str ) else dir_with_proteins) out_dir = FileRoutines.check_path(output_dir) FileRoutines.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
def parallel_search_tandem_repeat(self, query_file, output_prefix, matching_weight=2, mismatching_penalty=7, indel_penalty=7, match_probability=80, indel_probability=10, min_alignment_score=50, max_period=500, report_flanking_sequences=False, splited_fasta_dir="splited_fasta_dir", splited_result_dir="splited_output", converted_output_dir="converted_output", max_len_per_file=100000, store_intermediate_files=False): work_dir = os.getcwd() splited_filename = FileRoutines.split_filename(query_file) self.split_fasta_by_seq_len(query_file, splited_fasta_dir, max_len_per_file=max_len_per_file, output_prefix=splited_filename[1]) common_options = self.parse_common_options( matching_weight=matching_weight, mismatching_penalty=mismatching_penalty, indel_penalty=indel_penalty, match_probability=match_probability, indel_probability=indel_probability, min_alignment_score=min_alignment_score, max_period=max_period, report_flanking_sequences=report_flanking_sequences, make_dat_file=True) common_options += " -h" # suppress html output options_list = [] splited_files = os.listdir(splited_fasta_dir) FileRoutines.safe_mkdir(splited_result_dir) FileRoutines.safe_mkdir(converted_output_dir) os.chdir(splited_result_dir) input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \ else "../%s" % splited_fasta_dir for filename in splited_files: file_options = "%s/%s" % (input_dir, filename) file_options += common_options options_list.append(file_options) self.parallel_execute(options_list) os.chdir(work_dir) trf_output_file_list = [] for filename in splited_files: trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % ( splited_result_dir, filename, matching_weight, mismatching_penalty, indel_penalty, match_probability, indel_probability, min_alignment_score, max_period) trf_output_file_list.append(trf_output_file) trf_report = self.convert_trf_report(trf_output_file_list, output_prefix) """ for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab", ".with_rep_seqs.gff", ".fasta"): file_str = "" merged_file = "%s%s" % (output_prefix, suffix) for filename in splited_files: file_str += " %s/%s%s" % (converted_output_dir, filename, suffix) CGAS.cat(file_str, merged_file) """ if not store_intermediate_files: shutil.rmtree(splited_fasta_dir) shutil.rmtree(splited_result_dir) shutil.rmtree(converted_output_dir) return trf_report
action="store", dest="max_memory_per_thread", default="1G", help="Maximum memory per thread. Default - 1G") args = parser.parse_args() if args.prepare_bam and ((not args.prepared_bam_prefix) or (not args.temp_dir)): raise ValueError( "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used" ) SamtoolsV1.threads = args.threads if args.prepare_bam or args.mix_ends: FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir)) prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix prepared_unpaired_bam_file = ( "%s.unpaired.bam" % args.prepared_bam_prefix) if args.mix_ends else None """ SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread) """ SamtoolsV1.prepare_bam_for_read_extraction( args.input, prepared_pe_bam_file, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread, bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file) if args.paired:
parser.add_argument("-i", "--input_file_list", action="store", dest="input", required=True, type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format)
def align_samples(self, samples_dir, output_dir, genome_dir, genome_fasta=None, samples=None, annotation_gtf=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=8000000000, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=True, max_intron_length=None): #STAR.threads = threads #STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=annotation_gtf, junction_tab_file=junction_tab_file_list, sjdboverhang=sjdboverhang, genomeSAindexNbases=genomeSAindexNbases, genomeChrBinNbits=genomeChrBinNbits, genome_size=genome_size) sample_list = samples if samples else self.get_sample_list(samples_dir) FileRoutines.safe_mkdir(output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_dir, sample) alignment_sample_dir = "%s/%s/" % (output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf if not genome_fasta else None, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) print "\tIndexing bam file..." resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir SamtoolsV1.index(resulting_bam_file)
STAR.path = args.star_dir if args.genome_fasta: STAR.index(args.genome_dir, args.genome_fasta, annotation_gtf=args.annotation_gtf, junction_tab_file=args.junction_tab_file, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=args.genome_size) sample_list = args.samples if args.samples else Pipeline.get_sample_list( args.samples_dir) FileRoutines.safe_mkdir(args.output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (args.samples_dir, sample) alignment_sample_dir = "%s/%s/" % (args.output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( args.genome_dir, forward_files, reverse_read_list=reverse_files,
def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None, show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None, showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False, codon_alignment=None, translated_alignment=None, cmd_log_file=None, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_jobs=None, max_running_time=None, max_memory_per_node=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): common_options = self.parse_common_options( tree_file=tree_file, output_format=output_format, show_xml=show_xml, show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences, show_evolutionary_events=show_evolutionary_events, showall=showall, compute_posterior_support=compute_posterior_support, njtree=njtree, skip_insertions=skip_insertions, codon_alignment=codon_alignment, translated_alignment=translated_alignment) FileRoutines.safe_mkdir(output_directory) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " -d=%s" % filename op += " -o=%s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) if handling_mode == "local": self.parallel_execute(options_list) elif handling_mode == "slurm": cmd_list = [ "%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list ] self.slurm_run_multiple_jobs_in_wrap_mode( cmd_list, cmd_log_file, max_jobs=max_jobs, job_name=job_name, log_prefix=log_prefix, error_log_prefix=error_log_prefix, cpus_per_node=None, max_running_jobs=None, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memory_per_node=max_memory_per_node, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict)
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff) for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence: MatplotlibRoutines.percent_histogram_from_file( stat_file, stat_file, data_type=None, column_list=(2, ), comments="#", n_bins=20, title="Transcript support by hints", extensions=("png", "svg"), legend_location="upper center", stats_as_legend=True) if args.pfam_db and args.swissprot_db: db_or_hints_dir = "supported_by_db_or_hints/" db_and_hints_dir = "supported_by_db_and_hints/" for directory in db_and_hints_dir, db_or_hints_dir: FileRoutines.safe_mkdir(directory) os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" % (args.output, db_or_hints_dir)) os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" % (args.output, db_and_hints_dir))
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
def parallel_codeml(self, in_dir, tree_file, out_dir, seq_type="codons", codon_frequency="F3X4", noisy=0, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, model=1, nssites=0, genetic_code=0, fix_kappa=False, kappa=5, fix_omega=False, omega=0.2, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0, Mgene=None): FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) out_file = "%s/%s.out" % (filename_out_dir, basename) ctl_file = "%s/%s.ctl" % (filename_out_dir, basename) options_list.append(ctl_file) dir_list.append(filename_out_dir) FileRoutines.safe_mkdir(filename_out_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=model, nssites=nssites, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega, omega=omega, getSE=getSE, RateAncestor=RateAncestor, Mgene=Mgene, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list)
dest="output", type=FileRoutines.check_path, help="Output directory") #parser.add_argument("-p", "--convert_options", action="store", dest="convert_options", # help="Options for convert") parser.add_argument("-d", "--dont_make_negative", action="store_true", dest="dont_negative", help="Dont make negative") args = parser.parse_args() temp_dir = "temp/" FileRoutines.safe_mkdir(temp_dir) BioConvert.threads = args.threads Convert.threads = args.threads BioConvert.parallel_convert(args.input, temp_dir) if args.dont_negative: os.rename(temp_dir, args.output) else: converted_files = os.listdir(temp_dir) converted_files = list( map(lambda s: "%s%s" % (temp_dir, s), converted_files)) Convert.parallel_convert( converted_files, args.output,
type=FileRoutines.check_path, help="Directory to write fam files named by species names") parser.add_argument("-d", "--syn_file", action="store", dest="syn_file", required=True, help="File with taxa ids and species names") parser.add_argument("-k", "--key_index", action="store", dest="key_index", type=int, default=0, help="Key column in file with synonyms(0-based)") parser.add_argument("-v", "--value_index", action="store", dest="value_index", type=int, default=1, help="Value column in file with synonyms(0-based)") parser.add_argument("-c", "--comments_prefix", action="store", dest="comments_prefix", default="#", help="Prefix of comments in synonyms file") parser.add_argument("-m", "--columns_separator", action="store", dest="separator", default="\t", help="Column separator in file with synonyms") parser.add_argument("-e", "--header", action="store_true", dest="header", default=False, help="Header is present in synonyms file. Default - False") args = parser.parse_args() syn_dict = SynDict() syn_dict.read(args.syn_file, header=args.header, separator=args.separator, key_index=args.key_index, value_index=args.value_index, comments_prefix=args.comments_prefix) FileRoutines.safe_mkdir(args.output_files_dir) input_files = os.listdir(args.input_files_dir) for filename in input_files: directory, taxon_id, extension = FileRoutines.split_filename(filename) if taxon_id not in syn_dict: print("Species name was not found for taxon %s" % taxon_id) continue shutil.copy("%s%s" % (args.input_files_dir, filename), "%s%s%s" % (args.output_files_dir, syn_dict[taxon_id], extension))
parser.add_argument("-n", "--check_file_contains_gene_counts", action="store_true", dest="check_with_counts", default=False, help="File to check contains gene counts") parser.add_argument("-o", "--out_dir", action="store", dest="out_dir", default="compare_dir", help="Output directory") args = parser.parse_args() FileRoutines.safe_mkdir(args.out_dir) ref_clusters_dict = read_cluster_file(args.ref_file, with_counts=args.ref_with_counts) check_clusters_dict = read_cluster_file(args.file_to_check, with_counts=args.check_with_counts) totally_in_ref = len(ref_clusters_dict) totally = len(check_clusters_dict) synonym_file = "synonym.t" contained_fully_in_file = "contained_fully_in.t" contained_in_file = "contained_in.t" include_file = "include.t" all_file = "all.t" synonym_dict = OrderedDict()