def index(self, genome_dir, genome_fasta, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None): FileRoutines.safe_mkdir(genome_dir) options = "--runMode genomeGenerate" options += " --genomeDir %s" % os.path.abspath(genome_dir) options += " --runThreadN %i" % self.threads options += " --genomeFastaFiles %s" % ( os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else " ".join(map(os.path.abspath, genome_fasta))) options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else "" options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else "" options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else "" # number of bases taken from both sides of splice junction. 100 by default if genome_size: options += " --genomeSAindexNbases %i" % min( [14, (floor(log(genome_size, 2) / 2)) - 1]) else: options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else "" # size of k-mers used for preindexing of suffix array options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else "" # padding size (log2) of reference sequences. 18 by default # recommended value min(18, log2(GenomeLength/NumberOfScaffolds)) options += " --sjdbGTFfeatureExon %s" % feature_from_gtf_to_use_as_exon if feature_from_gtf_to_use_as_exon else "" self.execute(options)
def parallel_align(self, list_of_files, output_dir, msa_tool='prank', seq_type=None, bootstrap_number=100, genetic_code=1, threads=None, msa_tool_options=None, seq_cutoff=None, col_cutoff=None, mafft_bin=None, prank_bin=None, muscle_bin=None, pagan_bin=None, ruby_bin=None, program=None, cmd_log_file=None, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_jobs=None, max_running_time=None, max_memory_per_node=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): common_options = self.parse_common_options(output_dir=output_dir, msa_tool=msa_tool, seq_type=seq_type, bootstrap_number=bootstrap_number, genetic_code=genetic_code, threads=threads, msa_tool_options=msa_tool_options, seq_cutoff=seq_cutoff, col_cutoff=col_cutoff, mafft_bin=mafft_bin, prank_bin=prank_bin, muscle_bin=muscle_bin, pagan_bin=pagan_bin, ruby_bin=ruby_bin, program=program) FileRoutines.safe_mkdir(output_dir) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " --seqFile %s" % filename op += " --dataset %s" % basename options_list.append(op) if handling_mode == "local": self.parallel_execute(options_list) elif handling_mode == "slurm": cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list] self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list, cmd_log_file, max_jobs=max_jobs, job_name=job_name, log_prefix=log_prefix, error_log_prefix=error_log_prefix, cpus_per_node=None, max_running_jobs=None, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memory_per_node=max_memory_per_node, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict)
def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None, show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None, showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False, codon_alignment=None, translated_alignment=None, cmd_log_file=None, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_jobs=None, max_running_time=None, max_memory_per_node=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): common_options = self.parse_common_options(tree_file=tree_file, output_format=output_format, show_xml=show_xml, show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences, show_evolutionary_events=show_evolutionary_events, showall=showall, compute_posterior_support=compute_posterior_support, njtree=njtree, skip_insertions=skip_insertions, codon_alignment=codon_alignment, translated_alignment=translated_alignment) FileRoutines.safe_mkdir(output_directory) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " -d=%s" % filename op += " -o=%s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) if handling_mode == "local": self.parallel_execute(options_list) elif handling_mode == "slurm": cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list] self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list, cmd_log_file, max_jobs=max_jobs, job_name=job_name, log_prefix=log_prefix, error_log_prefix=error_log_prefix, cpus_per_node=None, max_running_jobs=None, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memory_per_node=max_memory_per_node, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict)
def read(self, input_file, filetype="fasta", verbose=False): list_of_files = FileRoutines.make_list_of_path_to_files(input_file) for filename in list_of_files: if verbose: print("Parsing %s ..." % filename) directory, basename, extension = FileRoutines.split_filename(filename) try: self.records[basename] = MultipleAlignmentStatRecord(basename, alignment=AlignIO.read(filename, filetype)) self.record_id_list.append(basename) except: raise ValueError("ERROR: Issues while parsing or calculating stats for %s!!!" % filename) # collectiontype-dependent function pass
def RepeatModeler_search(query_file, db_name, output_file="run.out", num_of_threads=5, RepeatModeler_dir=""): print("\nRepeatModeler search...\n") repmod_dir = FileRoutines.check_path(RepeatModeler_dir) os.system(repmod_dir + "BuildDatabase -engine ncbi -name %s %s" % (db_name, query_file)) os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s" % (num_of_threads, db_name, output_file))
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80, PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""): print("\nTRF search...\n") #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options] #Where: (all weights, penalties, and scores are positive) # File = sequences input file # Match = matching weight # Mismatch = mismatching penalty # Delta = indel penalty # PM = match probability (whole number) # PI = indel probability (whole number) # Minscore = minimum alignment score to report # MaxPeriod = maximum period size to report # [options] = one or more of the following : # -m masked sequence file # -f flanking sequence # -d data file # -h suppress HTML output #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m flanking = "" if flanked: flanking = "-f" trf_path = FileRoutines.check_path(TRF_dir) os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m" % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
def parallel_align(self, list_of_files, output_directory, output_suffix="alignment", gap_open_penalty=None, offset=None, maxiterate=None, quiet=True, mode="globalpair", number_of_processes=1, anysymbol=False): # TODO: add rest of options options = " --thread %i" % self.threads options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else "" options += " --ep %f" % offset if offset is not None else "" options += " --maxiterate %i" % maxiterate if maxiterate is not None else "" options += " --quiet" if quiet else "" options += " --%s" % mode options += " --anysymbol" if anysymbol else "" options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = options op += " %s" % filename op += " > %s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) self.parallel_execute(options_list, threads=number_of_processes)
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) self.safe_mkdir(splited_dir) self.safe_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = FileRoutines.split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def read(self, vep_tab_file): with FileRoutines.metaopen(vep_tab_file, "r") as in_fd: metadata = [] while True: line = in_fd.readline() if line[:19] == "#Uploaded_variation": header_list = line.strip()[1:].split("\t") break metadata.append(line) return pd.read_csv(in_fd, sep='\t', names=header_list)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="", num_of_threads=5, search_type="-s"): #species: see list of possible species in repeatmasker.help coming with RepeatMasker #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast) repmask_dir = FileRoutines.check_path(RepeatMasker_dir) custom_lib = "" if custom_lib_path: cuatom_lib = "-lib %s" % custom_lib_path #additional options: #-xm creates an additional output file in cross_match format (for parsing) #-ace creates an additional output file in ACeDB format #-gff creates an additional Gene Feature Finding format #-excln The percentages displayed in the .tbl file are calculated using a # total sequence length excluding runs of 25 Ns or more. print("\nRepeatMasker search...\n") os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s" % (custom_lib, num_of_threads, species, search_type, query_file))
def windowmasker_search(windowmasker_dir): winmask_dir = FileRoutines.check_path(windowmasker_dir) #TODO: write this function pass
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""): picard_dir = FileRoutines.check_path(PICARD_dir) os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" % (picard_dir, fasta_file, dict_name))
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""): print("\nExtracting RepBase for %s\n" % species) repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""): repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
def align_samples(self, samples_dir, output_dir, genome_dir, genome_fasta=None, samples=None, annotation_gtf=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_per_thread_for_bam_sorting="4G", include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=True, max_intron_length=None, input_is_se=None, filename_fragment_to_mark_se_reads=".se."): #STAR.threads = threads #STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=annotation_gtf, junction_tab_file=junction_tab_file_list, sjdboverhang=sjdboverhang, genomeSAindexNbases=genomeSAindexNbases, genomeChrBinNbits=genomeChrBinNbits, genome_size=genome_size) sample_list = samples if samples else self.get_sample_list(samples_dir) FileRoutines.safe_mkdir(output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_dir, sample) alignment_sample_dir = "%s/%s/" % (output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files, se_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir, filename_fragment_to_mark_se_reads= filename_fragment_to_mark_se_reads, input_is_se=input_is_se) #print (se_files) print("\tAligning reads...") self.align( genome_dir, forward_files if forward_files else se_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf if not genome_fasta else None, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length)
def parallel_search_tandem_repeat(self, query_file, output_prefix, matching_weight=2, mismatching_penalty=7, indel_penalty=7, match_probability=80, indel_probability=10, min_alignment_score=50, max_period=500, report_flanking_sequences=False, splited_fasta_dir="splited_fasta_dir", splited_result_dir="splited_output", converted_output_dir="converted_output", max_len_per_file=100000, store_intermediate_files=False, max_repeat_length=None): work_dir = os.getcwd() splited_filename = FileRoutines.split_filename(query_file) self.split_fasta_by_seq_len(query_file, splited_fasta_dir, max_len_per_file=max_len_per_file, output_prefix=splited_filename[1]) common_options = self.parse_common_options(matching_weight=matching_weight, mismatching_penalty=mismatching_penalty, indel_penalty=indel_penalty, match_probability=match_probability, indel_probability=indel_probability, min_alignment_score=min_alignment_score, max_period=max_period, report_flanking_sequences=report_flanking_sequences, make_dat_file=True, max_repeat_length=max_repeat_length) common_options += " -h" # suppress html output options_list = [] splited_files = os.listdir(splited_fasta_dir) FileRoutines.safe_mkdir(splited_result_dir) FileRoutines.safe_mkdir(converted_output_dir) os.chdir(splited_result_dir) input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \ else "../%s" % splited_fasta_dir for filename in splited_files: file_options = "%s/%s" % (input_dir, filename) file_options += common_options options_list.append(file_options) self.parallel_execute(options_list) os.chdir(work_dir) trf_output_file_list = [] for filename in splited_files: trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (splited_result_dir, filename, matching_weight, mismatching_penalty, indel_penalty, match_probability, indel_probability, min_alignment_score, max_period) trf_output_file_list.append(trf_output_file) trf_report = self.convert_trf_report(trf_output_file_list, output_prefix) """ for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab", ".with_rep_seqs.gff", ".fasta"): file_str = "" merged_file = "%s%s" % (output_prefix, suffix) for filename in splited_files: file_str += " %s/%s%s" % (converted_output_dir, filename, suffix) CGAS.cat(file_str, merged_file) """ compress_splited_out_string = "tar czf %s.splited_output.tar.gz %s" % (output_prefix, splited_result_dir) os.system(compress_splited_out_string) if not store_intermediate_files: shutil.rmtree(splited_fasta_dir) shutil.rmtree(splited_result_dir) shutil.rmtree(converted_output_dir) return trf_report
def draw_window_density_distribution(self, count_dict, window_size, output_prefix=None, suptitle="SNP density distribution", density_multiplicator=1000, number_of_bins=None, width_of_bins=None, max_threshold=None, min_threshold=None, scaffold_black_list=[], scaffold_white_list=[], sort_scaffolds=False, scaffold_ordered_list=None, subplot_size=4, per_scaffold_histo_dir="per_scaffold_histo_dir/", subplot_tuple=None, share_x_axis=True, share_y_axis=True, extensions=("png",), show_mean_and_median=True): """ scaffold_threshold: if number of scaffolds is higher draw only separated_histograms """ samples_list = count_dict.keys() final_scaffold_list = self.get_filtered_scaffold_list(count_dict, scaffold_black_list=scaffold_black_list, sort_scaffolds=sort_scaffolds, scaffold_ordered_list=scaffold_ordered_list, scaffold_white_list=scaffold_white_list) scaffold_number = len(final_scaffold_list) FileRoutines.safe_mkdir(per_scaffold_histo_dir) xlabel = "Number of SNPs" ylabel = "Number of windows" scaled_count_dict = OrderedDict() empty_windows_scaffold_dict = OrderedDict() for scaffold in final_scaffold_list: for sample in count_dict: if scaffold not in count_dict[sample]: continue empty_windows_scaffold_dict[scaffold] = np.zeros(len(count_dict[sample][scaffold])) break for sample in samples_list: scaled_count_dict[sample] = OrderedDict() for scaffold in final_scaffold_list: if scaffold not in count_dict[sample]: scaled_count_dict[sample][scaffold] = empty_windows_scaffold_dict[scaffold] scaled_count_dict[sample][scaffold] = np.array(map(float, count_dict[sample][scaffold])) * density_multiplicator / window_size print("Drawing separated histograms for each scaffold...") extended_label_dict = OrderedDict() for scaffold in final_scaffold_list: print("Drawing histogram for scaffold %s" % scaffold) #scaffold_data = [scaled_count_dict[sample][scaffold] if scaffold in scaled_count_dict[sample] else empty_windows_scaffold_dict[scaffold] for sample in samples_list] scaffold_data = [scaled_count_dict[sample][scaffold] for sample in samples_list] out_prefix = "%s/%s.%s" % (per_scaffold_histo_dir, output_prefix, scaffold) if output_prefix else "%s/%s" % (per_scaffold_histo_dir, scaffold) for sample in samples_list: median = np.median(scaled_count_dict[sample][scaffold]) mean = np.mean(scaled_count_dict[sample][scaffold]) extended_label = "%s: Med. %.2f, Avg: %.2f" % (sample, float(median), float(mean)) print(extended_label) if scaffold in extended_label_dict: extended_label_dict[scaffold].append(extended_label) else: extended_label_dict[scaffold] = [extended_label] #print scaffold_data self.draw_histogram(scaffold_data, output_prefix=out_prefix, number_of_bins=number_of_bins, width_of_bins=width_of_bins, max_threshold=max_threshold, min_threshold=min_threshold, xlabel=xlabel, ylabel=ylabel, title=scaffold, extensions=extensions, ylogbase=None, subplot=None, suptitle=None, close_figure=True, data_label_list=extended_label_dict[scaffold] if show_mean_and_median else samples_list) #print scaled_count_dict print("Drawing histograms for all scaffolds on same figure...") data = list(recursive_generator(scaled_count_dict)) min_value = min(data) if data else 0 max_value = max(data) if data else 0 #print len(scaled_count_dict) #print data bin_array = self.generate_bin_array(data, y_list=None, bin_number=number_of_bins, bin_width=width_of_bins, bin_array=None, min_x_value=min_threshold, max_x_value=max_threshold, min_y_value=None, max_y_value=None, add_max_value=True) plt.suptitle(suptitle) if subplot_tuple is None: side = math.sqrt(scaffold_number) rounded_side = int(side) side = rounded_side + 1 if side % rounded_side else rounded_side subplot_tupleee = (side, side) #print subplot_tupleee else: subplot_tupleee = subplot_tuple if len(subplot_tupleee) != 2: raise ValueError("Subplot tuple should contain exactly two values, not %i!" % len(subplot_tuple)) if not (isinstance(subplot_tuple[0], int) and isinstance(subplot_tuple[1], int)): raise ValueError("Subplot tuple should contain two values, not (%s, %s)!" % (str(type(subplot_tuple[0])), str(type(subplot_tuple[1])))) figure = plt.figure(256, figsize=(subplot_size * subplot_tupleee[0], subplot_size * subplot_tupleee[1]), dpi=200) print (subplot_size * subplot_tupleee[0], subplot_size * subplot_tupleee[1]) number_of_subplots = subplot_tupleee[0] * subplot_tupleee[1] subplot_list = [] for dataset_index in range(0, len(final_scaffold_list)): scaffold = final_scaffold_list[dataset_index] if dataset_index > 0: if share_x_axis and share_y_axis: subplot_list.append(figure.add_subplot(subplot_tupleee[0], subplot_tupleee[1], dataset_index + 1, sharex=subplot_list[0], sharey=subplot_list[0])) elif share_x_axis: subplot_list.append(figure.add_subplot(subplot_tupleee[0], subplot_tupleee[1], dataset_index + 1, sharex=subplot_list[0])) elif share_y_axis: subplot_list.append(figure.add_subplot(subplot_tupleee[0], subplot_tupleee[1], dataset_index + 1, sharex=subplot_list[0], sharey=subplot_list[0])) else: subplot_list.append(figure.add_subplot(subplot_tupleee[0], subplot_tupleee[1], dataset_index + 1)) else: subplot_list.append(figure.add_subplot(subplot_tupleee[0], subplot_tupleee[1], dataset_index + 1)) #print dataset_index + 1 #print subplot_tupleee[0] * (subplot_tupleee[1] - 1) #print ((dataset_index + 1) > (subplot_tupleee[0] * (subplot_tupleee[1] - 1))) histo = self.draw_histogram([scaled_count_dict[sample][scaffold] for sample in samples_list], number_of_bins=None, width_of_bins=None, max_threshold=None, min_threshold=None, xlabel=xlabel if ((dataset_index + 1) > (subplot_tupleee[0] * (subplot_tupleee[1] - 1))) else None, ylabel=ylabel if ((dataset_index + 1) % subplot_tupleee[0]) == 1 else None, title=scaffold, extensions=("png",), ylogbase=None, subplot=subplot_list[dataset_index], suptitle=None, data_label_list=extended_label_dict[scaffold] if show_mean_and_median else samples_list, bin_array=bin_array) plt.xlim(xmin=min_threshold if min_threshold and (min_threshold >= min_value) else min_value, xmax=max_threshold if max_threshold and (max_threshold <= max_value) else max_value) #print histo """ if output_prefix: output_histo_file = "%s.%s.%shisto" % (output_prefix, dataset_index if parameters[8] is None else parameters[10], ("log%i." % parameters[7]) if parameters[7] else "") np.savetxt(output_histo_file, histo, fmt="%i\t%i") """ plt.tight_layout() if output_prefix: for ext in extensions: plt.savefig("%s.%s" % (output_prefix, ext)) plt.close(figure) print("Drawing combined histogram for all scaffolds...") combined_count_dict = OrderedDict() extended_combined_label_list = [] for sample in samples_list: combined_count_dict[sample] = [] for scaffold in count_dict[sample]: combined_count_dict[sample] = combined_count_dict[sample] + count_dict[sample][scaffold] combined_count_dict[sample] = np.array(map(float, combined_count_dict[sample]))* density_multiplicator / window_size median = np.median(combined_count_dict[sample]) mean = np.mean(combined_count_dict[sample]) extended_label = "%s: Med. %.2f, Avg: %.2f" % (sample, float(median), float(mean)) print(extended_label) extended_combined_label_list.append(extended_label) #print combined_count_dict figure = plt.figure(384, figsize=(8,8)) self.draw_histogram([combined_count_dict[sample] for sample in combined_count_dict], output_prefix="%s.combined" % output_prefix if output_prefix else "combined", number_of_bins=number_of_bins, width_of_bins=width_of_bins, max_threshold=max_threshold, min_threshold=min_threshold, xlabel=xlabel, ylabel=ylabel, title="SNP density distribution(all scaffolds)", extensions=extensions, ylogbase=None, subplot=None, suptitle=None, close_figure=True, data_label_list=extended_combined_label_list if show_mean_and_median else samples_list)
def parallel_codeml(self, in_dir, tree_file, out_dir, seq_type="codons", codon_frequency="F3X4", noisy=0, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, model=1, nssites=0, genetic_code=0, fix_kappa=False, kappa=5, fix_omega=False, omega=0.2, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0, Mgene=None): FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) out_file = "%s/%s.out" % (filename_out_dir, basename) ctl_file = "%s/%s.ctl" % (filename_out_dir, basename) options_list.append(ctl_file) dir_list.append(filename_out_dir) FileRoutines.safe_mkdir(filename_out_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=model, nssites=nssites, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega, omega=omega, getSE=getSE, RateAncestor=RateAncestor, Mgene=Mgene, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list)
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True, other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir", config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None, extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None, async_run=False, min_intron_len=None, parsing_mode="parse"): common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model, output_gff3=output_gff3, other_options=other_options, config_dir=config_dir, use_softmasking=use_softmasking, hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile, predict_UTR=predict_UTR, min_intron_len=min_intron_len) splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) FileRoutines.safe_mkdir(splited_dir) FileRoutines.safe_mkdir(splited_out_dir) self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_output_files = [] options_list = [] for filename in input_list_of_files: input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.gff" % (splited_out_dir, filename) list_of_output_files.append(output_file) options = common_options options += " %s" % input_file options += " > %s" % output_file options_list.append(options) self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run) if combine_output_to_single_file: CGAS.cat(list_of_output_files, output=output)