def combine_count_files(count_file_list, output_file, sample_name_list=None): if sample_name_list is not None: if len(count_file_list) != len(sample_name_list): raise ValueError( "Several files doesn't have corresponding sample name") samples = zip( sample_name_list if sample_name_list else count_file_list, count_file_list) count_table = TwoLvlDict() for sample, filename in samples: count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table.write(output_file)
def count_locations(self, annotation_black_list=[], allow_several_counts_of_record=False, out_filename="location_counts.t", write=True, count_dir="location_counts"): os.system("mkdir -p %s" % count_dir) regions_dict = self._split_regions() region_counts_dict = TwoLvlDict({}) for region in regions_dict: count_locations_dict = {"igc": 0} for record in regions_dict[region]: if (not record.description["Loc"]) or ( "Loc" not in record.description): count_locations_dict["unknown"] += 1 continue #print(record.description["Loc"]) if allow_several_counts_of_record: for location in record.description["Loc"]: if location in annotation_black_list: continue if location not in count_locations_dict: count_locations_dict[location] = 1 else: count_locations_dict[location] += 1 else: full_location = [] for location in record.description["Loc"]: if location in annotation_black_list: continue full_location.append(location) if not full_location: continue full_location.sort() full_location = "/".join(full_location) if full_location not in count_locations_dict: count_locations_dict[full_location] = 1 else: count_locations_dict[full_location] += 1 labels = [] counts = [] #colors = [] for location in count_locations_dict: if count_locations_dict[ location] == 0 or location in annotation_black_list: continue labels.append(location) counts.append(count_locations_dict[location]) region_counts_dict[region] = OrderedDict([ (label, count) for label, count in zip(labels, counts) ]) if write: region_counts_dict.write("%s/%s" % (count_dir, out_filename)) return region_counts_dict
def results_extraction_listener(queue, output_file_prefix, selected_species_list=None): """listens for messages on the queue, writes to file.""" positive_selection_dict = TwoLvlDict() selected_species_positive_selection_dict = TwoLvlDict() error_fd = open("errors.err", "w") error_fd.write("#sample\terror_code\n") while 1: result = queue.get() if isinstance(result[1], int): error_fd.write("%s\t%i\n" % (result[0], result[1])) continue if result == 'finish': positive_selection_dict.write("%s.all" % output_file_prefix, absent_symbol=".") if selected_species_list: selected_species_positive_selection_dict.write( "%s.selected_species" % output_file_prefix, absent_symbol=".") # print positive_selection_dict.table_form(absent_symbol=".") break if result[1]: positive_selection_dict[result[0]] = result[1] if selected_species_list: for species in selected_species_list: if species in result[1]: if result[ 0] not in selected_species_positive_selection_dict: selected_species_positive_selection_dict[ result[0]] = {} selected_species_positive_selection_dict[ result[0]][species] = result[1][species]
def get_general_stats(self): stat_dict = TwoLvlDict() for report_id in self: stat_dict[report_id] = OrderedDict() stat_dict[report_id]["machine_number"] = len( self[report_id].machine_id_list) stat_dict[report_id]["machine_ids"] = self[ report_id].machine_id_list stat_dict[report_id]["flowcell_number"] = len( self[report_id].flowcell_id_list) stat_dict[report_id]["flowcell_ids"] = self[ report_id].flowcell_id_list stat_dict[report_id]["lane_number"] = len( self[report_id].lane_table) stat_dict[report_id]["full_lane_ids"] = self[ report_id].full_lane_id_list stat_dict[report_id]["short_lane_ids"] = self[ report_id].short_lane_id_list stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs stat_dict[report_id]["retained_pairs"] = self[ report_id].retained_pairs stat_dict[report_id]["retained_pairs_fraction"] = self[ report_id].retained_pairs_fraction stat_dict[report_id]["retained_forward_only"] = self[ report_id].retained_forward_only stat_dict[report_id]["retained_reverse_only"] = self[ report_id].retained_reverse_only stat_dict[report_id]["both_discarded"] = self[ report_id].both_discarded stat_dict[report_id][ "min_retained_pairs_in_tiles_fraction"] = self[ report_id].minimum_retained_pairs_in_tiles_fraction return stat_dict
def write_stats(self, output_prefix): Ns_dict = TwoLvlDict() gaps_dict = TwoLvlDict() for record_id in self.records: Ns_dict[self.records[record_id].id] = self.records[record_id].N_counts gaps_dict[self.records[record_id].id] = self.records[record_id].gap_counts Ns_dict.write(out_filename="%s.N_counts" % output_prefix) gaps_dict.write(out_filename="%s.gaps_counts" % output_prefix)
def count_types(self, output_file=None, total_output_file=None, return_mode="chrom"): annotated_types = self.get_annotated_types() count_dict = TwoLvlDict() total_count_dict = OrderedDict() for type in annotated_types: total_count_dict[type] = OrderedDict() total_count_dict[type]["complete"] = 0 total_count_dict[type]["partial"] = 0 for chrom in self.records: count_dict[chrom] = OrderedDict() for type in annotated_types: count_dict[chrom][type] = 0 for chrom in self.records: for record in self.records[chrom]: count_dict[chrom][record.type] += 1 if record.partial: total_count_dict[record.type]["partial"] += 1 else: total_count_dict[record.type]["complete"] += 1 if output_file: count_dict.write(output_file) if total_output_file: with open(total_output_file, "w") as out_fd: out_fd.write("#rRNA\tComplete%s\tPartial%s\n" % ("(>%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "", "(<%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "")) for type in total_count_dict: out_fd.write("%s\t%i\t%i\n" % (type, total_count_dict[type]["complete"], total_count_dict[type]["partial"])) if return_mode == "chrom": return count_dict elif return_mode == "total": return total_count_dict elif return_mode == "both": return count_dict, total_count_dict else: raise ValueError("Unknown return type. Allowed variants: 'chrom', 'total', 'both'")
def count_reads_and_bases(self, fastq_file_list, stat_file=None): fastq_list = [fastq_file_list] if isinstance(fastq_file_list, str) else fastq_file_list counts = TwoLvlDict() for fastq_file in fastq_list: counts[fastq_file] = OrderedDict() counts[fastq_file]["Reads"] = 0 counts[fastq_file]["Bases"] = 0 for fastq_file in fastq_list: with self.metaopen(fastq_file, "r") as fastq_fd: for line in fastq_fd: counts[fastq_file]["Bases"] += len(fastq_fd.readline()) counts[fastq_file]["Reads"] += 1 fastq_fd.readline() fastq_fd.readline() # to take into account "\n" at the end of each line counts[fastq_file]["Bases"] = counts[fastq_file]["Bases"] - counts[fastq_file]["Reads"] counts.write() if stat_file: counts.write(stat_file) return counts
def get_general_stats(self): stat_dict = TwoLvlDict() for report_id in self: stat_dict[report_id] = OrderedDict() stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs stat_dict[report_id]["pairs_without_adapters"] = self[ report_id].retained_pairs stat_dict[report_id]["pairs_without_adapters_fraction"] = self[ report_id].retained_pairs_fraction return stat_dict
def get_results(samples_list, data_type): results = TwoLvlDict() for sample in samples_list: results[sample] = OrderedDict() filename = "%s/all_reads/%s_all_%s_coverage.tab" % (sample, sample, data_type) data = read_data(filename) if not data: print sample continue #print sample for gene in data: results[sample][gene] = data[gene] for proportions, name in zip([[1, 2], [2, 1], [1, 1]], ["1:2", "2:1", "1:1"]): chi_results = calculate_chi_squared(data, proportions) #print name results[sample][name + " Chi"] = chi_results[0] results[sample][name + " p-value"] = chi_results[1] #print chi_results return results
def get_general_stats(self): stat_dict = TwoLvlDict() for report_id in self: stat_dict[report_id] = OrderedDict() stat_dict[report_id]["Number of distinct kmers"] = self[report_id]["Number of distinct kmers"] stat_dict[report_id]["Number of distinct kmers"] = self[report_id]["Number of distinct kmers"] stat_dict[report_id]["Fraction of distinct kmers with errors"] = self[report_id]["Fraction of distinct kmers with errors"] stat_dict[report_id]["Total number of kmers"] = self[report_id]["Total number of kmers"] stat_dict[report_id]["Total number of kmers with errors"] = self[report_id]["Total number of kmers with errors"] stat_dict[report_id]["Fraction of kmers with errors"] = self[report_id]["Fraction of kmers with errors"] stat_dict[report_id]["Width of first peak"] = self[report_id]["Width of first peak"] stat_dict[report_id]["Mean kmer multiplicity in first peak"] = self[report_id]["Mean kmer multiplicity in first peak"] stat_dict[report_id]["Kmer multiplicity at first maximum "] = self[report_id]["Kmer multiplicity at first maximum"] stat_dict[report_id]["Standard deviation of kmer multiplicity in first peak"] = self[report_id]["Standard deviation of kmer multiplicity in first peak"] stat_dict[report_id]["Variance coefficient of kmer multiplicity in first peak"] = self[report_id]["Variance coefficient of kmer multiplicity in first peak"] if "Estimated genome size, bp" in self[report_id]: stat_dict[report_id]["Estimated genome size,bp"] = self[report_id]["Estimated genome size, bp"] return stat_dict
from Parsers.CCF import CollectionCCF def get_intersection_length(start1, end1, start2, end2): if start1 - end2 > 0 or start2 - end1 > 0: return 0 start_shift = start1 - start2 start_coef_shift = 0 if start_shift < 0 else 1 end_shift = end1 - end2 end_coef_shift = 0 if end_shift > 0 else 1 return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift overlap_clusters_percent = TwoLvlDict({}) #size = 8 #power = 0.05 print([float(f) / float(100) for f in range(1, 11)]) for size in range(3, 11): overlap_clusters_percent[size] = {} for power in [float(f) / float(100) for f in range(1, 11)]: PmCDA1_3d_clusters = CollectionCCF( from_file=True, input_file= "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_sub_clusters = CollectionCCF( from_file=True, input_file=
"Length of labels list is not equal to number of files with assemblies" ) assemblies_dict = OrderedDict() for i in range(0, len(args.input_file_list)): assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i + 1) tmp_index = "%s.tmp.idx" % assembly_label assemblies_dict[assembly_label] = SequenceRoutines.parse_seq_file( args.input_file_list[i], args.parsing_mode, format=args.format, index_file=tmp_index) #SeqIO.index_db(tmp_index, args.input_file_list[i],format=args.format) assembly_N50_dict = TwoLvlDict() assembly_L50 = TwoLvlDict() assembly_bins = [] assembly_contig_cumulative_length = OrderedDict() assembly_contig_number_values = OrderedDict() assembly_general_stats = TwoLvlDict() assembly_length_array = OrderedDict() assembly_lengths = TwoLvlDict() for assembly in assemblies_dict: lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \ contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly], thresholds_list=args.thresholds, seq_len_file="%s.%s.len" % (args.output_prefix, assembly)) assembly_N50_dict[assembly] = N50_dict assembly_L50[assembly] = L50_dict assembly_contig_cumulative_length[
species_list = sorted(args.species_set) if args.white_list_file and args.black_list_file: raise ValueError("Black list and white list cant be set simultaneously") black_list = IdList() white_list = IdList() if args.black_list_file: black_list.read(args.black_list_file) if args.white_list_file: white_list.read(args.white_list_file) out_fd = open(args.cafe_file, "w") filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w") out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) species_filtered_fd_list = OrderedDict() fam_count_dict = TwoLvlDict() species_family_dict = TwoLvlDict() for species in args.species_set: species_family_dict[species] = SynDict() species_family_dict[species].read( "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix), split_values=True, values_separator=",", separator="\t") #print species_family_dict[species] fam_count_dict[species] = species_family_dict[species].count_synonyms() #print fam_count_dict[species] species_filtered_fd_list[species] = open( "%s%s.fam" % (args.filtered_family_dir, species), "w") for family in fam_count_dict.sl_keys():
"snoRNA": "ncRNA", "snRNA": "ncRNA" } annotation_black_list = ["gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette", "five_prime_UTR_intron"] with open(args.annotations) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(args.masking) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record statistics_dict = TwoLvlDict(OrderedDict({})) print("Handling %s" % sample) statistics_dict[sample] = OrderedDict({}) os.system("mkdir -p %s" % clustering_dir) mutations = CollectionVCF(in_file=args.vcf_file if args.vcf_file else "%s.vcf" % args.sample_name, from_file=True) mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) mutations.set_location_flag(bad_region_dict, check_location, "BR") mutations.check_by_ref_and_alt(ref_alt_variants["deaminases"], "DA", description="Deaminase-like variant") raw_mutations_counts = len(mutations) print("Totaly %i mutations" % raw_mutations_counts)
def handle_sanger_data(self, input_dir, output_prefix, outdir=None, read_subfolders=False, min_mean_qual=0, min_median_qual=0, min_len=50): if outdir: self.workdir = outdir self.init_dirs() sanger_filelist = self.make_list_of_path_to_files( input_dir, expression=self.is_sanger_file, recursive=read_subfolders, return_absolute_paths=True) stat_dict = TwoLvlDict() record_dict = OrderedDict() trimmed_record_dict = OrderedDict() excluded_list = IdList() excluded_counter = 0 low_quality_counter = 0 too_short_counter = 0 merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix) merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix) merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir, output_prefix) merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir, output_prefix) for filename in sanger_filelist: filename_list = self.split_filename(filename) record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir, filename_list[1]) record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir, filename_list[1]) record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % ( self.workdir, filename_list[1]) record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % ( self.workdir, filename_list[1]) record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % ( self.workdir, filename_list[1]) record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % ( self.workdir, filename_list[1]) record = SeqIO.read(self.metaopen(filename, "rb"), format="abi") record_dict[record.id] = record SeqIO.write(record, record_raw_fastq, format="fastq") SeqIO.write(record, record_raw_fasta, format="fasta") trimmed_record = SeqIO.AbiIO._abi_trim(record) stat_dict[record.id] = OrderedDict({ "raw_len": len(record), "raw_mean_qual": np.mean(record.letter_annotations["phred_quality"]), "raw_median_qual": np.median(record.letter_annotations["phred_quality"]), "trimmed_len": len(trimmed_record), "trimmed_mean_qual": np.mean(trimmed_record.letter_annotations["phred_quality"]), "trimmed_median_qual": np.median(trimmed_record.letter_annotations["phred_quality"]), "retained": "-", }) MatplotlibRoutines.draw_bar_plot( record.letter_annotations["phred_quality"], record_raw_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) if stat_dict[record.id]["trimmed_len"] >= min_len: if min_median_qual: if (stat_dict[record.id]["trimmed_median_qual"] >= min_median_qual) and ( stat_dict[record.id]["trimmed_mean_qual"] >= min_mean_qual): stat_dict[record.id]["retained"] = "+" else: low_quality_counter += 1 else: stat_dict[record.id]["retained"] = "+" else: too_short_counter += 1 if stat_dict[record.id]["retained"] == "-": excluded_list.append(record.id) continue SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq") SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta") MatplotlibRoutines.draw_bar_plot( trimmed_record.letter_annotations["phred_quality"], record_trimmed_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) trimmed_record_dict[record.id] = trimmed_record SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fasta, format="fasta") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fasta, format="fasta") excluded_list.write("%s.excluded.ids" % output_prefix) stat_dict.write(out_filename="%s.stats" % output_prefix) print("Excluded: %i" % excluded_counter) print("\tToo short( < %i ): %i" % (min_len, too_short_counter)) print("\tLow quality( median < %i or mean < %i ): %i" % (min_median_qual, min_mean_qual, low_quality_counter))
dest="output", required=True, help="File to write statistics") parser.add_argument( "-l", "--log_file", action="store", dest="log_file", default="trimmomatic.log", help="Name of files with trimmomatic log. Default - trimmomatic.log") args = parser.parse_args() samples = sorted( args.samples.split(",") if args.samples else os.listdir(args.samples_dir)) present_samples = [] for sample in samples: if os.path.isdir(args.samples_dir + sample): present_samples.append(sample) reports_dict = TwoLvlDict() for sample in present_samples: print("Handling report from %s" % sample) sample_dir = "%s%s/" % (args.samples_dir, sample) trimmomatic_log = "%s/trimmomatic.log" % sample_dir reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log) reports_dict.write(args.output)
def filter(self, samples_directory, output_directory, adapter_fragment_file, trimmomatic_adapter_file, general_stat_file, samples_to_handle=None, threads=4, trimmomatic_dir="", coockiecutter_dir="", facut_dir="", mismatch_number=2, pe_reads_score=30, se_read_score=10, min_adapter_len=1, sliding_window_size=None, average_quality_threshold=15, base_quality="phred33", read_name_type="illumina", leading_base_quality_threshold=None, trailing_base_quality_threshold=None, crop_length=None, head_crop_length=None, min_len=50, remove_intermediate_files=False, skip_coockiecutter=False, retain_single_end_reads=True, input_is_se=False): Cookiecutter.path = coockiecutter_dir Trimmomatic.jar_path = trimmomatic_dir Trimmomatic.threads = threads FaCut.path = facut_dir self.safe_mkdir(output_directory) """ merged_raw_dir = "%s/merged/" % output_directory filtered_dir = "%s/filtered/" % output_directory coockie_filtered_dir = "%s/coockiecutter/" % filtered_dir coockie_trimmomatic_filtered_dir = "%s/coockiecutter_trimmomatic/" % filtered_dir coockie_trimmomatic_quality_filtered_dir = "%s/coockiecutter_trimmomatic_quality/" % filtered_dir final_filtered_dir = "%s/final/" % filtered_dir filtering_stat_dir = "%s/filtered_stat/" % output_directory """ sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) merged_raw_dir, filtered_dir, coockie_filtered_dir, \ coockie_trimmomatic_filtered_dir, coockie_trimmomatic_quality_filtered_dir, \ final_filtered_dir, filtering_stat_dir = self.prepare_filtering_directories(output_directory, sample_list) filtering_statistics = TwoLvlDict() for sample in sample_list: print("Handling sample %s" % sample) filtering_statistics[sample] = OrderedDict() merged_raw_sample_dir = "%s/%s/" % (merged_raw_dir, sample) #merged_forward_reads = "%s/%s_1.fq" % (merged_raw_sample_dir, sample) #merged_reverse_reads = "%s/%s_2.fq" % (merged_raw_sample_dir, sample) coockie_filtered_sample_dir = "%s/%s/" % (coockie_filtered_dir, sample) coockie_stats = "%s/%s.coockiecutter.stats" % ( coockie_filtered_sample_dir, sample) coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % ( coockie_trimmomatic_filtered_dir, sample) coockie_trimmomatic_quality_filtered_sample_dir = "%s/%s/" % ( coockie_trimmomatic_quality_filtered_dir, sample) final_filtered_sample_dir = "%s/%s/" % (final_filtered_dir, sample) filtering_stat_sample_dir = "%s/%s" % (filtering_stat_dir, sample) #""" print("\tMerging fastqs if necessary...") merged_forward_reads, merged_reverse_reads, merged_se_reads = self.combine_fastq_files( samples_directory, sample, merged_raw_sample_dir, use_links_if_merge_not_necessary=True, input_is_se=input_is_se) if not skip_coockiecutter: print("\tFiltering by Cookiecutter") #""" Cookiecutter.rm_reads( adapter_fragment_file, merged_forward_reads if merged_forward_reads else merged_se_reads, coockie_stats, right_reads=merged_reverse_reads, out_dir=coockie_filtered_sample_dir, use_dust_filter=False, dust_cutoff=None, dust_window_size=None, use_N_filter=False, read_length_cutoff=None, polyGC_length_cutoff=None) #""" print("\tParsing Cookiecutter report...") coockiecutter_report = CoockiecutterReport( coockie_stats, input_is_se=input_is_se) filtering_statistics[sample][ "raw_pairs"] = coockiecutter_report.input_pairs filtering_statistics[sample][ "pairs_after_coockiecutter"] = coockiecutter_report.retained_pairs filtering_statistics[sample][ "pairs_after_coockiecutter,%"] = float( "%.2f" % (float(coockiecutter_report.retained_pairs) / float(coockiecutter_report.input_pairs) * 100)) os.system("cp %s %s" % (coockie_stats, filtering_stat_sample_dir)) coockie_filtered_paired_forward_reads = "%s/%s_1.ok.fastq" % ( coockie_filtered_sample_dir, sample) coockie_filtered_paired_reverse_reads = "%s/%s_2.ok.fastq" % ( coockie_filtered_sample_dir, sample) coockie_filtered_paired_se_reads = "" coockie_filtered_se_reads = "%s/%s.se.ok.fastq" % ( coockie_filtered_sample_dir, sample) # se reads produced by Coockiecutter are ignored now!! #coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (coockie_trimmomatic_filtered_dir, sample) trimmomatic_output_prefix = "%s/%s" % ( coockie_trimmomatic_filtered_sample_dir, sample) trimmomatic_log = "%s.trimmomatic.log" % trimmomatic_output_prefix #""" if (merged_forward_reads is None) and (merged_reverse_reads is None): print("Filtering by Trimmomatic...") Trimmomatic.filter( merged_se_reads if skip_coockiecutter else coockie_filtered_se_reads, trimmomatic_output_prefix, output_extension="fq", right_reads=None, adapters_file=trimmomatic_adapter_file, mismatch_number=mismatch_number, pe_reads_score=pe_reads_score, se_read_score=se_read_score, min_adapter_len=min_adapter_len, sliding_window_size=sliding_window_size, average_quality_threshold=average_quality_threshold, leading_base_quality_threshold= leading_base_quality_threshold, trailing_base_quality_threshold= trailing_base_quality_threshold, crop_length=crop_length, head_crop_length=head_crop_length, min_length=min_len, logfile=trimmomatic_log, base_quality=base_quality) else: print("\tFiltering by Trimmomatic...") Trimmomatic.filter( merged_forward_reads if skip_coockiecutter else coockie_filtered_paired_forward_reads, trimmomatic_output_prefix, output_extension="fq", right_reads=merged_reverse_reads if skip_coockiecutter else coockie_filtered_paired_reverse_reads, adapters_file=trimmomatic_adapter_file, mismatch_number=mismatch_number, pe_reads_score=pe_reads_score, se_read_score=se_read_score, min_adapter_len=min_adapter_len, sliding_window_size=sliding_window_size, average_quality_threshold=average_quality_threshold, leading_base_quality_threshold= leading_base_quality_threshold, trailing_base_quality_threshold= trailing_base_quality_threshold, crop_length=crop_length, head_crop_length=head_crop_length, min_length=min_len, logfile=trimmomatic_log, base_quality=base_quality) #""" trimmomatic_report = TrimmomaticReport(trimmomatic_log, input_is_se=input_is_se) if skip_coockiecutter: filtering_statistics[sample][ "raw_pairs"] = trimmomatic_report.stats["input"] filtering_statistics[sample][ "pairs_after_trimmomatic"] = trimmomatic_report.stats[ "surviving"] if input_is_se else trimmomatic_report.stats[ "both_surviving"] filtering_statistics[sample][ "pairs_after_trimmomatic,%"] = trimmomatic_report.stats[ "surviving,%"] if input_is_se else trimmomatic_report.stats[ "both_surviving,%"] if retain_single_end_reads and not input_is_se: filtering_statistics[sample][ "forward_se_after_trimmomatic"] = trimmomatic_report.stats[ "forward_only_surviving"] filtering_statistics[sample][ "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[ "forward_only_surviving"] filtering_statistics[sample][ "reverse_se_after_trimmomatic"] = trimmomatic_report.stats[ "reverse_only_surviving,%"] filtering_statistics[sample][ "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[ "forward_only_surviving,%"] os.system("cp %s %s" % (trimmomatic_log, filtering_stat_sample_dir)) coockie_trimmomatic_filtered_paired_forward_reads = "%s/%s_1.pe.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_paired_reverse_reads = "%s/%s_2.pe.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_unpaired_forward_reads = "%s/%s_1.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_unpaired_reverse_reads = "%s/%s_2.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_se_reads = "%s/%s.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) final_forward_reads = "%s/%s.final_1.fastq" % ( final_filtered_sample_dir, sample) final_reverse_reads = "%s/%s.final_2.fastq" % ( final_filtered_sample_dir, sample) final_forward_se_reads = "%s/%s.final_1.se.fastq" % ( final_filtered_sample_dir, sample) final_reverse_se_reads = "%s/%s.final_2.se.fastq" % ( final_filtered_sample_dir, sample) final_se_reads = "%s/%s.final.se.fastq" % ( final_filtered_sample_dir, sample) if sliding_window_size is None: facut_pe_output_prefix = "%s/%s.pe" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_forward_se_output_prefix = "%s/%s.forward.se" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_reverse_se_output_prefix = "%s/%s.reverse.se" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_pe_stat_file = "%s.facut.stat" % facut_pe_output_prefix facut_forward_se_stat_file = "%s.facut.stat" % facut_forward_se_output_prefix facut_reverse_se_stat_file = "%s.facut.stat" % facut_reverse_se_output_prefix #""" FaCut.filter_by_mean_quality( average_quality_threshold, facut_pe_output_prefix, coockie_trimmomatic_filtered_paired_forward_reads, reverse_reads= coockie_trimmomatic_filtered_paired_reverse_reads, quality_type=base_quality, stat_file=facut_pe_stat_file, name_type=read_name_type) FaCut.filter_by_mean_quality( average_quality_threshold, facut_forward_se_output_prefix, coockie_trimmomatic_filtered_unpaired_forward_reads, quality_type=base_quality, stat_file=facut_forward_se_stat_file, name_type=read_name_type) FaCut.filter_by_mean_quality( average_quality_threshold, facut_reverse_se_output_prefix, coockie_trimmomatic_filtered_unpaired_reverse_reads, quality_type=base_quality, stat_file=facut_reverse_se_stat_file, name_type=read_name_type) #""" #if input_is_se: #else: facut_report = FaCutReport(facut_pe_stat_file) filtering_statistics[sample][ "pairs_after_facut"] = facut_report.retained_pairs filtering_statistics[sample]["pairs_after_facut,%"] = float( "%.2f" % (float(facut_report.retained_pairs) / float(facut_report.input_pairs) * 100)) filtering_statistics[sample][ "retained_pairs_in_worst_tile,%"] = facut_report.minimum_retained_pairs_in_tiles_fraction * 100 filtering_statistics[sample][ "pairs_survived_after_filtration,%"] = float( "%.2f" % (float(facut_report.retained_pairs) / filtering_statistics[sample]["raw_pairs"] * 100)) facut_filtered_forward_reads = "%s_1.pe.fq" % facut_pe_output_prefix facut_filtered_reverse_reads = "%s_2.pe.fq" % facut_pe_output_prefix facut_filtered_forward_se_reads = "%s.se.fq" % facut_forward_se_output_prefix facut_filtered_reverse_se_reads = "%s.se.fq" % facut_reverse_se_output_prefix os.system("cp %s %s" % (facut_pe_stat_file, filtering_stat_sample_dir)) if retain_single_end_reads: os.system("cp %s %s" % (facut_forward_se_stat_file, filtering_stat_sample_dir)) os.system("cp %s %s" % (facut_reverse_se_stat_file, filtering_stat_sample_dir)) os.system("ln %s %s" % (facut_filtered_forward_reads, final_forward_reads)) os.system("ln %s %s" % (facut_filtered_reverse_reads, final_reverse_reads)) if retain_single_end_reads and not input_is_se: os.system("cat %s %s > %s" % (facut_filtered_forward_se_reads, facut_filtered_reverse_se_reads, final_forward_se_reads)) #os.system("ln %s %s" % (facut_filtered_forward_se_reads, final_forward_se_reads)) #os.system("ln %s %s" % (facut_filtered_reverse_se_reads, final_reverse_se_reads)) if input_is_se: pass #os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads)) else: os.system("ln %s %s" % (coockie_trimmomatic_filtered_paired_forward_reads, final_forward_reads)) os.system("ln %s %s" % (coockie_trimmomatic_filtered_paired_reverse_reads, final_reverse_reads)) if retain_single_end_reads and not input_is_se: os.system( "cat %s %s > %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, coockie_trimmomatic_filtered_unpaired_reverse_reads, final_forward_se_reads)) """ os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, final_forward_se_reads)) os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_reverse_reads, final_reverse_se_reads)) """ if input_is_se: os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads)) filtering_statistics[sample][ "pairs_survived_after_filtration,%"] = float( "%.2f" % (float(trimmomatic_report.stats[ "surviving" if input_is_se else "both_surviving"]) / filtering_statistics[sample]["raw_pairs"] * 100)) print(filtering_statistics.table_form()) if remove_intermediate_files: shutil.rmtree(merged_raw_sample_dir) shutil.rmtree(coockie_filtered_sample_dir) shutil.rmtree(coockie_trimmomatic_filtered_sample_dir) shutil.rmtree(coockie_trimmomatic_quality_filtered_sample_dir) if remove_intermediate_files: shutil.rmtree(coockie_filtered_dir) shutil.rmtree(coockie_trimmomatic_filtered_dir) shutil.rmtree(coockie_trimmomatic_quality_filtered_dir) shutil.rmtree(merged_raw_dir) filtering_statistics.write(general_stat_file, sort=False)
required=True, help="Comma-separated list of species") parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=FileRoutines.check_path, help="Directory with families of species") """ parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids") """
def draw_variant_window_densities(self, count_df, scaffold_length_dict, window_size, window_step, output_prefix, masking_dict=None, gap_fraction_threshold=0.4, record_style=None, ext_list=("svg", "png"), label_fontsize=13, left_offset=0.2, figure_width=12, figure_height_scale_factor=0.5, scaffold_synonym_dict=None, id_replacement_mode="partial", suptitle=None, density_multiplicator=1000, scaffold_black_list=[], sort_scaffolds=False, scaffold_ordered_list=None, scaffold_white_list=[], add_sample_name_to_labels=False, dist_between_scaffolds_scaling_factor=1, gap_color="grey", masked_color="grey", no_snp_color="white", colormap=None, colors=("#333a97", "#3d3795","#5d3393", "#813193", "#9d2d7f", "#b82861", "#d33845", "#ea2e2e", "#f5ae27"), thresholds=(0.0, 0.1, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0, 2.5), colormap_tuple_list=((0.0, "#333a97"), (0.1, "#3d3795"), (0.5, "#5d3393"), (0.75, "#813193"), (1.0, "#9d2d7f"), (1.25, "#b82861"), (1.5, "#d33845"), (2.0, "#ea2e2e"), (2.5, "#f5ae27"))): """ cont_dict = {sample: {scaffold: }}""" if dist_between_scaffolds_scaling_factor < 1: raise ValueError("Scaling factor for distance between scaffolds have to be >=1.0") final_scaffold_list = self.get_filtered_scaffold_list(count_df.index.get_level_values('CHROM').unique().to_list(), scaffold_black_list=scaffold_black_list, sort_scaffolds=sort_scaffolds, scaffold_ordered_list=scaffold_ordered_list, scaffold_white_list=scaffold_white_list) scaffold_number = len(final_scaffold_list) max_scaffold_length = max([scaffold_length_dict[scaf] for scaf in final_scaffold_list]) #max_scaffold_length = max(scaffold_length_dict.values()) window_number, sample_number = np.shape(count_df) figure = plt.figure(figsize=(figure_width, int(figure_height_scale_factor * scaffold_number * sample_number))) subplot = plt.subplot(1, 1, 1) subplot.get_yaxis().set_visible(False) #subplot.get_xaxis().set_visible(False) #axes.xaxis.set_major_formatter(x_formatter) #subplot.spines['bottom'].set_color('none') subplot.spines['right'].set_color('none') subplot.spines['left'].set_color('none') subplot.spines['top'].set_color('none') scaffold_height = 10 dist_between_scaffolds = 5 start_x = 0 start_y = - dist_between_scaffolds label_line_y_shift = int(scaffold_height/2) label_line_y_jump = int(scaffold_height/2) #normalize_color_func = LinearSegmentedColormap.from_list("Densities_custom", colormap_tuple_list) #plt.register_cmap(cmap=colormap) #colormap = cm.get_cmap(name="plasma", lut=None) #normalize_colors = colors.BoundaryNorm(boundaries_for_colormap, len(boundaries_for_colormap) - 1) * int(256/(len(boundaries_for_colormap) - 1)) #normalize_colors = colors.Normalize(vmin=boundaries_for_colormap[0], vmax=boundaries_for_colormap[-1]) masked_windows_count_dict = TwoLvlDict() no_snps_windows_count_dict = TwoLvlDict() for sample in count_df: masked_windows_count_dict[sample] = OrderedDict() no_snps_windows_count_dict[sample] = OrderedDict() if colormap: cmap = plt.get_cmap(colormap, len(thresholds)) masked_regions_fd = open("%s.masked_regions" % output_prefix, "w") masked_regions_fd.write("#scaffold\twindow\tmasked_position\tmasked_position,fraction\n") for scaffold in final_scaffold_list: sample_index = 0 for sample in count_df: masked_windows_count_dict[sample][scaffold] = 0 no_snps_windows_count_dict[sample][scaffold] = 0 #if scaffold in scaffold_black_list: # continue #print gap_coords_list, gap_len_list start_y += scaffold_height + dist_between_scaffolds * (dist_between_scaffolds_scaling_factor if sample_index == 0 else 1) label_y_start = label_line_y_shift + start_y gap_y_jump = label_y_start + label_line_y_jump prev_x = 0 #figure.text(0, start_y, scaffold, rotation=0, fontweight="bold", transform=subplot.transAxes, fontsize=9, # horizontalalignment='center', # verticalalignment='center') if scaffold_synonym_dict: if id_replacement_mode == "exact": if scaffold in scaffold_synonym_dict: scaffold_label = scaffold_synonym_dict[scaffold] else: scaffold_label = scaffold print("WARNING!!! Synonym for %s was not found" % scaffold) elif id_replacement_mode == "partial": partial_syn_list = [] for partial_syn in scaffold_synonym_dict: if partial_syn in scaffold: partial_syn_list.append(partial_syn) if len(partial_syn_list) > 1: print("WARNING!!! More than one possible replacement for %s was found: %s. No replacement then." % (scaffold, ",".join(partial_syn_list))) scaffold_label = scaffold elif not partial_syn_list: scaffold_label = scaffold print("WARNING!!! Synonym for %s was not found" % scaffold) else: scaffold_label = scaffold_synonym_dict[partial_syn_list[0]] else: raise ValueError("Unknown id replacement mode") else: scaffold_label = scaffold subplot.annotate(("%s (%s)" % (scaffold, sample))if add_sample_name_to_labels else scaffold_label, xy=(0, label_y_start), xycoords='data', fontsize=16, xytext=(-15, 1.5 * label_line_y_shift), textcoords='offset points', ha='right', va='top') if scaffold in count_df[sample]: for window_index in count_df.loc[scaffold].index: window_start = window_index * window_step window_end = window_start + window_size - 1 # TODO: check end coordinate if masking_dict: if scaffold in masking_dict: unmasked_length = window_size - masking_dict[scaffold][window_index] if unmasked_length > 0: variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(unmasked_length) else: variant_density = None else: variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(window_size) if variant_density is None: window_color = masked_color else: if colormap: if variant_density <= thresholds[0]: window_color = no_snp_color else: for threshold_index in range(0, len(thresholds) - 1): if thresholds[threshold_index] < variant_density <= thresholds[threshold_index+1]: window_color = cmap(threshold_index) break else: window_color = cmap(threshold_index+1) else: if variant_density <= colormap_tuple_list[0][0]: window_color = no_snp_color else: for lower_boundary, color in colormap_tuple_list: if variant_density <= lower_boundary: break if variant_density > lower_boundary: prev_color = color else: prev_color = color window_color = prev_color if masking_dict: if scaffold in masking_dict: if float(masking_dict[scaffold][window_index]) / float(window_size) > gap_fraction_threshold: window_color = masked_color #print scaffold #print i, variant_density, window_color if window_color == masked_color: masked_windows_count_dict[sample][scaffold] += 1 masked_regions_fd.write("%s\t%i\t%i\t%f\n" % (scaffold, window_index, masking_dict[scaffold][window_index], float(masking_dict[scaffold][window_index]) / float(window_size))) elif window_color == no_snp_color: no_snps_windows_count_dict[sample][scaffold] += 1 window = Rectangle((window_start, start_y), window_size, scaffold_height, fill=True, edgecolor=None, facecolor=window_color, linewidth=0.0000000000001) #print prev_x #print gap_coords[0] - prev_x subplot.add_patch(window) # draw_chromosome fragment = Rectangle((0, start_y), scaffold_length_dict[scaffold], scaffold_height, fill=False, edgecolor="black", facecolor=None, linewidth=0.5) subplot.add_patch(fragment) sample_index += 1 legend_y_position = int(start_y/2) legend_x_position = int(max_scaffold_length * 1.05) legend_element_side = scaffold_height square_y_pos = legend_y_position - legend_element_side for color, legend_label in zip((masked_color, no_snp_color), ("masked", "no SNPs")): square_y_pos += legend_element_side fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=color, linewidth=0.5) subplot.add_patch(fragment) subplot.annotate(legend_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) if colormap: for i in range(0, len(thresholds)): square_y_pos += legend_element_side #print (colormap_tuple_list[i][1]) fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=cmap(i), linewidth=0.5) subplot.add_patch(fragment) if i == (len(thresholds) - 1): legend_element_label = "> %.2f" % thresholds[i] else: legend_element_label = "%.2f - %.2f" % (thresholds[i], thresholds[i + 1]) subplot.annotate(legend_element_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) else: for i in range(0, len(colormap_tuple_list)): square_y_pos += legend_element_side #print (colormap_tuple_list[i][1]) fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=colormap_tuple_list[i][1], linewidth=0.5) subplot.add_patch(fragment) if i == (len(colormap_tuple_list) - 1): legend_element_label = "> %.2f" % colormap_tuple_list[i][0] else: legend_element_label = "%.2f - %.2f" % (colormap_tuple_list[i][0], colormap_tuple_list[i + 1][0]) subplot.annotate(legend_element_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) plt.xlim(xmin=0, xmax=int(1.2 * max_scaffold_length)) plt.ylim(ymin=0, ymax=start_y + 2 * scaffold_height) #plt.colorbar(subplot) #plt.tight_layout() plt.subplots_adjust(left=left_offset, right=0.95)#bottom=0.1, right=0.8, top=0.9) if suptitle: plt.suptitle(suptitle) for extension in ext_list: plt.savefig("%s.%s" % (output_prefix, extension)) plt.close() no_snps_windows_count_dict.write("%s.no_snps.windows.count" % output_prefix) masked_windows_count_dict.write("%s.masked.windows.count" % output_prefix) masked_regions_fd.close()
-ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2) """ return -ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2) def get_intersection_length(start1, end1, start2, end2): if start1 - end2 > 0 or start2 - end1 > 0: return 0 start_shift = start1 - start2 start_coef_shift = 0 if start_shift < 0 else 1 end_shift = end1 - end2 end_coef_shift = 0 if end_shift > 0 else 1 return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift overlap_clusters_percent = TwoLvlDict({}) totaly_genes = 6074 test_fd = open("probability.t", "w") test_fd.write("#size\tpower\ttotal\tPmCDA1_3d\tPmCDA1_sub_3d\tintersection\tp-value\n") print([float(f) / float(100) for f in range(1, 11)]) for size in range(3, 11): overlap_clusters_percent[size] = {} for power in [float(f) / float(100) for f in range(1, 11)]: PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) #cluster_3d_dict = OrderedDict({}) cluster_3d_set = set([]) cluster_3d_sub_set = set([])
dest="species_dir", default="./", type=check_path, help="Comma-separated list of species") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") nonassembled = species_syn_dict.filter_by_line(filter_nonassembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") nonassembled.write("not_assembled_families_in_all_species.t", absent_symbol=".") complicated_families_dict = nonassembled.filter_by_line( filter_splited_to_several_fam)
return True parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="File with families assembly information") parser.add_argument("-e", "--header", action="store_true", dest="header", help="Header is present in input file") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() out_fd.write("#family\tspecies_with_family\tspecies_with_errors\tspecies_with_correct_fam\terror_ratio\n") with open(args.input, "r") as in_fd: if args.header: in_fd.readline() for line in in_fd: species_with_errors = 0 species_with_fam = 0 tmp = line.strip().split("\t") family_name = tmp[0] for fam in tmp[1:]: if fam != ".": species_with_fam += 1 if "_" in fam: species_with_errors += 1 species_with_correct_fam = species_with_fam - species_with_errors
def star_and_htseq(self, genome_dir, samples_directory, output_directory, gff_for_htseq, count_table_file_prefix, genome_fasta=None, samples_to_handle=None, genome_size=None, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_per_thread_for_bam_sorting="4G", include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=False, star_dir=None, threads=1, max_intron_length=None, stranded_rnaseq="yes", min_alignment_quality=10, feature_type_for_htseq="exon", feature_id_attribute_for_htseq="gene_id", htseq_mode="union"): STAR.threads = threads STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=genome_size) sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) self.prepare_diff_expression_directories(output_directory, sample_list) alignment_dir = "%s/alignment/" % output_directory count_pe_table = TwoLvlDict() count_se_table = TwoLvlDict() count_all_table = TwoLvlDict() count_pe_table_file = "%s/%s.pe.tab" % (output_directory, count_table_file_prefix) count_se_table_file = "%%s/%s.se.tab" % (output_directory, count_table_file_prefix) count_all_table_file = "%s/%s.all.tab" % (output_directory, count_table_file_prefix) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_directory, sample) alignment_sample_dir = "%s/%s/" % (alignment_dir, sample) alignment_sample_se_dir = "%s/se/" % alignment_sample_dir filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files( sample_dir) if se_files: self.safe_mkdir(alignment_sample_se_dir) print("\tAligning paired reads...") count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample) #""" STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir print("\tIndexing alignment file for paired reads...") os.system("samtools index %s" % alignment_file) print("\tCounting paired reads aligned to features...") HTSeq.count(alignment_file, gff_for_htseq, count_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_counts = SynDict(filename=count_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_pe_table[sample] = sample_counts if se_files: print("\tAligning single reads...") count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir, sample) #""" STAR.align( genome_dir, se_files, reverse_read_list=None, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon= feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_se_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir print("\tIndexing alignment file for single reads...") os.system("samtools index %s" % alignment_se_file) print("\tCounting single reads aligned to features...") HTSeq.count( alignment_se_file, gff_for_htseq, count_se_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_se_counts = SynDict(filename=count_se_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_se_table[sample] = sample_se_counts else: count_se_table[sample] = SynDict() count_all_table[sample] = SynDict() if se_files: for gene_id in set(sample_counts.keys()) | set( sample_se_counts.keys()): if (gene_id in sample_counts) and (gene_id in sample_se_counts): count_all_table[sample][gene_id] = sample_counts[ gene_id] + sample_se_counts[gene_id] elif gene_id in sample_counts: count_all_table[sample][gene_id] = sample_counts[ gene_id] elif gene_id in sample_se_counts: count_all_table[sample][gene_id] = sample_se_counts[ gene_id] else: count_all_table[sample] = count_pe_table[sample] count_pe_table.write(count_pe_table_file) count_se_table.write(count_se_table_file) count_all_table.write(count_all_table_file)
"--gap_symbol", action="store", dest="gap_symbol", default="-", help="Gap symbol. Default - '-'") parser.add_argument("-m", "--histogram_output", action="store", dest="histogram_output", required=True, help="File to write histogram") args = parser.parse_args() unique_position_dict = TwoLvlDict() FileRoutines.safe_mkdir(args.output_dir) for alignment_file in args.input: alignment_name_list = FileRoutines.split_filename(alignment_file) output_prefix = "%s/%s.unique_positions" % (args.output_dir, alignment_name_list[1]) unique_position_dict[alignment_name_list[ 1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file( alignment_file, output_prefix, format=args.format, gap_symbol="-", return_mode="relative",
from collections import OrderedDict from RouToolPa.Collections.General import TwoLvlDict from RouToolPa.Routines.File import check_path parser = argparse.ArgumentParser() parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","), required=True, help="Comma-separated list of species") parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path, help="Directory with per species statistics") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_stat_dict = TwoLvlDict() for species in args.species_list: with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd: statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines()) species_stat_dict[species] = OrderedDict(statistics) species_stat_dict.write(out_fd) if args.output != "stdout": out_fd.close()
help="write extensions of vcf files in output file. Default: false") parser.add_argument("-r", "--remove_nucleotide_substitutions", action="store_true", dest="rem_nuc_sub", help="Remove nucleotide substitutions from output(preserve only AA substitutions)") parser.add_argument("-c", "--convert_aa_to_single_letter", action="store_true", dest="convert_to_single_letter", help="Convert aminoacids to single letters") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) gene_alias_dict = SynDict() if args.gene_alias_file: gene_alias_dict.read(args.gene_alias_file, split_values=False) out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") summary_dict = TwoLvlDict() for filename in args.input: directory, prefix, extension = split_filename(filename) if args.write_dir_path and args.write_ext: name = filename elif args.write_dir_path: name = (directory + prefix) if directory else prefix elif args.write_ext: name = prefix + extension else: name = prefix if args.suffix_to_remove in name: name = name.replace(args.suffix_to_remove, "") summary_dict[name] = OrderedDict() with open(filename, "r") as file_fd:
len(filtered_out_report.records)) if args.ref_species_gene_file: reference_genes_dict = {} with open(args.ref_species_gene_file, "r") as ref_fd: for line in ref_fd: gene_family_id, genes = line.strip().split("\t") genes = [] if genes == "." else genes.split(",") reference_genes_dict[gene_family_id] = [genes[:]] if genes: reference_genes_dict[gene_family_id].append(choice(genes)) # print gene_family_id #print reference_genes_dict[gene_family_id] node_header_list = features_list + ["reference_gene"] delta_index = features_list.index("delta") statistics_dict = TwoLvlDict({}) for node_id in node_values: statistics_dict[node_id] = OrderedDict({ "lost": 0, "new": 0, "lost_ref_ann": 0, "new_ref_ann": 0 }) for node_id in node_values: fd_list = [] for directory in node_info_dir, node_ref_dir: for mode in "all", "new", "lost": fd_list.append( open(
def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None, black_scaffold_list=(), white_scaffold_list=()): busco_table_dict = OrderedDict() gene_id_dict = OrderedDict() counts_dict = OrderedDict() output_path_list = self.split_filename(output_prefix) pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".") pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".") self.safe_mkdir(pairwise_overlaps_dir) self.safe_mkdir(pairwise_overlap_counts_dir) lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)] for busco_table, label in zip(busco_file_list, lllabels_list): busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list, white_list=white_scaffold_list) gene_id_dict[label] = OrderedDict() counts_dict[label] = OrderedDict() gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses() # TODO: draw piecharts # TODO: count overlaps pairwise_overlap_dict = OrderedDict() count_pairwise_overlap_dict = OrderedDict() for label1 in lllabels_list: for label2 in lllabels_list: if label1 == label2: continue overlap_id = "%s_vs_%s" % (label1, label2) pairwise_overlap_dict[overlap_id] = TwoLvlDict() count_pairwise_overlap_dict[overlap_id] = TwoLvlDict() for status1 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() for status2 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2]) count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)]) pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2))) count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id)) if 2 <= len(busco_file_list) <= 3: fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6)) plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes") #print(subplot_list) for status, index in zip(self.status_list, range(0, 4)): plt.sca(subplot_list[index // 2][index % 2]) plt.title(status) MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status], gene_id_dict[lllabels_list[1]][status], set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None, set_labels=lllabels_list, set_colors=["red", "yellow", "green"], output_prefix=None, extensions=("png",), title=None) plt.savefig("%s.venn.png" % output_prefix) plt.close()
dest="split_values", help="Split values. Default: False") parser.add_argument("-s", "--value_separator", action="store", dest="value_separator", default=",'", help="Value separator. Default: ','") parser.add_argument( "-g", "--ignore_value_repeats", action="store_true", dest="ignore_value_repeats", help= "Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) " "and don't raise exception. If yes value from first entry is stored. Default: False" ) args = parser.parse_args() combined_table = TwoLvlDict(input_file=args.files, absent_symbol=args.absent_symbol, split_values=args.split_values, value_sep=args.value_separator, ignore_value_repeats=args.ignore_value_repeats) #print combined_table combined_table.write(args.output, absent_symbol=args.absent_symbol, close_after_if_file_object=False, sort=False)
"snoRNA": "ncRNA", "snRNA": "ncRNA" } annotation_black_list = ["gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette", "five_prime_UTR_intron"] with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(bad_regions_file) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record statistics_dict = TwoLvlDict(OrderedDict({})) for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) statistics_dict[sample_set_name] = OrderedDict({}) os.chdir(workdir) os.system("mkdir -p %s" % sample_set_name) os.chdir(sample_set_name) os.system("mkdir -p %s" % clustering_dir) #os.system("pwd") mutations = CollectionVCF(vcf_file="../SNP_annotated_raw_vcf/%s_SNP.vcf" % sample_set_name, from_file=True) mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) mutations.check_location(bad_regions) mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA")