def variants_in_UTR(collection, feature_type="5'_UTR"): UTR_record_list = [] for variant in collection: if feature_type in variant.info_dict["Ftype"]: UTR_record_list.append(variant) return CollectionVCF(from_file=False, record_list=UTR_record_list, metadata=collection.metadata, header=collection.header)
def extract_vcf(self): vcf = CollectionVCF(metadata=self.metadata.vcf_metadata, record_list=[], header=self.metadata.vcf_header, samples=self.metadata.samples, from_file=False) for record in self: """ print(record) print(type(record)) print(record.records) print(type(record.records)) """ vcf = vcf + record.records return vcf
def variants_start_end(collection, left, record_dict, min_five_utr_len=10, skip_nonintergenic_variants=False): pre_UTR_record_list = [] for record_id in record_dict: for feature in record_dict[record_id].features: if feature.type != "gene": continue for sub_feature in feature.sub_features: if sub_feature.type == "five_prime_UTR" and len( sub_feature) >= min_five_utr_len: break else: continue for sub_feature in feature.sub_features: strand = sub_feature.strand if sub_feature.type == "five_prime_UTR": five_UTR_start = sub_feature.location.start + 1 if strand == +1 else sub_feature.location.end pre_UTR_start = five_UTR_start - left if strand == +1 else five_UTR_start + 1 pre_UTR_end = five_UTR_start - 1 if strand == +1 else five_UTR_start + left for variant in collection: if record_id != variant.chrom: continue if skip_nonintergenic_variants and variant.info_dict[ "Ftype"] != ["igc"]: continue if pre_UTR_start <= variant.pos <= pre_UTR_end: pre_UTR_record_list.append(variant) pre_UTR_record_list[-1].info_dict["Fstrand"] = [ "P" ] if strand == +1 else ["M"] relative_position = (variant.pos - five_UTR_start) * strand if relative_position > 0: print(pre_UTR_start, pre_UTR_end, five_UTR_start) print(variant) print(sub_feature) return CollectionVCF(from_file=False, record_list=pre_UTR_record_list, metadata=collection.metadata, header=collection.header)
def homogeneity_plot(sample_set_names_list, plot_file_prefix): rcParams.update({'font.size': 6}) letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] feature_type_list = ["all", "pre_5'_UTR", "5'_UTR", "CDS", "3'_UTR"] figure = plt.figure(1, dpi=600, figsize=(5 * 1.47, 2 * len(sample_set_names_list))) index = 1 for sample, letter in zip(sample_set_names_list, letter_list_part1): findex = 1 for feature_type in feature_type_list: vcf_file = "%s_good.vcf" % sample if feature_type == "all" \ else "%s_pre_UTR_variants_only_intergenic_l_300.vcf" % sample \ if feature_type == "pre_5'_UTR" \ else "%s_%s_variants.vcf" % (sample, feature_type) collection = CollectionVCF(from_file=True, vcf_file=vcf_file) sample_data = collection.count_strandness("%s_%s_variants" % (sample, feature_type)) ax = plt.subplot(len(sample_set_names_list), len(feature_type_list), index) n_groups = 4 points = np.arange(n_groups) bar_width = 0.35 C_values = sample_data["all"][0] G_values = sample_data["all"][1] rects1 = plt.bar(points, C_values, bar_width, color='b', label='C->T') rects2 = plt.bar(points + bar_width, G_values, bar_width, color='g', label='G->A') table = [C_values[1:-1], G_values[1:-1]] g, p_value, dof, expctd = chi2_contingency(table) phi = phi_coefficient_correlation(table) if index > len(feature_type_list) * (len(sample_set_names_list) - 1): plt.xlabel('Strand') if findex == 1: plt.ylabel('N of SNV') plt.text(-0.50, 0.5, sample[:-3], rotation=90, fontweight="bold", transform=ax.transAxes, fontsize=10, horizontalalignment='center', verticalalignment='center') #plt.title("%s%i. %s (%i SNV)\np=%.3f, phi=%.3f" % (letter, findex, sample, len(collection), p_value, phi), # fontweight='bold') #plt.title("%s%i. %s (%s)\np=%.3f, phi=%.3f" % (letter, findex, sample, feature_type, p_value, phi), # fontweight='bold', fontsize=6) title = "%s%i" % (letter, findex) title_text = r"$p=%.2f, \varphi=%.2f$" % (p_value, phi) if p_value >= 0.01 \ else r"$p=%.1e, \varphi=%.2f$" % (p_value, phi) plt.text( 0.23, 1.1, title_text, rotation=0, transform=ax.transAxes, fontsize=8, #horizontalalignment='center', verticalalignment='center') plt.title(title, fontweight='bold', fontsize=11, loc="left") plt.xticks(points + bar_width, ('None', '+', '-', 'Both')) if findex == len(feature_type_list): plt.legend(prop={'size': 8}) if index <= len(feature_type_list): plt.text(0.5, 1.25, feature_type, rotation=0, fontweight="bold", transform=ax.transAxes, fontsize=10, horizontalalignment='center', verticalalignment='center') #plt.suptitle("Strandness histograms", fontweight="bold", fontsize=20) findex += 1 index += 1 #plt.tight_layout() plt.subplots_adjust(hspace=0.5, wspace=0.25, top=0.88, left=0.09, right=0.99) for extension in [".pdf", ".svg", ".eps", ".png"]: plt.savefig("%s%s" % (plot_file_prefix, extension)) plt.close()
continue os.chdir("alignment_LAN210_v0.9m") mutations = CollectionVCF(vcf_file=sample + suffix, from_file=True) print("Totaly %s mutations" % len(mutations)) mutations.test_thresholds(extracting_method='distance', threshold=(50, 5000, 100), testing_dir="testing_threshold") """ sample_set_names_list = [ "PmCDA1_3d", "HAP", "PmCDA1_sub1_3d", "PmCDA1_6d", "HAP_sub1", "PmCDA1_sub1_6d", "A1_3d", "A1_6d", "A3G_3d", "AID_3d", "AID_6d" ] suffix = "_SNP.vcf" workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/test_thresholds/" sample_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/SNP_annotated_raw_vcf/" for sample_set in sample_set_names_list: print("Handling %s" % sample_set) os.chdir(workdir) os.system("mkdir -p %s" % sample_set) os.chdir(sample_set) mutations = CollectionVCF(vcf_file=sample_dir + sample_set + suffix, from_file=True) print("Totaly %s mutations" % len(mutations)) mutations.test_thresholds(extracting_method='distance', threshold=(50, 5000, 100), testing_dir="testing_threshold")
help="End histogramm file prefix", default="end_histogramm") args = parser.parse_args() if args.both is not None: args.left = args.both args.right = args.both if (not args.right) and (not args.left): raise ValueError("Both left and right regions were not set") with open(args.gff, "r") as in_fd: record_dict = dict([(record.id, record) for record in GFF.parse(in_fd)]) variants = CollectionVCF(from_file=True, vcf_file=args.vcf) gene_variants_positions = [] all_variant_start_positions = [] all_variant_end_positions = [] print(args.left) print(args.right) for record_id in record_dict: for feature in record_dict[record_id].features: if feature.type != "gene": continue #print(feature.sub_features) for sub_feature in feature.sub_features: if sub_feature.type != "CDS": continue chrom = record_id strand = sub_feature.strand
annotations_dict = {} annotation_synonym_dict = { "three_prime_UTR": "3'_UTR", "five_prime_UTR": "5'_UTR", "snoRNA": "ncRNA", "snRNA": "ncRNA" } annotation_black_list = [ "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region" ] with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(bad_regions_file) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) os.chdir(workdir) mutations = CollectionVCF(vcf_file=sample_set_name + ".vcf", from_file=True) mutations.find_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict, feature_type_black_list=annotation_black_list) mutations.write("%s_annotated.vcf" % sample_set_name)
import matplotlib.pyplot as plt from matplotlib import rcParams if __name__ == "__main__": workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/all/pre_UTR_strandness/" letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] os.chdir(workdir) sample_set_names_list = [ "PmCDA1_3d", "PmCDA1_sub1_3d", "PmCDA1_6d", "PmCDA1_sub1_6d" ] rcParams.update({'font.size': 7}) plt.figure(1, dpi=300, figsize=(6, 6)) index = 1 for sample, letter in zip(sample_set_names_list, letter_list_part1): collection = CollectionVCF( from_file=True, vcf_file=sample + "_pre_UTR_variants_only_intergenic_l_300.vcf") sample_data = collection.count_strandness( sample + "_pre_UTR_variants_only_intergenic_l_300_strandness") plt.subplot(2, 2, index) n_groups = 4 points = np.arange(n_groups) bar_width = 0.35 C_values = sample_data["all"][0] G_values = sample_data["all"][1] rects1 = plt.bar(points, C_values, bar_width, color='b', label='C->T') rects2 = plt.bar(points + bar_width, G_values,
"three_prime_UTR": "3'_UTR", "five_prime_UTR": "5'_UTR", "snoRNA": "ncRNA", "snRNA": "ncRNA" } with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(bad_regions_file) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record region = "chrX" for sample_set in sample_set_names_list: collection = CollectionVCF(from_file=True, vcf_file="%s_good.vcf" % sample_set) plt.figure(1, figsize=(15, 8)) ax0 = plt.subplot(212) ax0.set_yscale('log', basey=2) ax0.set_xlim(1, 745688) #ax0.set_ylim(ymin=0) region_reference_dict, maximum = rainfall_plot( collection, region, base_colors=[], facecolor="#D6D6D6", ref_genome=reference, draw_gaps=True, masked_regions=bad_region_dict) for reference in region_reference_dict[region]:
bad_region_dict = {} with open(bad_regions_file) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) os.chdir(workdir) os.system("mkdir -p %s" % sample_set_name) os.chdir(sample_set_name) os.system("mkdir -p %s %s" % (clustering_dir, rainfall_dir)) #os.system("pwd") mutations = CollectionVCF( vcf_file="../SNP_annotated_raw_vcf/%s_SNP.vcf" % sample_set_name, from_file=True) """ mutations.rainfall_plot("%s_mutations" % (sample_set_name), ref_genome=reference, draw_gaps=True, masked_regions=bad_region_dict) """ mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) mutations.check_location(bad_regions) mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA") annotation_black_list = [ "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette",
def read(self, input_file): # TODO: write read from ccf file with open(input_file, "r") as in_fd: stripped_line = in_fd.readline().strip() if stripped_line == "#VCF_METADATA START": vcf_metadata = MetadataVCF() stripped_line = in_fd.readline().strip() while (stripped_line != "#VCF_METADATA END"): vcf_metadata.add_metadata(stripped_line) stripped_line = in_fd.readline().strip() stripped_line = in_fd.readline().strip() if stripped_line == "#VCF_HEADER": header_line = in_fd.readline().strip() vcf_header = HeaderVCF(header_line[1:].split("\t")) #print("a\na\na\na\na\n") #print(vcf_header) self.metadata = MetadataCCF(vcf_header[9:], vcf_metadata=vcf_metadata, vcf_header=vcf_header) stripped_line = in_fd.readline().strip() if stripped_line == "#CCF_HEADER": header_line = in_fd.readline().strip() self.header = HeaderCCF(header_line[1:].split("\t")) flag = 0 self.records = [] while True: data_line = in_fd.readline() if data_line == "" or data_line == "\n": break stripped_line = data_line.strip() if data_line[0] == "\t": #stripped_line = stripped_line[1:] #print(collection_vcf) collection_vcf.records.append( collection_vcf.add_record( stripped_line, external_metadata=self.metadata.vcf_metadata)) flag = 1 #print("aaaa") continue if flag != 0: self.records.append( RecordCCF(id=cluster_id, chrom=chrom, size=size, start=start, end=end, description=description, flags=flags, collection_vcf=collection_vcf, bad_vcf_records=bad_records, from_records=False, subclusters=subclusters)) #collection_vcf = None if stripped_line[0] == ">": flag = 0 cluster_id, chrom, start, end, description_and_flags = stripped_line[ 1:].split("\t") start = int(start) end = int(end) description_and_flags = description_and_flags.split(";") description = OrderedDict({}) flags = set([]) subclusters = None for descr_entry in description_and_flags: descr_entry_splited = descr_entry.split("=") if len(descr_entry_splited) == 1: flags.add(descr_entry_splited[0]) continue if descr_entry_splited[0] == "Size": size = int(descr_entry_splited[1]) elif descr_entry_splited[0] == "Bad_records": bad_records = int(descr_entry_splited[1]) elif descr_entry_splited[ 0] == "Mean" or descr_entry_splited[ 0] == "Median" or descr_entry_splited[ 0] == "Power" or descr_entry_splited[ 0] == "Homogeneity": description[descr_entry_splited[0]] = float( descr_entry_splited[1]) elif descr_entry_splited[0] == "Loc": description[descr_entry_splited[ 0]] = descr_entry_splited[1].split(",") elif descr_entry_splited[0] == "Subclusters": subclusters = [ int(x) for x in descr_entry_splited[1].split(",") ] else: description[descr_entry_splited[ 0]] = descr_entry_splited[1].split(",") if len(description[descr_entry_splited[0]]) == 1: description[ descr_entry_splited[0]] = description[ descr_entry_splited[0]][0] collection_vcf = CollectionVCF(metadata=None, record_list=None, header=None, vcf_file=None, samples=None, from_file=False, external_metadata=None) continue self.records.append( RecordCCF(id=cluster_id, chrom=chrom, size=size, start=start, end=end, description=description, flags=flags, collection_vcf=collection_vcf, bad_vcf_records=bad_records, from_records=False, subclusters=subclusters))
def adjust(self, border_limit=None, min_size_to_adjust=2, remove_border_subclusters=False, remove_size_limit=1): # adjusts cluster borders, returns list of new cluster records # skip adjustment for clusters with 3 or less mutations if (self.size < min_size_to_adjust) or (self.subclusters is None): #return -1 return [self] limit = border_limit if border_limit else len(self.subclusters) for i in range(0, limit): if self.subclusters[i] == self.subclusters[0]: left_subcluster_end = i else: break # exit if cluster doesnt have subclusters if left_subcluster_end == len(self.subclusters) - 1: #return 1 return [self] for i in range(-1, -limit - 1, -1): if self.subclusters[i] == self.subclusters[-1]: right_subcluster_start = i else: break if remove_border_subclusters: start = left_subcluster_end + 1 if left_subcluster_end < remove_size_limit else 0 end = right_subcluster_start if right_subcluster_start >= -remove_size_limit else len( self.subclusters) new_left_cluster, new_right_cluster = None, None if start > 0: new_left_cluster = RecordCCF( collection_vcf=CollectionVCF( record_list=self.records.records[:start], from_file=False), subclusters=self.subclusters[:start], from_records=True) if end < len(self.subclusters): new_right_cluster = RecordCCF( collection_vcf=CollectionVCF( record_list=self.records.records[end:], from_file=False), subclusters=self.subclusters[end:], from_records=True) """ self.__init__(collection_vcf=CollectionVCF(record_list=self.records.records[start:end], from_file=False), subclusters=self.subclusters[start:end], from_records=True) """ new_middle_cluster = RecordCCF( collection_vcf=CollectionVCF( record_list=self.records.records[start:end], from_file=False), subclusters=self.subclusters[start:end], from_records=True) """ if new_left_cluster or new_right_cluster: print("original") print(self) print("adjusted") print(new_left_cluster) print(new_middle_cluster) print(new_right_cluster) """ cluster_list = [new_left_cluster] if new_left_cluster else [] cluster_list += [new_middle_cluster] cluster_list += [new_right_cluster] if new_right_cluster else [] return cluster_list
pre_UTR_bins = left / bin_width CDS_bins = np.linspace(0, right, right / bin_width + 1) UTR_bins = 10 normed = True max_start = 0 max_end = 0 skip_nonintergenic_variants = True for sample_set in sample_set_names_list: print("Handling %s" % sample_set) vcf_file = "%s_good.vcf" % sample_set #start_hist_prefix = "%s_start_hist_r_%i_l_%i" % (sample_set, right, left) #end_hist_prefix = "%s_end_hist_r_%i_l_%i" % (sample_set, right, left) #gene_variants = "%s_gene_variants_r_%i_l_%i.t" % (sample_set, right, left) #variants, minus_variants = CollectionVCF(from_file=True, vcf_file=vcf_file).filter_by_expression("record.ref == 'C'") # C -> T variants #variants, minus_variants = CollectionVCF(from_file=True, vcf_file=vcf_file).filter_by_expression("(record.ref == 'C' and record.info_dict['Fstrand'][0] == 'P') or (record.ref == 'G' and record.info_dict['Fstrand'][0] == 'M')") # nontranscribed thread variants = CollectionVCF(from_file=True, vcf_file=vcf_file) pre_UTR_positions[sample_set], UTR_positions[sample_set], CDS_positions[sample_set] = \ variants_start_end(variants, left, right, record_dict, min_five_utr_len=10, skip_nonintergenic_variants=skip_nonintergenic_variants) length_dict[sample_set] = len(variants) #print(start_dict[sample_set]) pre_UTR_hist_dict[sample_set] = list( np.histogram(pre_UTR_positions[sample_set], bins=pre_UTR_bins)) UTR_hist_dict[sample_set] = list( np.histogram(UTR_positions[sample_set], bins=UTR_bins)) CDS_hist_dict[sample_set] = list( np.histogram(CDS_positions[sample_set], bins=CDS_bins)) print("UTR") print(UTR_positions[sample_set]) print("blablabla") print(pre_UTR_hist_dict[sample_set][0])
combined_vcf_suffix = "_adjusted_cluster_mutations.vcf" combined_3_vcf_suffix = "_adjusted_3+_cluster_mutations.vcf" samples_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/all/" samples_subdir = "alignment_LAN210_v0.10m/" sample_suffix = "_GATK_best_SNP.vcf" workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/" combined_subdir = "clustering/" for sample_set in sample_set_names_list: os.chdir(workdir) os.chdir(sample_set) os.system("mkdir -p per_sample_vcf") sample_set_mutations = CollectionVCF(from_file=True, vcf_file=combined_subdir + sample_set + combined_vcf_suffix) #sample_set_3_mutations = CollectionVCF(from_file=True, vcf_file=combined_subdir + sample_set + combined_3_vcf_suffix) per_sample_mutations = {} samples_list = sample_set_mutations.samples print("Handling %s" % sample_set) for sample in sample_set_mutations.samples: print("Handling %s" % sample) sample_mutations = CollectionVCF(from_file=True, vcf_file=samples_dir + sample + "/" + samples_subdir + sample + sample_suffix) per_sample_mutations[sample] = CollectionVCF( metadata=sample_mutations.metadata, record_list=None, header=sample_mutations.header,
gff_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/annotations/merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3" annotations_dict = {} with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record for sample in samples_list: print("Handling %s" % sample) os.chdir(workdir) os.chdir(sample) if alignment_dir not in os.listdir("."): continue os.chdir(alignment_dir) os.system("mkdir -p %s" % clustering_dir) mutations = CollectionVCF(vcf_file=sample + suffix, from_file=True) mutations.get_location(annotations_dict) mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA") """ for record in mutations: print (record) print(record.flags) """ #for record in mutations: # print(record.description) mutations.location_pie(annotation_black_list=[ "gene", "region", "ARS", "long_terminal_repeat" ], figsize=(40, 40), pie_filename="variant_location_pie.svg", counts_filename="variant_location_counts.t")
annotations_dict = {} annotation_synonym_dict = {"three_prime_UTR": "3'_UTR", "five_prime_UTR": "5'_UTR", "snoRNA": "ncRNA", "snRNA": "ncRNA" } with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record rcParams.update({'font.size': 7}) plt.figure(1, dpi=300, figsize=(8, 6)) index = 1 for sample, letter in zip(sample_set_names_list, letter_list_part1): collection = CollectionVCF(from_file=True, vcf_file=sample + "_good.vcf") plt.subplot(3, 2, index) collection.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) location_pie(collection, annotation_colors=[], ref_genome=None, explode=True, annotation_black_list=annotation_black_list, allow_several_counts_of_record=False, counts_filename="location_counts.t", counts_dir="location_counts", legend_font=6, combine_mixed=True ) plt.title("%s. %s" % (letter, sample), fontweight='bold') index += 1 for format_ext in ["svg", "eps", "pdf", "png"]:
vcf_best_merged_homo = "%s_GATK_best_merged_homo.vcf" % sample_name UnifiedGenotyper.variant_call("%s_trimmed_sorted_rm_pcr_chrom_with_header.bam" % sample_name, reference, stand_emit_conf=40, stand_call_conf=100, GATK_dir=gatk_dir, num_of_threads=5, output_mode="EMIT_VARIANTS_ONLY", discovery_mode="BOTH", output_file=vcf_all) SelectVariants.get_indel(gatk_dir, reference, vcf_all, vcf_indel) SelectVariants.get_SNP(gatk_dir, reference, vcf_all, vcf_SNP) VariantFiltration.filter_bad_SNP(gatk_dir, reference, vcf_SNP, vcf_filtered_SNP) VariantFiltration.filter_bad_indel(gatk_dir, reference, vcf_indel, vcf_filtered_indel) SelectVariants.remove_filtered(gatk_dir, reference, vcf_filtered_SNP, vcf_best_SNP) SelectVariants.remove_filtered(gatk_dir, reference, vcf_filtered_indel, vcf_best_indel) CombineVariants.combine_from_same_source(gatk_dir, reference, [vcf_best_SNP, vcf_best_indel], vcf_best_merged) best_merged = CollectionVCF(vcf_file=vcf_best_merged) best_merged_homo, best_merged_hetero = best_merged.split_by_zygoty() best_merged_homo.write(vcf_best_merged_homo) best_merged_hetero.write(vcf_best_merged_hetero)
annotations_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/annotations/merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3" sequence_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/LAN210_v0.10m.fasta" workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf" sequence_dict = SeqIO.to_dict(SeqIO.parse(sequence_file, "fasta")) #print(sequence_dict) with open(annotations_file, "r") as in_fd: annotation_dict = dict([(record.id, record) for record in GFF.parse(in_fd)]) mutation_strand_dict = {"C": "P", "G": "M"} #values_names = {"Len": 0, "P": 1, "P_dens": 2, "M": 3, "M_dens": 4, "Exp": 5} values_names = {"Len": 0, "P": 1, "M": 2, "Exp": 3} for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) os.chdir(workdir) os.chdir(sample_set_name) mutations = CollectionVCF( vcf_file="./clustering/%s_adjusted_cluster_mutations.vcf" % sample_set_name, from_file=True) mutations_large_clusters = CollectionVCF( vcf_file="./clustering/%s_adjusted_3+_cluster_mutations.vcf" % sample_set_name, from_file=True) prepare_data(mutations, "all_adjusted_cluster_mutations", expression_data_dir) prepare_data(mutations_large_clusters, "adjusted_3+_cluster_mutations", expression_data_dir)
workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/all/pre_UTR_strandness/" letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] os.chdir(workdir) sample_set_names_list = [ "PmCDA1_3d", "PmCDA1_sub1_3d", "PmCDA1_6d", "PmCDA1_sub1_6d" ] rcParams.update({'font.size': 7}) feature_type_list = ["5'_UTR", "CDS", "3'_UTR"] for feature_type in feature_type_list: plt.figure(1, dpi=300, figsize=(6, 6)) index = 1 for sample, letter in zip(sample_set_names_list, letter_list_part1): collection = CollectionVCF(from_file=True, vcf_file="%s_%s_variants.vcf" % (sample, feature_type)) sample_data = collection.count_strandness("%s_%s_variants" % (sample, feature_type)) plt.subplot(2, 2, index) n_groups = 4 points = np.arange(n_groups) bar_width = 0.35 C_values = sample_data["all"][0] G_values = sample_data["all"][1] rects1 = plt.bar(points, C_values, bar_width, color='b',