plt.savefig("heatmap_3+_power_0.05+.svg") plt.savefig("heatmap_3+_power_0.05+.eps") plt.close() """ for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) os.chdir(workdir) #os.system("mkdir -p %s" % sample_set_name) os.chdir(sample_set_name) #os.system("mkdir -p %s %s" % (clustering_dir, rainfall_dir)) #os.system("pwd") clusters_dict[sample_set_name] = CollectionCCF( from_file=True, input_file="%s%s_adjusted_size_3+_power_0.1+.ccf" % (clustering_dir, sample_set_name)) data[sample_set_name] = clusters_dict[sample_set_name].get_data_for_stat(additional_data=("Median", "Power")) \ if "HAP" in sample_set_name \ else clusters_dict[sample_set_name].get_data_for_stat(additional_data=("Median", "Power", "Homogeneity")) data[sample_set_name] = [ data[sample_set_name][:, i] for i in range(0, 4 if "HAP" in sample_set_name else 5) ] os.chdir(workdir) os.system("mkdir -p %s" % heatmap_dir) os.chdir(heatmap_dir)
"_adjusted_size_3+_power_0.05+.ccf", "_adjusted_size_3+_power_0.1+.ccf"] homogeneity_dir = "homogeneity/" y_name = "N of clusters" os.system("mkdir -p %s" % homogeneity_dir) data = {} power_list = ["All", "Power >= 0.05", "Power >= 0.1"] for sample in sample_set_names_list: os.chdir(workdir + sample + "/" + clustering_dir) data[sample] = dict([(key, 0) for key in power_list]) for suffix, name in zip(suffix_list, power_list): print(sample + suffix) clusters = CollectionCCF(from_file=True, input_file=sample + suffix) data[sample][name] = clusters.get_data_for_stat(additional_data=["Homogeneity", "Median", "Power"]) parameters_dict = OrderedDict({"Length": 0, "Size": 1, "Homogeneity": 2, "Median": 3, "Power": 4}) index = 1 for parameter in parameters_dict: os.chdir(workdir + homogeneity_dir) plt.figure(index, dpi=150, figsize=(24, 18)) for j in range(0, len(sample_set_names_list)): sample = sample_set_names_list[j] for i in range(0, len(power_list)):
"HAP", "PmCDA1_sub1_3d", "PmCDA1_6d", "HAP_sub1", "PmCDA1_sub1_6d", ] samples_dir = workdir + all_files_subdir power_limits = [f / 100 for f in range(1, 11)] size_limits = [i for i in range(3, 11)] os.chdir(workdir) data_dict = {} for sample in sample_set_names_list: data_dict[sample] = CollectionCCF(from_file=True, input_file=samples_dir + sample + all_files_suffix).get_data_for_stat(additional_data=["Power"]) figure = plt.figure(1, dpi=150, figsize=(18, 12)) for sample in sample_set_names_list: size_data = data_dict[sample][:, 1] max_size = max(size_data) bins_size_data = np.linspace(3, max_size, max_size + 1) power_data = data_dict[sample][:, 2] max_power = max(power_data) n_power_bins = int(max_power / 0.01) + 1 bins_power_data = np.linspace(0, n_power_bins * 0.01, n_power_bins + 1) hist, xedges, yedges = np.histogram2d(size_data, power_data, bins=(bins_size_data, bins_power_data)) ax = figure.add_subplot(3, 2, sample_set_names_list.index(sample) + 1, projection="3d")
#print(index) if index == 7 or index == 10: index += 1 subplot_list.append(None) continue file_name = "%s_size_%s+_power_%s+_good.ccf" % (sample_set_name, size, power) if power != "all" \ else "%s_size_%s+_good.ccf" % (sample_set_name, size) print("Handling %s" % file_name) data[sample_set_name] = {} if "HAP" in sample_set_name: data_names = ["Size", "Power"] else: data_names = ["Size", "Power", "Homogeneity"] tmp_data = CollectionCCF( from_file=True, input_file=file_name).get_data_for_stat( additional_data=data_names[1:]) for data_name in data_names: data[sample_set_name][ data_name] = tmp_data[:, value_names_dict[data_name]] if index == 1: subplot_list.append(plt.subplot(4, 3, index)) elif index == 8: subplot_list.append( plt.subplot(4, 3, index, sharex=subplot_list[0])) elif index >= 9: subplot_list.append( plt.subplot(4, 3, index, sharex=subplot_list[7],
def get_clusters(self, extracting_method="inconsistent", threshold=0.8, cluster_distance='average', dendrogramm_max_y=2000, sample_name=None, save_clustering=False, clustering_dir="clustering", split_by_regions=False, dendrogramm_color_threshold=1000, draw_dendrogramm=True, return_collection=True, write_inconsistent=True, write_correlation=True): from Parsers.CCF import RecordCCF, CollectionCCF, MetadataCCF, HeaderCCF if self.linkage_dict: linkage_dict = self.linkage_dict else: region_dict, linkage_dict = self.hierarchical_clustering(method=cluster_distance, dendrogramm_max_y=dendrogramm_max_y, sample_name=sample_name, save=save_clustering, clustering_dir=clustering_dir, dendrogramm_color_threshold=dendrogramm_color_threshold, draw_dendrogramm=draw_dendrogramm, write_correlation=write_correlation, write_inconsistent=write_inconsistent) if split_by_regions: mut_clusters_dict = OrderedDict({}) else: mut_clusters_list = [] clusters = OrderedDict() for region in linkage_dict: clusters[region] = fcluster(linkage_dict[region], threshold, criterion=extracting_method) if return_collection: for region in region_dict: # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster clusters_dict = OrderedDict({}) for i in range(0, len(clusters[region])): if clusters[region][i] not in clusters_dict: clusters_dict[clusters[region][i]] = [region_dict[region][i]] else: clusters_dict[clusters[region][i]].append(region_dict[region][i]) if split_by_regions: mut_clusters_dict[region] = \ CollectionCCF(record_list=[RecordCCF(collection_vcf=CollectionVCF(record_list=clusters_dict[cluster], from_file=False), from_records=True) for cluster in clusters_dict], metadata=MetadataCCF(self.samples, vcf_metadata=self.metadata, vcf_header=self.header), header=HeaderCCF("CLUSTER_ID\tCHROM\tSTART\tEND\tDESCRIPTION".split("\t"))) else: mut_clusters_list += [RecordCCF(collection_vcf=CollectionVCF(record_list=clusters_dict[cluster], from_file=False), from_records=True) for cluster in clusters_dict] if split_by_regions: return mut_clusters_dict return CollectionCCF(record_list=mut_clusters_list, metadata=MetadataCCF(self.samples, vcf_metadata=self.metadata, vcf_header=self.header), header=HeaderCCF("CLUSTER_ID\tCHROM\tSTART\tEND\tDESCRIPTION".split("\t"))) else: return clusters
#"AID_6d" ] power = "0.03" size = "5" bin_size = 50 workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%s/%s/" % ( size, power) suffix = "_size_%s+_power_%s+_good.ccf" % (size, power) hist_subfolder = "length_distribution/" os.chdir(workdir) os.system("mkdir -p %s" % hist_subfolder) for extension in ".svg", ".eps", ".png", ".pdf": os.system("mkdir -p %s/%s" % (hist_subfolder, extension[1:])) for sample_set in sample_set_names_list: collection = CollectionCCF(from_file=True, input_file=workdir + sample_set + suffix) length_data = collection.get_data_for_stat(additional_data=None)[:, 0] #print(stat_data) print(sample_set) #print(length_data) total = len(length_data) minimum = min(length_data) maximum = max(length_data) if len(length_data) == 0: continue plt.figure(1, figsize=(5, 5)) plt.subplot(1, 1, 1) bins = np.linspace(1, 2500, 2500 / bin_size + 1) plt.hist(length_data, bins=bins, label="Min L: %i bp\nMax L: %i bp" % (minimum, maximum))
#"PmCDA1_sub1_6d", #"A1_3d", #"A1_6d", #"A3G_3d", #"AID_3d", #"AID_6d" ] power_limits = [f / 100 for f in range(1, 11)] size_limits = [i for i in range(3, 11)] os.chdir(workdir) for sample_set in sample_set_names_list: stat_dict = TwoLvlDict(OrderedDict({})) print("Handling %s" % sample_set) all_clusters = CollectionCCF(from_file=True, input_file=workdir + all_files_subdir + sample_set + all_files_suffix) if "HAP" not in sample_set: all_clusters.check_strandness() for min_size in size_limits: stat_dict[min_size] = OrderedDict({}) os.system("mkdir -p %i %i/all " % (min_size, min_size)) above_size_clusters, below_size_clusters = all_clusters.filter_by_expression( "record.size >= %i" % min_size) above_size_clusters.write( "%i/all/%s_size_%i+%s" % (min_size, sample_set, min_size, all_files_suffix)) stat_dict[min_size][0.00] = len(above_size_clusters) for min_power in power_limits: os.system("mkdir -p %i/%.2f" % (min_size, min_power))
return 0 start_shift = start1 - start2 start_coef_shift = 0 if start_shift < 0 else 1 end_shift = end1 - end2 end_coef_shift = 0 if end_shift > 0 else 1 return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift overlap_clusters_percent = TwoLvlDict({}) #size = 8 #power = 0.05 print([float(f) / float(100) for f in range(1, 11)]) for size in range(3, 11): overlap_clusters_percent[size] = {} for power in [float(f) / float(100) for f in range(1, 11)]: PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power)) PmCDA1_3d_sub_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power)) #cluster_3d_dict = OrderedDict({}) cluster_3d_dict = TwoLvlDict({}) for cluster_3d in PmCDA1_3d_clusters: cluster_3d_dict[cluster_3d.id] = OrderedDict({"length": cluster_3d.len, "N of clusters": 0, "length of clusters": [], "intersection": [], "intersection % of main cluster": [], "interscection % of clusters": [],