Esempio n. 1
0
    plt.savefig("heatmap_3+_power_0.05+.svg")
    plt.savefig("heatmap_3+_power_0.05+.eps")
    plt.close()
    """

    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)

        os.chdir(workdir)
        #os.system("mkdir -p %s" % sample_set_name)
        os.chdir(sample_set_name)
        #os.system("mkdir -p %s %s" % (clustering_dir, rainfall_dir))
        #os.system("pwd")
        clusters_dict[sample_set_name] = CollectionCCF(
            from_file=True,
            input_file="%s%s_adjusted_size_3+_power_0.1+.ccf" %
            (clustering_dir, sample_set_name))

        data[sample_set_name] = clusters_dict[sample_set_name].get_data_for_stat(additional_data=("Median", "Power")) \
            if "HAP" in sample_set_name \
            else clusters_dict[sample_set_name].get_data_for_stat(additional_data=("Median", "Power", "Homogeneity"))

        data[sample_set_name] = [
            data[sample_set_name][:, i]
            for i in range(0, 4 if "HAP" in sample_set_name else 5)
        ]

    os.chdir(workdir)
    os.system("mkdir -p %s" % heatmap_dir)
    os.chdir(heatmap_dir)
Esempio n. 2
0
                   "_adjusted_size_3+_power_0.05+.ccf",
                   "_adjusted_size_3+_power_0.1+.ccf"]

    homogeneity_dir = "homogeneity/"
    y_name = "N of clusters"

    os.system("mkdir -p %s" % homogeneity_dir)

    data = {}
    power_list = ["All", "Power >= 0.05", "Power >= 0.1"]
    for sample in sample_set_names_list:
        os.chdir(workdir + sample + "/" + clustering_dir)
        data[sample] = dict([(key, 0) for key in power_list])
        for suffix, name in zip(suffix_list, power_list):
            print(sample + suffix)
            clusters = CollectionCCF(from_file=True, input_file=sample + suffix)
            data[sample][name] = clusters.get_data_for_stat(additional_data=["Homogeneity", "Median", "Power"])

    parameters_dict = OrderedDict({"Length": 0,
                              "Size": 1,
                              "Homogeneity": 2,
                              "Median": 3,
                              "Power": 4})

    index = 1
    for parameter in parameters_dict:
        os.chdir(workdir + homogeneity_dir)
        plt.figure(index, dpi=150, figsize=(24, 18))
        for j in range(0, len(sample_set_names_list)):
            sample = sample_set_names_list[j]
            for i in range(0, len(power_list)):
Esempio n. 3
0
                             "HAP",
                             "PmCDA1_sub1_3d",
                             "PmCDA1_6d",
                             "HAP_sub1",
                             "PmCDA1_sub1_6d",

                             ]

samples_dir = workdir + all_files_subdir
power_limits = [f / 100 for f in range(1, 11)]
size_limits = [i for i in range(3, 11)]

os.chdir(workdir)
data_dict = {}
for sample in sample_set_names_list:
    data_dict[sample] = CollectionCCF(from_file=True, input_file=samples_dir + sample + all_files_suffix).get_data_for_stat(additional_data=["Power"])



figure = plt.figure(1, dpi=150, figsize=(18, 12))
for sample in sample_set_names_list:
    size_data = data_dict[sample][:, 1]
    max_size = max(size_data)
    bins_size_data = np.linspace(3, max_size, max_size + 1)
    power_data = data_dict[sample][:, 2]
    max_power = max(power_data)
    n_power_bins = int(max_power / 0.01) + 1
    bins_power_data = np.linspace(0, n_power_bins * 0.01, n_power_bins + 1)
    hist, xedges, yedges = np.histogram2d(size_data, power_data, bins=(bins_size_data, bins_power_data))
    ax = figure.add_subplot(3, 2, sample_set_names_list.index(sample) + 1, projection="3d")
Esempio n. 4
0
                #print(index)
                if index == 7 or index == 10:
                    index += 1
                    subplot_list.append(None)
                    continue
                file_name = "%s_size_%s+_power_%s+_good.ccf" % (sample_set_name, size, power) if power != "all" \
                            else "%s_size_%s+_good.ccf" % (sample_set_name, size)
                print("Handling %s" % file_name)
                data[sample_set_name] = {}
                if "HAP" in sample_set_name:
                    data_names = ["Size", "Power"]
                else:
                    data_names = ["Size", "Power", "Homogeneity"]

                tmp_data = CollectionCCF(
                    from_file=True, input_file=file_name).get_data_for_stat(
                        additional_data=data_names[1:])
                for data_name in data_names:
                    data[sample_set_name][
                        data_name] = tmp_data[:, value_names_dict[data_name]]
                if index == 1:
                    subplot_list.append(plt.subplot(4, 3, index))
                elif index == 8:
                    subplot_list.append(
                        plt.subplot(4, 3, index, sharex=subplot_list[0]))
                elif index >= 9:
                    subplot_list.append(
                        plt.subplot(4,
                                    3,
                                    index,
                                    sharex=subplot_list[7],
Esempio n. 5
0
File: VCF.py Progetto: melakbet/MAVR
    def get_clusters(self,
                     extracting_method="inconsistent",
                     threshold=0.8,
                     cluster_distance='average',
                     dendrogramm_max_y=2000,
                     sample_name=None,
                     save_clustering=False,
                     clustering_dir="clustering",
                     split_by_regions=False,
                     dendrogramm_color_threshold=1000,
                     draw_dendrogramm=True,
                     return_collection=True,
                     write_inconsistent=True,
                     write_correlation=True):
        from Parsers.CCF import RecordCCF, CollectionCCF, MetadataCCF, HeaderCCF
        if self.linkage_dict:
            linkage_dict = self.linkage_dict
        else:
            region_dict, linkage_dict = self.hierarchical_clustering(method=cluster_distance,
                                                                     dendrogramm_max_y=dendrogramm_max_y,
                                                                     sample_name=sample_name,
                                                                     save=save_clustering,
                                                                     clustering_dir=clustering_dir,
                                                                     dendrogramm_color_threshold=dendrogramm_color_threshold,
                                                                     draw_dendrogramm=draw_dendrogramm,
                                                                     write_correlation=write_correlation,
                                                                     write_inconsistent=write_inconsistent)
        if split_by_regions:
            mut_clusters_dict = OrderedDict({})
        else:
            mut_clusters_list = []

        clusters = OrderedDict()
        for region in linkage_dict:
            clusters[region] = fcluster(linkage_dict[region], threshold, criterion=extracting_method)

        if return_collection:
            for region in region_dict:
                # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
                clusters_dict = OrderedDict({})

                for i in range(0, len(clusters[region])):
                    if clusters[region][i] not in clusters_dict:
                        clusters_dict[clusters[region][i]] = [region_dict[region][i]]
                    else:
                        clusters_dict[clusters[region][i]].append(region_dict[region][i])
                if split_by_regions:
                    mut_clusters_dict[region] = \
                        CollectionCCF(record_list=[RecordCCF(collection_vcf=CollectionVCF(record_list=clusters_dict[cluster], from_file=False),
                                                             from_records=True) for cluster in clusters_dict],
                                      metadata=MetadataCCF(self.samples, vcf_metadata=self.metadata, vcf_header=self.header),
                                      header=HeaderCCF("CLUSTER_ID\tCHROM\tSTART\tEND\tDESCRIPTION".split("\t")))
                else:
                    mut_clusters_list += [RecordCCF(collection_vcf=CollectionVCF(record_list=clusters_dict[cluster], from_file=False), from_records=True)
                                          for cluster in clusters_dict]
            if split_by_regions:
                return mut_clusters_dict
            return CollectionCCF(record_list=mut_clusters_list, metadata=MetadataCCF(self.samples, vcf_metadata=self.metadata, vcf_header=self.header),
                                 header=HeaderCCF("CLUSTER_ID\tCHROM\tSTART\tEND\tDESCRIPTION".split("\t")))
        else:
            return clusters
Esempio n. 6
0
    #"AID_6d"
]
power = "0.03"
size = "5"
bin_size = 50
workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%s/%s/" % (
    size, power)
suffix = "_size_%s+_power_%s+_good.ccf" % (size, power)
hist_subfolder = "length_distribution/"
os.chdir(workdir)
os.system("mkdir -p %s" % hist_subfolder)

for extension in ".svg", ".eps", ".png", ".pdf":
    os.system("mkdir -p %s/%s" % (hist_subfolder, extension[1:]))
for sample_set in sample_set_names_list:
    collection = CollectionCCF(from_file=True,
                               input_file=workdir + sample_set + suffix)
    length_data = collection.get_data_for_stat(additional_data=None)[:, 0]
    #print(stat_data)
    print(sample_set)
    #print(length_data)
    total = len(length_data)
    minimum = min(length_data)
    maximum = max(length_data)
    if len(length_data) == 0:
        continue
    plt.figure(1, figsize=(5, 5))
    plt.subplot(1, 1, 1)
    bins = np.linspace(1, 2500, 2500 / bin_size + 1)
    plt.hist(length_data,
             bins=bins,
             label="Min L: %i bp\nMax L: %i bp" % (minimum, maximum))
    #"PmCDA1_sub1_6d",
    #"A1_3d",
    #"A1_6d",
    #"A3G_3d",
    #"AID_3d",
    #"AID_6d"
]
power_limits = [f / 100 for f in range(1, 11)]
size_limits = [i for i in range(3, 11)]

os.chdir(workdir)
for sample_set in sample_set_names_list:
    stat_dict = TwoLvlDict(OrderedDict({}))
    print("Handling %s" % sample_set)
    all_clusters = CollectionCCF(from_file=True,
                                 input_file=workdir + all_files_subdir +
                                 sample_set + all_files_suffix)
    if "HAP" not in sample_set:
        all_clusters.check_strandness()
    for min_size in size_limits:
        stat_dict[min_size] = OrderedDict({})
        os.system("mkdir -p %i %i/all " % (min_size, min_size))
        above_size_clusters, below_size_clusters = all_clusters.filter_by_expression(
            "record.size >= %i" % min_size)
        above_size_clusters.write(
            "%i/all/%s_size_%i+%s" %
            (min_size, sample_set, min_size, all_files_suffix))
        stat_dict[min_size][0.00] = len(above_size_clusters)
        for min_power in power_limits:

            os.system("mkdir -p %i/%.2f" % (min_size, min_power))
        return 0
    start_shift = start1 - start2
    start_coef_shift = 0 if start_shift < 0 else 1
    end_shift = end1 - end2
    end_coef_shift = 0 if end_shift > 0 else 1

    return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift

overlap_clusters_percent = TwoLvlDict({})
#size = 8
#power = 0.05
print([float(f) / float(100) for f in range(1, 11)])
for size in range(3, 11):
    overlap_clusters_percent[size] = {}
    for power in [float(f) / float(100) for f in range(1, 11)]:
        PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))

        PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))
        PmCDA1_3d_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power))
        PmCDA1_3d_sub_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power))
        #cluster_3d_dict = OrderedDict({})

        cluster_3d_dict = TwoLvlDict({})

        for cluster_3d in PmCDA1_3d_clusters:
            cluster_3d_dict[cluster_3d.id] = OrderedDict({"length": cluster_3d.len,
                                                       "N of clusters": 0,
                                                       "length of clusters": [],
                                                       "intersection": [],
                                                       "intersection % of main cluster": [],
                                                       "interscection % of clusters": [],