Esempio n. 1
0
 def draw_fraction_of_retained_pairs_per_tile_histogram(
         self, output_prefix):
     data = self.get_fraction_of_retained_pairs_per_tile()
     MatplotlibRoutines.percent_histogram(
         data,
         output_prefix,
         n_bins=20,
         title="Distribution of retained pairs per tile",
         xlabel="Fraction of retained pairs",
         ylabel="Number of tiles",
         label=None,
         extensions=("png", "svg"),
         legend=None,
         legend_location="best",
         input_mode="fraction",
         xmax=None,
         xmin=None)
Esempio n. 2
0
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff)

print("Drawing histograms...")

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:

    MatplotlibRoutines.percent_histogram_from_file(
        stat_file,
        stat_file,
        data_type=None,
        column_list=(2, ),
        comments="#",
        n_bins=20,
        title="Transcript support by hints",
        extensions=("png", "svg"),
        legend_location="upper center",
        stats_as_legend=True)
print("Creating final directories...")
if args.pfam_db and args.swissprot_db:
    db_or_hints_dir = "supported_by_db_or_hints/"
    db_and_hints_dir = "supported_by_db_and_hints/"
    for directory in db_and_hints_dir, db_or_hints_dir:
        FileRoutines.safe_mkdir(directory)

    os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" %
              (args.output, db_or_hints_dir))
    os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" %
Esempio n. 3
0
if args.index is None:
    args.index = [None for i in range(0, len(args.input))]
if args.max_value is None:
    args.max_value = [None for i in range(0, len(args.input))]

MatplotlibRoutines.draw_tetra_histogram_with_two_logscaled_from_file(
    args.input,
    args.index,
    args.output_prefix,
    figsize=(10, 10),
    number_of_bins_list=args.number_of_bins,
    width_of_bins_list=args.width_of_bins,
    max_threshold_list=args.max_value,
    min_threshold_list=args.min_value,
    xlabel=args.xlabel,
    ylabel=args.ylabel,
    title_list=args.title_list,
    logbase=args.logbase,
    label_list=None,
    extensions=args.extensions,
    suptitle=None,
    separator=args.separator,
    share_y_axis=args.share_y_axis,
    share_x_axis=args.share_x_axis)
"""
Example:
~/Dropbox/MAVR/scripts/draw/draw_tetra_histogram_with_two_logscaled.py -i kirill.dn.ds.w.tab,solenodon.raw_alns.all.tab -d 3,3 -o dnds.ratio.log  -l 'dN/dS' -y "Number of genes" -w 20 -n 0 -x 999 -t "11 species,4 species"
"""
"""
if (args.number_of_bins is not None) and (args.width_of_bins is not None):
Esempio n. 4
0
    def handle_sanger_data(self,
                           input_dir,
                           output_prefix,
                           outdir=None,
                           read_subfolders=False,
                           min_mean_qual=0,
                           min_median_qual=0,
                           min_len=50):
        if outdir:
            self.workdir = outdir

        self.init_dirs()

        sanger_filelist = self.make_list_of_path_to_files(
            input_dir,
            expression=self.is_sanger_file,
            recursive=read_subfolders,
            return_absolute_paths=True)
        stat_dict = TwoLvlDict()
        record_dict = OrderedDict()
        trimmed_record_dict = OrderedDict()
        excluded_list = IdList()
        excluded_counter = 0
        low_quality_counter = 0
        too_short_counter = 0

        merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix)
        merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix)
        merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir,
                                                        output_prefix)
        merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir,
                                                        output_prefix)

        for filename in sanger_filelist:
            filename_list = self.split_filename(filename)

            record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir,
                                                              filename_list[1])
            record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir,
                                                              filename_list[1])
            record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % (
                self.workdir, filename_list[1])

            record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % (
                self.workdir, filename_list[1])
            record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % (
                self.workdir, filename_list[1])
            record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % (
                self.workdir, filename_list[1])

            record = SeqIO.read(self.metaopen(filename, "rb"), format="abi")
            record_dict[record.id] = record
            SeqIO.write(record, record_raw_fastq, format="fastq")
            SeqIO.write(record, record_raw_fasta, format="fasta")

            trimmed_record = SeqIO.AbiIO._abi_trim(record)

            stat_dict[record.id] = OrderedDict({
                "raw_len":
                len(record),
                "raw_mean_qual":
                np.mean(record.letter_annotations["phred_quality"]),
                "raw_median_qual":
                np.median(record.letter_annotations["phred_quality"]),
                "trimmed_len":
                len(trimmed_record),
                "trimmed_mean_qual":
                np.mean(trimmed_record.letter_annotations["phred_quality"]),
                "trimmed_median_qual":
                np.median(trimmed_record.letter_annotations["phred_quality"]),
                "retained":
                "-",
            })
            MatplotlibRoutines.draw_bar_plot(
                record.letter_annotations["phred_quality"],
                record_raw_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            if stat_dict[record.id]["trimmed_len"] >= min_len:
                if min_median_qual:
                    if (stat_dict[record.id]["trimmed_median_qual"] >=
                            min_median_qual) and (
                                stat_dict[record.id]["trimmed_mean_qual"] >=
                                min_mean_qual):
                        stat_dict[record.id]["retained"] = "+"
                    else:
                        low_quality_counter += 1
                else:
                    stat_dict[record.id]["retained"] = "+"
            else:
                too_short_counter += 1

            if stat_dict[record.id]["retained"] == "-":
                excluded_list.append(record.id)
                continue

            SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq")
            SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta")

            MatplotlibRoutines.draw_bar_plot(
                trimmed_record.letter_annotations["phred_quality"],
                record_trimmed_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            trimmed_record_dict[record.id] = trimmed_record

        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fasta,
                    format="fasta")

        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fasta,
                    format="fasta")

        excluded_list.write("%s.excluded.ids" % output_prefix)
        stat_dict.write(out_filename="%s.stats" % output_prefix)

        print("Excluded: %i" % excluded_counter)
        print("\tToo short( < %i ): %i" % (min_len, too_short_counter))
        print("\tLow quality( median < %i or mean < %i ): %i" %
              (min_median_qual, min_mean_qual, low_quality_counter))
Esempio n. 5
0
    "-e",
    "--extensions",
    action="store",
    dest="extensions",
    type=lambda x: x.split(","),
    default=["png", "svg"],
    help="Comma-separated list of extensions for histogram files")
"""
parser.add_argument("-l", "--xlabel", action="store", dest="xlabel",
                    help="X label")
parser.add_argument("-y", "--ylabel", action="store", dest="ylabel",
                    help="Y label")
"""
parser.add_argument("-t",
                    "--title",
                    action="store",
                    dest="title",
                    help="Title of histogram")

args = parser.parse_args()

MatplotlibRoutines.venn_diagram_from_sets_from_files(
    args.id_file_a,
    args.id_file_b,
    set3_file=args.id_file_c,
    set_labels=args.set_labels,
    set_colors=args.set_colors,
    output_prefix=args.output_prefix,
    extensions=args.extensions,
    title=args.title)
Esempio n. 6
0
                    dest="ylabel",
                    help="Y label")
parser.add_argument("-t",
                    "--title",
                    action="store",
                    dest="title",
                    help="Title of histogram")

args = parser.parse_args()

MatplotlibRoutines.draw_histogram_from_file(args.input_file,
                                            args.output_prefix,
                                            number_of_bins=args.number_of_bins,
                                            width_of_bins=args.width_of_bins,
                                            separator=args.separator,
                                            max_length=args.max_length,
                                            min_length=args.min_length,
                                            xlabel=args.xlabel,
                                            ylabel=args.ylabel,
                                            title=args.title,
                                            extensions=args.extensions,
                                            logbase=args.logbase)
"""
if (args.number_of_bins is not None) and (args.width_of_bins is not None):
    raise AttributeError("Options -w/--width_of_bins and -b/--number_of_bins mustn't be set simultaneously")

lengths = np.fromfile(args.input_file, sep=args.separator)

max_len = max(lengths)

if args.max_length is None:
    args.max_length = max_len
Esempio n. 7
0
    def draw_general_stats_distributions(self, output_prefix, figsize=(12, 6), extensions=("png", "svg"), dpi=300,
                                         logscale_heatmaps=True):

        nrows = 2
        ncols = 4

        #figure = plt.figure(figsize=(12, 6), dpi=dpi)
        #ax_array = figure.subplots(nrows=nrows, ncols=ncols, squeeze=False)
        percent_histogram_bin_number = 20

        min_seq_number_in_alignment = min(self.general_stats_table[:, 0])
        max_seq_number_in_alignment = max(self.general_stats_table[:, 0])
        seq_bin_number = max_seq_number_in_alignment - min_seq_number_in_alignment

        figure, ax_array = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, squeeze=False, dpi=dpi)

        # Histogram: distribution of sequence number in alignment
        print("Histogram: distribution of sequence number in alignment")

        MatplotlibRoutines.draw_histogram(self.general_stats_table[:, 0],
                                          output_prefix="%s.seq_number_distibution" % output_prefix,
                                          number_of_bins=None, width_of_bins=1,
                                          max_threshold=None, min_threshold=None, xlabel=None, ylabel="N of alignments",
                                          title="Distribution of\nsequence numbers",
                                          extensions=("png",), ylogbase=None, subplot=ax_array[0, 0], suptitle=None,
                                          close_figure=False, save_histovalues_only=True)
        # Histogram(logscaled): distribution of sequence number in alignment

        print("Histogram(logscaled): distribution of sequence number in alignment")
        MatplotlibRoutines.draw_histogram(self.general_stats_table[:, 0], output_prefix=None,
                                          number_of_bins=None, width_of_bins=1,
                                          max_threshold=None, min_threshold=None,
                                          xlabel="N of sequences\nin alignment", ylabel="N of alignments",
                                          title=None,
                                          extensions=("png",), ylogbase=10, subplot=ax_array[1, 0], suptitle=None,
                                          close_figure=False)

        #print self.general_stats_table[:, 4].astype(float) / self.general_stats_table[:, 2].astype(float) * 100

        # Heatmap: x: max_seq_len/aln_len, y: min_seq_len/aln_len
        print("Heatmap: x: max_seq_len/aln_len, y: min_seq_len/aln_len")
        MatplotlibRoutines.draw_percent_heatmap(self.general_stats_table[:, 3].astype(float) / self.general_stats_table[:, 1].astype(float) * 100,
                                                self.general_stats_table[:, 2].astype(float) / self.general_stats_table[:, 1].astype(float) * 100,
                                                output_prefix="%s.min_max_seq_len" % output_prefix, xlabel="Max seq len, % of aln",
                                                ylabel="Min seq len, % of aln", title=None,
                                                figsize=(8, 8), minimum_counts_to_show=1,
                                                extensions=("png", "svg"), show_colorbar=True,
                                                bin_number=percent_histogram_bin_number, bin_width=None, bin_array=None,
                                                type="percent", add_max_value=True,
                                                subplot=ax_array[0, 1],
                                                header="#left_xedge\tleft_yedge\tvalue",
                                                save_histovalues_only=True,
                                                logscaled=logscale_heatmaps)

        # Heatmap; x: seq number, y: max_unique_pos_in_seq/aln_len
        print("Heatmap: x: seq number, y: max_unique_pos_in_seq/aln_len")
        MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 13],
                                        output_prefix="%s.max_unique_pos_seq_number" % output_prefix,
                                        xlabel="N of sequences\nin alignment",
                                        ylabel="Max uniq positions, % of seq", title=None,
                                        figsize=figsize, minimum_counts_to_show=1,
                                        extensions=extensions, show_colorbar=True,
                                        bin_number=(seq_bin_number, percent_histogram_bin_number),
                                        bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment,
                                        max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100,
                                        add_max_value=True, subplot=ax_array[1, 1],
                                        save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts",
                                        logscaled=logscale_heatmaps)

        # Heatmap; x: seq number, y: max_unique_insertions_in_seq/aln_len
        print("Heatmap: x: seq number, y: max_unique_insertions_in_seq/aln_len")
        MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 5],
                                        output_prefix="%s.max_unique_insertions_seq_number" % output_prefix,
                                        xlabel=None, #"N of sequences\nin alignment",
                                        ylabel="Max uniq insertions, % of seq", title=None,
                                        figsize=figsize, minimum_counts_to_show=1,
                                        extensions=extensions, show_colorbar=True,
                                        bin_number=(seq_bin_number, percent_histogram_bin_number),
                                        bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment,
                                        max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100,
                                        add_max_value=True, subplot=ax_array[0, 2],
                                        save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts",
                                        logscaled=logscale_heatmaps)

        # Heatmap: x: seq number, y: max_unique_gaps_in_seq/aln_len
        print("Heatmap: x: seq number, y: max_unique_gaps_in_seq/aln_len")
        MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 7],
                                        output_prefix="%s.max_unique_gaps_seq_number" % output_prefix,
                                        xlabel="N of sequences\nin alignment",
                                        ylabel="Max uniq gaps, % of seq", title=None,
                                        figsize=figsize, minimum_counts_to_show=1,
                                        extensions=extensions, show_colorbar=True,
                                        bin_number=(seq_bin_number, percent_histogram_bin_number),
                                        bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment,
                                        max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100,
                                        add_max_value=True, subplot=ax_array[1, 2],
                                        save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts",
                                        logscaled=logscale_heatmaps)


        # Heatmap; x: seq number, y: max_unique_leading_pos_in_seq/aln_len
        print("Heatmap: x: seq number, y: max_unique_leading_pos_in_seq/aln_len")
        MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 9],
                                        output_prefix="%s.max_unique_leading_pos_seq_number" % output_prefix,
                                        xlabel=None, #"N of sequences\nin alignment",
                                        ylabel="Max uniq leading pos, % of seq", title=None,
                                        figsize=figsize, minimum_counts_to_show=1,
                                        extensions=extensions, show_colorbar=True,
                                        bin_number=(seq_bin_number, percent_histogram_bin_number),
                                        bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment,
                                        max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100,
                                        add_max_value=True, subplot=ax_array[0, 3],
                                        save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts",
                                        logscaled=logscale_heatmaps)

        # Heatmap: x: seq number, y: max_unique_trailing_pos_in_seq/aln_len
        print("Heatmap: x: seq number, y: max_unique_trailing_pos_in_seq/aln_len")
        MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 11],
                                        output_prefix="%s.max_unique_trailing_pos_seq_number" % output_prefix,
                                        xlabel="N of sequences\nin alignment",
                                        ylabel="Max uniq trailing pos, % of seq", title=None,
                                        figsize=figsize, minimum_counts_to_show=1,
                                        extensions=extensions, show_colorbar=True,
                                        bin_number=(seq_bin_number, percent_histogram_bin_number),
                                        bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment,
                                        max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100,
                                        add_max_value=True, subplot=ax_array[1, 3],
                                        save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts",
                                        logscaled=logscale_heatmaps)

        plt.tight_layout()
        #aaa = self.general_stats_table[:, 3].astype(float) / self.general_stats_table[:, 1].astype(float) * 100
        #for i in range(0, len(aaa)):
        #    print "%i\t%i\t%i\t%f" % (i, self.general_stats_table[:, 3][i], self.general_stats_table[:, 1][i], aaa[i])

        for ext in extensions:
            plt.savefig("%s.%s" % (output_prefix, ext))
Esempio n. 8
0
    "-a",
    "--legend_location",
    action="store",
    dest="legend_location",
    default="upper center",
    help="Location of legend on histogram. Default - 'upper center'")
parser.add_argument(
    "-m",
    "--input_mode",
    action="store",
    dest="input_mode",
    default="percent",
    help="Type of input data. Allowed: fraction, percent. Default - percent")
args = parser.parse_args()

MatplotlibRoutines.percent_histogram_from_file(
    args.input_file,
    args.output_prefix,
    data_type=args.data_type,
    column_list=args.columns_list,
    separator=args.separator,
    comments=args.comments_prefix,
    n_bins=args.number_of_bins,
    title=args.title,
    xlabel=args.xlabel,
    ylabel=args.ylabel,
    extensions=args.extensions,
    legend_location=args.legend_location,
    stats_as_legend=True,
    input_mode=args.input_mode)
Esempio n. 9
0
for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_prefix = "%s/%s.unique_positions" % (args.output_dir,
                                                alignment_name_list[1])

    unique_position_dict[alignment_name_list[
        1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file(
            alignment_file,
            output_prefix,
            format=args.format,
            gap_symbol="-",
            return_mode="relative",
            verbose=False)

species_list = unique_position_dict.sl_keys()

data_dict = OrderedDict()

for species in species_list:
    data_dict[species] = []
    for alignment in unique_position_dict:
        data_dict[species].append(unique_position_dict[alignment][species])

data_list = [data_dict[species] for species in data_dict]

MatplotlibRoutines.extended_percent_histogram(data_list,
                                              args.histogram_output,
                                              input_mode="percent",
                                              label=species_list)
Esempio n. 10
0
    def test_roh_parameters(self,
                            output_dir,
                            output_prefix,
                            input_vcf_file,
                            allow_noncanonical_chromosome_names=True,
                            keep_autoconverted_files=None,
                            window_length_in_kb=None,
                            min_homozygous_snps_per_window=(2, 51, 1),
                            min_homozygous_snps_in_roh=(2, 101, 1),
                            max_heterozygous_snps_per_window=(1, 11, 1),
                            max_heterozygous_snps=(1, 21, 1),
                            max_inverse_density_of_homozygous_snps_in_kb_per_snp=(50, 1000, 50),
                            ):
        self.safe_mkdir(output_dir)

        plink_report_dict = OrderedDict()
        roh_count_array = np.zeros((len(range(*min_homozygous_snps_per_window)),
                                    len(range(*max_heterozygous_snps_per_window)),
                                    len(range(*min_homozygous_snps_in_roh)),
                                    len(range(*max_heterozygous_snps)),
                                    len(range(*max_inverse_density_of_homozygous_snps_in_kb_per_snp)),
                                    ), dtype=int)

        i_ticks = range(*min_homozygous_snps_per_window)
        j_ticks = range(*max_heterozygous_snps_per_window)
        k_ticks = range(*min_homozygous_snps_in_roh)
        l_ticks = range(*max_heterozygous_snps)
        m_ticks = range(*max_inverse_density_of_homozygous_snps_in_kb_per_snp)

        for i in i_ticks:
            plink_report_dict[i] = OrderedDict()
            for j in j_ticks:
                plink_report_dict[i][j] = OrderedDict()
                for k in k_ticks:
                    plink_report_dict[i][j][k] = OrderedDict()
                    for l in l_ticks:
                        plink_report_dict[i][j][k][l] = OrderedDict()
                        for m in m_ticks:
                            dir_name = "%s/%i_%i_%i_%i_%i/" % (output_dir, i, j, k, l, m)
                            description_text = "Minimum homozygous SNPs per window:\t%i\n" % i
                            description_text += "Minimum homozygous SNPs in ROh:\t%i\n" % k
                            description_text += "Maximum heterozygous SNPs per window:\t%i\n" % j
                            description_text += "Max heterozygous SNPs:\t%i\n" % l
                            description_text += "Max inverse density of homozygous SNPs(kb/SNP):\t%i\n" % m
                            self.safe_mkdir(dir_name, description_text=description_text, description_filename="DESCRIPTION")
                            self.find_runs_of_homozygosity("%s/%s" % (dir_name, output_prefix),
                                                           input_vcf_file=input_vcf_file,
                                                           allow_noncanonical_chromosome_names=allow_noncanonical_chromosome_names,
                                                           keep_autoconverted_files=keep_autoconverted_files,
                                                           roh_calling_method=None,
                                                           window_length_in_kb=window_length_in_kb,
                                                           min_homozygous_snps_per_window=i,
                                                           max_heterozygous_snps_per_window=j,
                                                           max_missing_snps_per_window=None,
                                                           max_inverse_density_of_homozygous_snps_in_kb_per_snp=m,
                                                           max_internal_gap_in_kb=None,
                                                           min_roh_length=None,
                                                           min_homozygous_snps_in_roh=k,
                                                           min_scanning_window_hit_rate=None,
                                                           generate_overlapping_segments=False,
                                                           max_heterozygous_snps=l,
                                                           min_concordance_across_jointly_homozygous_variants=None,
                                                           homozygous_verbose=False)
                            plink_report_dict[i][j][k][l][m] = PLINKReport("%s/%s.hom" % (dir_name, output_prefix),
                                                                           report_type="ROH")
                            roh_count_array[i_ticks.index(i)][j_ticks.index(j)][k_ticks.index(k)][l_ticks.index(l)][m_ticks.index(m)] = len(plink_report_dict[i][j][k][l][m])
        figure_dir = "%s/pic/" % output_dir
        self.safe_mkdir(figure_dir)

        num_k_ticks = len(k_ticks)
        num_l_ticks = len(l_ticks)

        for m in m_ticks:
            figure, subplot_list = plt.subplots(num_k_ticks, num_l_ticks, sharex=True, sharey=True)
            plt.suptitle("Number of ROH depending on several parameters")
            for subplot_index in range(0, len(subplot_list)):
                k = int(subplot_index / num_l_ticks)
                l = subplot_index % num_l_ticks

                roh_counts = roh_count_array[:, :, k, l, m]

                title = "Max heterozygous SNPs: %i" % l_ticks[l] if k == 0 else None
                xlabel = "Min homozygous SNPs per window" if k == num_k_ticks - 1 else None
                ylabel = "" if l == 0 else None
                image, colorbar = MatplotlibRoutines.annotated_heatmap(roh_counts, i_ticks, j_ticks, subplot=subplot_list[subplot_index],
                                                                       title=title, xlabel=xlabel, ylabel=None)
                """
                heatmap = subplot_list[subplot_index].imshow(roh_counts)
                subplot_list[subplot_index].set_xticks(np.arange(len(i_ticks)))
                subplot_list[subplot_index].set_yticks(np.arange(len(j_ticks)))

                subplot_list[subplot_index].set_xticklabels(i_ticks)
                subplot_list[subplot_index].set_yticklabels(j_ticks)

                colorbar = subplot_list[subplot_index].figure.colorbar(heatmap, ax=subplot_list[subplot_index])
                """

            plt.savefig("%s/%i.png" % figure_dir, m)
Esempio n. 11
0
                    dest="ylabel",
                    help="Y label")
parser.add_argument("-t",
                    "--title",
                    action="store",
                    dest="title",
                    help="Title of histogram")

args = parser.parse_args()

MatplotlibRoutines.draw_heatmap_from_file(
    args.input_file,
    args.output_prefix,
    x_column=args.x_col,
    y_column=args.y_col,
    xlabel=args.xlabel,
    ylabel=args.ylabel,
    title=args.title,
    figsize=(8, 8),
    minimum_counts_to_show=args.min_counts_to_show,
    extensions=args.extensions,
    show_colorbar=not args.remove_colorbar,
    bin_number=args.number_of_bins,
    bin_width=args.width_of_bins,
    bin_array=args.array_of_bins,
    min_x_value=args.min_x,
    max_x_value=args.max_x,
    min_y_value=args.min_y,
    max_y_value=args.max_y,
    add_max_value=True)
Esempio n. 12
0
    def extract_proteins_from_output(self,
                                     augustus_output,
                                     protein_output,
                                     evidence_stats_file=None,
                                     supported_by_hints_file=None,
                                     complete_proteins_id_file=None,
                                     id_prefix="p."):
        if evidence_stats_file:
            ev_fd = open(evidence_stats_file, "w")
            ev_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            ev_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if evidence_stats_file:
            sup_fd = open(supported_by_hints_file, "w")
            sup_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            sup_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if complete_proteins_id_file:
            complete_fd = open(complete_proteins_id_file, "w")

        with open(protein_output, "w") as out_fd:
            with open(augustus_output, "r") as in_fd:
                for line in in_fd:
                    if line[:12] == "# start gene":
                        gene = line.strip().split()[-1]
                    elif "\ttranscript\t" in line:
                        transcript_id = line.split("\t")[8].split(
                            ";")[0].split("=")[1]
                        start_presence = False
                        stop_presence = False
                        #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene))
                    elif "\tstart_codon\t" in line:
                        start_presence = True
                    elif "\tstop_codon\t" in line:
                        stop_presence = True
                    elif "# protein sequence" in line:
                        protein = line.strip().split("[")[-1]
                        if "]" in protein:
                            protein = protein.split("]")[0]
                        else:
                            while True:
                                part = in_fd.readline().split()[-1]
                                if "]" in part:
                                    protein += part.split("]")[0]
                                    break
                                else:
                                    protein += part
                        if complete_proteins_id_file:
                            #print "AAAAA"
                            #print (start_presence, stop_presence)
                            if start_presence and stop_presence:
                                complete_fd.write("%s%s\n" %
                                                  (id_prefix, transcript_id))

                        out_fd.write(
                            ">%s%s\t gene=%s start_presence=%s stop_presence=%s\n"
                            % (id_prefix, transcript_id, gene,
                               str(start_presence), str(stop_presence)))
                        out_fd.write(protein)
                        protein_len = len(protein)
                        out_fd.write("\n")

                    elif evidence_stats_file or supported_by_hints_file:
                        if line[:17] == "# % of transcript":
                            supported_fraction = line.strip().split()[-1]
                            while True:
                                tmp_line = in_fd.readline()
                                if tmp_line[:12] == "# CDS exons:":
                                    cds_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:14] == "# CDS introns:":
                                    introns_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 5'UTR exons":
                                    five_utr_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 3'UTR exons":
                                    three_introns_support = tmp_line.strip(
                                    ).split()[-1]
                                elif tmp_line[:
                                              27] == "# incompatible hint groups:":
                                    incompatible_hint_groups = tmp_line.strip(
                                    ).split()[-1]
                                    if evidence_stats_file:
                                        ev_fd.write("%s\t%s\t%s\t" %
                                                    (gene, transcript_id,
                                                     supported_fraction))
                                        ev_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))
                                    if supported_by_hints_file and (
                                            float(supported_fraction) > 0):
                                        sup_fd.write("%s\t%s\t%s\t" %
                                                     (gene, transcript_id,
                                                      supported_fraction))
                                        sup_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))

                                    break

        if evidence_stats_file:
            ev_fd.close()

        self.extract_longest_isoforms(evidence_stats_file,
                                      "%s.longest_pep" % evidence_stats_file,
                                      minimum_supported_fraction=0)
        SequenceRoutines.extract_sequence_by_ids(
            protein_output, "%s.longest_pep.ids" % evidence_stats_file,
            "%s.longest_pep.pep" % evidence_stats_file)

        if supported_by_hints_file:
            supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file
            supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file
            supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file
            self.extract_longest_isoforms(
                evidence_stats_file,
                supported_by_hints_longest_pep_evidence,
                minimum_supported_fraction=0.00001)
            SequenceRoutines.extract_sequence_by_ids(
                protein_output, supported_by_hints_longest_pep_ids,
                supported_by_hints_longest_pep)

        evidence_files = (evidence_stats_file,
                          "%s.longest_pep" % evidence_stats_file,
                          "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \
                          (evidence_stats_file,)
        for evidence_file in evidence_files:
            print("Drawing transcript support distribution for %s" %
                  evidence_file)
            MatplotlibRoutines.percent_histogram_from_file(
                evidence_file,
                evidence_file,
                column_list=(2, ),
                separator=None,
                comments="#",
                n_bins=20,
                title="Transcript support by hints",
                xlabel="%%",
                ylabel="Number",
                extensions=["svg", "png"],
                legend_location="upper center",
                stats_as_legend=True)
Esempio n. 13
0
    def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None,
                                        black_scaffold_list=(), white_scaffold_list=()):

        busco_table_dict = OrderedDict()
        gene_id_dict = OrderedDict()
        counts_dict = OrderedDict()

        output_path_list = self.split_filename(output_prefix)

        pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".")
        pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".")
        self.safe_mkdir(pairwise_overlaps_dir)
        self.safe_mkdir(pairwise_overlap_counts_dir)

        lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)]

        for busco_table, label in zip(busco_file_list, lllabels_list):
            busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list,
                                                 white_list=white_scaffold_list)

            gene_id_dict[label] = OrderedDict()
            counts_dict[label] = OrderedDict()

            gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses()

        # TODO: draw piecharts


        # TODO: count overlaps

        pairwise_overlap_dict = OrderedDict()
        count_pairwise_overlap_dict = OrderedDict()
        for label1 in lllabels_list:
            for label2 in lllabels_list:
                if label1 == label2:
                    continue
                overlap_id = "%s_vs_%s" % (label1, label2)
                pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                count_pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                for status1 in self.status_list:
                    pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    for status2 in self.status_list:
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2])
                        count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)])
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2)))

                count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id))

        if 2 <= len(busco_file_list) <= 3:
            fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6))
            plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes")
            #print(subplot_list)
            for status, index in zip(self.status_list, range(0, 4)):

                plt.sca(subplot_list[index // 2][index % 2])
                plt.title(status)
                MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status],
                                                          gene_id_dict[lllabels_list[1]][status],
                                                          set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None,
                                                          set_labels=lllabels_list, set_colors=["red", "yellow", "green"],
                                                          output_prefix=None, extensions=("png",), title=None)

            plt.savefig("%s.venn.png" % output_prefix)

            plt.close()
Esempio n. 14
0
    def get_feature_length_distribution_from_gff(self,
                                                 input_gff,
                                                 output_prefix,
                                                 feature_list=None):
        from RouToolPa.Routines import MatplotlibRoutines

        len_file = "%s.len" % output_prefix
        stat_file = "%s.stat" % output_prefix

        feature_length_list = []
        total_feature_length = 0
        feature_number = 0
        feature_type_set = set()

        with open(input_gff, "r") as in_fd:
            for line in in_fd:
                if line[0] == "#":
                    continue
                tmp = line.split("\t")
                feature = tmp[self.GFF_FEATURETYPE_COLUMN]
                feature_type_set.add(feature)
                if feature_list is not None:
                    if isinstance(feature_list, str):
                        if feature != feature_list:
                            continue
                    else:
                        if feature not in feature_list:
                            continue

                start = int(tmp[self.GFF_START_COLUMN])
                end = int(tmp[self.GFF_END_COLUMN])

                feature_number += 1
                feature_length = end - start + 1
                feature_length_list.append(feature_length)
                #len_fd.write("%i\n" % feature_length)

                total_feature_length += feature_length

        stat_string = "Features\t%s\n" % (",".join(feature_list)
                                          if feature_list else "all")
        stat_string += "Number of features\t%i\n" % feature_number
        stat_string += "Total length\t%i\n" % total_feature_length

        print(stat_string)
        with open(stat_file, "w") as stat_fd:
            stat_fd.write(stat_string)

        feature_length_list = np.array(feature_length_list)
        np.savetxt(len_file, feature_length_list, fmt='%i')

        feature_name = "feature"

        if len(feature_type_set) == 1:
            feature_name = list(feature_type_set)[0]
        elif feature_list is None:
            feature_name = "feature"
        elif isinstance(feature_list, str):
            feature_name = feature_list
        elif len(feature_list) == 1:
            feature_name = feature_list[0]
        else:
            feature_name = "feature"

        MatplotlibRoutines.draw_histogram(
            feature_length_list,
            output_prefix=output_prefix + ".all",
            xlabel="Feature length",
            ylabel="N of features",
            title="Distribution of %s lengths" % feature_name,
            ylogbase=10,
            xlogbase=10,
            bins_list=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000],
            close_figure=True)

        MatplotlibRoutines.draw_histogram(
            feature_length_list,
            output_prefix=output_prefix + ".max_10000",
            width_of_bins=100,
            max_threshold=10000,
            min_threshold=1,
            xlabel="Feature length",
            ylabel="N of features",
            title="Distribution of %s lengths" % feature_name,
            close_figure=True)

        MatplotlibRoutines.draw_histogram(
            feature_length_list,
            output_prefix=output_prefix + ".max_1000",
            width_of_bins=10,
            max_threshold=1000,
            min_threshold=1,
            xlabel="Feature length",
            ylabel="N of features",
            title="Distribution of %s lengths" % feature_name,
            close_figure=True)