Esempio n. 1
0
    def combine_count_files(count_file_list,
                            output_file,
                            sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError(
                    "Several files doesn't have corresponding sample name")

        samples = zip(
            sample_name_list if sample_name_list else count_file_list,
            count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename,
                                          header=False,
                                          separator="\t",
                                          allow_repeats_of_key=False,
                                          split_values=False,
                                          values_separator=",",
                                          key_index=0,
                                          value_index=1,
                                          close_after_if_file_object=False,
                                          expression=None,
                                          comments_prefix="__")

        count_table.write(output_file)
Esempio n. 2
0
def results_extraction_listener(queue,
                                output_file_prefix,
                                selected_species_list=None):
    """listens for messages on the queue, writes to file."""

    positive_selection_dict = TwoLvlDict()
    selected_species_positive_selection_dict = TwoLvlDict()
    error_fd = open("errors.err", "w")
    error_fd.write("#sample\terror_code\n")
    while 1:
        result = queue.get()
        if isinstance(result[1], int):
            error_fd.write("%s\t%i\n" % (result[0], result[1]))
            continue
        if result == 'finish':
            positive_selection_dict.write("%s.all" % output_file_prefix,
                                          absent_symbol=".")
            if selected_species_list:
                selected_species_positive_selection_dict.write(
                    "%s.selected_species" % output_file_prefix,
                    absent_symbol=".")
            # print positive_selection_dict.table_form(absent_symbol=".")
            break
        if result[1]:
            positive_selection_dict[result[0]] = result[1]
            if selected_species_list:
                for species in selected_species_list:
                    if species in result[1]:
                        if result[
                                0] not in selected_species_positive_selection_dict:
                            selected_species_positive_selection_dict[
                                result[0]] = {}
                        selected_species_positive_selection_dict[
                            result[0]][species] = result[1][species]
Esempio n. 3
0
    def count_reads_and_bases(self, fastq_file_list, stat_file=None):

        fastq_list = [fastq_file_list] if isinstance(fastq_file_list, str) else fastq_file_list

        counts = TwoLvlDict()

        for fastq_file in fastq_list:
            counts[fastq_file] = OrderedDict()
            counts[fastq_file]["Reads"] = 0
            counts[fastq_file]["Bases"] = 0

        for fastq_file in fastq_list:
            with self.metaopen(fastq_file, "r") as fastq_fd:
                for line in fastq_fd:
                    counts[fastq_file]["Bases"] += len(fastq_fd.readline())
                    counts[fastq_file]["Reads"] += 1
                    fastq_fd.readline()
                    fastq_fd.readline()

                # to take into account "\n" at the end of each line
                counts[fastq_file]["Bases"] = counts[fastq_file]["Bases"] - counts[fastq_file]["Reads"]

        counts.write()

        if stat_file:
            counts.write(stat_file)

        return counts
Esempio n. 4
0
    def count_locations(self,
                        annotation_black_list=[],
                        allow_several_counts_of_record=False,
                        out_filename="location_counts.t",
                        write=True,
                        count_dir="location_counts"):
        os.system("mkdir -p %s" % count_dir)
        regions_dict = self._split_regions()
        region_counts_dict = TwoLvlDict({})
        for region in regions_dict:
            count_locations_dict = {"igc": 0}

            for record in regions_dict[region]:
                if (not record.description["Loc"]) or (
                        "Loc" not in record.description):
                    count_locations_dict["unknown"] += 1
                    continue
                #print(record.description["Loc"])
                if allow_several_counts_of_record:
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        if location not in count_locations_dict:
                            count_locations_dict[location] = 1
                        else:
                            count_locations_dict[location] += 1
                else:
                    full_location = []
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        full_location.append(location)
                    if not full_location:
                        continue
                    full_location.sort()
                    full_location = "/".join(full_location)
                    if full_location not in count_locations_dict:
                        count_locations_dict[full_location] = 1
                    else:
                        count_locations_dict[full_location] += 1

            labels = []
            counts = []
            #colors = []
            for location in count_locations_dict:
                if count_locations_dict[
                        location] == 0 or location in annotation_black_list:
                    continue
                labels.append(location)
                counts.append(count_locations_dict[location])
            region_counts_dict[region] = OrderedDict([
                (label, count) for label, count in zip(labels, counts)
            ])

        if write:
            region_counts_dict.write("%s/%s" % (count_dir, out_filename))
        return region_counts_dict
Esempio n. 5
0
    def write_stats(self, output_prefix):
        Ns_dict = TwoLvlDict()
        gaps_dict = TwoLvlDict()
        for record_id in self.records:
            Ns_dict[self.records[record_id].id] = self.records[record_id].N_counts
            gaps_dict[self.records[record_id].id] = self.records[record_id].gap_counts

        Ns_dict.write(out_filename="%s.N_counts" % output_prefix)
        gaps_dict.write(out_filename="%s.gaps_counts" % output_prefix)
Esempio n. 6
0
    def count_types(self, output_file=None, total_output_file=None, return_mode="chrom"):

        annotated_types = self.get_annotated_types()
        count_dict = TwoLvlDict()
        total_count_dict = OrderedDict()

        for type in annotated_types:
            total_count_dict[type] = OrderedDict()
            total_count_dict[type]["complete"] = 0
            total_count_dict[type]["partial"] = 0

        for chrom in self.records:
            count_dict[chrom] = OrderedDict()
            for type in annotated_types:
                count_dict[chrom][type] = 0

        for chrom in self.records:
            for record in self.records[chrom]:
                count_dict[chrom][record.type] += 1
                if record.partial:
                    total_count_dict[record.type]["partial"] += 1
                else:
                    total_count_dict[record.type]["complete"] += 1

        if output_file:
            count_dict.write(output_file)

        if total_output_file:
            with open(total_output_file, "w") as out_fd:
                out_fd.write("#rRNA\tComplete%s\tPartial%s\n" % ("(>%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "",
                                                                 "(<%.2f of expected length)" % self.partial_threshold if self.partial_threshold else ""))
                for type in total_count_dict:
                    out_fd.write("%s\t%i\t%i\n" % (type, total_count_dict[type]["complete"],
                                                   total_count_dict[type]["partial"]))

        if return_mode == "chrom":
            return count_dict
        elif return_mode == "total":
            return total_count_dict
        elif return_mode == "both":
            return count_dict, total_count_dict
        else:
            raise ValueError("Unknown return type. Allowed variants: 'chrom', 'total', 'both'")
Esempio n. 7
0
from collections import OrderedDict
from RouToolPa.Collections.General import TwoLvlDict
from RouToolPa.Routines.File import check_path



parser = argparse.ArgumentParser()

parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","),
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path,
                    help="Directory with per species statistics")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_stat_dict = TwoLvlDict()

for species in args.species_list:
    with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd:
        statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines())
    species_stat_dict[species] = OrderedDict(statistics)

species_stat_dict.write(out_fd)
if args.output != "stdout":
    out_fd.close()
Esempio n. 8
0
                        cluster_3d_set.add(gene)

        for cluster_3d_sub in PmCDA1_3d_sub_clusters:
            for variant in cluster_3d_sub:
                if "Genes" in variant.info_dict:
                    for gene in variant.info_dict["Genes"]:
                        cluster_3d_sub_set.add(gene)


        print ("PmCDA1 3d : %i" % len(cluster_3d_set))
        print ("PmCDA1 3d sub: %i" % len(cluster_3d_sub_set))
        intersection = cluster_3d_set & cluster_3d_sub_set
        print("Intersection: % i" % len(intersection))

        n_intersection_genes = len(intersection)
        n_cluster_3d_genes = len(cluster_3d_set)
        n_cluster_3d_sub_genes = len(cluster_3d_sub_set)


        #print intersection
        #print cluster_3d_set
        #print cluster_3d_sub_set

        overlap_clusters_percent[size][power] = 100 * float(len(intersection))/float(len(cluster_3d_set))

        p_value = hypergeom(n_intersection_genes, totaly_genes, n_cluster_3d_genes, n_cluster_3d_sub_genes)
        test_fd.write("%i\t%.2f\t%i\t%i\t%i\t%i\t%e\n" % (size, power, totaly_genes, n_cluster_3d_genes, n_cluster_3d_sub_genes, n_intersection_genes, p_value))

overlap_clusters_percent.write("overlap_clusters_percent_genes.t")

test_fd.close()
Esempio n. 9
0
number_of_bins = len(bins) - 1

# add zeroes to absent bins for all assemblies
for assembly in assembly_contig_cumulative_length:
    bin_number_difference = number_of_bins - len(
        assembly_contig_cumulative_length[assembly])
    if bin_number_difference > 0:
        assembly_contig_cumulative_length[assembly] += [
            0 for i in range(0, bin_number_difference)
        ]
        assembly_contig_number_values[assembly] += [
            0 for i in range(0, bin_number_difference)
        ]

assembly_N50_dict.write("%s.N50" % args.output_prefix)
assembly_L50.write("%s.L50" % args.output_prefix)
assembly_general_stats.write("%s.general" % args.output_prefix)
assembly_lengths.write("%s.lengths" % args.output_prefix)
#assembly_bins.write("%s.bins" % args.output_prefix)
#print(assembly_contig_cumulative_length)
#assembly_contig_cumulative_length.write("%s.cumulative_length" % args.output_prefix)
#assembly_contig_number_values.write("%s.contig_number_values" % args.output_prefix)

fig = plt.figure(figsize=(12, 6))
subplot_1 = plt.subplot(1, 2, 1)

plt.hist(
    [assembly_length_array[assembly] for assembly in assembly_length_array],
    bins,
    label=assembly_length_array.keys())
Esempio n. 10
0
    def handle_sanger_data(self,
                           input_dir,
                           output_prefix,
                           outdir=None,
                           read_subfolders=False,
                           min_mean_qual=0,
                           min_median_qual=0,
                           min_len=50):
        if outdir:
            self.workdir = outdir

        self.init_dirs()

        sanger_filelist = self.make_list_of_path_to_files(
            input_dir,
            expression=self.is_sanger_file,
            recursive=read_subfolders,
            return_absolute_paths=True)
        stat_dict = TwoLvlDict()
        record_dict = OrderedDict()
        trimmed_record_dict = OrderedDict()
        excluded_list = IdList()
        excluded_counter = 0
        low_quality_counter = 0
        too_short_counter = 0

        merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix)
        merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix)
        merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir,
                                                        output_prefix)
        merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir,
                                                        output_prefix)

        for filename in sanger_filelist:
            filename_list = self.split_filename(filename)

            record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir,
                                                              filename_list[1])
            record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir,
                                                              filename_list[1])
            record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % (
                self.workdir, filename_list[1])

            record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % (
                self.workdir, filename_list[1])
            record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % (
                self.workdir, filename_list[1])
            record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % (
                self.workdir, filename_list[1])

            record = SeqIO.read(self.metaopen(filename, "rb"), format="abi")
            record_dict[record.id] = record
            SeqIO.write(record, record_raw_fastq, format="fastq")
            SeqIO.write(record, record_raw_fasta, format="fasta")

            trimmed_record = SeqIO.AbiIO._abi_trim(record)

            stat_dict[record.id] = OrderedDict({
                "raw_len":
                len(record),
                "raw_mean_qual":
                np.mean(record.letter_annotations["phred_quality"]),
                "raw_median_qual":
                np.median(record.letter_annotations["phred_quality"]),
                "trimmed_len":
                len(trimmed_record),
                "trimmed_mean_qual":
                np.mean(trimmed_record.letter_annotations["phred_quality"]),
                "trimmed_median_qual":
                np.median(trimmed_record.letter_annotations["phred_quality"]),
                "retained":
                "-",
            })
            MatplotlibRoutines.draw_bar_plot(
                record.letter_annotations["phred_quality"],
                record_raw_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            if stat_dict[record.id]["trimmed_len"] >= min_len:
                if min_median_qual:
                    if (stat_dict[record.id]["trimmed_median_qual"] >=
                            min_median_qual) and (
                                stat_dict[record.id]["trimmed_mean_qual"] >=
                                min_mean_qual):
                        stat_dict[record.id]["retained"] = "+"
                    else:
                        low_quality_counter += 1
                else:
                    stat_dict[record.id]["retained"] = "+"
            else:
                too_short_counter += 1

            if stat_dict[record.id]["retained"] == "-":
                excluded_list.append(record.id)
                continue

            SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq")
            SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta")

            MatplotlibRoutines.draw_bar_plot(
                trimmed_record.letter_annotations["phred_quality"],
                record_trimmed_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            trimmed_record_dict[record.id] = trimmed_record

        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fasta,
                    format="fasta")

        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fasta,
                    format="fasta")

        excluded_list.write("%s.excluded.ids" % output_prefix)
        stat_dict.write(out_filename="%s.stats" % output_prefix)

        print("Excluded: %i" % excluded_counter)
        print("\tToo short( < %i ): %i" % (min_len, too_short_counter))
        print("\tLow quality( median < %i or mean < %i ): %i" %
              (min_median_qual, min_mean_qual, low_quality_counter))
Esempio n. 11
0
filtered.write("%s/%s_adjusted_size_3+_power_0.05+.ccf" % (clustering_dir, sample))
filtered_out.write("%s/%s_adjusted_size_3+_power_less_0.05.ccf" % (clustering_dir, sample))
"""
if "HAP" not in sample:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
else:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power"))
"""
filtered, filtered_out = filtered.filter(filter_by_power_10)
filtered.write("%s/%s_adjusted_size_3+_power_0.1+.ccf" % (clustering_dir, sample))
filtered_out.write("%s/%s_adjusted_size_3+_power_0.05+_less_0.1.ccf" % (clustering_dir, sample))
"""
if "HAP" not in sample:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
else:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power"))
"""
statistics_dict.write(out_filename="%s/%s_mutation_count_statistics.t" % (clustering_dir, sample))
Esempio n. 12
0
    def filter(self,
               samples_directory,
               output_directory,
               adapter_fragment_file,
               trimmomatic_adapter_file,
               general_stat_file,
               samples_to_handle=None,
               threads=4,
               trimmomatic_dir="",
               coockiecutter_dir="",
               facut_dir="",
               mismatch_number=2,
               pe_reads_score=30,
               se_read_score=10,
               min_adapter_len=1,
               sliding_window_size=None,
               average_quality_threshold=15,
               base_quality="phred33",
               read_name_type="illumina",
               leading_base_quality_threshold=None,
               trailing_base_quality_threshold=None,
               crop_length=None,
               head_crop_length=None,
               min_len=50,
               remove_intermediate_files=False,
               skip_coockiecutter=False,
               retain_single_end_reads=True,
               input_is_se=False):

        Cookiecutter.path = coockiecutter_dir
        Trimmomatic.jar_path = trimmomatic_dir
        Trimmomatic.threads = threads
        FaCut.path = facut_dir

        self.safe_mkdir(output_directory)
        """
        merged_raw_dir = "%s/merged/" % output_directory
        filtered_dir = "%s/filtered/" % output_directory
        coockie_filtered_dir = "%s/coockiecutter/" % filtered_dir
        coockie_trimmomatic_filtered_dir = "%s/coockiecutter_trimmomatic/" % filtered_dir
        coockie_trimmomatic_quality_filtered_dir = "%s/coockiecutter_trimmomatic_quality/" % filtered_dir
        final_filtered_dir = "%s/final/" % filtered_dir
        filtering_stat_dir = "%s/filtered_stat/" % output_directory
        """
        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        merged_raw_dir, filtered_dir, coockie_filtered_dir, \
        coockie_trimmomatic_filtered_dir, coockie_trimmomatic_quality_filtered_dir, \
        final_filtered_dir, filtering_stat_dir = self.prepare_filtering_directories(output_directory, sample_list)

        filtering_statistics = TwoLvlDict()

        for sample in sample_list:
            print("Handling sample %s" % sample)
            filtering_statistics[sample] = OrderedDict()
            merged_raw_sample_dir = "%s/%s/" % (merged_raw_dir, sample)
            #merged_forward_reads = "%s/%s_1.fq" % (merged_raw_sample_dir, sample)
            #merged_reverse_reads = "%s/%s_2.fq" % (merged_raw_sample_dir, sample)

            coockie_filtered_sample_dir = "%s/%s/" % (coockie_filtered_dir,
                                                      sample)
            coockie_stats = "%s/%s.coockiecutter.stats" % (
                coockie_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (
                coockie_trimmomatic_filtered_dir, sample)

            coockie_trimmomatic_quality_filtered_sample_dir = "%s/%s/" % (
                coockie_trimmomatic_quality_filtered_dir, sample)
            final_filtered_sample_dir = "%s/%s/" % (final_filtered_dir, sample)
            filtering_stat_sample_dir = "%s/%s" % (filtering_stat_dir, sample)

            #"""
            print("\tMerging fastqs if necessary...")
            merged_forward_reads, merged_reverse_reads, merged_se_reads = self.combine_fastq_files(
                samples_directory,
                sample,
                merged_raw_sample_dir,
                use_links_if_merge_not_necessary=True,
                input_is_se=input_is_se)
            if not skip_coockiecutter:
                print("\tFiltering by Cookiecutter")
                #"""
                Cookiecutter.rm_reads(
                    adapter_fragment_file,
                    merged_forward_reads
                    if merged_forward_reads else merged_se_reads,
                    coockie_stats,
                    right_reads=merged_reverse_reads,
                    out_dir=coockie_filtered_sample_dir,
                    use_dust_filter=False,
                    dust_cutoff=None,
                    dust_window_size=None,
                    use_N_filter=False,
                    read_length_cutoff=None,
                    polyGC_length_cutoff=None)

                #"""
                print("\tParsing Cookiecutter report...")
                coockiecutter_report = CoockiecutterReport(
                    coockie_stats, input_is_se=input_is_se)

                filtering_statistics[sample][
                    "raw_pairs"] = coockiecutter_report.input_pairs
                filtering_statistics[sample][
                    "pairs_after_coockiecutter"] = coockiecutter_report.retained_pairs
                filtering_statistics[sample][
                    "pairs_after_coockiecutter,%"] = float(
                        "%.2f" %
                        (float(coockiecutter_report.retained_pairs) /
                         float(coockiecutter_report.input_pairs) * 100))

                os.system("cp %s %s" %
                          (coockie_stats, filtering_stat_sample_dir))

                coockie_filtered_paired_forward_reads = "%s/%s_1.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
                coockie_filtered_paired_reverse_reads = "%s/%s_2.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
                coockie_filtered_paired_se_reads = ""

                coockie_filtered_se_reads = "%s/%s.se.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
            # se reads produced by Coockiecutter are ignored now!!

            #coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (coockie_trimmomatic_filtered_dir, sample)
            trimmomatic_output_prefix = "%s/%s" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            trimmomatic_log = "%s.trimmomatic.log" % trimmomatic_output_prefix
            #"""
            if (merged_forward_reads is None) and (merged_reverse_reads is
                                                   None):
                print("Filtering by Trimmomatic...")

                Trimmomatic.filter(
                    merged_se_reads
                    if skip_coockiecutter else coockie_filtered_se_reads,
                    trimmomatic_output_prefix,
                    output_extension="fq",
                    right_reads=None,
                    adapters_file=trimmomatic_adapter_file,
                    mismatch_number=mismatch_number,
                    pe_reads_score=pe_reads_score,
                    se_read_score=se_read_score,
                    min_adapter_len=min_adapter_len,
                    sliding_window_size=sliding_window_size,
                    average_quality_threshold=average_quality_threshold,
                    leading_base_quality_threshold=
                    leading_base_quality_threshold,
                    trailing_base_quality_threshold=
                    trailing_base_quality_threshold,
                    crop_length=crop_length,
                    head_crop_length=head_crop_length,
                    min_length=min_len,
                    logfile=trimmomatic_log,
                    base_quality=base_quality)

            else:
                print("\tFiltering by Trimmomatic...")
                Trimmomatic.filter(
                    merged_forward_reads if skip_coockiecutter else
                    coockie_filtered_paired_forward_reads,
                    trimmomatic_output_prefix,
                    output_extension="fq",
                    right_reads=merged_reverse_reads if skip_coockiecutter else
                    coockie_filtered_paired_reverse_reads,
                    adapters_file=trimmomatic_adapter_file,
                    mismatch_number=mismatch_number,
                    pe_reads_score=pe_reads_score,
                    se_read_score=se_read_score,
                    min_adapter_len=min_adapter_len,
                    sliding_window_size=sliding_window_size,
                    average_quality_threshold=average_quality_threshold,
                    leading_base_quality_threshold=
                    leading_base_quality_threshold,
                    trailing_base_quality_threshold=
                    trailing_base_quality_threshold,
                    crop_length=crop_length,
                    head_crop_length=head_crop_length,
                    min_length=min_len,
                    logfile=trimmomatic_log,
                    base_quality=base_quality)
            #"""
            trimmomatic_report = TrimmomaticReport(trimmomatic_log,
                                                   input_is_se=input_is_se)
            if skip_coockiecutter:
                filtering_statistics[sample][
                    "raw_pairs"] = trimmomatic_report.stats["input"]

            filtering_statistics[sample][
                "pairs_after_trimmomatic"] = trimmomatic_report.stats[
                    "surviving"] if input_is_se else trimmomatic_report.stats[
                        "both_surviving"]
            filtering_statistics[sample][
                "pairs_after_trimmomatic,%"] = trimmomatic_report.stats[
                    "surviving,%"] if input_is_se else trimmomatic_report.stats[
                        "both_surviving,%"]

            if retain_single_end_reads and not input_is_se:
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic"] = trimmomatic_report.stats[
                        "forward_only_surviving"]
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[
                        "forward_only_surviving"]

                filtering_statistics[sample][
                    "reverse_se_after_trimmomatic"] = trimmomatic_report.stats[
                        "reverse_only_surviving,%"]
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[
                        "forward_only_surviving,%"]

            os.system("cp %s %s" %
                      (trimmomatic_log, filtering_stat_sample_dir))

            coockie_trimmomatic_filtered_paired_forward_reads = "%s/%s_1.pe.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            coockie_trimmomatic_filtered_paired_reverse_reads = "%s/%s_2.pe.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_unpaired_forward_reads = "%s/%s_1.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            coockie_trimmomatic_filtered_unpaired_reverse_reads = "%s/%s_2.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_se_reads = "%s/%s.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            final_forward_reads = "%s/%s.final_1.fastq" % (
                final_filtered_sample_dir, sample)
            final_reverse_reads = "%s/%s.final_2.fastq" % (
                final_filtered_sample_dir, sample)

            final_forward_se_reads = "%s/%s.final_1.se.fastq" % (
                final_filtered_sample_dir, sample)
            final_reverse_se_reads = "%s/%s.final_2.se.fastq" % (
                final_filtered_sample_dir, sample)

            final_se_reads = "%s/%s.final.se.fastq" % (
                final_filtered_sample_dir, sample)

            if sliding_window_size is None:
                facut_pe_output_prefix = "%s/%s.pe" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_forward_se_output_prefix = "%s/%s.forward.se" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_reverse_se_output_prefix = "%s/%s.reverse.se" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_pe_stat_file = "%s.facut.stat" % facut_pe_output_prefix

                facut_forward_se_stat_file = "%s.facut.stat" % facut_forward_se_output_prefix
                facut_reverse_se_stat_file = "%s.facut.stat" % facut_reverse_se_output_prefix
                #"""
                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_pe_output_prefix,
                    coockie_trimmomatic_filtered_paired_forward_reads,
                    reverse_reads=
                    coockie_trimmomatic_filtered_paired_reverse_reads,
                    quality_type=base_quality,
                    stat_file=facut_pe_stat_file,
                    name_type=read_name_type)

                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_forward_se_output_prefix,
                    coockie_trimmomatic_filtered_unpaired_forward_reads,
                    quality_type=base_quality,
                    stat_file=facut_forward_se_stat_file,
                    name_type=read_name_type)
                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_reverse_se_output_prefix,
                    coockie_trimmomatic_filtered_unpaired_reverse_reads,
                    quality_type=base_quality,
                    stat_file=facut_reverse_se_stat_file,
                    name_type=read_name_type)
                #"""
                #if input_is_se:

                #else:
                facut_report = FaCutReport(facut_pe_stat_file)

                filtering_statistics[sample][
                    "pairs_after_facut"] = facut_report.retained_pairs
                filtering_statistics[sample]["pairs_after_facut,%"] = float(
                    "%.2f" % (float(facut_report.retained_pairs) /
                              float(facut_report.input_pairs) * 100))
                filtering_statistics[sample][
                    "retained_pairs_in_worst_tile,%"] = facut_report.minimum_retained_pairs_in_tiles_fraction * 100

                filtering_statistics[sample][
                    "pairs_survived_after_filtration,%"] = float(
                        "%.2f" %
                        (float(facut_report.retained_pairs) /
                         filtering_statistics[sample]["raw_pairs"] * 100))

                facut_filtered_forward_reads = "%s_1.pe.fq" % facut_pe_output_prefix
                facut_filtered_reverse_reads = "%s_2.pe.fq" % facut_pe_output_prefix

                facut_filtered_forward_se_reads = "%s.se.fq" % facut_forward_se_output_prefix
                facut_filtered_reverse_se_reads = "%s.se.fq" % facut_reverse_se_output_prefix

                os.system("cp %s %s" %
                          (facut_pe_stat_file, filtering_stat_sample_dir))
                if retain_single_end_reads:
                    os.system("cp %s %s" % (facut_forward_se_stat_file,
                                            filtering_stat_sample_dir))
                    os.system("cp %s %s" % (facut_reverse_se_stat_file,
                                            filtering_stat_sample_dir))

                os.system("ln %s %s" %
                          (facut_filtered_forward_reads, final_forward_reads))
                os.system("ln %s %s" %
                          (facut_filtered_reverse_reads, final_reverse_reads))
                if retain_single_end_reads and not input_is_se:
                    os.system("cat %s %s > %s" %
                              (facut_filtered_forward_se_reads,
                               facut_filtered_reverse_se_reads,
                               final_forward_se_reads))

                    #os.system("ln %s %s" % (facut_filtered_forward_se_reads, final_forward_se_reads))
                    #os.system("ln %s %s" % (facut_filtered_reverse_se_reads, final_reverse_se_reads))

                if input_is_se:
                    pass
                    #os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads))

            else:
                os.system("ln %s %s" %
                          (coockie_trimmomatic_filtered_paired_forward_reads,
                           final_forward_reads))
                os.system("ln %s %s" %
                          (coockie_trimmomatic_filtered_paired_reverse_reads,
                           final_reverse_reads))
                if retain_single_end_reads and not input_is_se:
                    os.system(
                        "cat %s %s > %s" %
                        (coockie_trimmomatic_filtered_unpaired_forward_reads,
                         coockie_trimmomatic_filtered_unpaired_reverse_reads,
                         final_forward_se_reads))
                    """
                    os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, final_forward_se_reads))
                    os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_reverse_reads, final_reverse_se_reads))

                    """
                if input_is_se:
                    os.system("ln %s %s" %
                              (coockie_trimmomatic_filtered_se_reads,
                               final_se_reads))
                filtering_statistics[sample][
                    "pairs_survived_after_filtration,%"] = float(
                        "%.2f" %
                        (float(trimmomatic_report.stats[
                            "surviving" if input_is_se else "both_surviving"])
                         / filtering_statistics[sample]["raw_pairs"] * 100))

            print(filtering_statistics.table_form())

            if remove_intermediate_files:
                shutil.rmtree(merged_raw_sample_dir)
                shutil.rmtree(coockie_filtered_sample_dir)
                shutil.rmtree(coockie_trimmomatic_filtered_sample_dir)
                shutil.rmtree(coockie_trimmomatic_quality_filtered_sample_dir)

        if remove_intermediate_files:
            shutil.rmtree(coockie_filtered_dir)
            shutil.rmtree(coockie_trimmomatic_filtered_dir)
            shutil.rmtree(coockie_trimmomatic_quality_filtered_dir)
            shutil.rmtree(merged_raw_dir)

        filtering_statistics.write(general_stat_file, sort=False)
Esempio n. 13
0
    def draw_variant_window_densities(self, count_df, scaffold_length_dict, window_size, window_step, output_prefix,
                                      masking_dict=None,
                                      gap_fraction_threshold=0.4,
                                      record_style=None, ext_list=("svg", "png"),
                                      label_fontsize=13, left_offset=0.2, figure_width=12,
                                      figure_height_scale_factor=0.5, scaffold_synonym_dict=None,
                                      id_replacement_mode="partial", suptitle=None, density_multiplicator=1000,
                                      scaffold_black_list=[], sort_scaffolds=False, scaffold_ordered_list=None,
                                      scaffold_white_list=[], add_sample_name_to_labels=False,
                                      dist_between_scaffolds_scaling_factor=1,
                                      gap_color="grey",
                                      masked_color="grey", no_snp_color="white",
                                      colormap=None,
                                      colors=("#333a97", "#3d3795","#5d3393", "#813193", "#9d2d7f", "#b82861",
                                                        "#d33845", "#ea2e2e", "#f5ae27"),
                                      thresholds=(0.0, 0.1, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0, 2.5),
                                      colormap_tuple_list=((0.0, "#333a97"), (0.1, "#3d3795"), (0.5, "#5d3393"),
                                                           (0.75, "#813193"), (1.0, "#9d2d7f"), (1.25, "#b82861"),
                                                           (1.5, "#d33845"), (2.0, "#ea2e2e"), (2.5, "#f5ae27"))):
        """ cont_dict = {sample: {scaffold: }}"""

        if dist_between_scaffolds_scaling_factor < 1:
            raise ValueError("Scaling factor for distance between scaffolds have to be >=1.0")

        final_scaffold_list = self.get_filtered_scaffold_list(count_df.index.get_level_values('CHROM').unique().to_list(),
                                                              scaffold_black_list=scaffold_black_list,
                                                              sort_scaffolds=sort_scaffolds,
                                                              scaffold_ordered_list=scaffold_ordered_list,
                                                              scaffold_white_list=scaffold_white_list)
        scaffold_number = len(final_scaffold_list)
        max_scaffold_length = max([scaffold_length_dict[scaf] for scaf in final_scaffold_list])
        #max_scaffold_length = max(scaffold_length_dict.values())
        window_number, sample_number = np.shape(count_df)
        figure = plt.figure(figsize=(figure_width,
                                     int(figure_height_scale_factor * scaffold_number * sample_number)))
        subplot = plt.subplot(1, 1, 1)

        subplot.get_yaxis().set_visible(False)
        #subplot.get_xaxis().set_visible(False)
        #axes.xaxis.set_major_formatter(x_formatter)

        #subplot.spines['bottom'].set_color('none')
        subplot.spines['right'].set_color('none')
        subplot.spines['left'].set_color('none')
        subplot.spines['top'].set_color('none')

        scaffold_height = 10

        dist_between_scaffolds = 5
        start_x = 0
        start_y = - dist_between_scaffolds

        label_line_y_shift = int(scaffold_height/2)
        label_line_y_jump = int(scaffold_height/2)

        #normalize_color_func = LinearSegmentedColormap.from_list("Densities_custom", colormap_tuple_list)
        #plt.register_cmap(cmap=colormap)
        #colormap = cm.get_cmap(name="plasma", lut=None)
        #normalize_colors = colors.BoundaryNorm(boundaries_for_colormap, len(boundaries_for_colormap) - 1) * int(256/(len(boundaries_for_colormap) - 1))
        #normalize_colors = colors.Normalize(vmin=boundaries_for_colormap[0], vmax=boundaries_for_colormap[-1])

        masked_windows_count_dict = TwoLvlDict()
        no_snps_windows_count_dict = TwoLvlDict()

        for sample in count_df:
            masked_windows_count_dict[sample] = OrderedDict()
            no_snps_windows_count_dict[sample] = OrderedDict()

        if colormap:
            cmap = plt.get_cmap(colormap, len(thresholds))

        masked_regions_fd = open("%s.masked_regions" % output_prefix, "w")
        masked_regions_fd.write("#scaffold\twindow\tmasked_position\tmasked_position,fraction\n")
        for scaffold in final_scaffold_list:

            sample_index = 0
            for sample in count_df:
                masked_windows_count_dict[sample][scaffold] = 0
                no_snps_windows_count_dict[sample][scaffold] = 0
                #if scaffold in scaffold_black_list:
                #    continue
                #print gap_coords_list, gap_len_list

                start_y += scaffold_height + dist_between_scaffolds * (dist_between_scaffolds_scaling_factor if sample_index == 0 else 1)
                label_y_start = label_line_y_shift + start_y
                gap_y_jump = label_y_start + label_line_y_jump
                prev_x = 0

                #figure.text(0, start_y, scaffold, rotation=0, fontweight="bold", transform=subplot.transAxes, fontsize=9,
                #             horizontalalignment='center',
                #             verticalalignment='center')

                if scaffold_synonym_dict:
                    if id_replacement_mode == "exact":
                        if scaffold in scaffold_synonym_dict:
                            scaffold_label = scaffold_synonym_dict[scaffold]
                        else:
                            scaffold_label = scaffold
                            print("WARNING!!! Synonym for %s was not found" % scaffold)
                    elif id_replacement_mode == "partial":

                        partial_syn_list = []
                        for partial_syn in scaffold_synonym_dict:
                            if partial_syn in scaffold:
                                partial_syn_list.append(partial_syn)

                        if len(partial_syn_list) > 1:
                            print("WARNING!!! More than one possible replacement for %s was found: %s. No replacement then." % (scaffold, ",".join(partial_syn_list)))
                            scaffold_label = scaffold
                        elif not partial_syn_list:
                            scaffold_label = scaffold
                            print("WARNING!!! Synonym for %s was not found" % scaffold)
                        else:
                            scaffold_label = scaffold_synonym_dict[partial_syn_list[0]]
                    else:
                        raise ValueError("Unknown id replacement mode")

                else:
                    scaffold_label = scaffold

                subplot.annotate(("%s (%s)" % (scaffold, sample))if add_sample_name_to_labels else scaffold_label,
                                 xy=(0, label_y_start), xycoords='data', fontsize=16,
                                 xytext=(-15, 1.5 * label_line_y_shift), textcoords='offset points',
                                 ha='right', va='top')
                if scaffold in count_df[sample]:
                    for window_index in count_df.loc[scaffold].index:

                        window_start = window_index * window_step
                        window_end = window_start + window_size - 1  # TODO: check end coordinate
                        if masking_dict:
                            if scaffold in masking_dict:
                                unmasked_length = window_size - masking_dict[scaffold][window_index]
                                if unmasked_length > 0:
                                    variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(unmasked_length)
                                else:
                                    variant_density = None
                        else:
                            variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(window_size)

                        if variant_density is None:
                            window_color = masked_color
                        else:
                            if colormap:
                                if variant_density <= thresholds[0]:
                                    window_color = no_snp_color
                                else:
                                    for threshold_index in range(0, len(thresholds) - 1):
                                        if thresholds[threshold_index] < variant_density <= thresholds[threshold_index+1]:
                                            window_color = cmap(threshold_index)
                                            break
                                    else:
                                        window_color = cmap(threshold_index+1)

                            else:
                                if variant_density <= colormap_tuple_list[0][0]:
                                    window_color = no_snp_color
                                else:
                                    for lower_boundary, color in colormap_tuple_list:
                                        if variant_density <= lower_boundary:
                                            break
                                        if variant_density > lower_boundary:
                                            prev_color = color
                                    else:
                                        prev_color = color
                                    window_color = prev_color

                        if masking_dict:
                            if scaffold in masking_dict:
                                if float(masking_dict[scaffold][window_index]) / float(window_size) > gap_fraction_threshold:
                                    window_color = masked_color
                        #print scaffold
                        #print i, variant_density, window_color

                        if window_color == masked_color:
                            masked_windows_count_dict[sample][scaffold] += 1
                            masked_regions_fd.write("%s\t%i\t%i\t%f\n" % (scaffold, window_index, masking_dict[scaffold][window_index], float(masking_dict[scaffold][window_index]) / float(window_size)))
                        elif window_color == no_snp_color:
                            no_snps_windows_count_dict[sample][scaffold] += 1

                        window = Rectangle((window_start, start_y), window_size, scaffold_height, fill=True,
                                           edgecolor=None, facecolor=window_color, linewidth=0.0000000000001)
                        #print prev_x
                        #print gap_coords[0] - prev_x

                        subplot.add_patch(window)

                # draw_chromosome

                fragment = Rectangle((0, start_y), scaffold_length_dict[scaffold], scaffold_height, fill=False,
                                     edgecolor="black", facecolor=None, linewidth=0.5)
                subplot.add_patch(fragment)
                sample_index += 1

        legend_y_position = int(start_y/2)
        legend_x_position = int(max_scaffold_length * 1.05)
        legend_element_side = scaffold_height

        square_y_pos = legend_y_position - legend_element_side

        for color, legend_label in zip((masked_color, no_snp_color), ("masked", "no SNPs")):
            square_y_pos += legend_element_side
            fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                 edgecolor="black", facecolor=color, linewidth=0.5)

            subplot.add_patch(fragment)
            subplot.annotate(legend_label,
                             xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                             xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)
        if colormap:
            for i in range(0, len(thresholds)):
                square_y_pos += legend_element_side
                #print (colormap_tuple_list[i][1])
                fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                     edgecolor="black", facecolor=cmap(i), linewidth=0.5)

                subplot.add_patch(fragment)
                if i == (len(thresholds) - 1):
                    legend_element_label = "> %.2f" % thresholds[i]
                else:
                    legend_element_label = "%.2f - %.2f" % (thresholds[i], thresholds[i + 1])

                subplot.annotate(legend_element_label,
                                 xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                                 xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)
        else:
            for i in range(0, len(colormap_tuple_list)):
                square_y_pos += legend_element_side
                #print (colormap_tuple_list[i][1])
                fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                     edgecolor="black", facecolor=colormap_tuple_list[i][1], linewidth=0.5)

                subplot.add_patch(fragment)
                if i == (len(colormap_tuple_list) - 1):
                    legend_element_label = "> %.2f" % colormap_tuple_list[i][0]
                else:
                    legend_element_label = "%.2f - %.2f" % (colormap_tuple_list[i][0], colormap_tuple_list[i + 1][0])

                subplot.annotate(legend_element_label,
                                 xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                                 xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)

        plt.xlim(xmin=0, xmax=int(1.2 * max_scaffold_length))
        plt.ylim(ymin=0, ymax=start_y + 2 * scaffold_height)
        #plt.colorbar(subplot)
        #plt.tight_layout()

        plt.subplots_adjust(left=left_offset, right=0.95)#bottom=0.1, right=0.8, top=0.9)
        if suptitle:
            plt.suptitle(suptitle)
        for extension in ext_list:
            plt.savefig("%s.%s" % (output_prefix, extension))
        plt.close()

        no_snps_windows_count_dict.write("%s.no_snps.windows.count" % output_prefix)
        masked_windows_count_dict.write("%s.masked.windows.count" % output_prefix)
        masked_regions_fd.close()
Esempio n. 14
0
                    dest="split_values",
                    help="Split values. Default: False")
parser.add_argument("-s",
                    "--value_separator",
                    action="store",
                    dest="value_separator",
                    default=",'",
                    help="Value separator. Default: ','")
parser.add_argument(
    "-g",
    "--ignore_value_repeats",
    action="store_true",
    dest="ignore_value_repeats",
    help=
    "Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) "
    "and don't raise exception. If yes value from first entry is stored. Default: False"
)

args = parser.parse_args()

combined_table = TwoLvlDict(input_file=args.files,
                            absent_symbol=args.absent_symbol,
                            split_values=args.split_values,
                            value_sep=args.value_separator,
                            ignore_value_repeats=args.ignore_value_repeats)
#print combined_table
combined_table.write(args.output,
                     absent_symbol=args.absent_symbol,
                     close_after_if_file_object=False,
                     sort=False)
Esempio n. 15
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file_prefix,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_per_thread_for_bam_sorting="4G",
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_pe_table = TwoLvlDict()
        count_se_table = TwoLvlDict()
        count_all_table = TwoLvlDict()
        count_pe_table_file = "%s/%s.pe.tab" % (output_directory,
                                                count_table_file_prefix)
        count_se_table_file = "%%s/%s.se.tab" % (output_directory,
                                                 count_table_file_prefix)
        count_all_table_file = "%s/%s.all.tab" % (output_directory,
                                                  count_table_file_prefix)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            alignment_sample_se_dir = "%s/se/" % alignment_sample_dir
            filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            if se_files:
                self.safe_mkdir(alignment_sample_se_dir)

            print("\tAligning paired reads...")
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)
            #"""
            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_per_thread_for_bam_sorting=
                max_memory_per_thread_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print("\tIndexing alignment file for paired reads...")
            os.system("samtools index %s" % alignment_file)

            print("\tCounting paired reads aligned to features...")

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)
            #"""
            sample_counts = SynDict(filename=count_file,
                                    header=False,
                                    separator="\t",
                                    allow_repeats_of_key=False,
                                    split_values=False,
                                    values_separator=",",
                                    key_index=0,
                                    value_index=1,
                                    close_after_if_file_object=False,
                                    expression=int,
                                    comments_prefix="__")
            count_pe_table[sample] = sample_counts

            if se_files:
                print("\tAligning single reads...")
                count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir,
                                                       sample)
                #"""
                STAR.align(
                    genome_dir,
                    se_files,
                    reverse_read_list=None,
                    annotation_gtf=annotation_gtf,
                    feature_from_gtf_to_use_as_exon=
                    feature_from_gtf_to_use_as_exon,
                    exon_tag_to_use_as_transcript_id=
                    exon_tag_to_use_as_transcript_id,
                    exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                    length_of_sequences_flanking_junction=
                    length_of_sequences_flanking_junction,
                    junction_tab_file_list=junction_tab_file_list,
                    three_prime_trim=three_prime_trim,
                    five_prime_trim=five_prime_trim,
                    adapter_seq_for_three_prime_clip=
                    adapter_seq_for_three_prime_clip,
                    max_mismatch_percent_for_adapter_trimming=
                    max_mismatch_percent_for_adapter_trimming,
                    three_prime_trim_after_adapter_clip=
                    three_prime_trim_after_adapter_clip,
                    output_type=output_type,
                    sort_bam=sort_bam,
                    max_memory_per_thread_for_bam_sorting=
                    max_memory_per_thread_for_bam_sorting,
                    include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                    output_unmapped_reads=output_unmapped_reads,
                    output_dir=alignment_sample_se_dir,
                    two_pass_mode=two_pass_mode,
                    max_intron_length=max_intron_length)

                alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir

                print("\tIndexing alignment file for single reads...")
                os.system("samtools index %s" % alignment_se_file)

                print("\tCounting single reads aligned to features...")

                HTSeq.count(
                    alignment_se_file,
                    gff_for_htseq,
                    count_se_file,
                    samtype="bam",
                    order="pos",
                    stranded_rnaseq=stranded_rnaseq,
                    min_alignment_quality=min_alignment_quality,
                    feature_type=feature_type_for_htseq,
                    feature_id_attribute=feature_id_attribute_for_htseq,
                    mode=htseq_mode,
                    suppress_progres_report=False)
                #"""

                sample_se_counts = SynDict(filename=count_se_file,
                                           header=False,
                                           separator="\t",
                                           allow_repeats_of_key=False,
                                           split_values=False,
                                           values_separator=",",
                                           key_index=0,
                                           value_index=1,
                                           close_after_if_file_object=False,
                                           expression=int,
                                           comments_prefix="__")

                count_se_table[sample] = sample_se_counts
            else:
                count_se_table[sample] = SynDict()
            count_all_table[sample] = SynDict()
            if se_files:
                for gene_id in set(sample_counts.keys()) | set(
                        sample_se_counts.keys()):
                    if (gene_id in sample_counts) and (gene_id
                                                       in sample_se_counts):
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id] + sample_se_counts[gene_id]
                    elif gene_id in sample_counts:
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id]
                    elif gene_id in sample_se_counts:
                        count_all_table[sample][gene_id] = sample_se_counts[
                            gene_id]
            else:
                count_all_table[sample] = count_pe_table[sample]

        count_pe_table.write(count_pe_table_file)
        count_se_table.write(count_se_table_file)
        count_all_table.write(count_all_table_file)
Esempio n. 16
0
                    dest="output",
                    required=True,
                    help="File to write statistics")
parser.add_argument(
    "-l",
    "--log_file",
    action="store",
    dest="log_file",
    default="trimmomatic.log",
    help="Name of files with trimmomatic log. Default - trimmomatic.log")

args = parser.parse_args()

samples = sorted(
    args.samples.split(",") if args.samples else os.listdir(args.samples_dir))
present_samples = []
for sample in samples:
    if os.path.isdir(args.samples_dir + sample):
        present_samples.append(sample)

reports_dict = TwoLvlDict()

for sample in present_samples:
    print("Handling report from %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)
    trimmomatic_log = "%s/trimmomatic.log" % sample_dir
    reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log)

reports_dict.write(args.output)
Esempio n. 17
0
                    cluster_3d_dict[cluster_3d.id]["intersection"].append(
                        intersection)
                    cluster_3d_dict[cluster_3d.id][
                        "intersection % of main cluster"].append(
                            intersection * 100 / cluster_3d.len)
                    cluster_3d_dict[
                        cluster_3d.id]["interscection % of clusters"].append(
                            intersection * 100 / cluster_3d_sub.len)

            cluster_3d_dict[cluster_3d.id]["total_intersection"] = sum(
                cluster_3d_dict[cluster_3d.id]
                ["intersection % of main cluster"]) if cluster_3d_dict[
                    cluster_3d.id]["intersection % of main cluster"] else 0

        cluster_3d_dict.write(
            "intersection_PmCDA1_3d_sub_and_nonsub_%i+_%.2f+.t" %
            (size, power))

        total_intersection = [
            cluster_3d_dict[cluster_id]["total_intersection"]
            for cluster_id in cluster_3d_dict
        ]
        print("Total %i" % len(total_intersection))
        print("No intersection %i" % total_intersection.count(0))
        print("Intersection %i" %
              (len(total_intersection) - total_intersection.count(0)))
        figure = plt.figure(1, figsize=(5, 5), dpi=300)
        subplot = plt.subplot(1, 1, 1)
        plt.hist(total_intersection)
        plt.xlabel("% of intersection")
        plt.ylabel("N")
Esempio n. 18
0
                    help="Directory with families of species")
"""
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")
"""
args = parser.parse_args()

# run after scripts/expansion/compare_cluster.py

# out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

not_assembled = species_syn_dict.filter_by_line(is_assembled)
species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".")

assembled_ids = IdSet(species_syn_dict.sl_keys())
assembled_ids.write("assembled_families.ids")
not_assembled_ids = IdSet(not_assembled.sl_keys())
not_assembled_ids.write("non_assembled_families.ids")

"""
if args.output != "stdout":
    out_fd.close()
"""
Esempio n. 19
0
            gene_name = tmp[10]
            substitution = tmp[8]
            if substitution == ".":  # skip substitutions not in CDS
                continue
            if gene_alias_dict:
                if gene_name in gene_alias_dict:
                    gene_name = gene_alias_dict[gene_name]
            if args.rem_nuc_sub:
                substitution = substitution.split("/")[0][2:]
                if args.convert_to_single_letter:
                    ref_aa = seq1(substitution[:3])
                    try:
                        if substitution[-1] == "*":
                            alt_aa = "*"
                            pos = substitution[3:-1]
                        else:
                            alt_aa = seq1(substitution[-3:])
                            pos = substitution[3:-3]
                        substitution = "%s%s%s" % (ref_aa, pos, alt_aa)
                    except:
                        print(substitution, "aaa", filename, gene_name)
            if gene_name not in summary_dict[name]:
                summary_dict[name][gene_name] = [substitution]
            else:
                summary_dict[name][gene_name].append(substitution)

summary_dict.write(out_fd, absent_symbol=".")
if args.output != "stdout":
    out_fd.close()

Esempio n. 20
0
with open("%s_test.t" % args.prefix, "w") as out_fd:
    for gene in gene_dict:
        for sub_feature in gene_dict[gene]:
            out_fd.write("%s\t%s\t%i\n" %
                         (gene, sub_feature, gene_dict[gene][sub_feature]))

lengths_dict = get_feature_lengths(record_dict)
count_dict = TwoLvlDict({})
for record in lengths_dict:
    count_dict[record] = {}
    for feature_type in lengths_dict[record]:
        count_dict[record][feature_type] = len(
            lengths_dict[record][feature_type])

count_dict.write("%s_counts.t" % args.prefix)
total_lengths = get_total_feature_lengths(lengths_dict,
                                          out_filename="%s_feature_lengths.t" %
                                          args.prefix)

white_list = ["five_prime_UTR", "three_prime_UTR", "CDS", "ncRNA"]
collapsed_dict = feature_lengths_collapse_records(lengths_dict,
                                                  synonym_dict={
                                                      "snoRNA": "ncRNA",
                                                      "snRNA": "ncRNA"
                                                  })

for feature in collapsed_dict:
    collapsed_dict[feature] = np.array(collapsed_dict[feature])

bin_dict = {
Esempio n. 21
0
                                            "general_tree.nwk")

if args.species_synonym_file:
    synonyms_dict = read_synonyms_dict(args.species_synonym_file,
                                       header=False,
                                       separator="\t",
                                       split_values=False)
    for node in cafe_report.general_data.tree.traverse():
        if node.name in synonyms_dict:
            node.name = synonyms_dict[node.name]
    cafe_report.general_data.write_general_tree(general_trees_dir +
                                                "general_tree_latin.nwk")

cafe_report.general_data.draw_expansion_contraction()
cafe_report.general_data.draw_significant_expansion_contraction()
"""
with open(background_genes_dir + "background_genes.t", "w") as back_fd:
    with open(background_genes_dir + "background_genes_list.txt", "w") as back_list_fd:
        back_fd.write("#id\tfamaliy_p_value\tref_gene\n")
        for record in filtered_out_report:
            #print(record)
            if reference_genes_dict[record.id][0]:
                random_reference_gene = choice(reference_genes_dict[record.id][0])
                back_list_fd.write(random_reference_gene + "\n")
            else:
                random_reference_gene = "."
            back_string = "%s\t%f\t%s\n" % (record.id, record.family_p_value, random_reference_gene)
            back_fd.write(back_string)
"""
statistics_dict.write(statistics_dir + "node_statistics.t", absent_symbol=".")