Example #1
0
    def count_reads_and_bases(self, fastq_file_list, stat_file=None):

        fastq_list = [fastq_file_list] if isinstance(fastq_file_list,
                                                     str) else fastq_file_list

        counts = TwoLvlDict()

        for fastq_file in fastq_list:
            counts[fastq_file] = OrderedDict()
            counts[fastq_file]["Reads"] = 0
            counts[fastq_file]["Bases"] = 0

        for fastq_file in fastq_list:
            with self.metaopen(fastq_file, "r") as fastq_fd:
                for line in fastq_fd:
                    counts[fastq_file]["Bases"] += len(fastq_fd.next())
                    counts[fastq_file]["Reads"] += 1
                    fastq_fd.next()
                    fastq_fd.next()

                # to take into account "\n" at the end of each line
                counts[fastq_file]["Bases"] = counts[fastq_file][
                    "Bases"] - counts[fastq_file]["Reads"]

        counts.write()

        if stat_file:
            counts.write(stat_file)
Example #2
0
def results_extraction_listener(queue,
                                output_file_prefix,
                                selected_species_list=None):
    """listens for messages on the queue, writes to file."""

    positive_selection_dict = TwoLvlDict()
    selected_species_positive_selection_dict = TwoLvlDict()
    error_fd = open("errors.err", "w")
    error_fd.write("#sample\terror_code\n")
    while 1:
        result = queue.get()
        if isinstance(result[1], int):
            error_fd.write("%s\t%i\n" % (result[0], result[1]))
            continue
        if result == 'finish':
            positive_selection_dict.write("%s.all" % output_file_prefix,
                                          absent_symbol=".")
            if selected_species_list:
                selected_species_positive_selection_dict.write(
                    "%s.selected_species" % output_file_prefix,
                    absent_symbol=".")
            # print positive_selection_dict.table_form(absent_symbol=".")
            break
        if result[1]:
            positive_selection_dict[result[0]] = result[1]
            if selected_species_list:
                for species in selected_species_list:
                    if species in result[1]:
                        if result[
                                0] not in selected_species_positive_selection_dict:
                            selected_species_positive_selection_dict[
                                result[0]] = {}
                        selected_species_positive_selection_dict[
                            result[0]][species] = result[1][species]
Example #3
0
    def count_locations(self,
                        annotation_black_list=[],
                        allow_several_counts_of_record=False,
                        out_filename="location_counts.t",
                        write=True,
                        count_dir="location_counts"):
        os.system("mkdir -p %s" % count_dir)
        regions_dict = self._split_regions()
        region_counts_dict = TwoLvlDict({})
        for region in regions_dict:
            count_locations_dict = {"igc": 0}

            for record in regions_dict[region]:
                if (not record.description["Loc"]) or (
                        "Loc" not in record.description):
                    count_locations_dict["unknown"] += 1
                    continue
                #print(record.description["Loc"])
                if allow_several_counts_of_record:
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        if location not in count_locations_dict:
                            count_locations_dict[location] = 1
                        else:
                            count_locations_dict[location] += 1
                else:
                    full_location = []
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        full_location.append(location)
                    if not full_location:
                        continue
                    full_location.sort()
                    full_location = "/".join(full_location)
                    if full_location not in count_locations_dict:
                        count_locations_dict[full_location] = 1
                    else:
                        count_locations_dict[full_location] += 1

            labels = []
            counts = []
            #colors = []
            for location in count_locations_dict:
                if count_locations_dict[
                        location] == 0 or location in annotation_black_list:
                    continue
                labels.append(location)
                counts.append(count_locations_dict[location])
            region_counts_dict[region] = OrderedDict([
                (label, count) for label, count in zip(labels, counts)
            ])

        if write:
            region_counts_dict.write("%s/%s" % (count_dir, out_filename))
        return region_counts_dict
Example #4
0
    def count_types(self,
                    output_file=None,
                    total_output_file=None,
                    return_mode="chrom"):

        annotated_types = self.get_annotated_types()
        count_dict = TwoLvlDict()
        total_count_dict = OrderedDict()

        for type in annotated_types:
            total_count_dict[type] = OrderedDict()
            total_count_dict[type]["complete"] = 0
            total_count_dict[type]["partial"] = 0

        for chrom in self.records:
            count_dict[chrom] = OrderedDict()
            for type in annotated_types:
                count_dict[chrom][type] = 0

        for chrom in self.records:
            for record in self.records[chrom]:
                count_dict[chrom][record.type] += 1
                if record.partial:
                    total_count_dict[record.type]["partial"] += 1
                else:
                    total_count_dict[record.type]["complete"] += 1

        if output_file:
            count_dict.write(output_file)

        if total_output_file:
            with open(total_output_file, "w") as out_fd:
                out_fd.write(
                    "#rRNA\tComplete%s\tPartial%s\n" %
                    ("(>%.2f of expected length)" %
                     self.partial_threshold if self.partial_threshold else "",
                     "(<%.2f of expected length)" %
                     self.partial_threshold if self.partial_threshold else ""))
                for type in total_count_dict:
                    out_fd.write("%s\t%i\t%i\n" %
                                 (type, total_count_dict[type]["complete"],
                                  total_count_dict[type]["partial"]))

        if return_mode == "chrom":
            return count_dict
        elif return_mode == "total":
            return total_count_dict
        elif return_mode == "both":
            return count_dict, total_count_dict
        else:
            raise ValueError(
                "Unknown return type. Allowed variants: 'chrom', 'total', 'both'"
            )
Example #5
0
    def get_leaf_values(self, write=True):
        leaf_values_dict = TwoLvlDict()
        dN_dict = self._get_tree_dist_dict(self.dNtree)
        dS_dict = self._get_tree_dist_dict(self.dStree)
        W_fict = self._get_tree_dist_dict(self.Wtree)

        leaf_values_dict["dN"] = dN_dict
        leaf_values_dict["dS"] = dS_dict
        leaf_values_dict["W"] = W_fict

        if write:
            leaf_values_dict.write("leaf_values.t")
        return leaf_values_dict
Example #6
0
    def combine_count_files(count_file_list, output_file, sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError("Several files doesn't have corresponding sample name")

        samples = zip(sample_name_list if sample_name_list else count_file_list, count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False,
                                          split_values=False, values_separator=",", key_index=0, value_index=1,
                                          close_after_if_file_object=False, expression=None, comments_prefix="__")

        count_table.write(output_file)
Example #7
0
number_of_bins = len(bins) - 1

# add zeroes to absent bins for all assemblies
for assembly in assembly_contig_cumulative_length:
    bin_number_difference = number_of_bins - len(
        assembly_contig_cumulative_length[assembly])
    if bin_number_difference > 0:
        assembly_contig_cumulative_length[assembly] += [
            0 for i in range(0, bin_number_difference)
        ]
        assembly_contig_number_values[assembly] += [
            0 for i in range(0, bin_number_difference)
        ]

assembly_N50_dict.write("%s.N50" % args.output_prefix)
assembly_L50.write("%s.L50" % args.output_prefix)
assembly_general_stats.write("%s.general" % args.output_prefix)
assembly_lengths.write("%s.lengths" % args.output_prefix)
#assembly_bins.write("%s.bins" % args.output_prefix)
#print(assembly_contig_cumulative_length)
#assembly_contig_cumulative_length.write("%s.cumulative_length" % args.output_prefix)
#assembly_contig_number_values.write("%s.contig_number_values" % args.output_prefix)

fig = plt.figure(figsize=(12, 6))
subplot_1 = plt.subplot(1, 2, 1)

plt.hist(
    [assembly_length_array[assembly] for assembly in assembly_length_array],
    bins,
    label=assembly_length_array.keys())
Example #8
0
"""
if "HAP" not in sample:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
else:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power"))
"""
filtered, filtered_out = filtered.filter(filter_by_power_10)
filtered.write("%s/%s_adjusted_size_3+_power_0.1+.ccf" %
               (clustering_dir, sample))
filtered_out.write("%s/%s_adjusted_size_3+_power_0.05+_less_0.1.ccf" %
                   (clustering_dir, sample))
"""
if "HAP" not in sample:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power", "Homogeneity"))
else:
    filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted),
                                         additional_data=("Median", "Mean", "Power"))
    #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted),
    #                                         additional_data=("Median", "Mean", "Power"))
"""
statistics_dict.write(out_filename="%s/%s_mutation_count_statistics.t" %
                      (clustering_dir, sample))
Example #9
0
with open("%s_test.t" % args.prefix, "w") as out_fd:
    for gene in gene_dict:
        for sub_feature in gene_dict[gene]:
            out_fd.write("%s\t%s\t%i\n" %
                         (gene, sub_feature, gene_dict[gene][sub_feature]))

lengths_dict = get_feature_lengths(record_dict)
count_dict = TwoLvlDict({})
for record in lengths_dict:
    count_dict[record] = {}
    for feature_type in lengths_dict[record]:
        count_dict[record][feature_type] = len(
            lengths_dict[record][feature_type])

count_dict.write("%s_counts.t" % args.prefix)
total_lengths = get_total_feature_lengths(lengths_dict,
                                          out_filename="%s_feature_lengths.t" %
                                          args.prefix)

white_list = ["five_prime_UTR", "three_prime_UTR", "CDS", "ncRNA"]
collapsed_dict = feature_lengths_collapse_records(lengths_dict,
                                                  synonym_dict={
                                                      "snoRNA": "ncRNA",
                                                      "snRNA": "ncRNA"
                                                  })

for feature in collapsed_dict:
    collapsed_dict[feature] = np.array(collapsed_dict[feature])

bin_dict = {
Example #10
0
        for cluster_3d_sub in PmCDA1_3d_sub_clusters:
            for variant in cluster_3d_sub:
                if "Genes" in variant.info_dict:
                    for gene in variant.info_dict["Genes"]:
                        cluster_3d_sub_set.add(gene)

        print("PmCDA1 3d : %i" % len(cluster_3d_set))
        print("PmCDA1 3d sub: %i" % len(cluster_3d_sub_set))
        intersection = cluster_3d_set & cluster_3d_sub_set
        print("Intersection: % i" % len(intersection))

        n_intersection_genes = len(intersection)
        n_cluster_3d_genes = len(cluster_3d_set)
        n_cluster_3d_sub_genes = len(cluster_3d_sub_set)

        #print intersection
        #print cluster_3d_set
        #print cluster_3d_sub_set

        overlap_clusters_percent[size][power] = 100 * float(
            len(intersection)) / float(len(cluster_3d_set))

        p_value = hypergeom(n_intersection_genes, totaly_genes,
                            n_cluster_3d_genes, n_cluster_3d_sub_genes)
        test_fd.write("%i\t%.2f\t%i\t%i\t%i\t%i\t%e\n" %
                      (size, power, totaly_genes, n_cluster_3d_genes,
                       n_cluster_3d_sub_genes, n_intersection_genes, p_value))

overlap_clusters_percent.write("overlap_clusters_percent_genes.t")

test_fd.close()
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d",
                    "--species_dir",
                    action="store",
                    dest="species_dir",
                    default="./",
                    type=check_path,
                    help="Directory with per species statistics")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_stat_dict = TwoLvlDict()

for species in args.species_list:
    with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd:
        statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines())
    species_stat_dict[species] = OrderedDict(statistics)

species_stat_dict.write(out_fd)
if args.output != "stdout":
    out_fd.close()
                    dest="output",
                    required=True,
                    help="File to write statistics")
parser.add_argument(
    "-l",
    "--log_file",
    action="store",
    dest="log_file",
    default="trimmomatic.log",
    help="Name of files with trimmomatic log. Default - trimmomatic.log")

args = parser.parse_args()

samples = sorted(
    args.samples.split(",") if args.samples else os.listdir(args.samples_dir))
present_samples = []
for sample in samples:
    if os.path.isdir(args.samples_dir + sample):
        present_samples.append(sample)

reports_dict = TwoLvlDict()

for sample in present_samples:
    print("Handling report from %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)
    trimmomatic_log = "%s/trimmomatic.log" % sample_dir
    reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log)

reports_dict.write(args.output)
Example #13
0
"""
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")
"""
args = parser.parse_args()

# run after scripts/expansion/compare_cluster.py

# out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" %
                                                   (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

not_assembled = species_syn_dict.filter_by_line(is_assembled)
species_syn_dict.write("correctly_assembled_families_species.t",
                       absent_symbol=".")

assembled_ids = IdSet(species_syn_dict.sl_keys())
assembled_ids.write("assembled_families.ids")
not_assembled_ids = IdSet(not_assembled.sl_keys())
not_assembled_ids.write("non_assembled_families.ids")
"""
if args.output != "stdout":
    out_fd.close()
"""
                                                       "interscection % of clusters": [],
                                                       "total_intersection": 0})
            for cluster_3d_sub in PmCDA1_3d_sub_clusters:
                if cluster_3d.chrom != cluster_3d_sub.chrom:
                    continue
                intersection = get_intersection_length(cluster_3d.start, cluster_3d.end, cluster_3d_sub.start, cluster_3d_sub.end)
                if intersection > 0:
                    cluster_3d_dict[cluster_3d.id]["N of clusters"] += 1
                    cluster_3d_dict[cluster_3d.id]["length of clusters"].append(cluster_3d_sub.len)
                    cluster_3d_dict[cluster_3d.id]["intersection"].append(intersection)
                    cluster_3d_dict[cluster_3d.id]["intersection % of main cluster"].append(intersection * 100/cluster_3d.len)
                    cluster_3d_dict[cluster_3d.id]["interscection % of clusters"].append(intersection * 100/cluster_3d_sub.len)

            cluster_3d_dict[cluster_3d.id]["total_intersection"] = sum(cluster_3d_dict[cluster_3d.id]["intersection % of main cluster"]) if cluster_3d_dict[cluster_3d.id]["intersection % of main cluster"] else 0

        cluster_3d_dict.write("intersection_PmCDA1_3d_sub_and_nonsub_%i+_%.2f+.t" % (size, power))

        total_intersection = [cluster_3d_dict[cluster_id]["total_intersection"] for cluster_id in cluster_3d_dict]
        print ("Total %i" % len(total_intersection))
        print("No intersection %i" % total_intersection.count(0))
        print("Intersection %i" % (len(total_intersection) - total_intersection.count(0)))
        figure = plt.figure(1, figsize=(5, 5), dpi=300)
        subplot = plt.subplot(1, 1, 1)
        plt.hist(total_intersection)
        plt.xlabel("% of intersection")
        plt.ylabel("N")
        plt.xlim(xmin=0, xmax=100)


        plt.savefig("intersection_PmCDA1_3d_sub_and_nonsub_%i+_%.2f+.svg" % (size, power))
        plt.close()
Example #15
0
import argparse

from Routines import FileRoutines
from CustomCollections.GeneralCollections import TwoLvlDict


parser = argparse.ArgumentParser()

parser.add_argument("-f", "--files", action="store", dest="files", required=True,
                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of files/directories with tables")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="Output file with combined table.")
parser.add_argument("-a", "--absent_symbol", action="store", dest="absent_symbol", default=".",
                    help="Symbol to be treated as absent value")
parser.add_argument("-v", "--split_values", action="store_true", dest="split_values",
                    help="Split values. Default: False")
parser.add_argument("-s", "--value_separator", action="store", dest="value_separator", default=",'",
                    help="Value separator. Default: ','")
parser.add_argument("-g", "--ignore_value_repeats", action="store_true", dest="ignore_value_repeats",
                    help="Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) "
                         "and don't raise exception. If yes value from first entry is stored. Default: False")

args = parser.parse_args()

combined_table = TwoLvlDict(input_file=args.files, absent_symbol=args.absent_symbol,
                            split_values=args.split_values, value_sep=args.value_separator,
                            ignore_value_repeats=args.ignore_value_repeats)
#print combined_table
combined_table.write(args.output, absent_symbol=args.absent_symbol, close_after_if_file_object=False, sort=False)
Example #16
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_for_bam_sorting=None,
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_table = TwoLvlDict()
        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            filetypes, forward_files, reverse_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            print "\tAligning reads..."

            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_for_bam_sorting=max_memory_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print "\tIndexing alignment file..."
            os.system("samtools index %s" % alignment_file)

            print "\tCounting reads aligned to features..."
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)

            sample_counts = SynDict()
            sample_counts.read(count_file,
                               header=False,
                               separator="\t",
                               allow_repeats_of_key=False,
                               split_values=False,
                               values_separator=",",
                               key_index=0,
                               value_index=1,
                               close_after_if_file_object=False,
                               expression=None,
                               comments_prefix="__")
            count_table[sample] = sample_counts

        count_table.write(count_table_file)
Example #17
0
    def get_taxa_genomes_summary(self, taxa, email, output_directory, output_prefix,
                                 max_ids_per_query=8000, max_download_attempts=500,
                                 min_scaffold_n50=None, min_contig_n50=None, max_scaffold_l50=None,
                                 max_contig_l50=None, max_contig_count=None, max_scaffold_count=None,
                                 max_chromosome_count=None, min_chromosome_count=None, max_unlocalized_scaffolds=None,
                                 max_unplaced_scaffolds=None, max_total_length=None, min_total_length=None,
                                 max_ungapped_length=None, min_ungapped_length=None,
                                 no_ambiguous_species=True):
        Entrez.email = email
        taxa_list = taxa if isinstance(taxa, Iterable) else [taxa]

        all_files_dir = "%s%s/" % (self.check_path(output_directory), "all")
        nonambiguous_species_all_dir = "%snonambiguous_species_all/" % self.check_path(output_directory)
        ambiguous_species_all_dir = "%s%s/" % (self.check_path(output_directory), "ambiguous_species_all")
        chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "chromosome_lvl")
        non_chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "nonchromosome_lvl")

        filtered_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "passed_integrity_filters")
        filtered_out_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "not_passed_integrity_filters")

        stat_dir = "%s%s/" % (self.check_path(output_directory), "stat")
        taxa_stat_dir = "%s%s/" % (self.check_path(output_directory), "taxa_stat")
        for subdir in (all_files_dir, chromosome_lvl_dir, non_chromosome_lvl_dir, stat_dir,
                       taxa_stat_dir, nonambiguous_species_all_dir, ambiguous_species_all_dir):
            self.save_mkdir(subdir)

        filter_by_integrity = min_scaffold_n50 or min_contig_n50 or max_scaffold_l50 or max_contig_l50 \
                              or max_contig_count or max_scaffold_count or max_chromosome_count \
                              or min_chromosome_count or max_unlocalized_scaffolds \
                              or max_unplaced_scaffolds or max_total_length or min_total_length \
                              or max_ungapped_length or min_ungapped_length

        if filter_by_integrity:
            for subdir in (filtered_by_integrity_dir, filtered_out_by_integrity_dir):
                self.save_mkdir(subdir)

        for taxon in taxa_list:
            search_term = "%s[Orgn]" % taxon

            attempt_counter = 1
            while True:
                try:
                    summary = Entrez.read(Entrez.esearch(db="genome", term=search_term, retmax=10000, retmode="xml"))
                    break
                except URLError:
                    if attempt_counter > max_download_attempts:
                        URLError("Network problems. Maximum attempt number is exceeded")
                    print "URLError. Retrying... Attempt %i" % attempt_counter
                    attempt_counter += 1

            print "Were found %s species" % summary["Count"]
            #print summary

            taxon_stat_file = "%s/%s.stat" % (taxa_stat_dir, taxon.replace(" ", "_"))
            taxon_stat_dict = TwoLvlDict()

            for species_id in summary["IdList"]: #[167] :
                print "Handling species id %s " % species_id

                species_stat_file = "%s/%s.stat" % (stat_dir, species_id)
                species_stat_dict = TwoLvlDict()
                species_stat_dict[species_id] = OrderedDict()

                taxon_stat_dict[species_id] = OrderedDict()

                for stat in "all", "chromosome_lvl", "non_chromosome_lvl":
                    species_stat_dict[species_id][stat] = 0
                    taxon_stat_dict[species_id][stat] = 0
                #species_summary = Entrez.read(Entrez.esummary(db="genome", id=species_id, retmax=10000, retmode="xml"))
                #print species_summary

                # get assemblies linked with genome of species

                attempt_counter = 1
                while True:
                    try:
                        assembly_links = Entrez.read(Entrez.elink(dbfrom="genome", id=species_id, retmode="xml",
                                                                  retmax=10000, linkname="genome_assembly"))
                        break
                    except URLError:
                        if attempt_counter > max_download_attempts:
                            URLError("Network problems. Maximum attempt number is exceeded")
                        print "URLError. Retrying... Attempt %i" % attempt_counter
                        attempt_counter += 1

                assembly_number = len(assembly_links)
                #print links
                #print links[0]["LinkSetDb"][0]["Link"]
                if assembly_links:
                    if "LinkSetDb" in assembly_links[0]:
                        if assembly_links[0]["LinkSetDb"]:
                            if "Link" in assembly_links[0]["LinkSetDb"][0]:
                                assembly_ids = [id_dict["Id"] for id_dict in assembly_links[0]["LinkSetDb"][0]["Link"]]
                            else:
                                continue
                        else:
                            continue
                    else:
                        continue
                else:
                    continue
                number_of_ids = len(assembly_ids)

                print "\tFound %i assemblies" % number_of_ids

                id_group_edges = np.arange(0, number_of_ids+1, max_ids_per_query)

                if id_group_edges[-1] != number_of_ids:
                    id_group_edges = np.append(id_group_edges, number_of_ids)

                number_of_id_groups = len(id_group_edges) - 1

                #print len(assembly_links[0]["LinkSetDb"][0]["Link"])
                #print assembly_ids
                #print len(assembly_ids)
                #assembly_dict = TwoLvlDict()
                #assemblies_with_ambiguous_taxonomies = SynDict()
                #summaries = Entrez.read(Entrez.esummary(db="assembly", id=",".join(assembly_ids), retmode="xml"))

                summary_list = None
                for i in range(0, number_of_id_groups):
                    print "\tDownloading summary about assemblies %i - %i" % (id_group_edges[i]+1, id_group_edges[i+1])
                    #print len(assembly_ids[id_group_edges[i]:id_group_edges[i+1]])
                    summaries = Entrez.read(Entrez.esummary(db="assembly",
                                                            id=",".join(assembly_ids[id_group_edges[i]:id_group_edges[i+1]]),
                                                            retmode="xml"), validate=False)
                    tmp_summary_list = AssemblySummaryList(entrez_summary_biopython=summaries)
                    summary_list = (summary_list + tmp_summary_list) if summary_list else tmp_summary_list

                print "\tDownloaded %i" % len(summary_list)

                if len(summary_list) != number_of_ids:
                    print "\tWARNING:Not all assemblies were downloaded"
                    """
                    print "\tFollowing assemblies were not downloaded(ids):%s" % ",".join(set())
                    """

                if summary_list:
                    species_stat_dict[species_id]["all"] = len(summary_list)
                    taxon_stat_dict[species_id]["all"] = len(summary_list)
                    output_file = "%s%s.genome.summary" % ((output_prefix + ".") if output_prefix else "", species_id)
                                                           #summary_list[0]['SpeciesName'].replace(" ", "_"))

                    all_output_file = "%s/%s" % (all_files_dir, output_file)
                    chromosome_lvl_output_file = "%s/%s" % (chromosome_lvl_dir, output_file)
                    non_chromosome_lvl_output_file = "%s/%s" % (non_chromosome_lvl_dir, output_file)
                    nonambiguous_species_output_file = "%s/%s" % (nonambiguous_species_all_dir, output_file)
                    ambiguous_species_output_file = "%s/%s" % (ambiguous_species_all_dir, output_file)
                    chromosome_lvl_summary_list, non_chromosome_lvl_summary_list = summary_list.filter_non_chrom_level_genomes()
                    filtered_by_integrity_file = "%s/%s" % (filtered_by_integrity_dir, output_file)
                    filtered_out_by_integrity_file = "%s/%s" % (filtered_out_by_integrity_dir, output_file)

                    species_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list)
                    taxon_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list)
                    species_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list)
                    taxon_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list)

                    print("\tChromosome level assemblies %i" % species_stat_dict[species_id]["chromosome_lvl"])
                    print("\tNon chromosome level assemblies %i" % species_stat_dict[species_id]["non_chromosome_lvl"])

                    if chromosome_lvl_summary_list:
                        chromosome_lvl_summary_list.write(chromosome_lvl_output_file)

                    if non_chromosome_lvl_summary_list:
                        non_chromosome_lvl_summary_list.write(non_chromosome_lvl_output_file)

                    nonambiguous_species_summary_list, ambiguous_species_summary_list = summary_list.filter_ambiguous_species()
                    #print(len(nonambiguous_species_summary_list), len(ambiguous_species_summary_list))
                    species_stat_dict[species_id]["nonambiguous_species"] = len(nonambiguous_species_summary_list)
                    species_stat_dict[species_id]["ambiguous_species"] = len(ambiguous_species_summary_list)
                    print "\tAmbiguous species %i" % species_stat_dict[species_id]["ambiguous_species"]
                    if nonambiguous_species_summary_list:
                        nonambiguous_species_summary_list.write(nonambiguous_species_output_file)
                    if ambiguous_species_summary_list:
                        ambiguous_species_summary_list.write(ambiguous_species_output_file)

                    summary_list.write(all_output_file)

                    if filter_by_integrity:
                        filtered_by_integrity, filtered_out_by_integrity = summary_list.filter_by_integrity(min_scaffold_n50=min_scaffold_n50,
                                                                                                            min_contig_n50=min_contig_n50,
                                                                                                            max_scaffold_l50=max_scaffold_l50,
                                                                                                            max_contig_l50=max_contig_l50,
                                                                                                            max_contig_count=max_contig_count,
                                                                                                            max_scaffold_count=max_scaffold_count,
                                                                                                            max_chromosome_count=max_chromosome_count,
                                                                                                            min_chromosome_count=min_chromosome_count,
                                                                                                            max_unlocalized_scaffolds=max_unlocalized_scaffolds,
                                                                                                            max_unplaced_scaffolds=max_unplaced_scaffolds,
                                                                                                            max_total_length=max_total_length,
                                                                                                            min_total_length=min_total_length,
                                                                                                            max_ungapped_length=max_ungapped_length,
                                                                                                            min_ungapped_length=min_ungapped_length,
                                                                                                            no_ambiguous_species=no_ambiguous_species)
                        species_stat_dict[species_id]["filtered_by_integrity"] = len(filtered_by_integrity)
                        species_stat_dict[species_id]["filtered_out_by_integrity"] = len(filtered_out_by_integrity)
                        if filtered_by_integrity:
                            filtered_by_integrity.write(filtered_by_integrity_file)
                        if filtered_out_by_integrity:
                            filtered_out_by_integrity.write(filtered_out_by_integrity_file)
                        print "\tPassed integrity filters %i" % species_stat_dict[species_id]["filtered_by_integrity"]
                species_stat_dict.write(species_stat_file)

                print "\n\n"

            taxon_stat_dict.write(taxon_stat_file)

            """
os.chdir(workdir)
for sample_set in sample_set_names_list:
    stat_dict = TwoLvlDict(OrderedDict({}))
    print("Handling %s" % sample_set)
    all_clusters = CollectionCCF(from_file=True,
                                 input_file=workdir + all_files_subdir +
                                 sample_set + all_files_suffix)
    if "HAP" not in sample_set:
        all_clusters.check_strandness()
    for min_size in size_limits:
        stat_dict[min_size] = OrderedDict({})
        os.system("mkdir -p %i %i/all " % (min_size, min_size))
        above_size_clusters, below_size_clusters = all_clusters.filter_by_expression(
            "record.size >= %i" % min_size)
        above_size_clusters.write(
            "%i/all/%s_size_%i+%s" %
            (min_size, sample_set, min_size, all_files_suffix))
        stat_dict[min_size][0.00] = len(above_size_clusters)
        for min_power in power_limits:

            os.system("mkdir -p %i/%.2f" % (min_size, min_power))
            above_power_clusters, below_power_clusters = above_size_clusters.filter_by_expression(
                "record.description['Power'] >= %f" % min_power)
            above_power_clusters.write("%i/%.2f/%s_size_%i+_power_%.2f+%s" %
                                       (min_size, min_power, sample_set,
                                        min_size, min_power, all_files_suffix))
            stat_dict[min_size][min_power] = len(above_power_clusters)

    stat_dict.write("%s_statistics.t" % sample_set)
Example #19
0
            tmp = line.strip().split("\t")
            gene_name = tmp[10]
            substitution = tmp[8]
            if substitution == ".":  # skip substitutions not in CDS
                continue
            if gene_alias_dict:
                if gene_name in gene_alias_dict:
                    gene_name = gene_alias_dict[gene_name]
            if args.rem_nuc_sub:
                substitution = substitution.split("/")[0][2:]
                if args.convert_to_single_letter:
                    ref_aa = seq1(substitution[:3])
                    try:
                        if substitution[-1] == "*":
                            alt_aa = "*"
                            pos = substitution[3:-1]
                        else:
                            alt_aa = seq1(substitution[-3:])
                            pos = substitution[3:-3]
                        substitution = "%s%s%s" % (ref_aa, pos, alt_aa)
                    except:
                        print(substitution, "aaa", filename, gene_name)
            if gene_name not in summary_dict[name]:
                summary_dict[name][gene_name] = [substitution]
            else:
                summary_dict[name][gene_name].append(substitution)

summary_dict.write(out_fd, absent_symbol=".")
if args.output != "stdout":
    out_fd.close()
Example #20
0
                                            "general_tree.nwk")

if args.species_synonym_file:
    synonyms_dict = read_synonyms_dict(args.species_synonym_file,
                                       header=False,
                                       separator="\t",
                                       split_values=False)
    for node in cafe_report.general_data.tree.traverse():
        if node.name in synonyms_dict:
            node.name = synonyms_dict[node.name]
    cafe_report.general_data.write_general_tree(general_trees_dir +
                                                "general_tree_latin.nwk")

cafe_report.general_data.draw_expansion_contraction()
cafe_report.general_data.draw_significant_expansion_contraction()
"""
with open(background_genes_dir + "background_genes.t", "w") as back_fd:
    with open(background_genes_dir + "background_genes_list.txt", "w") as back_list_fd:
        back_fd.write("#id\tfamaliy_p_value\tref_gene\n")
        for record in filtered_out_report:
            #print(record)
            if reference_genes_dict[record.id][0]:
                random_reference_gene = choice(reference_genes_dict[record.id][0])
                back_list_fd.write(random_reference_gene + "\n")
            else:
                random_reference_gene = "."
            back_string = "%s\t%f\t%s\n" % (record.id, record.family_p_value, random_reference_gene)
            back_fd.write(back_string)
"""
statistics_dict.write(statistics_dir + "node_statistics.t", absent_symbol=".")