Exemple #1
0
 def extract_transcripts_by_ids(self, input_gff, transcript_id_file,
                                output_gff):
     transcript_ids = IdSet()
     transcript_ids.read(transcript_id_file, header=False)
     GFF.write(
         self.record_with_extracted_transcripts_generator(
             input_gff, transcript_ids), open(output_gff, "w"))
Exemple #2
0
    def intersect_ids(list_of_group_a, list_of_group_b, mode="common"):
        # possible modes: common, only_a, only_b, not_common,  combine, count
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        for id_list in list_of_group_a:
            a = a | IdSet(id_list)

        for id_list in list_of_group_b:
            b = b | IdSet(id_list)

        if mode != "count":
            return IdSet(expression(a, b))
        else:
            return len(a), len(b), len(a
                                       & b), len(a -
                                                 b), len(b -
                                                         a), len(a
                                                                 ^ b), len(a
                                                                           | b)
 def extract_monocluster_ids_from_file(self,
                                       dir_with_cluster_files,
                                       out_file,
                                       file_with_white_list_ids=None):
     # filenames are counted as species names
     white_list_ids = None
     if file_with_white_list_ids:
         white_list_ids = IdSet()
         white_list_ids.read(file_with_white_list_ids)
     clusters_dict = self.read_cluster_files_from_dir(
         dir_with_cluster_files)
     monoclusters = self.extract_monocluster_ids(
         clusters_dict, out_file=out_file, white_list_ids=white_list_ids)
     return monoclusters
 def get_sequence_names(clusters_dict,
                        write_ids=False,
                        out_prefix=None,
                        white_list_ids=None):
     sequence_names_dict = SynDict()
     for species in clusters_dict:
         sequence_names_dict[species] = IdSet()
     for species in clusters_dict:
         for cluster_id in clusters_dict[species]:
             if white_list_ids:
                 if cluster_id not in white_list_ids:
                     continue
             sequence_names_dict[species] = sequence_names_dict[
                 species] | IdSet(clusters_dict[species][cluster_id])
     if write_ids:
         for species in clusters_dict:
             out_file = "%s_%s.ids" % (
                 out_prefix, species) if out_prefix else "%s.ids" % species
             sequence_names_dict[species].write(out_file)
     return sequence_names_dict
Exemple #5
0
    def rename_scaffolds_in_gff(self,
                                input_gff,
                                syn_file,
                                output_prefix,
                                verbose=True):

        syn_dict = SynDict(filename=syn_file)
        skipped_id_list = IdSet()

        output_gff = "%s.renamed.gff" % output_prefix
        skipped_gff = "%s.skipped.gff" % output_prefix
        skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix

        with self.metaopen(input_gff, "r") as in_fd, \
             self.metaopen(output_gff, "w") as out_fd, \
             self.metaopen(skipped_gff, "w") as skipped_fd:

            for line in in_fd:
                if line[0] == "#":
                    out_fd.write(line)
                gff_list = line.split("\t")
                if gff_list[0] in syn_dict:
                    gff_list[0] = syn_dict[gff_list[0]]
                    out_fd.write("\t".join(gff_list))
                else:
                    skipped_fd.write(line)
                    skipped_id_list.add(gff_list[0])

        if verbose:
            print("Not renamed scaffolds: %i" % len(skipped_id_list))

        skipped_id_list.write(skipped_id_file)
    def extract_monocluster_ids(self,
                                clusters_dict,
                                white_list_ids=None,
                                out_file=None):
        """
        Extracts clusters with only one sequence in all species.
        """
        monocluster_ids = IdSet()
        cluster_names = self.get_cluster_names(clusters_dict)

        for cluster_name in cluster_names:
            for species in clusters_dict:
                if white_list_ids:
                    if cluster_name not in white_list_ids:
                        break
                if cluster_name not in clusters_dict[species]:
                    break
                if len(clusters_dict[species][cluster_name]) > 1:
                    break
            else:
                monocluster_ids.add(cluster_name)

        if out_file:
            monocluster_ids.write(out_file)

        return monocluster_ids
Exemple #7
0
    def get_column_value_set_from_file(self,
                                       input_file,
                                       column_number,
                                       output_file=None,
                                       separator="\t",
                                       comments_prefix="#",
                                       verbose=False):

        column_value_set = IdSet([
            line_list[column_number] for line_list in
            self.file_line_as_list_generator(input_file,
                                             separator=separator,
                                             comments_prefix=comments_prefix)
        ])
        if output_file:
            column_value_set.write(output_file)

        if verbose:
            print("#Column %i (0-based) contains %i different values" %
                  (column_number, len(column_value_set)))

        return column_value_set
    def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None):
        cluster_names = IdSet()
        for species in clusters_dict:
            species_clusters = IdSet(clusters_dict[species].keys())
            cluster_names |= species_clusters
        if out_file:
            cluster_names.write(out_file)

        return cluster_names & IdSet(
            white_list_ids) if white_list_ids else cluster_names
Exemple #9
0
    def get_scaffold_ids_from_gff(gff_file, out_file=None):
        scaffold_id_set = IdSet()

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                scaffold_id = line.split("\t")[0]
                scaffold_id_set.add(scaffold_id)

        if out_file:
            scaffold_id_set.write(out_file)

        return scaffold_id_set
Exemple #10
0
    def intersect_ids_from_files(files_with_ids_from_group_a,
                                 files_with_ids_from_group_b,
                                 result_file=None,
                                 mode="common",
                                 case_insensitive=False):
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        #print(files_with_ids_from_group_a)
        for filename in [files_with_ids_from_group_a] if isinstance(
                files_with_ids_from_group_a,
                str) else files_with_ids_from_group_a:
            id_set = IdSet()
            id_set.read(
                filename,
                comments_prefix="#",
                expression=(lambda s: s.upper()) if case_insensitive else None)
            a = a | id_set

        for filename in [files_with_ids_from_group_b] if isinstance(
                files_with_ids_from_group_b,
                str) else files_with_ids_from_group_b:
            id_set = IdSet()
            id_set.read(
                filename,
                comments_prefix="#",
                expression=(lambda s: s.upper()) if case_insensitive else None)
            b = b | id_set
        result_fd = open(result_file, "w") if result_file else sys.stdout
        if mode != "count":
            final_set = IdSet(expression(a, b))
            final_set.write(result_fd)
        else:
            result_fd.write(
                "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n"
                % (len(a), len(b), len(a & b), len(a - b), len(b - a),
                   len(a ^ b), len(a | b)))
Exemple #11
0
    accordance_file = "%s/%s.accordance" % (args.accordance_dir, species)
    accordance_dict[species] = SynDict()
    accordance_dict[species].read(accordance_file, key_index=1, value_index=0)


if args.name_first:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[0], args.name_separator.join(gene_list[1:])
else:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[-1], args.name_separator.join(gene_list[:-1])

families_with_errors = IdSet()
for family in pep_fam_dict:
    cds_fam_dict[family] = []
    for pep in pep_fam_dict[family]:
        species, pep_name = split_name(pep)
        if pep_name in accordance_dict[species]:
            cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \
                "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species)
            cds_fam_dict[family].append(cds_name)
        else:
            print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name))
            families_with_errors.add(family)

for family in families_with_errors:
    cds_fam_dict.pop(family, None)
Exemple #12
0
    def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None,
                                        black_scaffold_list=(), white_scaffold_list=()):

        busco_table_dict = OrderedDict()
        gene_id_dict = OrderedDict()
        counts_dict = OrderedDict()

        output_path_list = self.split_filename(output_prefix)

        pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".")
        pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".")
        self.safe_mkdir(pairwise_overlaps_dir)
        self.safe_mkdir(pairwise_overlap_counts_dir)

        lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)]

        for busco_table, label in zip(busco_file_list, lllabels_list):
            busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list,
                                                 white_list=white_scaffold_list)

            gene_id_dict[label] = OrderedDict()
            counts_dict[label] = OrderedDict()

            gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses()

        # TODO: draw piecharts


        # TODO: count overlaps

        pairwise_overlap_dict = OrderedDict()
        count_pairwise_overlap_dict = OrderedDict()
        for label1 in lllabels_list:
            for label2 in lllabels_list:
                if label1 == label2:
                    continue
                overlap_id = "%s_vs_%s" % (label1, label2)
                pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                count_pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                for status1 in self.status_list:
                    pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    for status2 in self.status_list:
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2])
                        count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)])
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2)))

                count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id))

        if 2 <= len(busco_file_list) <= 3:
            fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6))
            plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes")
            #print(subplot_list)
            for status, index in zip(self.status_list, range(0, 4)):

                plt.sca(subplot_list[index // 2][index % 2])
                plt.title(status)
                MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status],
                                                          gene_id_dict[lllabels_list[1]][status],
                                                          set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None,
                                                          set_labels=lllabels_list, set_colors=["red", "yellow", "green"],
                                                          output_prefix=None, extensions=("png",), title=None)

            plt.savefig("%s.venn.png" % output_prefix)

            plt.close()
Exemple #13
0
    def prepare_template_for_popart(alignment_file,
                                    output_file,
                                    haplotype_fam_file=None,
                                    traits_file=None,
                                    whitelist_file=None):
        from RouToolPa.Parsers.Sequence import CollectionSequence
        sequence_collection = CollectionSequence(in_file=alignment_file,
                                                 parsing_mode="parse")
        sequence_collection.get_stats_and_features(count_gaps=False,
                                                   sort=False)
        whitelist = IdSet(filename=whitelist_file)
        alignment_len = sequence_collection.seq_lengths["length"].unique()
        if len(alignment_len) > 1:
            raise ValueError(
                "ERROR!!! Sequences in alignment have different lengths!")
        alignment_len = alignment_len[0]

        haplotype_selected_sequence_dict = SynDict()
        haplotypes_without_sequences_ids = IdList()

        traits_df = pd.read_csv(
            traits_file, sep="\t",
            index_col=0) if traits_file else pd.DataFrame()

        if haplotype_fam_file:
            haplotype_dict = SynDict(filename=haplotype_fam_file,
                                     split_values=True)
            for haplotype_id in haplotype_dict:
                for sequence_id in haplotype_dict[haplotype_id]:
                    if sequence_id in sequence_collection.records:
                        haplotype_selected_sequence_dict[
                            haplotype_id] = sequence_id
                        break
                else:
                    haplotypes_without_sequences_ids.append(haplotype_id)
        else:
            haplotype_dict = dict([(entry, [entry])
                                   for entry in sequence_collection.scaffolds])
            haplotype_selected_sequence_dict = dict([
                (entry, entry) for entry in sequence_collection.scaffolds
            ])

        final_haplotype_set = (set(haplotype_selected_sequence_dict.keys())
                               & whitelist) if whitelist else set(
                                   haplotype_selected_sequence_dict.keys())

        with open(output_file, "w") as out_fd:
            #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict),
            #                                                                                          "\n".join(haplotype_selected_sequence_dict.keys())))
            out_fd.write("#NEXUS\n\n")
            out_fd.write(
                "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n"
                % (len(final_haplotype_set), alignment_len))
            out_fd.write("\tMATRIX\n")

            for haplotype_id in final_haplotype_set:
                out_fd.write(
                    "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[
                        haplotype_selected_sequence_dict[haplotype_id]]))
            out_fd.write("\t;\nEND;\n\n")

            if not traits_df.empty:
                traits_number = len(traits_df.columns)
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n"
                    .format(traits_number))
                out_fd.write("\tTraitLabels {0};\n".format(" ".join(
                    traits_df.columns)))
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %s\n" %
                        (haplotype_id,
                         ",".join(map(str, traits_df.loc[haplotype_id]))
                         if haplotype_id in traits_df.index else
                         ("0," * traits_number)[:-1]))
            else:
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n"
                )
                out_fd.write("\tTraitLabels Area;\n")
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %i\n" %
                        (haplotype_id, len(haplotype_dict[haplotype_id])))
            out_fd.write("\t;\nEND;\n\n")
Exemple #14
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        # TODO: TEST IT BEFORE USAGE.WAS SIGNIFICANTLY CHANGED WITHOUT TESTING
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = self.check_path(output_dir)

        for species in clusters_dict:
            #idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (self.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = CollectionSequence(
                in_file=sequence_file,
                format=sequence_file_format,
                parsing_mode="parse")
            #sequence_super_dict[species] = SeqIO.index_db(idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                sequence_super_dict[species].write(
                    out_file, whitelist=seqeuence_names[species])
                #SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_super_dict[species],
                #                                                    seqeuence_names[species]),
                #            out_file, format=sequence_file_format)
        elif mode == "families":
            if species_label_first:
                label_sequence = lambda label, name: "%s%s%s" % (
                    label, separator_for_labeling, name)
            else:
                label_sequence = lambda label, name: "%s%s%s" % (
                    name, separator_for_labeling, label)
            """
            def per_family_record_generator(seq_super_dict, clust_dict, cluster_id):

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]
            """
            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                with self.metaopen(out_file) as out_fd:
                    for species in sequence_super_dict:
                        sequence_super_dict[species].write(
                            out_fd,
                            keep_file_open=True,
                            whitelist=clusters_dict[species][cluster_name],
                            out_id_expression=label_sequence
                            if label_species else None)
Exemple #15
0
    def predict_genes(self,
                      output_prefix,
                      annotation_species_prefix,
                      genome_fasta,
                      augustus_species,
                      output_directory="./",
                      augustus_strand=None,
                      augustus_gene_model=None,
                      augustus_config_dir=None,
                      augustus_use_softmasking=None,
                      augustus_other_options="",
                      augustus_hintsfile=None,
                      augustus_extrinsicCfgFile=None,
                      augustus_predict_UTR=None,
                      augustus_min_intron_len=None,
                      threads=1,
                      augustus_dir="",
                      hmmer_dir="",
                      blast_dir="",
                      stop_codons_list=("TGA", "TAA", "TAG"),
                      genetic_code_table=1):

        draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix)

        augustus_splited_input_dir = "%s/splited_input/" % output_directory
        augustus_splited_output_dir = "%s/splited_output_dir" % output_directory

        output_raw_gff = "%s.raw.gff" % draft_file_prefix
        output_gff = "%s.renamed.gff" % draft_file_prefix
        augustus_pep = "%s.pep" % draft_file_prefix

        AUGUSTUS.path = augustus_dir
        AUGUSTUS.threads = threads
        HMMER3.path = hmmer_dir
        HMMER3.threads = threads
        BLASTp.path = blast_dir
        BLASTp.threads = threads

        print("Annotating genes...")
        AUGUSTUS.parallel_predict(
            augustus_species,
            genome_fasta,
            output_raw_gff,
            strand=augustus_strand,
            gene_model=augustus_gene_model,
            output_gff3=True,
            other_options=augustus_other_options,
            config_dir=augustus_config_dir,
            use_softmasking=augustus_use_softmasking,
            hints_file=augustus_hintsfile,
            split_dir=augustus_splited_input_dir,
            splited_output_dir=augustus_splited_output_dir,
            extrinsicCfgFile=augustus_extrinsicCfgFile,
            predict_UTR=augustus_predict_UTR,
            combine_output_to_single_file=True,
            min_intron_len=augustus_min_intron_len)

        #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8):

        AUGUSTUS.replace_augustus_ids(output_raw_gff,
                                      draft_file_prefix,
                                      species_prefix=annotation_species_prefix,
                                      number_of_digits_in_id=8)
        #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False)
        gffread_file_prefix = "%s.gffread" % draft_file_prefix
        gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences(
            output_gff, genome_fasta, gffread_file_prefix)
        gffread_trimmed_cds = ".".join(
            gffread_cds_file.split(".")[:-1]) + ".trimmed.cds"
        gffread_trimmed_pep = ".".join(
            gffread_pep_file.split(".")[:-1]) + ".trimmed.pep"
        self.trim_cds_and_remove_terminal_stop_codons(
            gffread_cds_file,
            gffread_trimmed_cds,
            stop_codons_list=stop_codons_list
        )  # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins
        inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix
        self.translate_sequences_from_file(
            gffread_trimmed_cds,
            gffread_trimmed_pep,
            format="fasta",
            id_expression=None,
            genetic_code_table=genetic_code_table,
            translate_to_stop=False,
            prefix_of_file_inframe_stop_codons_seqsin=
            inframe_stop_codons_file_prefix)  # Universal code !!!

        AUGUSTUS.extract_gene_ids_from_output(output_gff,
                                              all_annotated_genes_ids)
        AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)

        print("Extracting peptides...")

        AUGUSTUS.extract_proteins_from_output(
            output_gff,
            output_pep,
            id_prefix="",
            evidence_stats_file=output_evidence_stats,
            supported_by_hints_file=output_supported_stats)

        self.compare_sequences_from_files(output_pep,
                                          "%s.trimmed.pep" % args.output,
                                          "comparison_of_peptides",
                                          format="fasta",
                                          verbose=True)

        os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" %
                  (output_supported_stats, output_supported_stats_ids))

        print("Annotating domains(Pfam database)...")

        HMMER3.parallel_hmmscan(
            args.pfam_db,
            output_pep,
            output_hmmscan,
            num_of_seqs_per_scan=None,
            split_dir="splited_hmmscan_fasta/",
            splited_output_dir="splited_hmmscan_output_dir",
            tblout_outfile=None,
            domtblout_outfile=output_domtblout,
            pfamtblout_outfile=None,
            splited_tblout_dir=None,
            splited_domtblout_dir="hmmscan_domtblout/")
        HMMER3.extract_dom_ids_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_ids)
        hits_dict = HMMER3.extract_dom_names_hits_from_domtblout(
            output_domtblout, output_pfam_annotated_dom_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_pfam_supported_transcripts_ids)
        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_pfam_supported_transcripts_ids,
            output_pfam_supported_genes_ids)
        os.system(remove_transcript_ids_str)

        print("Annotating peptides(Swissprot database)...")

        BLASTp.parallel_blastp(output_pep,
                               args.swissprot_db,
                               evalue=0.0000001,
                               output_format=6,
                               outfile=output_swissprot_blastp_hits,
                               split_dir="splited_blastp_fasta",
                               splited_output_dir="splited_blastp_output_dir")
        hits_dict = BLASTp.extract_hits_from_tbl_output(
            output_swissprot_blastp_hits, output_swissprot_blastp_hits_names)
        supported_ids = IdSet(hits_dict.keys())
        supported_ids.write(output_swissprot_supported_transcripts_ids)

        remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
            output_swissprot_supported_transcripts_ids,
            output_swissprot_supported_genes_ids)
        os.system(remove_transcript_ids_str)
        """
Exemple #16
0
                                                   (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

nonassembled = species_syn_dict.filter_by_line(filter_nonassembled)
species_syn_dict.write("correctly_assembled_families_species.t",
                       absent_symbol=".")

nonassembled.write("not_assembled_families_in_all_species.t",
                   absent_symbol=".")
complicated_families_dict = nonassembled.filter_by_line(
    filter_splited_to_several_fam)
complicated_families_dict.write("complicated_families.t", absent_symbol=".")

complicated_families_syn_dict = SynDict()
complicated_families_syn_ids = IdSet()
sl_keys = list(complicated_families_dict.sl_keys())
for sl_key in sl_keys:
    sp_set = set()
    for species in complicated_families_dict:
        if sl_key not in complicated_families_dict[species]:
            continue
        tmp = complicated_families_dict[species][sl_key].split(";")
        for i in range(0, len(tmp)):
            if "_" in tmp[i]:
                tmp[i] = tmp[i][2:]
            tmp[i] = tmp[i].split(",")
            for syn_id in tmp[i]:
                complicated_families_syn_ids.add(syn_id)
                sp_set.add(syn_id)
    complicated_families_syn_dict[sl_key] = sp_set
Exemple #17
0
    def filter_reference(self,
                         reference,
                         repeatmasking_gff_list,
                         output_prefix,
                         reference_len_file=None,
                         annotation_gff=None,
                         max_masked_fraction=0.8,
                         white_scaffold_list=(),
                         black_scaffold_list=(),
                         max_length=None):
        scaffold_with_annotation_file = "%s.scaffolds_with_annotations.ids" % output_prefix
        sorted_combined_repeatmasking_gff = "%s.sorted_combined_repeatmasking.gff" % output_prefix
        reference_len_filename = "%s.reference.len" % output_prefix if reference_len_file is None else reference_len_file
        repeatmasking_coverage_file = "%s.repeatmasking.coverage" % output_prefix
        filtering_log_file = "%s.filtering.log" % output_prefix

        filtered_scaffolds_file = "%s.filtered.fasta" % output_prefix
        filtered_out_scaffolds_file = "%s.filtered_out.fasta" % output_prefix
        filtered_out_scaffolds_id_file = "%s.filtered_out.ids" % output_prefix

        scaffold_with_annotation_set = self.get_scaffold_ids_from_gff(
            annotation_gff, out_file=scaffold_with_annotation_file)

        print("Sorting GFFs with masking...")
        sorting_string = "cat %s | sort -k1,1 -k4,4n -k5,5n > %s" % (
            repeatmasking_gff_list if isinstance(repeatmasking_gff_list, str)
            else " ".join(repeatmasking_gff_list),
            sorted_combined_repeatmasking_gff)

        self.execute(options=sorting_string, cmd="")

        print("Parsing reference...")

        reference_dict = self.parse_seq_file(reference, mode="parse")

        if reference_len_file is None:
            length_dict = self.get_lengths(reference_dict,
                                           out_file=reference_len_filename)
        else:
            length_dict = SynDict(filename=reference_len_filename)

        print("Calculating fraction of masked regions...")

        GenomeCov.get_coverage_for_gff(sorted_combined_repeatmasking_gff,
                                       reference_len_filename,
                                       output=repeatmasking_coverage_file)

        print("Filtering...")
        low_zero_coverage_fraction_dict = SynDict(
            filename=repeatmasking_coverage_file,
            key_index=0,
            value_index=4,
            include_line_expression=lambda l: l.split("\t")[1] == "0",
            expression=float,
            include_value_expression=lambda v: v < (1.0 - max_masked_fraction))
        #print low_zero_coverage_fraction_dict
        scaffold_to_remove = IdSet()

        with open(filtering_log_file, "w") as log_fd:
            log_fd.write("#Scaffold\tStatus\tLength\tDescription\n")
            for scaffold in reference_dict:
                if scaffold in black_scaffold_list:
                    scaffold_to_remove.add(scaffold)
                    log_fd.write("%s\tRemoved\t%i\tBlackList\n" %
                                 (scaffold, length_dict[scaffold]))
                    continue

                if scaffold in low_zero_coverage_fraction_dict:
                    if scaffold in white_scaffold_list:
                        log_fd.write(
                            "%s\tRetained\t%i\tWhiteList,LowNonMaskedPercentage:%f\n"
                            % (scaffold, length_dict[scaffold],
                               low_zero_coverage_fraction_dict[scaffold]))
                        continue
                    if scaffold in scaffold_with_annotation_set:
                        log_fd.write(
                            "%s\tRetained\t%i\tWithAnnotations,LonNonMaskedPercentage:%f\n"
                            % (scaffold, length_dict[scaffold],
                               low_zero_coverage_fraction_dict[scaffold]))
                        continue
                    if not (max_length is None):
                        if length_dict[scaffold] > max_length:
                            log_fd.write(
                                "%s\tRetained\t%i\tLong,LowNonMaskedPercentage:%f\n"
                                % (scaffold, length_dict[scaffold],
                                   low_zero_coverage_fraction_dict[scaffold]))
                        continue

                    scaffold_to_remove.add(scaffold)
                    log_fd.write(
                        "%s\tRemoved\t%i\tLowNonMaskedPercentage:%f\n" %
                        (scaffold, length_dict[scaffold],
                         low_zero_coverage_fraction_dict[scaffold]))
                    continue
                log_fd.write("%s\tRetained\t%i\tOK\n" %
                             (scaffold, length_dict[scaffold]))

        scaffold_to_remove.write(filename=filtered_out_scaffolds_id_file)

        SeqIO.write(self.record_by_id_generator(reference_dict,
                                                scaffold_to_remove),
                    filtered_out_scaffolds_file,
                    format="fasta")
        SeqIO.write(self.record_by_id_generator(
            reference_dict,
            IdSet(reference_dict.keys()) - scaffold_to_remove),
                    filtered_scaffolds_file,
                    format="fasta")
        print("Total scaffolds\t%i\nRemoved\t%i\n" %
              (len(reference_dict), len(scaffold_to_remove)))
Exemple #18
0
          (output_supported_stats, output_supported_stats_ids))

if args.pfam_db:
    print("Annotating domains(Pfam database)...")
    HMMER3.threads = args.threads
    HMMER3.parallel_hmmscan(
        args.pfam_db,
        output_pep,
        output_prefix_hmmscan,
        "./",  #output_hmmscan,
        num_of_seqs_per_scan=None)  # TODO CHECK!!!!!!!!!!!!!!!!!!!!!!!!!!
    HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout,
                                               output_pfam_annotated_dom_ids)
    hits_dict = HMMER3.extract_dom_names_hits_from_domtblout(
        output_domtblout, output_pfam_annotated_dom_names)
    supported_ids = IdSet(hits_dict.keys())
    supported_ids.write(output_pfam_supported_transcripts_ids)
    remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % (
        output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids)
    os.system(remove_transcript_ids_str)

if args.swissprot_db:
    print("Annotating peptides(Swissprot database)...")
    BLASTp.threads = args.threads
    BLASTp.parallel_blastp(output_pep,
                           args.swissprot_db,
                           evalue=0.0000001,
                           output_format=6,
                           outfile=output_swissprot_blastp_hits,
                           split_dir="splited_blastp_fasta",
                           splited_output_dir="splited_blastp_output_dir")
Exemple #19
0
                    action="store",
                    dest="extension",
                    help="Extension of output files. Default: equal to -f")
parser.add_argument(
    "-d",
    "--id_file",
    action="store",
    dest="id_file",
    help="File with groups of sequences to extract(.fam file).")

args = parser.parse_args()

FileRoutines.safe_mkdir(args.output)
args.extension = args.extension if args.extension else args.format
tmp_index_file = "temp.idx"

#id_list = read_ids(args.id_file)
id_list = IdSet(filename=args.id_file)

sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(SequenceRoutines.record_by_id_generator(
        sequence_dict, sequence_groups_id[group], verbose=True),
                "%s%s.%s" % (args.output, group, args.extension),
                format=args.format)

os.remove(tmp_index_file)