Esempio n. 1
0
 def extract_transcripts_by_ids(self, input_gff, transcript_id_file,
                                output_gff):
     transcript_ids = IdSet()
     transcript_ids.read(transcript_id_file, header=False)
     GFF.write(
         self.record_with_extracted_transcripts_generator(
             input_gff, transcript_ids), open(output_gff, "w"))
Esempio n. 2
0
 def extract_monocluster_ids_from_file(self,
                                       dir_with_cluster_files,
                                       out_file,
                                       file_with_white_list_ids=None):
     # filenames are counted as species names
     white_list_ids = None
     if file_with_white_list_ids:
         white_list_ids = IdSet()
         white_list_ids.read(file_with_white_list_ids)
     clusters_dict = self.read_cluster_files_from_dir(
         dir_with_cluster_files)
     monoclusters = self.extract_monocluster_ids(
         clusters_dict, out_file=out_file, white_list_ids=white_list_ids)
     return monoclusters
Esempio n. 3
0
    def intersect_ids_from_files(files_with_ids_from_group_a,
                                 files_with_ids_from_group_b,
                                 result_file=None,
                                 mode="common"):
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        #print(files_with_ids_from_group_a)
        for filename in [files_with_ids_from_group_a] if isinstance(
                files_with_ids_from_group_a,
                str) else files_with_ids_from_group_a:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            a = a | id_set

        for filename in [files_with_ids_from_group_b] if isinstance(
                files_with_ids_from_group_b,
                str) else files_with_ids_from_group_b:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            b = b | id_set

        result_fd = open(result_file, "w") if result_file else sys.stdout
        if mode != "count":
            final_set = IdSet(expression(a, b))
            final_set.write(result_fd)
        else:
            result_fd.write(
                "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n"
                % (len(a), len(b), len(a & b), len(a - b), len(b - a),
                   len(a ^ b), len(a | b)))
Esempio n. 4
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = self.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (self.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)
Esempio n. 5
0
elif args.swissprot_db:
    gene_ids_white_list = [output_swissprot_supported_genes_ids]
else:
    gene_ids_white_list = [all_annotated_genes_ids]

HMMER3.intersect_ids_from_files([all_annotated_genes_ids],
                                gene_ids_black_list,
                                genes_not_masked_ids,
                                mode="only_a")
HMMER3.intersect_ids_from_files(gene_ids_white_list,
                                gene_ids_black_list,
                                final_genes_ids,
                                mode="only_a")
#"""
final_ids = IdSet()
final_ids.read(final_genes_ids)

AnnotationsRoutines.extract_annotation_from_gff(output_gff, final_ids,
                                                ["gene"], final_gff)
print("Extracting CDS...")
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff)

print("Drawing histograms...")

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:

    MatplotlibRoutines.percent_histogram_from_file(