Exemple #1
0
    def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b,
                                 result_file=None, mode="common"):
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        #print(files_with_ids_from_group_a)
        for filename in [files_with_ids_from_group_a] if isinstance(files_with_ids_from_group_a, str) else files_with_ids_from_group_a:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            a = a | id_set

        for filename in [files_with_ids_from_group_b] if isinstance(files_with_ids_from_group_b, str) else files_with_ids_from_group_b:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            b = b | id_set

        result_fd = open(result_file, "w") if result_file else sys.stdout
        if mode != "count":
            final_set = IdSet(expression(a, b))
            final_set.write(result_fd)
        else:
            result_fd.write("Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" %
                            (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
Exemple #2
0
 def extract_monocluster_ids_from_file(self,
                                       dir_with_cluster_files,
                                       out_file,
                                       file_with_white_list_ids=None):
     # filenames are counted as species names
     white_list_ids = None
     if file_with_white_list_ids:
         white_list_ids = IdSet()
         white_list_ids.read(file_with_white_list_ids)
     clusters_dict = self.read_cluster_files_from_dir(
         dir_with_cluster_files)
     monoclusters = self.extract_monocluster_ids(
         clusters_dict, out_file=out_file, white_list_ids=white_list_ids)
     return monoclusters
Exemple #3
0
 def extract_transcripts_by_ids(self, input_gff, transcript_id_file, output_gff):
     transcript_ids = IdSet()
     transcript_ids.read(transcript_id_file, header=False)
     GFF.write(self.record_with_extracted_transcripts_generator(input_gff, transcript_ids),
               open(output_gff, "w"))
Exemple #4
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = FileRoutines.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (FileRoutines.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True,
                    help="File with ids of families to extract")
parser.add_argument("-o", "--output", action="store", dest="output", default="stdout",
                    help="File to write extracted families. Default - stdout")
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

out_file = sys.stdout if args.output == "stdout" else open(args.output, "w")

fam_dict = SynDict()
fam_dict.read(args.input)

id_set = IdSet()
id_set.read(args.id_file)

extracted_dict = SynDict()
for id_entry in id_set:
    if id_entry in fam_dict:
        extracted_dict[id_entry] = fam_dict[id_entry]
    else:
        if args.verbose:
            print("%s was not found" % id_entry)

extracted_dict.write(out_file, close_after_if_file_object=True)




Exemple #6
0
elif args.swissprot_db:
    gene_ids_white_list = [output_swissprot_supported_genes_ids]
else:
    gene_ids_white_list = [all_annotated_genes_ids]

HMMER3.intersect_ids_from_files([all_annotated_genes_ids],
                                gene_ids_black_list,
                                genes_not_masked_ids,
                                mode="only_a")
HMMER3.intersect_ids_from_files(gene_ids_white_list,
                                gene_ids_black_list,
                                final_genes_ids,
                                mode="only_a")

final_ids = IdSet()
final_ids.read(final_genes_ids)

AnnotationsRoutines.extract_annotation_from_gff(output_gff, final_ids,
                                                ["gene"], final_gff)
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff)

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:

    MatplotlibRoutines.percent_histogram_from_file(
        stat_file,
        stat_file,
        data_type=None,