def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common"): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance(files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read(filename, comments_prefix="#") a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance(files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read(filename, comments_prefix="#") b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write("Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
def extract_monocluster_ids_from_file(self, dir_with_cluster_files, out_file, file_with_white_list_ids=None): # filenames are counted as species names white_list_ids = None if file_with_white_list_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) monoclusters = self.extract_monocluster_ids( clusters_dict, out_file=out_file, white_list_ids=white_list_ids) return monoclusters
def extract_transcripts_by_ids(self, input_gff, transcript_id_file, output_gff): transcript_ids = IdSet() transcript_ids.read(transcript_id_file, header=False) GFF.write(self.record_with_extracted_transcripts_generator(input_gff, transcript_ids), open(output_gff, "w"))
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = FileRoutines.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (FileRoutines.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of families to extract") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write extracted families. Default - stdout") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print not found ids. Default - no") args = parser.parse_args() out_file = sys.stdout if args.output == "stdout" else open(args.output, "w") fam_dict = SynDict() fam_dict.read(args.input) id_set = IdSet() id_set.read(args.id_file) extracted_dict = SynDict() for id_entry in id_set: if id_entry in fam_dict: extracted_dict[id_entry] = fam_dict[id_entry] else: if args.verbose: print("%s was not found" % id_entry) extracted_dict.write(out_file, close_after_if_file_object=True)
elif args.swissprot_db: gene_ids_white_list = [output_swissprot_supported_genes_ids] else: gene_ids_white_list = [all_annotated_genes_ids] HMMER3.intersect_ids_from_files([all_annotated_genes_ids], gene_ids_black_list, genes_not_masked_ids, mode="only_a") HMMER3.intersect_ids_from_files(gene_ids_white_list, gene_ids_black_list, final_genes_ids, mode="only_a") final_ids = IdSet() final_ids.read(final_genes_ids) AnnotationsRoutines.extract_annotation_from_gff(output_gff, final_ids, ["gene"], final_gff) AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff) for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence: MatplotlibRoutines.percent_histogram_from_file( stat_file, stat_file, data_type=None,