def intersect_ids(list_of_group_a, list_of_group_b, mode="common"): # possible modes: common, only_a, only_b, not_common, combine, count a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b for id_list in list_of_group_a: a = a | IdSet(id_list) for id_list in list_of_group_b: b = b | IdSet(id_list) if mode != "count": return IdSet(expression(a, b)) else: return len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)
def extract_transcripts_by_ids(self, input_gff, transcript_id_file, output_gff): transcript_ids = IdSet() transcript_ids.read(transcript_id_file, header=False) GFF.write( self.record_with_extracted_transcripts_generator( input_gff, transcript_ids), open(output_gff, "w"))
def extract_monocluster_ids_from_file(self, dir_with_cluster_files, out_file, file_with_white_list_ids=None): # filenames are counted as species names white_list_ids = None if file_with_white_list_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) monoclusters = self.extract_monocluster_ids( clusters_dict, out_file=out_file, white_list_ids=white_list_ids) return monoclusters
def get_sequence_names(clusters_dict, write_ids=False, out_prefix=None, white_list_ids=None): sequence_names_dict = SynDict() for species in clusters_dict: sequence_names_dict[species] = IdSet() for species in clusters_dict: for cluster_id in clusters_dict[species]: if white_list_ids: if cluster_id not in white_list_ids: continue sequence_names_dict[species] = sequence_names_dict[ species] | IdSet(clusters_dict[species][cluster_id]) if write_ids: for species in clusters_dict: out_file = "%s_%s.ids" % ( out_prefix, species) if out_prefix else "%s.ids" % species sequence_names_dict[species].write(out_file) return sequence_names_dict
def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None): cluster_names = IdSet() for species in clusters_dict: species_clusters = IdSet(clusters_dict[species].keys()) cluster_names |= species_clusters if out_file: cluster_names.write(out_file) return cluster_names & IdSet( white_list_ids) if white_list_ids else cluster_names
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = self.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (self.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)
def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common"): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance( files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read(filename, comments_prefix="#") a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance( files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read(filename, comments_prefix="#") b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write( "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
gene_ids_white_list = [output_pfam_supported_genes_ids] elif args.swissprot_db: gene_ids_white_list = [output_swissprot_supported_genes_ids] else: gene_ids_white_list = [all_annotated_genes_ids] HMMER3.intersect_ids_from_files([all_annotated_genes_ids], gene_ids_black_list, genes_not_masked_ids, mode="only_a") HMMER3.intersect_ids_from_files(gene_ids_white_list, gene_ids_black_list, final_genes_ids, mode="only_a") #""" final_ids = IdSet() final_ids.read(final_genes_ids) AnnotationsRoutines.extract_annotation_from_gff(output_gff, final_ids, ["gene"], final_gff) print("Extracting CDS...") AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff) print("Drawing histograms...") for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence:
def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write("%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)