def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write( "%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)
def get_scaffold_ids_from_gff(gff_file, out_file=None): scaffold_id_set = IdSet() with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue scaffold_id = line.split("\t")[0] scaffold_id_set.add(scaffold_id) if out_file: scaffold_id_set.write(out_file) return scaffold_id_set
complicated_families_syn_dict = SynDict() complicated_families_syn_ids = IdSet() sl_keys = list(complicated_families_dict.sl_keys()) for sl_key in sl_keys: sp_set = set() for species in complicated_families_dict: if sl_key not in complicated_families_dict[species]: continue tmp = complicated_families_dict[species][sl_key].split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2:] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True) for entry in complicated_families_dict.all_values(): tmp = entry.split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) complicated_families_syn_ids.write("complicated_families_check.ids") nonassembled.write("splited_to_several_families.t", absent_symbol=".")
accordance_dict[species].read(accordance_file, key_index=1, value_index=0) if args.name_first: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[0], args.name_separator.join(gene_list[1:]) else: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[-1], args.name_separator.join(gene_list[:-1]) families_with_errors = IdSet() for family in pep_fam_dict: cds_fam_dict[family] = [] for pep in pep_fam_dict[family]: species, pep_name = split_name(pep) if pep_name in accordance_dict[species]: cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \ "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species) cds_fam_dict[family].append(cds_name) else: print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name)) families_with_errors.add(family) for family in families_with_errors: cds_fam_dict.pop(family, None) families_with_errors.write(args.fam_error) cds_fam_dict.write(args.output, splited_values=True)