コード例 #1
0
ファイル: SequenceCluster.py プロジェクト: melakbet/MAVR
    def extract_monocluster_ids(self,
                                clusters_dict,
                                white_list_ids=None,
                                out_file=None):
        """
        Extracts clusters with only one sequence in all species.
        """
        monocluster_ids = IdSet()
        cluster_names = self.get_cluster_names(clusters_dict)

        for cluster_name in cluster_names:
            for species in clusters_dict:
                if white_list_ids:
                    if cluster_name not in white_list_ids:
                        break
                if cluster_name not in clusters_dict[species]:
                    break
                if len(clusters_dict[species][cluster_name]) > 1:
                    break
            else:
                monocluster_ids.add(cluster_name)

        if out_file:
            monocluster_ids.write(out_file)

        return monocluster_ids
コード例 #2
0
    def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True):

        syn_dict = SynDict(filename=syn_file)
        skipped_id_list = IdSet()

        output_gff = "%s.renamed.gff" % output_prefix
        skipped_gff = "%s.skipped.gff" % output_prefix
        skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix

        with self.metaopen(input_gff, "r") as in_fd, \
             self.metaopen(output_gff, "w") as out_fd, \
             self.metaopen(skipped_gff, "w") as skipped_fd:

            for line in in_fd:
                if line[0] == "#":
                    out_fd.write(line)
                gff_list = line.split("\t")
                if gff_list[0] in syn_dict:
                    gff_list[0] = syn_dict[gff_list[0]]
                    out_fd.write("\t".join(gff_list))
                else:
                    skipped_fd.write(line)
                    skipped_id_list.add(gff_list[0])

        if verbose:
            print("Not renamed scaffolds: %i" % len(skipped_id_list))

        skipped_id_list.write(skipped_id_file)
コード例 #3
0
    def convert_rm_out_to_gff(input_file, output_file,
                              annotated_repeat_classes_file,
                              annotated_repeat_families_file):
        repeat_classes_set = IdSet()
        repeat_families_set = IdSet()
        with open(input_file, "r") as in_fd:
            for i in range(0, 3):
                in_fd.readline()

            with open(output_file, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip().split()
                    strand = "+" if tmp[8] == "+" else "-"
                    repeat_class_family = tmp[10].split("/")
                    if len(repeat_class_family) == 1:
                        repeat_class_family.append(".")
                    repeat_classes_set.add(repeat_class_family[0])
                    repeat_families_set.add("/".join(repeat_class_family))
                    parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \
                                 % (repeat_class_family[0], repeat_class_family[1],
                                    tmp[9], tmp[0], tmp[1], tmp[2], tmp[3])
                    out_fd.write(
                        "%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" %
                        (tmp[4], tmp[5], tmp[6], strand, parameters))
        repeat_classes_set.write(annotated_repeat_classes_file)
        repeat_families_set.write(annotated_repeat_families_file)
コード例 #4
0
    def get_scaffold_ids_from_gff(gff_file, out_file=None):
        scaffold_id_set = IdSet()

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                scaffold_id = line.split("\t")[0]
                scaffold_id_set.add(scaffold_id)

        if out_file:
            scaffold_id_set.write(out_file)

        return scaffold_id_set
コード例 #5
0
complicated_families_syn_dict = SynDict()
complicated_families_syn_ids = IdSet()
sl_keys = list(complicated_families_dict.sl_keys())
for sl_key in sl_keys:
    sp_set = set()
    for species in complicated_families_dict:
        if sl_key not in complicated_families_dict[species]:
            continue
        tmp = complicated_families_dict[species][sl_key].split(";")
        for i in range(0, len(tmp)):
            if "_" in tmp[i]:
                tmp[i] = tmp[i][2:]
            tmp[i] = tmp[i].split(",")
            for syn_id in tmp[i]:
                complicated_families_syn_ids.add(syn_id)
                sp_set.add(syn_id)
    complicated_families_syn_dict[sl_key] = sp_set
complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True)

for entry in complicated_families_dict.all_values():
    tmp = entry.split(";")
    for i in range(0, len(tmp)):
        if "_" in tmp[i]:
            tmp[i] = tmp[i][2]
        tmp[i] = tmp[i].split(",")
        for syn_id in tmp[i]:
            complicated_families_syn_ids.add(syn_id)
complicated_families_syn_ids.write("complicated_families_check.ids")

nonassembled.write("splited_to_several_families.t", absent_symbol=".")
コード例 #6
0
ファイル: get_cds_families.py プロジェクト: melakbet/MAVR
    accordance_dict[species].read(accordance_file, key_index=1, value_index=0)


if args.name_first:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[0], args.name_separator.join(gene_list[1:])
else:
    def split_name(pep_name):
        gene_list = pep_name.split(args.name_separator)
        return gene_list[-1], args.name_separator.join(gene_list[:-1])

families_with_errors = IdSet()
for family in pep_fam_dict:
    cds_fam_dict[family] = []
    for pep in pep_fam_dict[family]:
        species, pep_name = split_name(pep)
        if pep_name in accordance_dict[species]:
            cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \
                "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species)
            cds_fam_dict[family].append(cds_name)
        else:
            print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name))
            families_with_errors.add(family)

for family in families_with_errors:
    cds_fam_dict.pop(family, None)

families_with_errors.write(args.fam_error)
cds_fam_dict.write(args.output, splited_values=True)