Esempio n. 1
0
    def extract_monocluster_ids(self,
                                clusters_dict,
                                white_list_ids=None,
                                out_file=None):
        """
        Extracts clusters with only one sequence in all species.
        """
        monocluster_ids = IdSet()
        cluster_names = self.get_cluster_names(clusters_dict)

        for cluster_name in cluster_names:
            for species in clusters_dict:
                if white_list_ids:
                    if cluster_name not in white_list_ids:
                        break
                if cluster_name not in clusters_dict[species]:
                    break
                if len(clusters_dict[species][cluster_name]) > 1:
                    break
            else:
                monocluster_ids.add(cluster_name)

        if out_file:
            monocluster_ids.write(out_file)

        return monocluster_ids
Esempio n. 2
0
    def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None):
        cluster_names = IdSet()
        for species in clusters_dict:
            species_clusters = IdSet(clusters_dict[species].keys())
            cluster_names |= species_clusters
        if out_file:
            cluster_names.write(out_file)

        return cluster_names & IdSet(
            white_list_ids) if white_list_ids else cluster_names
Esempio n. 3
0
    def intersect_ids_from_files(files_with_ids_from_group_a,
                                 files_with_ids_from_group_b,
                                 result_file=None,
                                 mode="common"):
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        #print(files_with_ids_from_group_a)
        for filename in [files_with_ids_from_group_a] if isinstance(
                files_with_ids_from_group_a,
                str) else files_with_ids_from_group_a:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            a = a | id_set

        for filename in [files_with_ids_from_group_b] if isinstance(
                files_with_ids_from_group_b,
                str) else files_with_ids_from_group_b:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            b = b | id_set

        result_fd = open(result_file, "w") if result_file else sys.stdout
        if mode != "count":
            final_set = IdSet(expression(a, b))
            final_set.write(result_fd)
        else:
            result_fd.write(
                "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n"
                % (len(a), len(b), len(a & b), len(a - b), len(b - a),
                   len(a ^ b), len(a | b)))
Esempio n. 4
0
    def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file):
        repeat_classes_set = IdSet()
        repeat_families_set = IdSet()
        with open(input_file, "r") as in_fd:
            for i in range(0, 3):
                in_fd.readline()

            with open(output_file, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip().split()
                    strand = "+" if tmp[8] == "+" else "-"
                    repeat_class_family = tmp[10].split("/")
                    if len(repeat_class_family) == 1:
                        repeat_class_family.append(".")
                    repeat_classes_set.add(repeat_class_family[0])
                    repeat_families_set.add("/".join(repeat_class_family))
                    parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \
                                 % (repeat_class_family[0], repeat_class_family[1],
                                    tmp[9], tmp[0], tmp[1], tmp[2], tmp[3])
                    out_fd.write("%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters))
        repeat_classes_set.write(annotated_repeat_classes_file)
        repeat_families_set.write(annotated_repeat_families_file)