Ejemplo n.º 1
0
    def _output_agp(self, out_agp, assembly_name):
        """
        Output file in NCBI AGP format
        """
        SHIFT = 1
        with open(out_agp, "w") as f:
            f.write("##agp-version  2.0\n")
            f.write("#ASSEMBLY NAME: {0}\n".format(assembly_name))
            f.write("#DESCRIPTION: Pseudochromosome assembly\n")
            f.write("#PROGRAM: Ragout v{0}\n".format(__version__))
            for scf in sorted(self.scaffolds, key=lambda s: s.name):
                chr_pos = 0
                for contig_id, contig in enumerate(scf.contigs):
                    chr_start = chr_pos
                    chr_end = chr_pos + contig.length()
                    chr_pos = chr_end + contig.link.gap
                    cont_name, cont_start, cont_end = contig.name_with_coords()
                    strand = "+" if contig.sign > 0 else "-"
                    support = _support_to_string(contig.link)

                    contig_num = 2 * contig_id + 1
                    gap_num = 2 * contig_id + 2
                    cont_fields = [scf.name, chr_start + SHIFT, chr_end,
                                   contig_num, "W", cont_name, cont_start + SHIFT,
                                   cont_end, strand]
                    f.write("\t".join(map(str, cont_fields)) + "\n")
                    if contig.link.gap > 0:
                        gap_fields = [scf.name, chr_end + SHIFT, chr_pos, gap_num,
                                      "N", contig.link.gap,
                                      "scaffold", "yes", support]
                        f.write("\t".join(map(str, gap_fields)) + "\n")
Ejemplo n.º 2
0
    def _print_statistics(self):
        """
        Computes and prints some useful statistics
        """
        unplaced_len = sum(map(len, list(self.unplaced_fasta.values())))
        fragments_len = sum(map(len, list(self.fragments_fasta.values())))
        output_len = self.used_fragments_len + self.introduced_gap_len

        #used_perc = 100 * float(self.used_fragments_len) / fragments_len
        unplaced_perc = 100 * float(unplaced_len) / fragments_len
        gap_perc = 100 * float(self.introduced_gap_len) / output_len

        unplaced_count = len(self.unplaced_fasta)
        used_fragments_num = 0
        for scf in self.scaffolds:
            used_fragments_num += len(scf.contigs)

        contigs_len = [len(c) for c in self.fragments_fasta.values()]
        scaffolds_len = [len(c) for c in self.scaffolds_fasta.values()]
        contigs_n50 = _calc_n50(contigs_len, fragments_len)
        scaffolds_n50 = _calc_n50(scaffolds_len, output_len)

        logger.info("Assembly statistics:\n\n"
                    "\tScaffolds:\t\t{0}\n"
                    "\tUsed fragments:\t\t{1}\n"
                    "\tScaffolds length:\t{2}\n\n"
                    "\tUnplaced fragments:\t{3}\n"
                    "\tUnplaced length:\t{4} ({5:2.2f}%)\n"
                    "\tIntroduced Ns length:\t{6} ({7:2.2f}%)\n\n"
                    "\tFragments N50:\t\t{8}\n"
                    "\tAssembly N50:\t\t{9}\n"
                    .format(len(self.scaffolds), used_fragments_num,
                            output_len, unplaced_count, unplaced_len,
                            unplaced_perc, self.introduced_gap_len, gap_perc,
                            contigs_n50, scaffolds_n50))
Ejemplo n.º 3
0
    def alternating_cycle(self, node_1, node_2):
        """
        Determines if there is a cycle of alternating colors
        that goes through the given red-supported (!) edge
        """
        def get_genome_ids(xxx_todo_changeme):
            (u, v) = xxx_todo_changeme
            return self.genomes_support(u, v)

        good_path = False
        path = None
        for path in self._alternating_paths(node_1, node_2):
            assert len(path) % 2 == 0
            if len(path) == 2:
                continue

            edges = list(zip(path[:-1], path[1:]))
            even_colors = list(map(get_genome_ids, edges[1::2]))
            even_good = all(
                [set(e) == set([self.target]) for e in even_colors])
            if not even_good:
                continue

            odd_colors = [get_genome_ids(e) for e in edges[0::2]]
            common_genomes = set(odd_colors[0])
            for edge_colors in odd_colors:
                common_genomes = common_genomes.intersection(edge_colors)

            if common_genomes:
                #self._check_distances(path)
                good_path = True
                break

        return len(path) // 2 if good_path else None
Ejemplo n.º 4
0
def make_synteny(maf_file, out_dir, min_blocks_list):
    """
    Builds synteny blocks from MAF file
    """
    if not check_binary():
        return False

    params_file = os.path.join(out_dir, "simpl_params.txt")
    _make_params_file(config.vals["maf2synteny"], params_file)
    cmdline = [
        M2S_EXEC, maf_file, "-o", out_dir, "-s", params_file, "-b",
        ",".join(map(str, min_blocks_list))
    ]

    logger.info("Running maf2synteny module")
    proc = subprocess.Popen(cmdline)
    #for line in iter(proc.stderr.readline, ""):
    #    logger.debug(line.strip())
    ret_code = proc.wait()
    if ret_code:
        logger.error("Non-zero return code: %d", ret_code)
        return False

    os.remove(params_file)

    return True
Ejemplo n.º 5
0
def _support_to_string(link):
    """
    Converts information about supporting adjacencies to string.
    Could be used separately form OutputGenerator for debugging purposes
    """
    supp_genomes = sorted(link.supporting_genomes)
    support_to_str = lambda gc: "{0}:{1}".format(gc.genome, gc.chr)
    support = ",".join(map(support_to_str, supp_genomes))
    if link.supporting_assembly:
        support += ",~>"
    return support
Ejemplo n.º 6
0
def parse_ragout_recipe(filename):
    if not os.path.exists(filename):
        raise RecipeException("Can't open recipe file")

    prefix = os.path.dirname(filename)

    recipe_dict = {"genomes": {}}
    known_params = [
        "tree", "target", "blocks", "maf", "hal", "fasta", "draft",
        "references", "naming_ref"
    ]
    deprecated = ["circular"]
    required_params = ["references", "target"]

    cast_bool = ["circular", "draft"]
    fix_path = ["fasta", "maf", "hal"]

    defaults = {"circular": False, "draft": False}

    param_matcher = re.compile(r"([^\s]+)\s*=\s*([^\s].*)$")
    with open(filename, "r") as f:
        for lineno, line in enumerate(f):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            m = param_matcher.match(line)
            if not m or not "." in m.group(1):
                raise RecipeException(
                    "Error parsing recipe on line {0}".format(lineno + 1))

            (obj, param_name), value = m.group(1).rsplit(".", 1), m.group(2)
            if param_name in deprecated:
                logger.warning("Recipe parameter '%s' is deprecated",
                               param_name)
                continue
            if param_name not in known_params:
                raise RecipeException(
                    "Unknown recipe parameter '{0}' on line {1}".format(
                        param_name, lineno))

            #checking values, casting
            if param_name in cast_bool:
                if value.lower() in ["true", "1"]:
                    value = True
                elif value.lower() in ["false", "0"]:
                    value = False
                else:
                    raise RecipeException(
                        "Error parsing recipe on line "
                        "{0}: wrong value '{1}' for bool param".format(
                            lineno, value))
            if param_name == "blocks":
                if value not in config.vals["blocks"]:
                    try:
                        value = list(map(int, value.split(",")))
                    except Exception:
                        raise RecipeException(
                            "Can't parse block size set: {0}".format(value))
            if param_name == "references":
                value = [s.strip() for s in value.split(",")]
            if param_name in fix_path:
                value = os.path.expanduser(value)
                value = os.path.join(prefix, value)
            ###

            if obj == "":
                recipe_dict[param_name] = value
            elif obj == "*":
                defaults[param_name] = value
            else:
                recipe_dict["genomes"].setdefault(obj, {})[param_name] = value

    for param in required_params:
        if param not in recipe_dict:
            raise RecipeException(
                "Required parameter '{0}' not found in recipe".format(param))

    genomes = recipe_dict["references"] + [recipe_dict["target"]]
    if "tree" in recipe_dict:
        try:
            leaves = get_leaves_names(recipe_dict["tree"])
            if set(leaves) != set(genomes):
                raise RecipeException("The tree does not agree with "
                                      "the specified set of genomes")
        except PhyloException as e:
            raise RecipeException(e)

    for g in recipe_dict["genomes"]:
        if g not in genomes:
            raise RecipeException("Recipe error: genome '{0}' is not in "
                                  "specified as reference or target".format(g))

    for g in genomes:
        recipe_dict["genomes"].setdefault(g, {})

    for g, g_params in recipe_dict["genomes"].items():
        for def_key, def_val in defaults.items():
            g_params.setdefault(def_key, def_val)

    return recipe_dict
Ejemplo n.º 7
0
    def build(self):
        """
        Implementation of neighbor-joining algorithm
        """
        MIN_LEN = 0.000001
        genomes = list(self.perms_by_genome.keys())
        taxas = list(map(Leaf, sorted(genomes)))
        for t in taxas:
            t.terminal = True

        distances = defaultdict(lambda : {})
        for t_1, t_2 in combinations_with_replacement(taxas, 2):
            distances[t_1][t_2] = self._genome_distance(t_1.identifier,
                                                        t_2.identifier)
            distances[t_2][t_1] = distances[t_1][t_2]

        def calc_q(taxas):
            q_matrix = defaultdict(lambda : {})
            for t_1, t_2 in combinations(taxas, 2):
                other_dist = 0
                for other_t in taxas:
                    other_dist += distances[t_1][other_t]
                    other_dist += distances[t_2][other_t]
                q_matrix[t_1][t_2] = ((len(taxas) - 2) * distances[t_1][t_2] -
                                     other_dist)
                q_matrix[t_2][t_1] = q_matrix[t_1][t_2]
            return q_matrix

        while len(taxas) > 1:
            #determine two closest ones
            q_matrix = calc_q(taxas)
            lowest_dst = float("inf")
            lowest_pair = None
            for t_1, t_2 in sorted(combinations(taxas, 2)):
                if q_matrix[t_1][t_2] < lowest_dst:
                    lowest_dst = q_matrix[t_1][t_2]
                    lowest_pair = (t_1, t_2)

            #calculate distances to new internal node from joined taxas
            new_taxa = Tree()
            new_taxa.terminal = False

            old_1, old_2 = sorted(lowest_pair)
            other_dist = 0
            for other_taxa in taxas:
                other_dist += distances[old_1][other_taxa]
                other_dist -= distances[old_2][other_taxa]
            div_dist = (0.5 / (len(taxas) - 2) * other_dist
                        if len(taxas) > 2 else 0)
            dist_1 = 0.5 * distances[old_1][old_2] + div_dist
            dist_2 = distances[old_1][old_2] - dist_1
            dist_1, dist_2 = max(MIN_LEN, dist_1), max(MIN_LEN, dist_2)

            new_taxa.add_edge((old_1, None, dist_1))
            new_taxa.add_edge((old_2, None, dist_2))
            taxas.remove(old_1)
            taxas.remove(old_2)

            for other_taxa in taxas:
                distances[new_taxa][other_taxa] = \
                    0.5 * (distances[old_1][other_taxa] +
                           distances[old_2][other_taxa] -
                           distances[old_1][old_2])
                distances[other_taxa][new_taxa] = distances[new_taxa][other_taxa]
            distances[new_taxa][new_taxa] = 0
            taxas.append(new_taxa)

        tree = list(taxas)[0]
        return tree