def _output_agp(self, out_agp, assembly_name): """ Output file in NCBI AGP format """ SHIFT = 1 with open(out_agp, "w") as f: f.write("##agp-version 2.0\n") f.write("#ASSEMBLY NAME: {0}\n".format(assembly_name)) f.write("#DESCRIPTION: Pseudochromosome assembly\n") f.write("#PROGRAM: Ragout v{0}\n".format(__version__)) for scf in sorted(self.scaffolds, key=lambda s: s.name): chr_pos = 0 for contig_id, contig in enumerate(scf.contigs): chr_start = chr_pos chr_end = chr_pos + contig.length() chr_pos = chr_end + contig.link.gap cont_name, cont_start, cont_end = contig.name_with_coords() strand = "+" if contig.sign > 0 else "-" support = _support_to_string(contig.link) contig_num = 2 * contig_id + 1 gap_num = 2 * contig_id + 2 cont_fields = [scf.name, chr_start + SHIFT, chr_end, contig_num, "W", cont_name, cont_start + SHIFT, cont_end, strand] f.write("\t".join(map(str, cont_fields)) + "\n") if contig.link.gap > 0: gap_fields = [scf.name, chr_end + SHIFT, chr_pos, gap_num, "N", contig.link.gap, "scaffold", "yes", support] f.write("\t".join(map(str, gap_fields)) + "\n")
def _print_statistics(self): """ Computes and prints some useful statistics """ unplaced_len = sum(map(len, list(self.unplaced_fasta.values()))) fragments_len = sum(map(len, list(self.fragments_fasta.values()))) output_len = self.used_fragments_len + self.introduced_gap_len #used_perc = 100 * float(self.used_fragments_len) / fragments_len unplaced_perc = 100 * float(unplaced_len) / fragments_len gap_perc = 100 * float(self.introduced_gap_len) / output_len unplaced_count = len(self.unplaced_fasta) used_fragments_num = 0 for scf in self.scaffolds: used_fragments_num += len(scf.contigs) contigs_len = [len(c) for c in self.fragments_fasta.values()] scaffolds_len = [len(c) for c in self.scaffolds_fasta.values()] contigs_n50 = _calc_n50(contigs_len, fragments_len) scaffolds_n50 = _calc_n50(scaffolds_len, output_len) logger.info("Assembly statistics:\n\n" "\tScaffolds:\t\t{0}\n" "\tUsed fragments:\t\t{1}\n" "\tScaffolds length:\t{2}\n\n" "\tUnplaced fragments:\t{3}\n" "\tUnplaced length:\t{4} ({5:2.2f}%)\n" "\tIntroduced Ns length:\t{6} ({7:2.2f}%)\n\n" "\tFragments N50:\t\t{8}\n" "\tAssembly N50:\t\t{9}\n" .format(len(self.scaffolds), used_fragments_num, output_len, unplaced_count, unplaced_len, unplaced_perc, self.introduced_gap_len, gap_perc, contigs_n50, scaffolds_n50))
def alternating_cycle(self, node_1, node_2): """ Determines if there is a cycle of alternating colors that goes through the given red-supported (!) edge """ def get_genome_ids(xxx_todo_changeme): (u, v) = xxx_todo_changeme return self.genomes_support(u, v) good_path = False path = None for path in self._alternating_paths(node_1, node_2): assert len(path) % 2 == 0 if len(path) == 2: continue edges = list(zip(path[:-1], path[1:])) even_colors = list(map(get_genome_ids, edges[1::2])) even_good = all( [set(e) == set([self.target]) for e in even_colors]) if not even_good: continue odd_colors = [get_genome_ids(e) for e in edges[0::2]] common_genomes = set(odd_colors[0]) for edge_colors in odd_colors: common_genomes = common_genomes.intersection(edge_colors) if common_genomes: #self._check_distances(path) good_path = True break return len(path) // 2 if good_path else None
def make_synteny(maf_file, out_dir, min_blocks_list): """ Builds synteny blocks from MAF file """ if not check_binary(): return False params_file = os.path.join(out_dir, "simpl_params.txt") _make_params_file(config.vals["maf2synteny"], params_file) cmdline = [ M2S_EXEC, maf_file, "-o", out_dir, "-s", params_file, "-b", ",".join(map(str, min_blocks_list)) ] logger.info("Running maf2synteny module") proc = subprocess.Popen(cmdline) #for line in iter(proc.stderr.readline, ""): # logger.debug(line.strip()) ret_code = proc.wait() if ret_code: logger.error("Non-zero return code: %d", ret_code) return False os.remove(params_file) return True
def _support_to_string(link): """ Converts information about supporting adjacencies to string. Could be used separately form OutputGenerator for debugging purposes """ supp_genomes = sorted(link.supporting_genomes) support_to_str = lambda gc: "{0}:{1}".format(gc.genome, gc.chr) support = ",".join(map(support_to_str, supp_genomes)) if link.supporting_assembly: support += ",~>" return support
def parse_ragout_recipe(filename): if not os.path.exists(filename): raise RecipeException("Can't open recipe file") prefix = os.path.dirname(filename) recipe_dict = {"genomes": {}} known_params = [ "tree", "target", "blocks", "maf", "hal", "fasta", "draft", "references", "naming_ref" ] deprecated = ["circular"] required_params = ["references", "target"] cast_bool = ["circular", "draft"] fix_path = ["fasta", "maf", "hal"] defaults = {"circular": False, "draft": False} param_matcher = re.compile(r"([^\s]+)\s*=\s*([^\s].*)$") with open(filename, "r") as f: for lineno, line in enumerate(f): line = line.strip() if not line or line.startswith("#"): continue m = param_matcher.match(line) if not m or not "." in m.group(1): raise RecipeException( "Error parsing recipe on line {0}".format(lineno + 1)) (obj, param_name), value = m.group(1).rsplit(".", 1), m.group(2) if param_name in deprecated: logger.warning("Recipe parameter '%s' is deprecated", param_name) continue if param_name not in known_params: raise RecipeException( "Unknown recipe parameter '{0}' on line {1}".format( param_name, lineno)) #checking values, casting if param_name in cast_bool: if value.lower() in ["true", "1"]: value = True elif value.lower() in ["false", "0"]: value = False else: raise RecipeException( "Error parsing recipe on line " "{0}: wrong value '{1}' for bool param".format( lineno, value)) if param_name == "blocks": if value not in config.vals["blocks"]: try: value = list(map(int, value.split(","))) except Exception: raise RecipeException( "Can't parse block size set: {0}".format(value)) if param_name == "references": value = [s.strip() for s in value.split(",")] if param_name in fix_path: value = os.path.expanduser(value) value = os.path.join(prefix, value) ### if obj == "": recipe_dict[param_name] = value elif obj == "*": defaults[param_name] = value else: recipe_dict["genomes"].setdefault(obj, {})[param_name] = value for param in required_params: if param not in recipe_dict: raise RecipeException( "Required parameter '{0}' not found in recipe".format(param)) genomes = recipe_dict["references"] + [recipe_dict["target"]] if "tree" in recipe_dict: try: leaves = get_leaves_names(recipe_dict["tree"]) if set(leaves) != set(genomes): raise RecipeException("The tree does not agree with " "the specified set of genomes") except PhyloException as e: raise RecipeException(e) for g in recipe_dict["genomes"]: if g not in genomes: raise RecipeException("Recipe error: genome '{0}' is not in " "specified as reference or target".format(g)) for g in genomes: recipe_dict["genomes"].setdefault(g, {}) for g, g_params in recipe_dict["genomes"].items(): for def_key, def_val in defaults.items(): g_params.setdefault(def_key, def_val) return recipe_dict
def build(self): """ Implementation of neighbor-joining algorithm """ MIN_LEN = 0.000001 genomes = list(self.perms_by_genome.keys()) taxas = list(map(Leaf, sorted(genomes))) for t in taxas: t.terminal = True distances = defaultdict(lambda : {}) for t_1, t_2 in combinations_with_replacement(taxas, 2): distances[t_1][t_2] = self._genome_distance(t_1.identifier, t_2.identifier) distances[t_2][t_1] = distances[t_1][t_2] def calc_q(taxas): q_matrix = defaultdict(lambda : {}) for t_1, t_2 in combinations(taxas, 2): other_dist = 0 for other_t in taxas: other_dist += distances[t_1][other_t] other_dist += distances[t_2][other_t] q_matrix[t_1][t_2] = ((len(taxas) - 2) * distances[t_1][t_2] - other_dist) q_matrix[t_2][t_1] = q_matrix[t_1][t_2] return q_matrix while len(taxas) > 1: #determine two closest ones q_matrix = calc_q(taxas) lowest_dst = float("inf") lowest_pair = None for t_1, t_2 in sorted(combinations(taxas, 2)): if q_matrix[t_1][t_2] < lowest_dst: lowest_dst = q_matrix[t_1][t_2] lowest_pair = (t_1, t_2) #calculate distances to new internal node from joined taxas new_taxa = Tree() new_taxa.terminal = False old_1, old_2 = sorted(lowest_pair) other_dist = 0 for other_taxa in taxas: other_dist += distances[old_1][other_taxa] other_dist -= distances[old_2][other_taxa] div_dist = (0.5 / (len(taxas) - 2) * other_dist if len(taxas) > 2 else 0) dist_1 = 0.5 * distances[old_1][old_2] + div_dist dist_2 = distances[old_1][old_2] - dist_1 dist_1, dist_2 = max(MIN_LEN, dist_1), max(MIN_LEN, dist_2) new_taxa.add_edge((old_1, None, dist_1)) new_taxa.add_edge((old_2, None, dist_2)) taxas.remove(old_1) taxas.remove(old_2) for other_taxa in taxas: distances[new_taxa][other_taxa] = \ 0.5 * (distances[old_1][other_taxa] + distances[old_2][other_taxa] - distances[old_1][old_2]) distances[other_taxa][new_taxa] = distances[new_taxa][other_taxa] distances[new_taxa][new_taxa] = 0 taxas.append(new_taxa) tree = list(taxas)[0] return tree