def create_hmms(aln_path_map, name=False, outdir=None, M=50, seq_id=90, add_cons=False, seq_lim=None, threads=1, verbose=False): verbose_num = 0 work_items = [] hmm_path_map = {} for pham, aln_path in aln_path_map.items(): if outdir is not None: hmm_path_name = aln_path.with_suffix(".hmm").name hmm_path = outdir.joinpath(hmm_path_name) else: hmm_path = aln_path.with_suffix(".hmm") hmm_path_map[pham] = hmm_path hmm_name = None if name: hmm_name = str(pham) work_items.append((aln_path, hmm_path, hmm_name, add_cons, seq_lim, M, seq_id, verbose_num)) parallelize.parallelize(work_items, threads, hhmake, verbose=verbose) return hmm_path_map
def align_pham_out_fastas(working_dir, pham_fasta_map, threads=1, verbose=False): """Uses multiple processes to align fasta-formatted multiple sequence files for all of the phams listed. :param working_dir: Path to the directory where the files will be written :type working_dir: pathlib.Path :param phams_dict: Dictionary that maps phams to their fasta file path :type phams_dict: dict{Path} :param threads: Number of processes/threads to spawn during alignment :type threads: int :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :return pham_aln_map: Dictionary that maps phams to their aln file path :rtype pham_aln_map: dict """ pham_aln_map = dict() if verbose: print("...Aligning pham gene fasta files...") work_items = [] for pham, filepath in pham_fasta_map.items(): aln_name = filepath.with_suffix(".aln").name aln_path = working_dir.joinpath(aln_name) pham_aln_map[pham] = aln_path work_items.append((filepath, aln_path)) parallelize.parallelize(work_items, threads, run_clustalo, verbose=verbose) return pham_aln_map
def write_phams(fasta_dir, aln_dir, phams_translations_dict, cores=1, verbose=False): work_items = [] for pham, pham_translations in phams_translations_dict.items(): work_items.append((fasta_dir, aln_dir, pham, pham_translations)) parallelize.parallelize(work_items, cores, write_phams_process, verbose=verbose)
def align_fastas(fasta_path_map, mat_out=False, tree_out=False, file_type="fasta", mode="clustalo", override=False, outdir=None, threads=1, verbose=False): verbose_num = 0 work_items = [] aln_path_map = {} for pham, fasta_path in fasta_path_map.items(): if outdir is not None: working_dir = outdir else: working_dir = fasta_path.parent fasta_path_name = fasta_path.with_suffix("").name if override: aln_path = working_dir.joinpath(".".join( [fasta_path_name, "fasta"])) else: aln_path = working_dir.joinpath(".".join([fasta_path_name, "aln"])) aln_path_map[pham] = aln_path mat_path = None if mat_out: mat_path = working_dir.joinpath(".".join([fasta_path_name, "mat"])) tree_path = None if tree_out: tree_path = working_dir.joinpath(".".join( [fasta_path_name, "tree"])) work_items.append((fasta_path, aln_path, mat_path, tree_path, file_type, "fasta", 1, verbose_num)) if mode == "clustalo": aln_driver = clustalo else: raise NotImplementedError("Alignment program not supported.") parallelize.parallelize(work_items, threads, aln_driver, verbose=verbose) return aln_path_map
def create_centroid_graph(pan_alchemist, clusters, aln_dir, threads=1, verbose=False): thread_manager = multiprocessing.Manager() cluster_data = thread_manager.list() cluster_data_dicts = pan_handling.retrieve_cluster_data( pan_alchemist, clusters) for cluster_data_dict in cluster_data_dicts: cluster_data.append((cluster_data_dict["ClusterID"], cluster_data_dict["CentroidSeq"].decode("utf-8"))) work_items = [] for i in range(len(cluster_data)): work_items.append((i, cluster_data)) random.shuffle(work_items) temp_dir_path = Path(TEMP_DIR) if temp_dir_path.is_dir(): shutil.rmtree(temp_dir_path) temp_dir_path.mkdir() if verbose: print("...Calculating centroid Levenshtein distances...") matrix_chunks = parallelize.parallelize(work_items, threads, create_centroid_graph_process, verbose=verbose) return matrix_chunks
def create_pham_hmms(alchemist, working_dir, pham_ts_to_id, cores=1, name_map=None, M=50, seq_id=90, add_cons=False, seq_lim=None, verbose=False): if name_map is None: name_map = dict() work_items = [] for pham, pham_ts in pham_ts_to_id.items(): name = name_map.get(pham) work_items.append( (working_dir, pham, pham_ts, name, M, seq_id, add_cons, seq_lim)) hmm_paths = parallelize.parallelize(work_items, cores, create_pham_hmms_process, verbose=verbose) return {pham: path for pham, path in hmm_paths}
def retrieve_intracluster_edges(db_filter, working_dir, node_names, matrix, lookup_dict, gcs=DEFAULT_SETTINGS["gcs"], kmer=DEFAULT_SETTINGS["kmer"], sketch=DEFAULT_SETTINGS["sketch"], threads=1, verbose=False): intracluster_edges = [] node_name_set = set() for i in range(matrix.size): target_name = matrix.labels[i] target_cluster = lookup_dict[target_name] for j in range(i, matrix.size): query_name = matrix.labels[j] query_cluster = lookup_dict[query_name] if (target_cluster is not None) and (query_cluster is not None): if query_cluster == target_cluster: continue pairwise_gcs = matrix.get_cell(i, j) if pairwise_gcs >= gcs: node_name_set.add(target_name) node_name_set.add(query_name) intracluster_edges.append(( target_name, str(target_cluster), query_name, str(query_cluster), str(round(pairwise_gcs, 3)))) db_filter.values = list(node_name_set) gs_and_ts = db_filter.select(["phage.PhageID", "phage.Sequence"], return_dict=False) fasta_dir = create_temp_path(str(working_dir.joinpath("fasta"))) fasta_path_map = write_genome_fastas( gs_and_ts, fasta_dir, verbose=verbose, threads=threads) sketch_dir = create_temp_path(str(working_dir.joinpath("sketches"))) sketch_path_map = sketch_genome_fastas( fasta_path_map, sketch_dir, verbose=verbose, threads=threads, kmer=kmer, sketch=sketch) work_items = [] for edge in intracluster_edges: work_items.append((sketch_path_map[edge[0]], sketch_path_map[edge[2]], edge)) if verbose: print("Calculating phage genome ANI...") intracluster_edges = parallelize.parallelize( work_items, threads, calculate_ani_process, verbose=verbose) intracluster_edges.sort(reverse=True, key=lambda x: ( float(x[4]) + float(x[5])) / 2) return intracluster_edges
def sketch_genome_fastas(fasta_path_map, sketch_dir, verbose=False, threads=1, kmer=DEFAULT_SETTINGS["kmer"], sketch=DEFAULT_SETTINGS["sketch"]): if verbose: print("Sketching genome fasta files...") work_items = [] sketch_path_map = {} for seq_id, fasta_path in fasta_path_map.items(): sketch_path = sketch_dir.joinpath(f"{seq_id}.msh") sketch_path_map[seq_id] = sketch_path work_items.append((fasta_path, sketch_path, kmer, sketch)) parallelize.parallelize(work_items, threads, alignment.mash_sketch, verbose=verbose) return sketch_path_map
def build_pan_towns(alchemist, pan_alchemist, hhdb_path, pan_dict, hmm_data_dir, data_maps_tuple, threads=1, verbose=False): work_items = [] hhr_path_map = {} for pham, hmm_path in data_maps_tuple[3].items(): hhr_path = hmm_data_dir.joinpath(".".join([str(pham), "hhr"])) hhr_path_map[pham] = hhr_path work_items.append( (hmm_path, hhdb_path, hhr_path, None, False, 1, 0, 0, 1)) if verbose: print("...Performing iterations of hhblitz to find HMM-HMM " "relationships...") parallelize.parallelize(work_items, threads, search.hhblits, verbose=verbose)
def build_symmetric_matrix(nodes, distance_function, is_distance=True, names=None, cores=1, verbose=False): work_items = [] row_indicies = [i for i in range(len(nodes))] chunk_size = int(floor(sqrt(len(nodes)))) for i in row_indicies: subject = nodes[i] if len(nodes) - 1 == i: work_items.append((distance_function, subject, [], i, 0)) else: query_node_chunks = basic.partition_list(nodes[i + 1:], chunk_size) for j in range(len(query_node_chunks)): work_items.append( (distance_function, subject, query_node_chunks[j], i, j)) matrix_data = parallelize.parallelize(work_items, cores, build_matrix_process, verbose=verbose) matrix_data.sort(key=lambda x: (x[1], x[2])) if names is None: names = row_indicies else: if len(names) != len(row_indicies): names = row_indicies matrix = SymmetricMatrix(names, is_distance=is_distance) for data in matrix_data: for i in range(len(data[0])): col = (data[2] * chunk_size) + (data[1] + i + 1) matrix.fill_cell(data[1], col, data[0][i]) diagonal_value = 1 if is_distance: diagonal_value = 0 matrix.fill_diagonal(diagonal_value) return matrix
def create_pham_alns(alchemist, working_dir, pham_ts_to_id, cores=1, mat_out=False, tree_out=False, infile_type="fasta", outfile_type="fasta", verbose=False): work_items = [] for pham, pham_ts in pham_ts_to_id.items(): work_items.append((working_dir, pham, pham_ts, mat_out, tree_out, infile_type, outfile_type)) fasta_paths = parallelize.parallelize(work_items, cores, create_pham_alns_process, verbose=verbose) return {pham: path for pham, path in fasta_paths}
def evaluate_clustering_scheme(matrix, cluster_scheme, cores=1, verbose=False, matrix_cache=None): if matrix_cache is None: matrix_cache = dict() work_items = [] for cluster, cluster_members in cluster_scheme.items(): if cluster is None: continue cluster_matrix = matrix_cache.get(cluster) if cluster_matrix is None: cluster_matrix = matrix.get_submatrix_from_labels( cluster_scheme[cluster]) work_items.append((cluster, cluster_matrix)) evaluations = parallelize.parallelize(work_items, cores, cluster_evaluation_subprocess, verbose=verbose) return {data[0]: data[1] for data in evaluations}
def build_pan_neighborhoods(alchemist, pan_alchemist, values, data_dir, data_maps_tuple, aD=75, mD=65, B=0.2, threads=1, verbose=False): matrix_chunks = create_centroid_graph(pan_alchemist, values, data_dir, threads=threads, verbose=verbose) thread_manager = multiprocessing.Manager() mD_cache = thread_manager.dict() data_cache = thread_manager.dict() path_cache = thread_manager.dict() temp_dir = Path(TEMP_DIR).joinpath("linker_files") temp_dir.mkdir() read_work_set = set() aln_work_items = [] if verbose: print("...Constructing base for pham neighborhoods...") construct_neighborhood_base(pan_alchemist, matrix_chunks, read_work_set, aln_work_items, data_dir, temp_dir, mD_cache, data_cache, path_cache, aD, mD, B) if verbose: print("...Reloading pham neighborhood cluster data...") for cluster in read_work_set: path_cache[cluster] = (data_maps_tuple[0].get(int(cluster)), data_maps_tuple[1].get(int(cluster)), data_maps_tuple[2].get(int(cluster))) chunk_size = int(math.sqrt(len(aln_work_items))) if chunk_size > 0: aln_work_chunks = basic.partition_list( aln_work_items, int(math.sqrt(len(aln_work_items)))) else: aln_work_chunks = [aln_work_items] if verbose: print("...Computing pham cluster minimum distances...") identity_edge_chunks = parallelize.parallelize( aln_work_chunks, threads, build_neighborhood_edge_process, verbose=verbose) identity_edges = [] for chunk in identity_edge_chunks: identity_edges = identity_edges + chunk if verbose: print("...Writing neighborhood data to PAN...") pan_alchemist.session.add_all(identity_edges) pan_alchemist.session.commit() pan_alchemist.session.close()
def cluster_db(matrix, eps, cores=1, verbose=False, is_distance=False, emax=0.9, S=1.6, M=2): if verbose: print("...Performing clustering iterations...") iteration_counter = 1 cluster_counter = 0 iter_scheme = dict() unclustered_matrix = matrix while True: iter_scheme[None] = list() if verbose: print(f"...Starting clustering iteration {iteration_counter}") iteration_counter += 1 greedy_scheme = clustering.dbscan(unclustered_matrix, eps, 1, is_distance=is_distance, return_matrix=True) work_items = [] for greedy_cluster, submatrix in greedy_scheme.items(): if greedy_cluster is None or submatrix.size <= 1: iter_scheme[None] = iter_scheme[None] + submatrix.labels continue work_items.append((submatrix, is_distance, eps, emax, S, M)) layered_schemes = parallelize.parallelize( work_items, cores, iter_cluster_process, verbose=False) work_items = [] for scheme in layered_schemes: for cluster, cluster_members in scheme.items(): if cluster is None: iter_scheme[None] = iter_scheme[None] + cluster_members continue cluster_counter += 1 iter_scheme[cluster_counter] = cluster_members new_unclustered_members = set(iter_scheme[None]) diff_unclustered = set(unclustered_matrix.labels).difference( new_unclustered_members) if not diff_unclustered: break unclustered_matrix = matrix.get_submatrix_from_labels( list(new_unclustered_members)) if verbose: print("...Finalizing clustering scheme...") iter_scheme_centroids = list() for cluster, cluster_members in iter_scheme.items(): if cluster is None: continue submatrix = matrix.get_submatrix_from_labels(cluster_members) iter_scheme_centroids.append(submatrix.get_centroid()) final_scheme = clustering.lloyds(matrix, iter_scheme_centroids, eps=eps, is_distance=is_distance, return_matrix=False) return final_scheme