def match_oligomers(F_oligomer_results, R_oligomer_results, minD=900, maxD=1100, dev_net=0, threads=4): thread_pool = multiprocessing.Pool(processes=threads) thread_manager = multiprocessing.Manager() managed_R_pos_oligomer_map = thread_manager.dict() managed_R_pos_oligomer_map.update(dict(R_oligomer_results)) chunk_size = math.ceil(math.sqrt(len(F_oligomer_results))) work_chunks = basic.partition_list(F_oligomer_results, chunk_size) results = [] for work_items in work_chunks: results.append( thread_pool.apply_async(process_match_oligomers, args=(work_items, managed_R_pos_oligomer_map, minD, maxD, dev_net))) primer_pairs = [] for result in results: primer_pairs = primer_pairs + result.get() thread_pool.close() thread_pool.join() return primer_pairs
def get_verified_data_handle(acc_id_dict, ncbi_cred_dict={}, batch_size=200, file_type="gb"): """Retrieve genomes from GenBank. output_folder = Path to where files will be saved. acc_id_dict = Dictionary where key = Accession and value = List[PhageIDs] """ # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. set_entrez_credentials(tool=ncbi_cred_dict.get("tool"), email=ncbi_cred_dict.get("email"), api_key=ncbi_cred_dict.get("api_key")) # Use esearch to verify the accessions are valid and efetch to retrieve # the record # Create batches of accessions unique_accession_list = list(acc_id_dict.keys()) # Add [ACCN] field to each accession number appended_accessions = \ [accession + "[ACCN]" for accession in unique_accession_list] # When retrieving in batch sizes, first create the list of values # indicating which indices of the unique_accession_list should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 chunked_accessions = basic.partition_list(appended_accessions, batch_size) for chunk in chunked_accessions: delimiter = " | " esearch_term = delimiter.join(chunk) # Use esearch for each accession search_record = run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] summary_records = get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) accessions_to_retrieve = get_accessions_to_retrieve(summary_records) if len(accessions_to_retrieve) > 0: fetch_handle = get_data_handle(accessions_to_retrieve, rettype=RETTYPE_MAPPINGS[file_type]) return fetch_handle else: return None
def test_primer_pairs(primer_pairs, genome_map, verbose=False, threads=4, het_min=-5000, tm_gap_max=5.0, ta_min=48.0, ta_max=68.0, minD=900, maxD=1100): thread_pool = multiprocessing.Pool(processes=threads) thread_manager = multiprocessing.Manager() tested_primer_pairs = thread_manager.list() managed_genome_map = thread_manager.dict() managed_genome_map.update(genome_map) chunk_size = math.ceil(math.sqrt(len(primer_pairs))) work_chunks = basic.partition_list(primer_pairs, chunk_size) results = [] for work_items in work_chunks: results.append( thread_pool.apply_async(process_test_primer_pairs, args=(work_items, managed_genome_map, minD, maxD, tm_gap_max, het_min, ta_min, ta_max))) total_run_info = [0] * 4 tested_primer_pairs = [] for result in results: pair_results, run_info = result.get() tested_primer_pairs = tested_primer_pairs + pair_results for i in range(len(run_info)): total_run_info[i] += run_info[i] thread_pool.close() thread_pool.join() if verbose: print(f"......{total_run_info[0]} primer " "pairs formed no products on at least one genome") print(f"......{total_run_info[1]} primer " "pairs formed multiple products on at least one genome") print(f"......{total_run_info[2]} primer pairs " "formed product with incorrect lengths on at least one genome") print(f"......{total_run_info[3]} primer pairs " "failed thermodynamic checks") tested_primer_pairs = heapq.merge(tested_primer_pairs, key=lambda pair: pair.rating, reverse=True) return list(tested_primer_pairs)
def format_summary_data(summary_data): recent_phages = summary_data["recent_phages"] recent_phages.reverse() recent_phages = basic.partition_list(recent_phages, 5)[0] summary_data["recent_phages"] = recent_phages phages_data = summary_data["recurring_phages"] phages_histogram = {} for pham in phages_data.keys(): basic.increment_histogram(phages_data[pham]["PhageID"], phages_histogram) recurring_phages = basic.sort_histogram_keys(phages_histogram) recurring_phages = basic.partition_list(recurring_phages, 5)[0] for i in range(len(recurring_phages)): recurring_phages[i] = "".join([ recurring_phages[i], f"({str(phages_histogram[recurring_phages[i]])})" ]) summary_data["recurring_phages"] = recurring_phages
def execute_value_subqueries(engine, executable, in_column, source_values, return_dict=True, limit=8000): """Query with a conditional on a set of values using subqueries. :param engine: SQLAlchemy Engine object used for executing queries. :type engine: Engine :param executable: Input a executable MySQL query. :type executable: Select :type executable: str :param in_column: SQLAlchemy Column object. :type in_column: Column :param source_values: Values from specified MySQL column. :type source_values: list[str] :param return_dict: Toggle whether to return data as a dictionary. :type return_dict: Boolean :param limit: SQLAlchemy IN clause query length limiter. :type limit: int :returns: List of grouped data for each value constraint. :rtype: list """ if not isinstance(in_column, Column): raise ValueError("Inputted column to conditional values against " "is not a SqlAlchemy Column." f"Object is instead type {type(in_column)}.") if not executable.is_derived_from(in_column.table): raise ValueError("Inputted column to conditional values against " "must be a column from the table(s) joined in the " "SQLAlchemy select.") values = [] if in_column.type.python_type == bytes: source_values = basic.convert_to_encoded(source_values) chunked_values = basic.partition_list(source_values, limit) for value_chunk in chunked_values: subquery = executable.where(in_column.in_(value_chunk)) proxy = engine.execute(subquery) results = proxy.fetchall() for result in results: if return_dict: result = dict(result) values.append(result) return values
def build_symmetric_matrix(nodes, distance_function, is_distance=True, names=None, cores=1, verbose=False): work_items = [] row_indicies = [i for i in range(len(nodes))] chunk_size = int(floor(sqrt(len(nodes)))) for i in row_indicies: subject = nodes[i] if len(nodes) - 1 == i: work_items.append((distance_function, subject, [], i, 0)) else: query_node_chunks = basic.partition_list(nodes[i + 1:], chunk_size) for j in range(len(query_node_chunks)): work_items.append( (distance_function, subject, query_node_chunks[j], i, j)) matrix_data = parallelize.parallelize(work_items, cores, build_matrix_process, verbose=verbose) matrix_data.sort(key=lambda x: (x[1], x[2])) if names is None: names = row_indicies else: if len(names) != len(row_indicies): names = row_indicies matrix = SymmetricMatrix(names, is_distance=is_distance) for data in matrix_data: for i in range(len(data[0])): col = (data[2] * chunk_size) + (data[1] + i + 1) matrix.fill_cell(data[1], col, data[0][i]) diagonal_value = 1 if is_distance: diagonal_value = 0 matrix.fill_diagonal(diagonal_value) return matrix
def contacts_index(def_size=20): size = request.args.get("size") if not size: size = def_size else: size = int(size) page = request.args.get("page") if not page: page = 1 else: page = int(page) filters = request.args.get("filters") if not filters: filters = "" db_filter = alchemy.build_filter() db_filter.key = "contact.ContactID" db_filter.add(filters) db_filter.values = db_filter.build_values( where=db_filter.build_where_clauses()) contacts = db_filter.query("contact") contacts = sorted(contacts, key=lambda contact: contact.Name) chunked_contacts = basic.partition_list(contacts, size) page_max = len(chunked_contacts) if page >= page_max: page = page_max return render_template("contact/index.html", chunked_contacts=chunked_contacts, page=page, page_max=page_max)
def isolates_index(def_size=20): size = request.args.get("size") if not size: size = def_size else: size = int(size) page = request.args.get("page") if not page: page = 1 else: page = int(page) filters = request.args.get("filters") if not filters: filters = "" db_filter = alchemy.build_filter() db_filter.key = "clinical_isolate.IsolateID" db_filter.add(filters) db_filter.values = db_filter.build_values( where=db_filter.build_where_clauses()) isolates = db_filter.query("clinical_isolate") isolates = sorted(isolates, key=alchemy.isolate_sorting) chunked_isolates = basic.partition_list(isolates, size) page_max = len(chunked_isolates) if page >= page_max: page = page_max return render_template("isolate/index.html", chunked_isolates=chunked_isolates, page=page, page_max=page_max)
def build_pan_neighborhoods(alchemist, pan_alchemist, values, data_dir, data_maps_tuple, aD=75, mD=65, B=0.2, threads=1, verbose=False): matrix_chunks = create_centroid_graph(pan_alchemist, values, data_dir, threads=threads, verbose=verbose) thread_manager = multiprocessing.Manager() mD_cache = thread_manager.dict() data_cache = thread_manager.dict() path_cache = thread_manager.dict() temp_dir = Path(TEMP_DIR).joinpath("linker_files") temp_dir.mkdir() read_work_set = set() aln_work_items = [] if verbose: print("...Constructing base for pham neighborhoods...") construct_neighborhood_base(pan_alchemist, matrix_chunks, read_work_set, aln_work_items, data_dir, temp_dir, mD_cache, data_cache, path_cache, aD, mD, B) if verbose: print("...Reloading pham neighborhood cluster data...") for cluster in read_work_set: path_cache[cluster] = (data_maps_tuple[0].get(int(cluster)), data_maps_tuple[1].get(int(cluster)), data_maps_tuple[2].get(int(cluster))) chunk_size = int(math.sqrt(len(aln_work_items))) if chunk_size > 0: aln_work_chunks = basic.partition_list( aln_work_items, int(math.sqrt(len(aln_work_items)))) else: aln_work_chunks = [aln_work_items] if verbose: print("...Computing pham cluster minimum distances...") identity_edge_chunks = parallelize.parallelize( aln_work_chunks, threads, build_neighborhood_edge_process, verbose=verbose) identity_edges = [] for chunk in identity_edge_chunks: identity_edges = identity_edges + chunk if verbose: print("...Writing neighborhood data to PAN...") pan_alchemist.session.add_all(identity_edges) pan_alchemist.session.commit() pan_alchemist.session.close()