Beispiel #1
0
def match_oligomers(F_oligomer_results,
                    R_oligomer_results,
                    minD=900,
                    maxD=1100,
                    dev_net=0,
                    threads=4):
    thread_pool = multiprocessing.Pool(processes=threads)
    thread_manager = multiprocessing.Manager()

    managed_R_pos_oligomer_map = thread_manager.dict()
    managed_R_pos_oligomer_map.update(dict(R_oligomer_results))

    chunk_size = math.ceil(math.sqrt(len(F_oligomer_results)))
    work_chunks = basic.partition_list(F_oligomer_results, chunk_size)

    results = []
    for work_items in work_chunks:
        results.append(
            thread_pool.apply_async(process_match_oligomers,
                                    args=(work_items,
                                          managed_R_pos_oligomer_map, minD,
                                          maxD, dev_net)))

    primer_pairs = []
    for result in results:
        primer_pairs = primer_pairs + result.get()

    thread_pool.close()
    thread_pool.join()

    return primer_pairs
Beispiel #2
0
def get_verified_data_handle(acc_id_dict,
                             ncbi_cred_dict={},
                             batch_size=200,
                             file_type="gb"):
    """Retrieve genomes from GenBank.

    output_folder = Path to where files will be saved.
    acc_id_dict = Dictionary where key = Accession and value = List[PhageIDs]
    """

    # More setup variables if NCBI updates are desired. NCBI Bookshelf resource
    # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric
    # Sayers, recommends that a single request not contain more than about 200
    # UIDS so we will use that as our batch size, and all Entrez requests must
    # include the user's email address and tool name.
    set_entrez_credentials(tool=ncbi_cred_dict.get("tool"),
                           email=ncbi_cred_dict.get("email"),
                           api_key=ncbi_cred_dict.get("api_key"))

    # Use esearch to verify the accessions are valid and efetch to retrieve
    # the record
    # Create batches of accessions
    unique_accession_list = list(acc_id_dict.keys())

    # Add [ACCN] field to each accession number
    appended_accessions = \
        [accession + "[ACCN]" for accession in unique_accession_list]

    # When retrieving in batch sizes, first create the list of values
    # indicating which indices of the unique_accession_list should be used
    # to create each batch.
    # For instace, if there are five accessions, batch size of two produces
    # indices = 0,2,4

    chunked_accessions = basic.partition_list(appended_accessions, batch_size)
    for chunk in chunked_accessions:
        delimiter = " | "
        esearch_term = delimiter.join(chunk)

        # Use esearch for each accession
        search_record = run_esearch(db="nucleotide",
                                    term=esearch_term,
                                    usehistory="y")
        search_count = int(search_record["Count"])
        search_webenv = search_record["WebEnv"]
        search_query_key = search_record["QueryKey"]
        summary_records = get_summaries(db="nucleotide",
                                        query_key=search_query_key,
                                        webenv=search_webenv)

        accessions_to_retrieve = get_accessions_to_retrieve(summary_records)
        if len(accessions_to_retrieve) > 0:
            fetch_handle = get_data_handle(accessions_to_retrieve,
                                           rettype=RETTYPE_MAPPINGS[file_type])
            return fetch_handle
        else:
            return None
Beispiel #3
0
def test_primer_pairs(primer_pairs,
                      genome_map,
                      verbose=False,
                      threads=4,
                      het_min=-5000,
                      tm_gap_max=5.0,
                      ta_min=48.0,
                      ta_max=68.0,
                      minD=900,
                      maxD=1100):
    thread_pool = multiprocessing.Pool(processes=threads)
    thread_manager = multiprocessing.Manager()

    tested_primer_pairs = thread_manager.list()

    managed_genome_map = thread_manager.dict()
    managed_genome_map.update(genome_map)

    chunk_size = math.ceil(math.sqrt(len(primer_pairs)))
    work_chunks = basic.partition_list(primer_pairs, chunk_size)

    results = []
    for work_items in work_chunks:
        results.append(
            thread_pool.apply_async(process_test_primer_pairs,
                                    args=(work_items, managed_genome_map, minD,
                                          maxD, tm_gap_max, het_min, ta_min,
                                          ta_max)))
    total_run_info = [0] * 4
    tested_primer_pairs = []
    for result in results:
        pair_results, run_info = result.get()

        tested_primer_pairs = tested_primer_pairs + pair_results
        for i in range(len(run_info)):
            total_run_info[i] += run_info[i]

    thread_pool.close()
    thread_pool.join()

    if verbose:
        print(f"......{total_run_info[0]} primer "
              "pairs formed no products on at least one genome")
        print(f"......{total_run_info[1]} primer "
              "pairs formed multiple products on at least one genome")
        print(f"......{total_run_info[2]} primer pairs "
              "formed product with incorrect lengths on at least one genome")
        print(f"......{total_run_info[3]} primer pairs "
              "failed thermodynamic checks")

    tested_primer_pairs = heapq.merge(tested_primer_pairs,
                                      key=lambda pair: pair.rating,
                                      reverse=True)

    return list(tested_primer_pairs)
Beispiel #4
0
def format_summary_data(summary_data):
    recent_phages = summary_data["recent_phages"]
    recent_phages.reverse()
    recent_phages = basic.partition_list(recent_phages, 5)[0]
    summary_data["recent_phages"] = recent_phages

    phages_data = summary_data["recurring_phages"]
    phages_histogram = {}
    for pham in phages_data.keys():
        basic.increment_histogram(phages_data[pham]["PhageID"],
                                  phages_histogram)

    recurring_phages = basic.sort_histogram_keys(phages_histogram)
    recurring_phages = basic.partition_list(recurring_phages, 5)[0]
    for i in range(len(recurring_phages)):
        recurring_phages[i] = "".join([
            recurring_phages[i],
            f"({str(phages_histogram[recurring_phages[i]])})"
        ])
    summary_data["recurring_phages"] = recurring_phages
Beispiel #5
0
def execute_value_subqueries(engine, executable, in_column, source_values,
                             return_dict=True, limit=8000):
    """Query with a conditional on a set of values using subqueries.

    :param engine: SQLAlchemy Engine object used for executing queries.
    :type engine: Engine
    :param executable: Input a executable MySQL query.
    :type executable: Select
    :type executable: str
    :param in_column: SQLAlchemy Column object.
    :type in_column: Column
    :param source_values: Values from specified MySQL column.
    :type source_values: list[str]
    :param return_dict: Toggle whether to return data as a dictionary.
    :type return_dict: Boolean
    :param limit: SQLAlchemy IN clause query length limiter.
    :type limit: int
    :returns: List of grouped data for each value constraint.
    :rtype: list
    """
    if not isinstance(in_column, Column):
        raise ValueError("Inputted column to conditional values against "
                         "is not a SqlAlchemy Column."
                         f"Object is instead type {type(in_column)}.")

    if not executable.is_derived_from(in_column.table):
        raise ValueError("Inputted column to conditional values against "
                         "must be a column from the table(s) joined in the "
                         "SQLAlchemy select.")

    values = []
    if in_column.type.python_type == bytes:
        source_values = basic.convert_to_encoded(source_values)

    chunked_values = basic.partition_list(source_values, limit)

    for value_chunk in chunked_values:
        subquery = executable.where(in_column.in_(value_chunk))

        proxy = engine.execute(subquery)
        results = proxy.fetchall()

        for result in results:
            if return_dict:
                result = dict(result)

            values.append(result)

    return values
Beispiel #6
0
def build_symmetric_matrix(nodes,
                           distance_function,
                           is_distance=True,
                           names=None,
                           cores=1,
                           verbose=False):
    work_items = []

    row_indicies = [i for i in range(len(nodes))]
    chunk_size = int(floor(sqrt(len(nodes))))
    for i in row_indicies:
        subject = nodes[i]
        if len(nodes) - 1 == i:
            work_items.append((distance_function, subject, [], i, 0))
        else:
            query_node_chunks = basic.partition_list(nodes[i + 1:], chunk_size)
            for j in range(len(query_node_chunks)):
                work_items.append(
                    (distance_function, subject, query_node_chunks[j], i, j))

    matrix_data = parallelize.parallelize(work_items,
                                          cores,
                                          build_matrix_process,
                                          verbose=verbose)

    matrix_data.sort(key=lambda x: (x[1], x[2]))

    if names is None:
        names = row_indicies
    else:
        if len(names) != len(row_indicies):
            names = row_indicies
    matrix = SymmetricMatrix(names, is_distance=is_distance)

    for data in matrix_data:
        for i in range(len(data[0])):
            col = (data[2] * chunk_size) + (data[1] + i + 1)
            matrix.fill_cell(data[1], col, data[0][i])

    diagonal_value = 1
    if is_distance:
        diagonal_value = 0

    matrix.fill_diagonal(diagonal_value)
    return matrix
Beispiel #7
0
def contacts_index(def_size=20):
    size = request.args.get("size")
    if not size:
        size = def_size
    else:
        size = int(size)

    page = request.args.get("page")
    if not page:
        page = 1
    else:
        page = int(page)

    filters = request.args.get("filters")
    if not filters:
        filters = ""

    db_filter = alchemy.build_filter()

    db_filter.key = "contact.ContactID"
    db_filter.add(filters)
    db_filter.values = db_filter.build_values(
        where=db_filter.build_where_clauses())

    contacts = db_filter.query("contact")
    contacts = sorted(contacts, key=lambda contact: contact.Name)

    chunked_contacts = basic.partition_list(contacts, size)
    page_max = len(chunked_contacts)

    if page >= page_max:
        page = page_max

    return render_template("contact/index.html",
                           chunked_contacts=chunked_contacts,
                           page=page,
                           page_max=page_max)
Beispiel #8
0
def isolates_index(def_size=20):
    size = request.args.get("size")
    if not size:
        size = def_size
    else:
        size = int(size)

    page = request.args.get("page")
    if not page:
        page = 1
    else:
        page = int(page)

    filters = request.args.get("filters")
    if not filters:
        filters = ""

    db_filter = alchemy.build_filter()

    db_filter.key = "clinical_isolate.IsolateID"
    db_filter.add(filters)
    db_filter.values = db_filter.build_values(
        where=db_filter.build_where_clauses())

    isolates = db_filter.query("clinical_isolate")
    isolates = sorted(isolates, key=alchemy.isolate_sorting)

    chunked_isolates = basic.partition_list(isolates, size)
    page_max = len(chunked_isolates)

    if page >= page_max:
        page = page_max

    return render_template("isolate/index.html",
                           chunked_isolates=chunked_isolates,
                           page=page,
                           page_max=page_max)
Beispiel #9
0
def build_pan_neighborhoods(alchemist,
                            pan_alchemist,
                            values,
                            data_dir,
                            data_maps_tuple,
                            aD=75,
                            mD=65,
                            B=0.2,
                            threads=1,
                            verbose=False):
    matrix_chunks = create_centroid_graph(pan_alchemist,
                                          values,
                                          data_dir,
                                          threads=threads,
                                          verbose=verbose)

    thread_manager = multiprocessing.Manager()
    mD_cache = thread_manager.dict()
    data_cache = thread_manager.dict()
    path_cache = thread_manager.dict()

    temp_dir = Path(TEMP_DIR).joinpath("linker_files")
    temp_dir.mkdir()

    read_work_set = set()
    aln_work_items = []

    if verbose:
        print("...Constructing base for pham neighborhoods...")
    construct_neighborhood_base(pan_alchemist, matrix_chunks, read_work_set,
                                aln_work_items, data_dir, temp_dir, mD_cache,
                                data_cache, path_cache, aD, mD, B)

    if verbose:
        print("...Reloading pham neighborhood cluster data...")
    for cluster in read_work_set:
        path_cache[cluster] = (data_maps_tuple[0].get(int(cluster)),
                               data_maps_tuple[1].get(int(cluster)),
                               data_maps_tuple[2].get(int(cluster)))

    chunk_size = int(math.sqrt(len(aln_work_items)))
    if chunk_size > 0:
        aln_work_chunks = basic.partition_list(
            aln_work_items, int(math.sqrt(len(aln_work_items))))
    else:
        aln_work_chunks = [aln_work_items]

    if verbose:
        print("...Computing pham cluster minimum distances...")
    identity_edge_chunks = parallelize.parallelize(
        aln_work_chunks,
        threads,
        build_neighborhood_edge_process,
        verbose=verbose)
    identity_edges = []
    for chunk in identity_edge_chunks:
        identity_edges = identity_edges + chunk

    if verbose:
        print("...Writing neighborhood data to PAN...")
    pan_alchemist.session.add_all(identity_edges)
    pan_alchemist.session.commit()
    pan_alchemist.session.close()