Python Chunks Examples, minedatabase.utils.Chunks Python Examples

Example #1

0

Show file

def write_targets_to_mine(targets: List[dict],
                          db: MINE,
                          chunk_size: int = 10000) -> None:
    """Write target compounds to target collection of MINE.

    Parameters
    ----------
    targets : List[dict]
        Listt of target dictionaries to write.
    db : MINE
        MINE object to write targets with.
    chunk_size : int, optional
        Size of chunks to break compounds into when writing, by default 10000.
    """
    def _get_cpd_insert(cpd_dict: dict):
        output_keys = ["_id", "ID", "SMILES", "InChI_key"]
        return pymongo.InsertOne({
            key: cpd_dict.get(key)
            for key in output_keys if cpd_dict.get(key) != None
        })

    n_cpds = len(targets)
    for i, target_chunk in enumerate(utils.Chunks(targets, chunk_size)):
        if i % 20 == 0:
            print(
                f"Writing Targets: Chunk {i} of {int(n_cpds/chunk_size) + 1}")
        cpd_requests = [_get_cpd_insert(cpd_dict) for cpd_dict in target_chunk]
        db.target_compounds.bulk_write(cpd_requests, ordered=False)

Example #2

0

Show file

def write_reactions_to_mine(reactions: List[dict],
                            db: MINE,
                            chunk_size: int = 10000) -> None:
    """Write reactions to reaction collection of MINE.

    Parameters
    ----------
    reactions : List[dict]
        Dictionary of reactions to write.
    db : MINE
        MINE object to write reactions with.
    chunk_size : int, optional
        Size of chunks to break reactions into when writing, by default 10000.
    """
    n_rxns = len(reactions)
    for i, rxn_chunk in enumerate(utils.Chunks(reactions, chunk_size)):
        if i % 20 == 0:
            print(
                f"Writing Reactions: Chunk {i} of {int(n_rxns/chunk_size) + 1}"
            )
        rxn_requests = [
            pymongo.InsertOne(utils.convert_sets_to_lists(rxn_dict))
            for rxn_dict in rxn_chunk
        ]

        db.reactions.bulk_write(rxn_requests, ordered=False)

Example #3

0

Show file

def write_compounds_to_mine(compounds: List[dict],
                            db: MINE,
                            chunk_size: int = 10000,
                            processes: int = 1) -> None:
    """Write compounds to reaction collection of MINE.

    Parameters
    ----------
    compounds : List[dict]
        Dictionary of compounds to write.
    db : MINE
        MINE object to write compounds with.
    chunk_size : int, optional
        Size of chunks to break compounds into when writing, by default 10000.
    processes : int, optional
        Number of processors to use, by default 1.
    """
    n_cpds = len(compounds)
    if processes == 1:
        pool = None
    else:
        pool = multiprocessing.Pool(processes)

    for i, cpd_chunk in enumerate(utils.Chunks(compounds, chunk_size)):
        if i % 20 == 0:
            print(
                f"Writing Compounds: Chunk {i} of {int(n_cpds/chunk_size) + 1}"
            )

        cpd_requests = []
        reactant_in_requests = []
        product_of_requests = []

        if pool:
            for res in pool.imap_unordered(_get_cpd_insert, cpd_chunk):
                cpd_request, reactant_in_request, product_of_request = res
                cpd_requests.append(cpd_request)
                reactant_in_requests.extend(reactant_in_request)
                product_of_requests.extend(product_of_request)
        else:
            for res in map(_get_cpd_insert, cpd_chunk):
                cpd_request, reactant_in_request, product_of_request = res
                cpd_requests.append(cpd_request)
                reactant_in_requests.extend(reactant_in_request)
                product_of_requests.extend(product_of_request)

        db.compounds.bulk_write(cpd_requests, ordered=False)
        if reactant_in_requests:
            db.reactant_in.bulk_write(reactant_in_requests, ordered=False)
        if product_of_requests:
            db.product_of.bulk_write(product_of_requests, ordered=False)

    if pool:
        pool.close()
        pool.join()

Example #4

0

Show file

def _get_reactant_in_insert(compound: dict) -> List[dict]:
    """Write reactants_in, ensuring memory size isn't too big.

    MongoDB only allows < 16 MB entries. This function breaks large reactants_in
    up to ensure this doesn't happen.

    Parameters
    ----------
    compounds : List[dict]
        Dictionary of compounds to write.
    db : MINE
        MINE object to write compounds with.

    Returns
    -------
    List[dict]
        dicts of reactant_in to insert
    """

    # Get number of chunks reactant_in must be broken up into
    # 16 MB is the max for BSON, cut to 14 MB max just to be safe
    # Also some weirdness is that max_size must be 1 order of magnitude lower
    max_size = 1.4 * 10**6
    r_in_size = sys.getsizeof(compound["Reactant_in"])
    chunks, rem = divmod(r_in_size, max_size)

    if rem:
        chunks += 1
    chunk_size = ceil(len(compound["Reactant_in"]) / chunks)

    # Generate the InsertOne requests
    r_in_chunks = utils.Chunks(compound["Reactant_in"],
                               chunk_size,
                               return_list=True)

    requests = []
    for i, r_in_chunk in enumerate(r_in_chunks):
        requests.append({
            "_id": f"{compound['_id']}_{i}",
            "c_id": compound["_id"],
            "Reactant_in": r_in_chunk,
        })

    return requests

Example #5

0

Show file

def write_core_compounds(compounds: List[dict],
                         db: MINE,
                         mine: str,
                         chunk_size: int = 10000,
                         processes=1) -> None:
    """Write core compounds to the core compound database.

    Calculates and formats compounds into appropriate form to insert into the
    core compound database in the mongo instance. Core compounds are attempted
    to be inserted and collisions are detected on the database. The list of
    MINEs a given compound is found in is updated as well.

    Parameters
    ----------
    compounds : dict
        List of compound dictionaries to write.
    db : MINE
        MINE object to write core compounds with.
    mine : str
        Name of the MINE.
    chunk_size : int, optional
        Size of chunks to break compounds into when writing, by default 10000.
    processes : int, optional
        The number of processors to use, by default 1.
    """
    n_cpds = len(compounds)
    if processes == 1:
        pool = None
    else:
        pool = multiprocessing.Pool(processes)

    for i, cpd_chunk in enumerate(utils.Chunks(compounds, chunk_size)):
        if i % 20 == 0:
            print(
                f"Writing Compounds: Chunk {i} of {int(n_cpds/chunk_size) + 1}"
            )

        # Capture annoying RDKit output

        cpd_chunk = [
            deepcopy(cpd) for cpd in cpd_chunk if cpd["_id"].startswith("C")
        ]
        if pool:
            core_requests = [
                req for req in pool.map(_get_core_cpd_insert, cpd_chunk)
            ]
        else:
            core_requests = [
                req for req in map(_get_core_cpd_insert, cpd_chunk)
            ]

        core_update_requests = [
            _get_core_cpd_update(cpd_dict, mine) for cpd_dict in cpd_chunk
        ]

        # Need to write update (i.e. what keeps track of which mines the
        # compound has been in)
        # first to ensure cpd exists
        db.core_compounds.bulk_write(core_requests)
        db.core_compounds.bulk_write(core_update_requests)
    if pool:
        pool.close()
        pool.join()