Esempio n. 1
0
def merge_vocab(*args):
    """
        helper function to merge multiple vocab objects...helpful for cases that may require the processing of more data than 
        is able to held in memory or for getting a common vocab to use to merge multiple disjoint datasets, etc..

        *args: a list of an arbitrary number of vocab objects
    """

    # use this list to filter out 'characters' that we don't need to make the new dataset
    ignore_char_list = ["<bos>", "<eos>", "<pad>", "<unk>"]
    merged_char_set = set()

    for vocab_path in args:
        vocab = torch.load(vocab_path)
        vocab_chars_set = set(
            [x for x in vocab.c2i.keys() if x not in ignore_char_list]
        )
        merged_char_set.update(vocab_chars_set)

    return CharVocab(merged_char_set)
Esempio n. 2
0
def compute_vocab(smiles_list, n_jobs=mp.cpu_count()):
    """
        simple function that can be used to create a vocabulary for an arbitrary set of smiles strings

        smiles_list: list of smiles strings
    """
    # extract all unique characters in smiles_list
    # char_set = set.union(*[set(x) for x in smiles_list])

    with mp.Pool(n_jobs) as pool:
        result = list(
            tqdm(
                pool.imap_unordered(compute_vocab_job, smiles_list),
                total=len(smiles_list),
            )
        )
        char_set = set.union(*result)

    # create the vocab
    vocab = CharVocab(char_set)

    return vocab