def merge_vocab(*args): """ helper function to merge multiple vocab objects...helpful for cases that may require the processing of more data than is able to held in memory or for getting a common vocab to use to merge multiple disjoint datasets, etc.. *args: a list of an arbitrary number of vocab objects """ # use this list to filter out 'characters' that we don't need to make the new dataset ignore_char_list = ["<bos>", "<eos>", "<pad>", "<unk>"] merged_char_set = set() for vocab_path in args: vocab = torch.load(vocab_path) vocab_chars_set = set( [x for x in vocab.c2i.keys() if x not in ignore_char_list] ) merged_char_set.update(vocab_chars_set) return CharVocab(merged_char_set)
def compute_vocab(smiles_list, n_jobs=mp.cpu_count()): """ simple function that can be used to create a vocabulary for an arbitrary set of smiles strings smiles_list: list of smiles strings """ # extract all unique characters in smiles_list # char_set = set.union(*[set(x) for x in smiles_list]) with mp.Pool(n_jobs) as pool: result = list( tqdm( pool.imap_unordered(compute_vocab_job, smiles_list), total=len(smiles_list), ) ) char_set = set.union(*result) # create the vocab vocab = CharVocab(char_set) return vocab