def test_nested(self): message = test_pb2.Nested() self.assertEmpty( message_helpers.find_submessages(message, test_pb2.Nested.Child)) message.child.value = 5.6 submessages = message_helpers.find_submessages(message, test_pb2.Nested.Child) self.assertLen(submessages, 1) # Show that the returned submessages work as references. submessages[0].value = 7.8 self.assertAlmostEqual(message.child.value, 7.8, places=4)
def resolve_names(message): """Attempts to resolve compound NAME identifiers to SMILES. When a NAME identifier is resolved, a SMILES identifier is added to the list of identifiers for that compound. Note that this function moves on to the next Compound after the first successful name resolution. Args: message: Reaction proto. Returns: Boolean whether `message` was modified. """ modified = False compounds = message_helpers.find_submessages(message, reaction_pb2.Compound) for compound in compounds: if any(identifier.type in _COMPOUND_STRUCTURAL_IDENTIFIERS for identifier in compound.identifiers): continue # Compound already has a structural identifier. for identifier in compound.identifiers: if identifier.type == identifier.NAME: try: smiles = _pubchem_resolve('name', identifier.value) new_identifier = compound.identifiers.add() new_identifier.type = new_identifier.SMILES new_identifier.value = smiles new_identifier.details = 'NAME resolved by PubChem' modified = True break except urllib.error.HTTPError as error: logging.info('PubChem could not resolve NAME %s: %s', identifier.value, error) return modified
def test_map_nested(self): message = test_pb2.MapNested() message.children['one'].value = 1.2 message.children['two'].value = 3.4 self.assertLen( message_helpers.find_submessages(message, test_pb2.MapNested.Child), 2)
def test_repeated_nested(self): message = test_pb2.RepeatedNested() message.children.add().value = 1.2 message.children.add().value = 3.4 self.assertLen( message_helpers.find_submessages(message, test_pb2.RepeatedNested.Child), 2)
def test_compounds(self): message = reaction_pb2.Reaction() message.inputs['test'].components.add().identifiers.add( type='NAME', value='aspirin') self.assertLen( message_helpers.find_submessages(message, reaction_pb2.Compound), 1)
def test_find_data_messages(self): message = reaction_pb2.Reaction() self.assertEmpty( message_helpers.find_submessages(message, reaction_pb2.Data)) message = reaction_pb2.ReactionObservation() message.image.value = 'not an image' self.assertLen( message_helpers.find_submessages(message, reaction_pb2.Data), 1) message = reaction_pb2.ReactionSetup() message.automation_code['test1'].value = 'test data 1' message.automation_code['test2'].bytes_value = b'test data 2' self.assertLen( message_helpers.find_submessages(message, reaction_pb2.Data), 2) message = reaction_pb2.Reaction() message.observations.add().image.value = 'not an image' message.setup.automation_code['test1'].value = 'test data 1' message.setup.automation_code['test2'].bytes_value = b'test data 2' self.assertLen( message_helpers.find_submessages(message, reaction_pb2.Data), 3)
def extract_data(message, root, min_size=0.0, max_size=1.0): """Replaces large Data values with pointers to offloaded data. Git LFS (https://git-lfs.github.com/) is convenient because it lives in the same repo as the associated Reaction records. However, it is not possible to get a permanent URL for the uploaded data because it is only committed to the PR branch. We have (at least) these options: 1. Modify the URL just before or after the PR is merged to point to the correct branch. 2. Modify the URL to point to its eventual destination (in the `main` branch) and deal with broken links during submission review, or 3. Use relative paths (relative to the repository root). This means that users will have to traverse the repo manually to access referenced data instead of simply following a URL. 4. Merge the data immediately in another repo so the URL is permanent. I think (2) is the best option because it yields URLs that will eventually work and it is simpler than (1). I don't like option (4) because it requires data to be committed and merged before review. Args: message: Protocol buffer message. root: Text root of the repository. min_size: Float minimum size of data before it will be written (in MB). max_size: Float maximum size of data to write (in MB). Returns: Set of text filenames; the generated Data files. """ dirname = tempfile.mkdtemp() data_messages = message_helpers.find_submessages(message, reaction_pb2.Data) filenames = set() for data_message in data_messages: data_filename, data_size = write_data(data_message, dirname, min_size=min_size, max_size=max_size) if data_filename: basename = os.path.basename(data_filename) output_filename = message_helpers.id_filename(basename) with_root = flask.safe_join(root, output_filename) if os.path.exists(with_root): warnings.warn(f'Target Data blob already exists: {with_root}') else: os.makedirs(os.path.dirname(with_root), exist_ok=True) shutil.copy2(data_filename, with_root) filenames.add(with_root) data_message.url = urllib.parse.urljoin(DATA_URL_PREFIX, output_filename) logging.info('Created Data link (%g MB): %s', data_size, with_root) shutil.rmtree(dirname) return filenames
def add_binary_identifiers(message): """Adds RDKIT_BINARY identifiers for compounds with valid structures. Note that the RDKIT_BINARY representations are mostly useful in the context of searching the database. Accordingly, this function is not included in the standard set of Reaction updates in update_reaction(). Args: message: Reaction proto. Returns: Boolean whether `message` was modified. """ modified = False compounds = message_helpers.find_submessages(message, reaction_pb2.Compound) for compound in compounds: if any(identifier.type == identifier.RDKIT_BINARY for identifier in message.identifiers): continue for identifier in compound.identifiers: mol = None if Chem and identifier.type == identifier.SMILES: mol = Chem.MolFromSmiles(identifier.value) elif identifier.type == identifier.INCHI: mol = Chem.MolFromInchi(identifier.value) elif identifier.type == identifier.MOLBLOCK: mol = Chem.MolFromMolBlock(identifier.value) if mol is not None: source = reaction_pb2.CompoundIdentifier.IdentifierType.Name( identifier.type) compound.identifiers.add(bytes_value=mol.ToBinary(), type='RDKIT_BINARY', details=f'Generated from {source}') modified = True break # Only add one RDKIT_BINARY per Compound. return modified
def test_scalar(self): message = test_pb2.Scalar(int32_value=5, float_value=6.7) self.assertEmpty( message_helpers.find_submessages(message, test_pb2.Scalar)) with self.assertRaisesRegex(TypeError, 'must be a Protocol Buffer'): message_helpers.find_submessages(message, float)