def test_add_sharded_dataset(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='SMILES', value='CCN') component.is_limiting = True component.amount.moles.value = 2 component.amount.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 reaction.provenance.record_created.time.value = '2020-01-02' reaction.provenance.record_created.person.username = '******' reaction.provenance.record_created.person.email = '*****@*****.**' reaction.reaction_id = 'test1' dataset1 = dataset_pb2.Dataset(reactions=[reaction]) dataset1_filename = os.path.join(self.test_subdirectory, 'test1.pbtxt') message_helpers.write_message(dataset1, dataset1_filename) reaction.provenance.record_created.time.value = '2020-01-03' reaction.provenance.record_created.person.username = '******' reaction.provenance.record_created.person.email = '*****@*****.**' reaction.reaction_id = 'test2' dataset2 = dataset_pb2.Dataset(reactions=[reaction]) dataset2_filename = os.path.join(self.test_subdirectory, 'test2.pbtxt') message_helpers.write_message(dataset2, dataset2_filename) added, removed, changed, filenames = self._run() self.assertEqual(added, {'test1', 'test2'}) self.assertEmpty(removed) self.assertEmpty(changed) self.assertLen(filenames, 2) filenames.pop(filenames.index(self.dataset_filename)) self.assertLen(filenames, 1) dataset = message_helpers.load_message(filenames[0], dataset_pb2.Dataset) self.assertLen(dataset.reactions, 2) self.assertFalse(os.path.exists(dataset1_filename)) self.assertFalse(os.path.exists(dataset2_filename))
def setUp(self): super().setUp() # Suppress RDKit warnings to clean up the test output. RDLogger.logger().setLevel(RDLogger.CRITICAL) self.test_subdirectory = tempfile.mkdtemp(dir=flags.FLAGS.test_tmpdir) reaction1 = reaction_pb2.Reaction() dummy_input = reaction1.inputs['dummy_input'] dummy_component = dummy_input.components.add() dummy_component.identifiers.add(type='CUSTOM') dummy_component.identifiers[0].details = 'custom_identifier' dummy_component.identifiers[0].value = 'custom_value' dummy_component.is_limiting = reaction_pb2.Boolean.TRUE dummy_component.mass.value = 1 dummy_component.mass.units = reaction_pb2.Mass.GRAM reaction1.outcomes.add().conversion.value = 75 dataset1 = dataset_pb2.Dataset(reactions=[reaction1]) self.dataset1_filename = os.path.join(self.test_subdirectory, 'dataset1.pbtxt') message_helpers.write_message(dataset1, self.dataset1_filename) # reaction2 is empty. reaction2 = reaction_pb2.Reaction() dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2]) self.dataset2_filename = os.path.join(self.test_subdirectory, 'dataset2.pbtxt') message_helpers.write_message(dataset2, self.dataset2_filename)
def setUp(self): super().setUp() self.test_subdirectory = tempfile.mkdtemp(dir=flags.FLAGS.test_tmpdir) reaction1 = reaction_pb2.Reaction() dummy_input = reaction1.inputs['dummy_input'] dummy_component = dummy_input.components.add() dummy_component.identifiers.add(type='CUSTOM') dummy_component.identifiers[0].details = 'custom_identifier' dummy_component.identifiers[0].value = 'custom_value' dummy_component.is_limiting = True dummy_component.mass.value = 1 dummy_component.mass.units = reaction_pb2.Mass.GRAM reaction1.outcomes.add().conversion.value = 75 dataset1 = dataset_pb2.Dataset(reactions=[reaction1]) self.dataset1_filename = os.path.join(self.test_subdirectory, 'dataset1.pb') with open(self.dataset1_filename, 'wb') as f: f.write(dataset1.SerializeToString()) # reaction2 is empty. reaction2 = reaction_pb2.Reaction() dataset2 = dataset_pb2.Dataset(reactions=[reaction1, reaction2]) self.dataset2_filename = os.path.join(self.test_subdirectory, 'dataset2.pb') with open(self.dataset2_filename, 'wb') as f: f.write(dataset2.SerializeToString())
def write_dataset(file_name): """Receives a serialized Dataset protobuf and write it to a file.""" dataset = dataset_pb2.Dataset() dataset.ParseFromString(flask.request.get_data()) resolve_tokens(dataset) put_dataset(file_name, dataset) return 'ok'
def test_add_dataset_with_large_data(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='SMILES', value='CCN') component.is_limiting = reaction_pb2.Boolean.TRUE component.moles.value = 2 component.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 image = reaction.observations.add().image image.bytes_value = b'test data value' image.format = 'png' dataset = dataset_pb2.Dataset(reactions=[reaction]) dataset_filename = os.path.join(self.test_subdirectory, 'test.pbtxt') message_helpers.write_message(dataset, dataset_filename) filenames = self._run_main(min_size=0.0) self.assertLen(filenames, 2) filenames.pop(filenames.index(self.dataset_filename)) dataset = message_helpers.load_message(filenames[0], dataset_pb2.Dataset) relative_path = ( 'data/36/ord_data-' '36443a1839bf1160087422b7468a93c7b97dac7eea423bfac189208a15823139' '.png') expected = ('https://github.com/Open-Reaction-Database/' 'ord-submissions-test/tree/' + relative_path) self.assertEqual(dataset.reactions[0].observations[0].image.url, expected) with open(os.path.join(self.test_subdirectory, relative_path), 'rb') as f: self.assertEqual(b'test data value', f.read())
def test_add_dataset(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='SMILES', value='CCN') component.is_limiting = True component.amount.moles.value = 2 component.amount.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 reaction.provenance.record_created.time.value = '2020-01-01' reaction.provenance.record_created.person.username = '******' reaction.provenance.record_created.person.email = '*****@*****.**' reaction.reaction_id = 'test' dataset = dataset_pb2.Dataset(reactions=[reaction]) dataset_filename = os.path.join(self.test_subdirectory, 'test.pbtxt') message_helpers.write_message(dataset, dataset_filename) added, removed, changed, filenames = self._run() self.assertEqual(added, {'test'}) self.assertEmpty(removed) self.assertEmpty(changed) self.assertLen(filenames, 2) self.assertFalse(os.path.exists(dataset_filename)) # Check for assignment of dataset and reaction IDs. filenames.pop(filenames.index(self.dataset_filename)) self.assertLen(filenames, 1) dataset = message_helpers.load_message(filenames[0], dataset_pb2.Dataset) self.assertNotEmpty(dataset.dataset_id) self.assertLen(dataset.reactions, 1) self.assertNotEmpty(dataset.reactions[0].reaction_id) # Check for binary output. root, ext = os.path.splitext(filenames[0]) self.assertEqual(ext, '.pbtxt') self.assertTrue(os.path.exists(root + '.pb'))
def test_valid_templating(self): template_string = self.template_string.replace('value: "CCO"', 'value: "$my_smiles$"') template_string = template_string.replace('value: 75', 'value: $conversion$') df = pd.DataFrame.from_dict({ '$my_smiles$': ['CCO', 'CCCO', 'CCCCO'], '$conversion$': [75, 50, 30], }) dataset = templating.generate_dataset(template_string, df) expected_reactions = [] for smiles, conversion in zip(['CCO', 'CCCO', 'CCCCO'], [75, 50, 30]): reaction = reaction_pb2.Reaction() reaction.CopyFrom(self.valid_reaction) reaction.inputs['in'].components[0].identifiers[0].value = smiles reaction.outcomes[0].conversion.value = conversion expected_reactions.append(reaction) expected_dataset = dataset_pb2.Dataset(reactions=expected_reactions) self.assertEqual(dataset, expected_dataset) # Test without "$" in column names df = pd.DataFrame.from_dict({ 'my_smiles': ['CCO', 'CCCO', 'CCCCO'], 'conversion': [75, 50, 30], }) dataset = templating.generate_dataset(template_string, df) self.assertEqual(dataset, expected_dataset)
def test_crossferences(self): message = dataset_pb2.Dataset() reaction1 = message.reactions.add() reaction2 = message.reactions.add() reaction3 = message.reactions.add() # Minimal reaction 1 dummy_input = reaction1.inputs['dummy_input'] reaction1.outcomes.add() dummy_component = dummy_input.components.add() dummy_component.identifiers.add(type='CUSTOM') dummy_component.identifiers[0].details = 'custom_identifier' dummy_component.identifiers[0].value = 'custom_value' dummy_component.amount.mass.value = 1 dummy_component.amount.mass.units = reaction_pb2.Mass.GRAM reaction2.CopyFrom(reaction1) reaction3.CopyFrom(reaction1) dummy_component.preparations.add(type='SYNTHESIZED') dummy_component.preparations[0].reaction_id = 'placeholder_id' reaction2.reaction_id = 'placeholder_id' dummy_input.crude_components.add(reaction_id='crude-making step', has_derived_amount=True) reaction3.reaction_id = 'crude-making step' updates.update_dataset(message) self.assertEqual(dummy_component.preparations[0].reaction_id, reaction2.reaction_id) self.assertEqual(dummy_input.crude_components[0].reaction_id, reaction3.reaction_id) self.assertNotEqual(dummy_component.preparations[0].reaction_id, 'placeholder_id') self.assertNotEqual(dummy_input.crude_components[0].reaction_id, 'crude-making step')
def test_resolver(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='NAME', value='ethylamine') component.is_limiting = True component.moles.value = 2 component.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 dataset = dataset_pb2.Dataset(reactions=[reaction]) dataset_filename = os.path.join(self.test_subdirectory, 'test.pbtxt') message_helpers.write_message(dataset, dataset_filename) filenames = self._run_main() self.assertLen(filenames, 2) self.assertFalse(os.path.exists(dataset_filename)) filenames.pop(filenames.index(self.dataset_filename)) self.assertLen(filenames, 1) dataset = message_helpers.load_message(filenames[0], dataset_pb2.Dataset) self.assertLen(dataset.reactions, 1) identifiers = (dataset.reactions[0].inputs['ethylamine'].components[0]. identifiers) self.assertLen(identifiers, 3) self.assertEqual( identifiers[1], reaction_pb2.CompoundIdentifier( type='SMILES', value='CCN', details='NAME resolved by PubChem')) self.assertEqual(identifiers[2].type, reaction_pb2.CompoundIdentifier.RDKIT_BINARY)
def dataset_filename(tmp_path) -> str: # Create a test database. connection = connect(ord_interface.client.POSTGRES_DB) connection.set_session(autocommit=True) with connection.cursor() as cursor: cursor.execute("CREATE DATABASE test;") connection.close() # Create a test dataset. reaction = reaction_pb2.Reaction() reaction.reaction_id = "test" reaction.identifiers.add(value="reaction", type="REACTION_SMILES") input1 = reaction.inputs["input1"] input1.components.add().identifiers.add(value="input1", type="SMILES") input2 = reaction.inputs["input2"] input2.components.add().identifiers.add(value="input2a", type="SMILES") input2.components.add().identifiers.add(value="input2b", type="SMILES") outcome = reaction.outcomes.add() product = outcome.products.add() product.measurements.add(type="YIELD", percentage={"value": 2.5}) product.identifiers.add(value="product", type="SMILES") reaction.provenance.doi = "10.0000/test.foo" dataset = dataset_pb2.Dataset(dataset_id="test_dataset", reactions=[reaction]) dataset_filename = (tmp_path / "test.pb").as_posix() message_helpers.write_message(dataset, dataset_filename) yield dataset_filename # Remove the test database. connection = connect(ord_interface.client.POSTGRES_DB) connection.set_session(autocommit=True) with connection.cursor() as cursor: cursor.execute("DROP DATABASE test;") connection.close()
def sync_reviews(): """Import all current pull requests into the datasets table. These datasets have two extra pieces of metadata: a GitHub PR number and the PR title text. These are encoded into the dataset name in Postgres using delimiters.""" if flask.g.user_id != REVIEWER: return flask.redirect('/') client = github.Github() repo = client.get_repo('Open-Reaction-Database/ord-data') user_id = flask.g.user_id with flask.g.db.cursor() as cursor: # First reset all datasets under review. query = psycopg2.sql.SQL('DELETE FROM datasets WHERE user_id=%s') cursor.execute(query, [REVIEWER]) # Then import all datasets from open PR's. for pr in repo.get_pulls(): for remote in pr.get_files(): response = requests.get(remote.raw_url) if remote.filename.endswith('.pbtxt'): dataset = dataset_pb2.Dataset() text_format.Parse(response.text, dataset) elif remote.filename.endswith('.pb'): dataset = dataset_pb2.Dataset.FromString(response.content) else: continue name = 'PR_%d ___%s___ %s' % (pr.number, pr.title, remote.filename[:-6]) query = psycopg2.sql.SQL( 'INSERT INTO datasets VALUES (%s, %s, %s)') cursor.execute( query, [user_id, name, serialize_for_db(dataset)]) flask.g.db.commit() return flask.redirect('/review')
def write_dataset(name): """Inserts a protobuf including upload tokens into the datasets table.""" dataset = dataset_pb2.Dataset() dataset.ParseFromString(flask.request.get_data()) resolve_tokens(dataset) put_dataset(name, dataset) return 'ok'
def test_add_dataset_with_existing_reaction_ids(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='SMILES', value='CCN') component.is_limiting = reaction_pb2.Boolean.TRUE component.moles.value = 2 component.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 reaction_id = 'ord-10aed8b5dffe41fab09f5b2cc9c58ad9' reaction.reaction_id = reaction_id reaction.provenance.record_created.time.value = '2020-01-01 11 am' dataset = dataset_pb2.Dataset(reactions=[reaction]) dataset_filename = os.path.join(self.test_subdirectory, 'test.pbtxt') message_helpers.write_message(dataset, dataset_filename) filenames = self._run_main() self.assertLen(filenames, 2) self.assertFalse(os.path.exists(dataset_filename)) filenames.pop(filenames.index(self.dataset_filename)) self.assertLen(filenames, 1) dataset = message_helpers.load_message(filenames[0], dataset_pb2.Dataset) # Check that existing record IDs for added datasets are not overridden. self.assertEqual(dataset.reactions[0].reaction_id, reaction_id) self.assertLen(dataset.reactions[0].provenance.record_modified, 0)
def setUp(self): super().setUp() self.test_subdirectory = tempfile.mkdtemp(dir=flags.FLAGS.test_tmpdir) os.chdir(self.test_subdirectory) subprocess.run(['git', 'init'], check=True) subprocess.run( ['git', 'config', '--local', 'user.email', 'test@ord-schema'], check=True) subprocess.run( ['git', 'config', '--local', 'user.name', 'Test Runner'], check=True) # Add some initial data. reaction = reaction_pb2.Reaction() methylamine = reaction.inputs['methylamine'] component = methylamine.components.add() component.identifiers.add(type='SMILES', value='CN') component.is_limiting = reaction_pb2.Boolean.TRUE component.moles.value = 1 component.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 75 reaction.provenance.record_created.time.value = '2020-01-01' reaction.reaction_id = 'ord-10aed8b5dffe41fab09f5b2cc9c58ad9' dataset_id = 'ord_dataset-64b14868c5cd46dd8e75560fd3589a6b' dataset = dataset_pb2.Dataset(reactions=[reaction], dataset_id=dataset_id) # Make sure the initial dataset is valid. validations.validate_message(dataset) os.makedirs(os.path.join('data', '64')) self.dataset_filename = os.path.join(self.test_subdirectory, 'data', '64', f'{dataset_id}.pbtxt') message_helpers.write_message(dataset, self.dataset_filename) subprocess.run(['git', 'add', 'data'], check=True) subprocess.run(['git', 'commit', '-m', 'Initial commit'], check=True)
def init_db(): """Ensures the db/ directory exists and contains at least one Dataset.""" if not os.path.isdir('db'): os.mkdir('db') if os.listdir('db'): return dataset = dataset_pb2.Dataset() put_dataset('dataset', dataset)
def test_main_pass(self): dataset = dataset_pb2.Dataset() reaction = dataset.reactions.add() component = reaction.inputs['test'].components.add() component.identifiers.add(value='c1ccccc1', type='SMILES') message_helpers.write_message(dataset, self.pb_filename) message_helpers.write_message(dataset, self.pbtxt_filename) self._run()
def _download_dataset(self, name): """Downloads an existing dataset.""" response = self.client.get(f'/dataset/{name}/download', follow_redirects=True) self.assertEqual(response.status_code, 200) dataset = dataset_pb2.Dataset() text_format.Parse(response.data, dataset) return dataset
def test_delete_reaction_id_blank(self): name = 'test' dataset = dataset_pb2.Dataset(reaction_ids=['', 'test', '']) self._upload_dataset(dataset, name) response = self.client.get(f'/dataset/{name}/delete/reaction_id', follow_redirects=True) self.assertEqual(response.status_code, 200) downloaded_dataset = self._download_dataset(name) self.assertLen(downloaded_dataset.reaction_ids, 2)
def _get_dataset() -> dataset_pb2.Dataset: """Returns a Dataset for testing.""" dataset = dataset_pb2.Dataset() with open(os.path.join(TESTDATA, "nielsen_fig1_dataset.pbtxt"), "rt") as f: text_format.Parse(f.read(), dataset) # Add some unicode to check for encoding/decoding robustness. # From https://en.wikipedia.org/wiki/Atlantis. dataset.reactions[0].provenance.city = "Ἀτλαντὶς νῆσος" return dataset
def test_read_dataset(client): name = "test" dataset = _get_dataset() _upload_dataset(client, dataset, name) response = client.get(f"/dataset/proto/read/{name}", follow_redirects=True) assert response.status_code == 200 downloaded_dataset = dataset_pb2.Dataset() downloaded_dataset.ParseFromString(response.data) assert downloaded_dataset == dataset
def test_delete_reaction_id_blank(client): name = "test" dataset = dataset_pb2.Dataset(reaction_ids=["", "test", ""]) _upload_dataset(client, dataset, name) response = client.get(f"/dataset/{name}/delete/reaction_id", follow_redirects=True) assert response.status_code == 200 downloaded_dataset = _download_dataset(client, name) assert len(downloaded_dataset.reaction_ids) == 2
def read_dataset(file_name): """Returns a Dataset as a serialized protobuf.""" dataset = dataset_pb2.Dataset() with open('db/%s.pbtxt' % file_name, 'rb') as pbtxt: text_format.Parse(pbtxt.read(), dataset) bites = dataset.SerializeToString(deterministic=True) response = flask.make_response(bites) response.headers.set('Content-Type', 'application/protobuf') return response
def get_dataset(file_name): """Reads a .pbtxt file from the db/ directory and parse it.""" with lock(file_name): if ('%s.pbtxt' % file_name) not in os.listdir('db'): return None dataset = dataset_pb2.Dataset() with open('db/%s.pbtxt' % file_name, 'rb') as pbtxt: text_format.Parse(pbtxt.read(), dataset) return dataset
def test_add_sharded_dataset_with_validation_errors(self): reaction = reaction_pb2.Reaction() ethylamine = reaction.inputs['ethylamine'] component = ethylamine.components.add() component.identifiers.add(type='SMILES', value='CCN') component.is_limiting = True component.moles.value = 2 component.moles.units = reaction_pb2.Moles.MILLIMOLE reaction.outcomes.add().conversion.value = 25 dataset1 = dataset_pb2.Dataset(reactions=[reaction]) dataset1_filename = os.path.join(self.test_subdirectory, 'test1.pbtxt') message_helpers.write_message(dataset1, dataset1_filename) reaction.inputs['ethylamine'].components[0].identifiers[ 0].value = 'C#O' dataset2 = dataset_pb2.Dataset(reactions=[reaction]) dataset2_filename = os.path.join(self.test_subdirectory, 'test2.pbtxt') message_helpers.write_message(dataset2, dataset2_filename) with self.assertRaisesRegex(ValueError, 'could not validate SMILES'): self._run_main()
def test_multiple_dois(self): dataset = dataset_pb2.Dataset() dataset.dataset_id = 'ord_dataset-1' dataset.reactions.add().provenance.doi = 'foo/bar' dataset.reactions.add().provenance.doi = 'not/bar' tempdir = self.create_tempdir() message_helpers.write_message( dataset, os.path.join(tempdir, f'{dataset.dataset_id}.pb')) with flagsaver.flagsaver(input=os.path.join(tempdir, '*.pb')): list_dois.main(())
def test_read_dataset(self): name = 'test' dataset = self._get_dataset() self._upload_dataset(dataset, name) response = self.client.get(f'/dataset/proto/read/{name}', follow_redirects=True) self.assertEqual(response.status_code, 200) downloaded_dataset = dataset_pb2.Dataset() downloaded_dataset.ParseFromString(response.data) self.assertEqual(downloaded_dataset, dataset)
def test_main_fail(self): dataset = dataset_pb2.Dataset() reaction = dataset.reactions.add() component = reaction.inputs['test'].components.add() component.identifiers.add(value='c1ccccc1', type='SMILES') message_helpers.write_message(dataset, self.pb_filename) component.identifiers.add(value='benzene', type='NAME') message_helpers.write_message(dataset, self.pbtxt_filename) with self.assertRaisesRegex(ValueError, 'Datasets differ'): self._run()
def test_bad_dataset_id(self): dataset = dataset_pb2.Dataset(reactions=[reaction_pb2.Reaction()], dataset_id='not-a-real-dataset-id') filename = os.path.join(self.test_subdirectory, 'test.pbtxt') message_helpers.write_message(dataset, filename) with flagsaver.flagsaver(root=self.test_subdirectory, input_pattern=filename, validate=False, update=True): with self.assertRaisesRegex(ValueError, 'malformed dataset ID'): process_dataset.main(())
def test_delete_reaction_id(client): name = "test" dataset = dataset_pb2.Dataset() reaction_id = "test_reaction_id" dataset.reaction_ids.append(reaction_id) _upload_dataset(client, dataset, name) response = client.get(f"/dataset/{name}/delete/reaction_id/{reaction_id}", follow_redirects=True) assert response.status_code == 200 downloaded_dataset = _download_dataset(client, name) assert len(downloaded_dataset.reaction_ids) == 0
def test_delete_reaction_id(self): name = 'test' dataset = dataset_pb2.Dataset() reaction_id = 'test_reaction_id' dataset.reaction_ids.append(reaction_id) self._upload_dataset(dataset, name) response = self.client.get( f'/dataset/{name}/delete/reaction_id/{reaction_id}', follow_redirects=True) self.assertEqual(response.status_code, 200) downloaded_dataset = self._download_dataset(name) self.assertEmpty(downloaded_dataset.reaction_ids)