def test_forge_match_resource_types(): f = Forge(index="mdf") # Test one type f.match_resource_types("record") res1 = f.search(limit=10) assert check_field(res1, "mdf.resource_type", "record") == 0 # Test two types f.match_resource_types(["collection", "dataset"]) res2 = f.search() assert check_field(res2, "mdf.resource_type", "record") == -1 # Test zero types assert f.match_resource_types("") == f
def test_forge_anonymous(capsys): f = Forge(anonymous=True) # Test search assert len( f.search("mdf.source_name:ab_initio_solute_database", advanced=True, limit=300)) == 300 # Test aggregation assert len(f.aggregate("mdf.source_name:nist_xps_db")) > 10000 # Error on auth-only functions # http_download assert f.http_download({})["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous HTTP download not yet supported." in out # globus_download assert f.globus_download({})["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous Globus Transfer not supported." in out # http_stream res = f.http_stream({}) assert next(res)["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous HTTP download not yet supported." in out with pytest.raises(StopIteration): next(res)
def test_forge_match_organizations(): f = Forge(index="mdf") # One repo f.match_organizations("NIST") res1 = f.search() assert res1 != [] check_val1 = check_field(res1, "mdf.organizations", "NIST") assert check_val1 == 1 # Multi-repo f.match_organizations(["NIST", "PRISMS"], match_all=False) res2 = f.search() assert check_field(res2, "mdf.organizations", "PRISMS") == 2 assert check_field(res2, "mdf.organizations", "NIST") == 2 # No repos assert f.match_organizations("") == f
def test_forge_match_elements(): f = Forge(index="mdf") # One element f.match_elements("Al") res1 = f.search() assert res1 != [] check_val1 = check_field(res1, "material.elements", "Al") assert check_val1 == 0 or check_val1 == 1 # Multi-element f.match_elements(["Al", "Cu"]) res2 = f.search() assert check_field(res2, "material.elements", "Al") == 1 assert check_field(res2, "material.elements", "Cu") == 1 # No elements assert f.match_elements("") == f
def test_forge_chaining(): f = Forge(index="mdf") f.match_field("source_name", "cip") f.match_field("material.elements", "Al") res1 = f.search() res2 = f.match_field("source_name", "cip").match_field("material.elements", "Al").search() assert all([r in res2 for r in res1]) and all([r in res1 for r in res2])
def test_forge_test_match_records(): f = Forge(index="mdf") # One record f.match_records("cip", 1006) res = f.search() assert len(res) == 1 assert check_field(res, "mdf.source_name", "cip") == 0 assert check_field(res, "mdf.scroll_id", 1006) == 0 # Multi-record, strip version info f.match_records("cip_v3.4", [1006, 1002]) res = f.search() assert len(res) == 2 assert check_field(res, "mdf.source_name", "cip") == 0 assert check_field(res, "mdf.scroll_id", 1006) == 2 # No args assert f.match_records("", "") == f
def test_forge_match_source_names(): f = Forge(index="mdf") # One source f.match_source_names("khazana_vasp") res1 = f.search() assert res1 != [] assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0 # Multi-source, strip version info f.match_source_names(["khazana_vasp", "ta_melting_v3.4"]) res2 = f.search() # res1 is a subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "mdf.source_name", "ta_melting") == 2 # No source assert f.match_source_names("") == f
def test_get_dataset_version(): # Get the version number of the OQMD f = Forge() hits = f.search('mdf.source_name:oqmd AND mdf.resource_type:dataset', advanced=True, limit=1) assert hits[0]['mdf']['version'] == f.get_dataset_version('oqmd') # Test invalid source_name with pytest.raises(ValueError): f.get_dataset_version('notreal')
def test_forge_match_dois(): f = Forge(index="mdf") # One doi f.match_dois("https://dx.doi.org/10.13011/M3B36G") res1 = f.search() assert res1 != [] assert check_field(res1, "dc.identifier.identifier", "https://dx.doi.org/10.13011/M3B36G") == 0 # Multiple dois f.match_dois(["https://dx.doi.org/10.13011/M3B36G", "10.18126/M23P9G"]) res2 = f.search() # # res1 is ça subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "dc.identifier.identifier", "10.18126/M23P9G") == 2 # No doi assert f.match_dois("") == f
def test_forge_match_source_names(): os.system('echo hello') f = Forge(index="mdf", no_local_server=True, no_browser=True) os.system('echo there') assert True # One source f.match_source_names("khazana_vasp") res1 = f.search() assert res1 != [] assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0 # Multi-source, strip version info f.match_source_names(["khazana_vasp", "ta_melting_v3.4"]) res2 = f.search() # res1 is a subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "mdf.source_name", "ta_melting") == 2 # No source assert f.match_source_names("") == f
class FoundryDatasets(): """ Class to download datasets hosted on Materials Data Facility Args: no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True test: (bool), whether to be in test mode. Some functionality may be disabled if True Methods: download_data: downloads specified data from MDF and saves to current directory Args: name: (str), name of the dataset to download doi: (str), digital object identifier of the dataset to download download: (bool), whether or not to download the full dataset Returns: None """ def __init__(self, no_local_server, anonymous, test): self.no_local_server = no_local_server self.anonymous = anonymous self.test = test self.mdf = Forge(no_local_server=self.no_local_server, anonymous=self.anonymous, test=self.test) def download_data(self, name=None, doi=None, download=False): if name is not None: self.mdf.match_source_names(name) elif doi is not None: self.mdf.match_dois(doi) else: print('ERROR: please specify either the dataset name or DOI for lookup MDF') result = self.mdf.search() if len(result) == 1: print('Successfully found the desired dataset on MDF') print('MDF entry:') pprint(result) if download == True: print('Downloading dataset from MDF') self.mdf.globus_download(results=result) return
def test_forge_fetch_datasets_from_results(): # Get some results f = Forge(index="mdf") # Record from OQMD res01 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record", advanced=True, limit=1) # Record from OQMD with info res02 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record", advanced=True, limit=1, info=True) # Records from JANAF res03 = f.search( "mdf.source_name:khazana_vasp AND mdf.resource_type:record", advanced=True, limit=2) # Dataset for NIST XPS DB res04 = f.search( "mdf.source_name:nist_xps_db AND mdf.resource_type:dataset", advanced=True) # Get the correct dataset entries oqmd = f.search("mdf.source_name:oqmd AND mdf.resource_type:dataset", advanced=True)[0] khazana_vasp = f.search( "mdf.source_name:khazana_vasp AND mdf.resource_type:dataset", advanced=True)[0] # Fetch single dataset res1 = f.fetch_datasets_from_results(res01[0]) assert mdf_toolbox.insensitive_comparison(res1[0], oqmd) # Fetch dataset with results + info res2 = f.fetch_datasets_from_results(res02) assert mdf_toolbox.insensitive_comparison(res2[0], oqmd) # Fetch multiple datasets rtemp = res01 + res03 res3 = f.fetch_datasets_from_results(rtemp) assert len(res3) == 2 assert oqmd in res3 assert khazana_vasp in res3 # Fetch dataset from dataset res4 = f.fetch_datasets_from_results(res04) assert mdf_toolbox.insensitive_comparison(res4, res04) # Fetch entries from current query f.match_source_names("nist_xps_db") assert f.fetch_datasets_from_results() == res04 # Fetch nothing unknown_entry = {"mdf": {"resource_type": "unknown"}} assert f.fetch_datasets_from_results(unknown_entry) == []
class Foundry(FoundryMetadata): """Foundry Client Base Class TODO: ------- Add Docstring """ # transfer_client: Any dlhub_client: Any forge_client: Any # connect_client: #Add this back in later, not necessary for current functionality xtract_tokens: Any def __init__(self, no_browser=False, no_local_server=False, search_index="mdf-test", **data): super().__init__(**data) auths = mdf_toolbox.login( services=[ "data_mdf", "search", "petrel", "transfer", "dlhub", "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", ], app_name="Foundry", make_clients=True, no_browser=no_browser, no_local_server=no_local_server, ) self.forge_client = Forge( index=search_index, services=None, search_client=auths["search"], transfer_client=auths["transfer"], data_mdf_authorizer=auths["data_mdf"], petrel_authorizer=auths["petrel"], ) self.dlhub_client = DLHubClient( dlh_authorizer=auths["dlhub"], search_client=auths["search"], fx_authorizer=auths[ "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"], force_login=False, ) self.xtract_tokens = { 'auth_token': auths['petrel'].access_token, 'transfer_token': auths['transfer'].authorizer.access_token, 'funx_token': auths[ 'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all'] .access_token } def load(self, name, download=True, globus=True, verbose=False, **kwargs): """Load the metadata for a Foundry dataset into the client Args: name (str): Name of the foundry dataset download (bool): If True, download the data associated with the package (default is True) Keyword Args: interval (int): How often to poll Globus to check if transfers are complete Returns ------- self """ # MDF specific logic res = self.forge_client.match_field( "mdf.organizations", "foundry").match_resource_types("dataset") res = res.match_field("mdf.source_id", name).search() res = res[0] res["dataset"] = res["projects"]["foundry"] res["dataset"]["type"] = res["dataset"]["package_type"] del res["projects"]["foundry"] self = Foundry(**res) if download is True: # Add check for package existence self.download(interval=kwargs.get("interval", 10), globus=globus, verbose=verbose) return self def list(self): """List available Foundry data packages Returns ------- (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year """ res = (self.forge_client.match_field( "mdf.organizations", "foundry").match_resource_types("dataset").search()) return pd.DataFrame([{ "source_id": r["mdf"]["source_id"], "name": r["dc"]["titles"][0]["title"], "year": r["dc"].get("publicationYear", None), } for r in res]) def get_packages(self, paths=False): """Get available local data packages Args: paths (bool): If True return paths in addition to package, if False return package name only Returns ------- (list): List describing local Foundry packages """ pkg_paths = glob.glob(self.config.local_cache_dir + "/*/") if paths: return [{ "path": path, "package": path.split("/")[-2] } for path in pkg_paths] else: return [path.split("/")[-2] for path in pkg_paths] def collect_dataframes(self, inputs=[], outputs=[], packages=None): """Collect dataframes of local data packages Args: inputs (list): List of strings for input columns outputs (list): List of strings for output columns Returns ------- (pandas.DataFrame): Collected dataframe with specified inputs and outputs """ frame_files = glob.glob(self.config.local_cache_dir + "/*/*dataframe*", recursive=True) frames = [] for frame in frame_files: df_tmp = pd.read_json(frame) df_tmp["source"] = frame frames.append(df_tmp) df = pd.concat(frames) if inputs and outputs: return df[inputs], df[outputs] else: return df def run(self, name, inputs, **kwargs): """Run a model on data Args: name (str): DLHub model name inputs: Data to send to DLHub as inputs (should be JSON serializable) Returns ------- Returns results after invocation via the DLHub service TODO: ------- - Pass **kwargs through to DLHub client and document kwargs """ return self.dlhub_client.run(name, inputs=inputs) def load_data(self, source_id=None, globus=True): """Load in the data associated with the prescribed dataset Tabular Data Type: Data are arranged in a standard data frame stored in self.dataframe_file. The contents are read, and File Data Type: <<Add desc>> For more complicated data structures, users should subclass Foundry and override the load_data function Args: inputs (list): List of strings for input columns outputs (list): List of strings for output columns Returns ------- (tuple): Tuple of X, y values """ if source_id: path = os.path.join(self.config.local_cache_dir, source_id) else: path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"]) # Handle Foundry-defined types. if self.dataset.type.value == "tabular": # If the file is not local, fetch the contents with Globus # Check if the contents are local # TODO: Add hashes and versioning to metadata and checking to the file try: self.dataset.dataframe = pd.read_json( os.path.join(path, self.config.dataframe_file)) except: # Try to read individual lines instead self.dataset.dataframe = pd.read_json(os.path.join( path, self.config.dataframe_file), lines=True) return ( self.dataset.dataframe[self.dataset.inputs], self.dataset.dataframe[self.dataset.outputs], ) elif self.dataset.type.value == "hdf5": f = h5py.File(os.path.join(path, self.config.data_file), "r") inputs = [f[i[0:]] for i in self.dataset.inputs] outputs = [f[i[0:]] for i in self.dataset.outputs] return (inputs, outputs) else: raise NotImplementedError def describe(self): print("DC:{}".format(self.dc)) print("Dataset:{}".format(self.dataset.json(exclude={"dataframe"}))) def publish(self, foundry_metadata, update=False, **kwargs): """Submit a data package for publication Args: foundry_metadata (dict): Path to the file containing update (bool): True if this is an update to a prior data package (default: self.config.metadata_file) Keyword Args: title (str): Title of the data package authors (list): List of data package author names e.g., Jack Black or Nunez, Victoria affiliations (list): List of author affiliations tags (list): List of tags to apply to the data package Returns ------- (dict) MDF Connect Response: Response from MDF Connect to allow tracking of dataset """ self.connect_client.create_dc_block( title=kwargs["title"], authors=kwargs["authors"], affiliations=kwargs.get("affiliations", []), subjects=kwargs.get("tags", ["machine learning", "foundry"]), ) self.connect_client.add_organization("Foundry") self.connect_client.set_project_block("foundry", foundry_metadata) self.connect_client.add_data_source(kwargs.get("data_sources", [])) res = self.connect_client.submit_dataset(update=update) return res def from_file(self, file=None): """Create a Foundry client from a file Args: file (str): Path to the file containing (default: self.config.metadata_file) Returns ------- (Foundry): an newly instantiated Foundry client """ if file is None: file = self.config.metadata_file with open("./{}".format(file)) as fp: obj = json.load(fp) return Foundry(**obj) def to_file(self, file=None): """Create a Foundry client from a file Args: file (str): Path to the file to save metadata to (default: self.config.metadata_file) Returns ------- (Foundry) self: for chaining """ if file is None: file = self.config.metadata_file with open("./{}".format(file)) as fp: obj = json.dump( self.json(exclude={"dlhub_client", "forge_client"}), fp) return self def configure(self, **kwargs): self.config = FoundryConfig(**kwargs) return self def download(self, globus=True, verbose=False, **kwargs): # Check if the dir already exists path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"]) if os.path.isdir(path): return self res = self.forge_client.search( "mdf.source_id:{name}".format(name=self.mdf["source_id"]), advanced=True) if globus: self.forge_client.globus_download( res, dest=self.config.local_cache_dir, dest_ep=self.config.destination_endpoint, interval=kwargs.get("interval", 20), download_datasets=True, ) else: source_id = self.mdf['source_id'] xtract_base_url = "http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com" # MDF Materials Data at NCSA source_ep_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec" base_url = "https://data.materialsdatafacility.org" folder_to_crawl = f"/foundry/{source_id}/" # This only matters if you want files grouped together. grouper = "matio" auth_token = self.xtract_tokens['auth_token'] transfer_token = self.xtract_tokens['transfer_token'] funcx_token = self.xtract_tokens['transfer_token'] headers = { 'Authorization': f"Bearer {auth_token}", 'Transfer': transfer_token, 'FuncX': funcx_token, 'Petrel': auth_token } if verbose: print(f"Headers: {headers}") # Initialize the crawl. This kicks off the Globus EP crawling service on the backend. crawl_url = f'{xtract_base_url}/crawl' if verbose: print(f"Crawl URL is : {crawl_url}") crawl_req = requests.post(crawl_url, json={ 'repo_type': "GLOBUS", 'eid': source_ep_id, 'dir_path': folder_to_crawl, 'Transfer': transfer_token, 'Authorization': funcx_token, 'grouper': grouper, 'https_info': { 'base_url': base_url } }) if verbose: print('Crawl response:', crawl_req) crawl_id = json.loads(crawl_req.content)['crawl_id'] if verbose: print(f"Crawl ID: {crawl_id}") # Wait for the crawl to finish before we can start fetching our metadata. while True: crawl_status = requests.get( f'{xtract_base_url}/get_crawl_status', json={'crawl_id': crawl_id}) if verbose: print(crawl_status) crawl_content = json.loads(crawl_status.content) if verbose: print(f"Crawl Status: {crawl_content}") if crawl_content['crawl_status'] == 'SUCCEEDED': files_crawled = crawl_content['files_crawled'] if verbose: print("Our crawl has succeeded!") break else: if verbose: print("Sleeping before re-polling...") time.sleep(2) # Now we fetch our metadata. Here you can configure n to be maximum number of # messages you want at once. file_ls = [] fetched_files = 0 while fetched_files < files_crawled: fetch_mdata = requests.get( f'{xtract_base_url}/fetch_crawl_mdata', json={ 'crawl_id': crawl_id, 'n': 2 }) fetch_content = json.loads(fetch_mdata.content) for file_path in fetch_content['file_ls']: file_ls.append(file_path) fetched_files += 1 if fetch_content['queue_empty']: if verbose: print("Queue is empty! Continuing...") time.sleep(2) source_path = os.path.join(self.config.local_cache_dir, self.mdf['source_id']) if not os.path.exists(self.config.local_cache_dir): os.mkdir(self.config.local_cache_dir) os.mkdir(source_path) elif not os.path.exists(source_path): os.mkdir(source_path) num_cores = multiprocessing.cpu_count() def download_file(file): requests.packages.urllib3.disable_warnings( InsecureRequestWarning) url = 'https://data.materialsdatafacility.org' + file['path'] destination = 'data/' + source_id + '/' + file['path'][ file['path'].rindex("/") + 1:] response = requests.get(url, verify=False) with open(destination, 'wb') as f: f.write(response.content) return {file['path'] + ' status': True} results = Parallel(n_jobs=num_cores)(delayed(download_file)(file) for file in file_ls) print('Done curling.') print(results) return self