def test_forge_aggregate_sources(): # Test limit f = Forge(index="mdf") res1 = f.aggregate_sources("nist_xps_db") assert isinstance(res1, list) assert len(res1) > 10000 assert isinstance(res1[0], dict)
def test_forge_globus_download(): f = Forge(index="mdf") # Simple case f.globus_download(example_result1) assert os.path.exists("./test_fetch.txt") os.remove("./test_fetch.txt") # With dest and preserve_dir dest_path = os.path.expanduser("~/mdf") f.globus_download(example_result1, dest=dest_path, preserve_dir=True) assert os.path.exists(os.path.join(dest_path, "test", "test_fetch.txt")) os.remove(os.path.join(dest_path, "test", "test_fetch.txt")) os.rmdir(os.path.join(dest_path, "test")) # With multiple files f.globus_download(example_result2, dest=dest_path) assert os.path.exists(os.path.join(dest_path, "test_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt")) os.remove(os.path.join(dest_path, "test_fetch.txt")) os.remove(os.path.join(dest_path, "test_multifetch.txt")) f.globus_download(example_result3, dest=dest_path) assert os.path.exists(os.path.join(dest_path, "test_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt")) os.remove(os.path.join(dest_path, "test_fetch.txt")) os.remove(os.path.join(dest_path, "test_multifetch.txt"))
def __init__(self, no_local_server, anonymous, test): self.no_local_server = no_local_server self.anonymous = anonymous self.test = test self.mdf = Forge(no_local_server=self.no_local_server, anonymous=self.anonymous, test=self.test)
def test_forge_search_by_titles(): f = Forge(index="mdf") titles1 = ['"High-throughput Ab-initio Dilute Solute Diffusion Database"'] res1 = f.search_by_titles(titles1) assert check_field( res1, "dc.titles.[].title", "High-throughput Ab-initio Dilute Solute Diffusion Database") == 0 titles2 = ["Database"] res2 = f.search_by_titles(titles2) assert check_field(res2, "dc.titles.[].title", "NIST X-ray Photoelectron Spectroscopy Database") == 2
def test_forge_search_by_elements(): f = Forge(index="mdf") elements = ["Cu", "Al"] sources = ["oqmd", "nist_xps_db"] res1, info1 = f.match_source_names(sources).match_elements( elements).search(limit=10000, info=True) res2, info2 = f.search_by_elements(elements, sources, limit=10000, info=True) assert all([r in res2 for r in res1]) and all([r in res1 for r in res2]) assert check_field(res1, "material.elements", "Al") == 1 assert check_field(res1, "mdf.source_name", "oqmd") == 2
def __init__(self, no_browser=False, no_local_server=False, search_index="mdf-test", **data): super().__init__(**data) auths = mdf_toolbox.login( services=[ "data_mdf", "search", "petrel", "transfer", "dlhub", "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", ], app_name="Foundry", make_clients=True, no_browser=no_browser, no_local_server=no_local_server, ) self.forge_client = Forge( index=search_index, services=None, search_client=auths["search"], transfer_client=auths["transfer"], data_mdf_authorizer=auths["data_mdf"], petrel_authorizer=auths["petrel"], ) self.dlhub_client = DLHubClient( dlh_authorizer=auths["dlhub"], search_client=auths["search"], fx_authorizer=auths[ "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"], force_login=False, ) self.xtract_tokens = { 'auth_token': auths['petrel'].access_token, 'transfer_token': auths['transfer'].authorizer.access_token, 'funx_token': auths[ 'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all'] .access_token }
def test_forge_chaining(): f = Forge(index="mdf") f.match_field("source_name", "cip") f.match_field("material.elements", "Al") res1 = f.search() res2 = f.match_field("source_name", "cip").match_field("material.elements", "Al").search() assert all([r in res2 for r in res1]) and all([r in res1 for r in res2])
def test_forge_match_resource_types(): f = Forge(index="mdf") # Test one type f.match_resource_types("record") res1 = f.search(limit=10) assert check_field(res1, "mdf.resource_type", "record") == 0 # Test two types f.match_resource_types(["collection", "dataset"]) res2 = f.search() assert check_field(res2, "mdf.resource_type", "record") == -1 # Test zero types assert f.match_resource_types("") == f
def test_forge_anonymous(capsys): f = Forge(anonymous=True) # Test search assert len( f.search("mdf.source_name:ab_initio_solute_database", advanced=True, limit=300)) == 300 # Test aggregation assert len(f.aggregate("mdf.source_name:nist_xps_db")) > 10000 # Error on auth-only functions # http_download assert f.http_download({})["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous HTTP download not yet supported." in out # globus_download assert f.globus_download({})["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous Globus Transfer not supported." in out # http_stream res = f.http_stream({}) assert next(res)["success"] is False out, err = capsys.readouterr() assert "Error: Anonymous HTTP download not yet supported." in out with pytest.raises(StopIteration): next(res)
def test_forge_http_stream(capsys): f = Forge(index="mdf") # Simple case res1 = f.http_stream(example_result1) assert isinstance(res1, types.GeneratorType) assert next( res1 ) == "This is a test document for Forge testing. Please do not remove.\n" # With multiple files res2 = f.http_stream((example_result2, {"info": {}})) assert isinstance(res2, types.GeneratorType) assert next( res2 ) == "This is a test document for Forge testing. Please do not remove.\n" assert next( res2 ) == "This is a second test document for Forge testing. Please do not remove.\n" assert next( res2 ) == "This is a test document for Forge testing. Please do not remove.\n" assert next( res2 ) == "This is a second test document for Forge testing. Please do not remove.\n" res3 = f.http_stream((example_result3, {"info": {}})) assert isinstance(res3, types.GeneratorType) assert next( res3 ) == "This is a test document for Forge testing. Please do not remove.\n" assert next( res3 ) == "This is a second test document for Forge testing. Please do not remove.\n" assert next( res3 ) == "This is a test document for Forge testing. Please do not remove.\n" assert next( res3 ) == "This is a second test document for Forge testing. Please do not remove.\n" # Too many results res4 = f.http_stream(list(range(10001))) assert next(res4)["success"] is False out, err = capsys.readouterr() assert "Too many results supplied. Use globus_download()" in out with pytest.raises(StopIteration): next(res4) # "Missing" files assert next(f.http_stream(example_result_missing)) is None out, err = capsys.readouterr() assert not os.path.exists("./should_not_exist.txt") assert ( "Error 404 when attempting to access " "'https://data.materialsdatafacility.org/test/should_not_exist.txt'" ) in out
def test_forge_match_organizations(): f = Forge(index="mdf") # One repo f.match_organizations("NIST") res1 = f.search() assert res1 != [] check_val1 = check_field(res1, "mdf.organizations", "NIST") assert check_val1 == 1 # Multi-repo f.match_organizations(["NIST", "PRISMS"], match_all=False) res2 = f.search() assert check_field(res2, "mdf.organizations", "PRISMS") == 2 assert check_field(res2, "mdf.organizations", "NIST") == 2 # No repos assert f.match_organizations("") == f
def test_forge_match_elements(): f = Forge(index="mdf") # One element f.match_elements("Al") res1 = f.search() assert res1 != [] check_val1 = check_field(res1, "material.elements", "Al") assert check_val1 == 0 or check_val1 == 1 # Multi-element f.match_elements(["Al", "Cu"]) res2 = f.search() assert check_field(res2, "material.elements", "Al") == 1 assert check_field(res2, "material.elements", "Cu") == 1 # No elements assert f.match_elements("") == f
def test_forge_match_titles(): # One title f = Forge(index="mdf") titles1 = '"High-throughput Ab-initio Dilute Solute Diffusion Database"' res1 = f.match_titles(titles1).search() assert res1 != [] assert check_field( res1, "dc.titles.[].title", "High-throughput Ab-initio Dilute Solute Diffusion Database") == 0 # Multiple titles titles2 = [ '"High-throughput Ab-initio Dilute Solute Diffusion Database"', '"Khazana (VASP)"' ] res2 = f.match_titles(titles2).search() assert res2 != [] assert check_field(res2, "dc.titles.[].title", "Khazana (VASP)") == 2 # No titles assert f.match_titles("") == f
def test_forge_test_match_records(): f = Forge(index="mdf") # One record f.match_records("cip", 1006) res = f.search() assert len(res) == 1 assert check_field(res, "mdf.source_name", "cip") == 0 assert check_field(res, "mdf.scroll_id", 1006) == 0 # Multi-record, strip version info f.match_records("cip_v3.4", [1006, 1002]) res = f.search() assert len(res) == 2 assert check_field(res, "mdf.source_name", "cip") == 0 assert check_field(res, "mdf.scroll_id", 1006) == 2 # No args assert f.match_records("", "") == f
def test_forge_match_source_names(): f = Forge(index="mdf") # One source f.match_source_names("khazana_vasp") res1 = f.search() assert res1 != [] assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0 # Multi-source, strip version info f.match_source_names(["khazana_vasp", "ta_melting_v3.4"]) res2 = f.search() # res1 is a subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "mdf.source_name", "ta_melting") == 2 # No source assert f.match_source_names("") == f
def test_forge_match_dois(): f = Forge(index="mdf") # One doi f.match_dois("https://dx.doi.org/10.13011/M3B36G") res1 = f.search() assert res1 != [] assert check_field(res1, "dc.identifier.identifier", "https://dx.doi.org/10.13011/M3B36G") == 0 # Multiple dois f.match_dois(["https://dx.doi.org/10.13011/M3B36G", "10.18126/M23P9G"]) res2 = f.search() # # res1 is ça subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "dc.identifier.identifier", "10.18126/M23P9G") == 2 # No doi assert f.match_dois("") == f
def test_forge_match_source_names(): os.system('echo hello') f = Forge(index="mdf", no_local_server=True, no_browser=True) os.system('echo there') assert True # One source f.match_source_names("khazana_vasp") res1 = f.search() assert res1 != [] assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0 # Multi-source, strip version info f.match_source_names(["khazana_vasp", "ta_melting_v3.4"]) res2 = f.search() # res1 is a subset of res2 assert len(res2) > len(res1) assert all([r1 in res2 for r1 in res1]) assert check_field(res2, "mdf.source_name", "ta_melting") == 2 # No source assert f.match_source_names("") == f
def test_get_dataset_version(): # Get the version number of the OQMD f = Forge() hits = f.search('mdf.source_name:oqmd AND mdf.resource_type:dataset', advanced=True, limit=1) assert hits[0]['mdf']['version'] == f.get_dataset_version('oqmd') # Test invalid source_name with pytest.raises(ValueError): f.get_dataset_version('notreal')
class FoundryDatasets(): """ Class to download datasets hosted on Materials Data Facility Args: no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True test: (bool), whether to be in test mode. Some functionality may be disabled if True Methods: download_data: downloads specified data from MDF and saves to current directory Args: name: (str), name of the dataset to download doi: (str), digital object identifier of the dataset to download download: (bool), whether or not to download the full dataset Returns: None """ def __init__(self, no_local_server, anonymous, test): self.no_local_server = no_local_server self.anonymous = anonymous self.test = test self.mdf = Forge(no_local_server=self.no_local_server, anonymous=self.anonymous, test=self.test) def download_data(self, name=None, doi=None, download=False): if name is not None: self.mdf.match_source_names(name) elif doi is not None: self.mdf.match_dois(doi) else: print('ERROR: please specify either the dataset name or DOI for lookup MDF') result = self.mdf.search() if len(result) == 1: print('Successfully found the desired dataset on MDF') print('MDF entry:') pprint(result) if download == True: print('Downloading dataset from MDF') self.mdf.globus_download(results=result) return
def test_forge_search_by_dois(): f = Forge(index="mdf") res1 = f.search_by_dois("https://dx.doi.org/10.13011/M3B36G") assert check_field(res1, "dc.identifier.identifier", "https://dx.doi.org/10.13011/M3B36G") == 0
def test_forge_match_years(capsys): # One year of data/results f = Forge(index="mdf") res1 = f.match_years("2015").search() assert res1 != [] assert check_field(res1, "dc.publicationYear", 2015) == 0 # Multiple years res2 = f.match_years(years=["2015", 2016]).search() assert check_field(res2, "dc.publicationYear", 2016) == 2 # Wrong input with pytest.raises(AttributeError) as excinfo: f.match_years(["20x5"]).search() assert "Invalid year: '20x5'" in str(excinfo.value) with pytest.raises(AttributeError) as excinfo: f.match_years(start="20x5").search() assert "Invalid start year: '20x5'" in str(excinfo.value) with pytest.raises(AttributeError) as excinfo: f.match_years(stop="20x5").search() assert "Invalid stop year: '20x5'" in str(excinfo.value) # No filters with no input f.match_years() assert f.current_query() == "" # Test range res4 = f.match_years(start=2015, stop=2015, inclusive=True).search() assert check_field(res4, "dc.publicationYear", 2015) == 0 res5 = f.match_years(start=2014, stop=2017, inclusive=False).search() assert check_field(res5, "dc.publicationYear", 2013) == -1 assert check_field(res5, "dc.publicationYear", 2014) == -1 assert check_field(res5, "dc.publicationYear", 2015) == 2 assert check_field(res5, "dc.publicationYear", 2016) == 2 assert check_field(res5, "dc.publicationYear", 2017) == -1 assert f.match_years(start=2015, stop=2015, inclusive=False).search() == []
def test_describe_field(capsys): f = Forge() # Basic usage (raw=True for ease of testing) res = f.describe_field("dataset", raw=True) assert res["success"] assert "dc" in res["schema"]["properties"].keys() assert res["schema"]["properties"]["mdf"]["properties"]["source_id"] # Specific field res = f.describe_field("dataset", field="dc", raw=True) assert "mdf" not in res["schema"]["properties"].keys() assert "titles" in res["schema"]["properties"].keys() # Special case res = f.describe_field("list", raw=True) assert isinstance(res["schema"], list) assert "mdf" in res["schema"] # Printing to stdout f.describe_field("record") out, err = capsys.readouterr() assert "- custom" in out # Specific field f.describe_field("record", field="mdf") out, err = capsys.readouterr() assert "- custom" not in out assert "- source_id" in out # Errors # Invalid resource_type res = f.describe_field("notexists", raw=True) assert res["success"] is False assert res["schema"] is None assert res["error"].startswith("Error 404") # stdout f.describe_field("notexists") out, err = capsys.readouterr() assert "Error 404" in out # Invalid field res = f.describe_field("dataset", field="foo.bar", raw=True) assert res["success"] is False assert res["schema"] is None assert res["error"].startswith("Error: Field 'foo' (from 'foo.bar')") # stdout f.describe_field("dataset", field="foo.bar") out, err = capsys.readouterr() assert "Error: Field 'foo' (from 'foo.bar')" in out
def generate_stats(raw=False, return_all=False, many_cutoff=100): """Generates statistics on datasets in MDF Search. Arguments: raw (bool): When False, will print stats to stdout and display a progress bar. When True, will return a dict of stats and will not display a progress bar. Default False. return_all (bool): When False or when raw is False, generate summary statistics. When True and raw is True, return the dataset source_ids for each category. Extremely verbose. Default False. many_cutoff (int): The number of records required to be considered "many" records. Thie value is inclusive. Default 100. Returns: dict: Stats, when raw is True (else these are printed) """ mdf = Forge() dataset_list = mdf.match_resource_types("dataset").search() all_datasets = [] num_records = 0 zero_records = [] one_record = [] multiple_records = [] many_records = [] for ds in tqdm(dataset_list, disable=raw): source_id = ds["mdf"]["source_id"] record_count = mdf.match_resource_types("record") \ .match_source_names(source_id) \ .search(limit=0, info=True)[1]["total_query_matches"] all_datasets.append((source_id, record_count)) num_records += record_count if record_count == 0: zero_records.append(source_id) elif record_count == 1: one_record.append(source_id) elif record_count > 1: multiple_records.append(source_id) if record_count >= int(many_cutoff): many_records.append(source_id) if raw: returnable = {} returnable["all_datasets_count"] = len(all_datasets) returnable["all_records_count"] = num_records returnable["zero_records_count"] = len(zero_records) returnable["one_record_count"] = len(one_record) returnable["multiple_records_count"] = len(multiple_records) returnable["many_records_count"] = len(many_records) returnable["one_or_more_count"] = len(one_record) + len( multiple_records) if return_all: returnable["all_datasets"] = all_datasets returnable["zero_records"] = zero_records returnable["one_record"] = one_record returnable["multiple_records"] = multiple_records returnable["many_records"] = many_records returnable["one_or_more"] = one_record + multiple_records return returnable else: print("MDF Search Statistics") print("---------------------") print("Total datasets:", len(all_datasets)) print("Total records:", num_records) print("Datasets with zero records:", len(zero_records)) print("Datasets with any records: ", len(one_record) + len(multiple_records)) print("{}% of datasets have records".format( int((len(one_record) + len(multiple_records)) / len(all_datasets) * 100))) print() print("Datasets with exactly one record: ", len(one_record)) print("Datasets with more than one record: ", len(multiple_records)) print("Datasets with more than", many_cutoff, "records:", len(many_records)) print() return
def test_forge_http_download(capsys): f = Forge(index="mdf") # Simple case f.http_download(example_result1) assert os.path.exists("./test_fetch.txt") # Test conflicting filenames f.http_download(example_result1) assert os.path.exists("./test_fetch(1).txt") f.http_download(example_result1) assert os.path.exists("./test_fetch(2).txt") os.remove("./test_fetch.txt") os.remove("./test_fetch(1).txt") os.remove("./test_fetch(2).txt") # With dest and preserve_dir, and tuple of results dest_path = os.path.expanduser("~/mdf") f.http_download(([example_result1], { "info": None }), dest=dest_path, preserve_dir=True) assert os.path.exists(os.path.join(dest_path, "test", "test_fetch.txt")) os.remove(os.path.join(dest_path, "test", "test_fetch.txt")) os.rmdir(os.path.join(dest_path, "test")) # With multiple files f.http_download(example_result2, dest=dest_path) assert os.path.exists(os.path.join(dest_path, "test_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt")) assert os.path.exists(os.path.join(dest_path, "petrel_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "petrel_multifetch.txt")) os.remove(os.path.join(dest_path, "test_fetch.txt")) os.remove(os.path.join(dest_path, "test_multifetch.txt")) os.remove(os.path.join(dest_path, "petrel_fetch.txt")) os.remove(os.path.join(dest_path, "petrel_multifetch.txt")) f.http_download(example_result3, dest=dest_path) assert os.path.exists(os.path.join(dest_path, "test_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt")) assert os.path.exists(os.path.join(dest_path, "petrel_fetch.txt")) assert os.path.exists(os.path.join(dest_path, "petrel_multifetch.txt")) os.remove(os.path.join(dest_path, "test_fetch.txt")) os.remove(os.path.join(dest_path, "test_multifetch.txt")) os.remove(os.path.join(dest_path, "petrel_fetch.txt")) os.remove(os.path.join(dest_path, "petrel_multifetch.txt")) # Too many files assert f.http_download(list(range(10001)))["success"] is False out, err = capsys.readouterr() assert "Too many results supplied. Use globus_download()" in out # "Missing" files f.http_download(example_result_missing) out, err = capsys.readouterr() assert not os.path.exists("./should_not_exist.txt") assert ( "Error 404 when attempting to access " "'https://data.materialsdatafacility.org/test/should_not_exist.txt'" ) in out # No datasets f.http_download(example_dataset) out, err = capsys.readouterr() assert not os.path.exists(os.path.join(dest_path, "petrel_fetch.txt")) assert ( "Skipping datset entry for 'foobar_v1': Cannot download dataset over HTTPS. " "Use globus_download() for datasets.") in out # Bad resource_type f.http_download(example_bad_resource) out, err = capsys.readouterr() assert "Error: Found unknown resource_type 'foobar'. Skipping entry." in out
def test_forge_fetch_datasets_from_results(): # Get some results f = Forge(index="mdf") # Record from OQMD res01 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record", advanced=True, limit=1) # Record from OQMD with info res02 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record", advanced=True, limit=1, info=True) # Records from JANAF res03 = f.search( "mdf.source_name:khazana_vasp AND mdf.resource_type:record", advanced=True, limit=2) # Dataset for NIST XPS DB res04 = f.search( "mdf.source_name:nist_xps_db AND mdf.resource_type:dataset", advanced=True) # Get the correct dataset entries oqmd = f.search("mdf.source_name:oqmd AND mdf.resource_type:dataset", advanced=True)[0] khazana_vasp = f.search( "mdf.source_name:khazana_vasp AND mdf.resource_type:dataset", advanced=True)[0] # Fetch single dataset res1 = f.fetch_datasets_from_results(res01[0]) assert mdf_toolbox.insensitive_comparison(res1[0], oqmd) # Fetch dataset with results + info res2 = f.fetch_datasets_from_results(res02) assert mdf_toolbox.insensitive_comparison(res2[0], oqmd) # Fetch multiple datasets rtemp = res01 + res03 res3 = f.fetch_datasets_from_results(rtemp) assert len(res3) == 2 assert oqmd in res3 assert khazana_vasp in res3 # Fetch dataset from dataset res4 = f.fetch_datasets_from_results(res04) assert mdf_toolbox.insensitive_comparison(res4, res04) # Fetch entries from current query f.match_source_names("nist_xps_db") assert f.fetch_datasets_from_results() == res04 # Fetch nothing unknown_entry = {"mdf": {"resource_type": "unknown"}} assert f.fetch_datasets_from_results(unknown_entry) == []
def test_describe_organization(capsys): f = Forge() # Basic usage (with raw=True) res = f.describe_organization("Argonne National Laboratory", raw=True) assert res["success"] assert isinstance(res["organization"], dict) assert res["organization"][ "canonical_name"] == "Argonne National Laboratory" assert "ANL" in res["organization"]["aliases"] # List res = f.describe_organization("list", raw=True) assert isinstance(res["organization"], list) assert "Center for Hierarchical Materials Design" in res["organization"] # All res = f.describe_organization("all", raw=True) assert isinstance(res["organization"], list) assert isinstance(res["organization"][0], dict) # Print to stdout f.describe_organization("CHiMaD") out, err = capsys.readouterr() assert "canonical_name: Center for Hierarchical Materials Design" in out assert "CHiMaD" in out assert "public" in out # List f.describe_organization("list") out, err = capsys.readouterr() assert "Center for Hierarchical Materials Design" in out assert "CHiMaD" not in out assert "Argonne National Laboratory" in out assert "ANL" not in out # Summary flag f.describe_organization("chimad", summary=True) out, err = capsys.readouterr() assert "canonical_name: Center for Hierarchical Materials Design" not in out assert "Center for Hierarchical Materials Design" in out assert "CHiMaD" in out assert "public" not in out # Errors # Invalid org res = f.describe_organization("foobar", raw=True) assert res["success"] is False assert "Error 404" in res["error"] assert res["status_code"] == 404 # stdout res = f.describe_organization("foobar") out, err = capsys.readouterr() assert "Error 404" in out
class Foundry(FoundryMetadata): """Foundry Client Base Class TODO: ------- Add Docstring """ # transfer_client: Any dlhub_client: Any forge_client: Any # connect_client: #Add this back in later, not necessary for current functionality xtract_tokens: Any def __init__(self, no_browser=False, no_local_server=False, search_index="mdf-test", **data): super().__init__(**data) auths = mdf_toolbox.login( services=[ "data_mdf", "search", "petrel", "transfer", "dlhub", "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", ], app_name="Foundry", make_clients=True, no_browser=no_browser, no_local_server=no_local_server, ) self.forge_client = Forge( index=search_index, services=None, search_client=auths["search"], transfer_client=auths["transfer"], data_mdf_authorizer=auths["data_mdf"], petrel_authorizer=auths["petrel"], ) self.dlhub_client = DLHubClient( dlh_authorizer=auths["dlhub"], search_client=auths["search"], fx_authorizer=auths[ "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"], force_login=False, ) self.xtract_tokens = { 'auth_token': auths['petrel'].access_token, 'transfer_token': auths['transfer'].authorizer.access_token, 'funx_token': auths[ 'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all'] .access_token } def load(self, name, download=True, globus=True, verbose=False, **kwargs): """Load the metadata for a Foundry dataset into the client Args: name (str): Name of the foundry dataset download (bool): If True, download the data associated with the package (default is True) Keyword Args: interval (int): How often to poll Globus to check if transfers are complete Returns ------- self """ # MDF specific logic res = self.forge_client.match_field( "mdf.organizations", "foundry").match_resource_types("dataset") res = res.match_field("mdf.source_id", name).search() res = res[0] res["dataset"] = res["projects"]["foundry"] res["dataset"]["type"] = res["dataset"]["package_type"] del res["projects"]["foundry"] self = Foundry(**res) if download is True: # Add check for package existence self.download(interval=kwargs.get("interval", 10), globus=globus, verbose=verbose) return self def list(self): """List available Foundry data packages Returns ------- (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year """ res = (self.forge_client.match_field( "mdf.organizations", "foundry").match_resource_types("dataset").search()) return pd.DataFrame([{ "source_id": r["mdf"]["source_id"], "name": r["dc"]["titles"][0]["title"], "year": r["dc"].get("publicationYear", None), } for r in res]) def get_packages(self, paths=False): """Get available local data packages Args: paths (bool): If True return paths in addition to package, if False return package name only Returns ------- (list): List describing local Foundry packages """ pkg_paths = glob.glob(self.config.local_cache_dir + "/*/") if paths: return [{ "path": path, "package": path.split("/")[-2] } for path in pkg_paths] else: return [path.split("/")[-2] for path in pkg_paths] def collect_dataframes(self, inputs=[], outputs=[], packages=None): """Collect dataframes of local data packages Args: inputs (list): List of strings for input columns outputs (list): List of strings for output columns Returns ------- (pandas.DataFrame): Collected dataframe with specified inputs and outputs """ frame_files = glob.glob(self.config.local_cache_dir + "/*/*dataframe*", recursive=True) frames = [] for frame in frame_files: df_tmp = pd.read_json(frame) df_tmp["source"] = frame frames.append(df_tmp) df = pd.concat(frames) if inputs and outputs: return df[inputs], df[outputs] else: return df def run(self, name, inputs, **kwargs): """Run a model on data Args: name (str): DLHub model name inputs: Data to send to DLHub as inputs (should be JSON serializable) Returns ------- Returns results after invocation via the DLHub service TODO: ------- - Pass **kwargs through to DLHub client and document kwargs """ return self.dlhub_client.run(name, inputs=inputs) def load_data(self, source_id=None, globus=True): """Load in the data associated with the prescribed dataset Tabular Data Type: Data are arranged in a standard data frame stored in self.dataframe_file. The contents are read, and File Data Type: <<Add desc>> For more complicated data structures, users should subclass Foundry and override the load_data function Args: inputs (list): List of strings for input columns outputs (list): List of strings for output columns Returns ------- (tuple): Tuple of X, y values """ if source_id: path = os.path.join(self.config.local_cache_dir, source_id) else: path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"]) # Handle Foundry-defined types. if self.dataset.type.value == "tabular": # If the file is not local, fetch the contents with Globus # Check if the contents are local # TODO: Add hashes and versioning to metadata and checking to the file try: self.dataset.dataframe = pd.read_json( os.path.join(path, self.config.dataframe_file)) except: # Try to read individual lines instead self.dataset.dataframe = pd.read_json(os.path.join( path, self.config.dataframe_file), lines=True) return ( self.dataset.dataframe[self.dataset.inputs], self.dataset.dataframe[self.dataset.outputs], ) elif self.dataset.type.value == "hdf5": f = h5py.File(os.path.join(path, self.config.data_file), "r") inputs = [f[i[0:]] for i in self.dataset.inputs] outputs = [f[i[0:]] for i in self.dataset.outputs] return (inputs, outputs) else: raise NotImplementedError def describe(self): print("DC:{}".format(self.dc)) print("Dataset:{}".format(self.dataset.json(exclude={"dataframe"}))) def publish(self, foundry_metadata, update=False, **kwargs): """Submit a data package for publication Args: foundry_metadata (dict): Path to the file containing update (bool): True if this is an update to a prior data package (default: self.config.metadata_file) Keyword Args: title (str): Title of the data package authors (list): List of data package author names e.g., Jack Black or Nunez, Victoria affiliations (list): List of author affiliations tags (list): List of tags to apply to the data package Returns ------- (dict) MDF Connect Response: Response from MDF Connect to allow tracking of dataset """ self.connect_client.create_dc_block( title=kwargs["title"], authors=kwargs["authors"], affiliations=kwargs.get("affiliations", []), subjects=kwargs.get("tags", ["machine learning", "foundry"]), ) self.connect_client.add_organization("Foundry") self.connect_client.set_project_block("foundry", foundry_metadata) self.connect_client.add_data_source(kwargs.get("data_sources", [])) res = self.connect_client.submit_dataset(update=update) return res def from_file(self, file=None): """Create a Foundry client from a file Args: file (str): Path to the file containing (default: self.config.metadata_file) Returns ------- (Foundry): an newly instantiated Foundry client """ if file is None: file = self.config.metadata_file with open("./{}".format(file)) as fp: obj = json.load(fp) return Foundry(**obj) def to_file(self, file=None): """Create a Foundry client from a file Args: file (str): Path to the file to save metadata to (default: self.config.metadata_file) Returns ------- (Foundry) self: for chaining """ if file is None: file = self.config.metadata_file with open("./{}".format(file)) as fp: obj = json.dump( self.json(exclude={"dlhub_client", "forge_client"}), fp) return self def configure(self, **kwargs): self.config = FoundryConfig(**kwargs) return self def download(self, globus=True, verbose=False, **kwargs): # Check if the dir already exists path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"]) if os.path.isdir(path): return self res = self.forge_client.search( "mdf.source_id:{name}".format(name=self.mdf["source_id"]), advanced=True) if globus: self.forge_client.globus_download( res, dest=self.config.local_cache_dir, dest_ep=self.config.destination_endpoint, interval=kwargs.get("interval", 20), download_datasets=True, ) else: source_id = self.mdf['source_id'] xtract_base_url = "http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com" # MDF Materials Data at NCSA source_ep_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec" base_url = "https://data.materialsdatafacility.org" folder_to_crawl = f"/foundry/{source_id}/" # This only matters if you want files grouped together. grouper = "matio" auth_token = self.xtract_tokens['auth_token'] transfer_token = self.xtract_tokens['transfer_token'] funcx_token = self.xtract_tokens['transfer_token'] headers = { 'Authorization': f"Bearer {auth_token}", 'Transfer': transfer_token, 'FuncX': funcx_token, 'Petrel': auth_token } if verbose: print(f"Headers: {headers}") # Initialize the crawl. This kicks off the Globus EP crawling service on the backend. crawl_url = f'{xtract_base_url}/crawl' if verbose: print(f"Crawl URL is : {crawl_url}") crawl_req = requests.post(crawl_url, json={ 'repo_type': "GLOBUS", 'eid': source_ep_id, 'dir_path': folder_to_crawl, 'Transfer': transfer_token, 'Authorization': funcx_token, 'grouper': grouper, 'https_info': { 'base_url': base_url } }) if verbose: print('Crawl response:', crawl_req) crawl_id = json.loads(crawl_req.content)['crawl_id'] if verbose: print(f"Crawl ID: {crawl_id}") # Wait for the crawl to finish before we can start fetching our metadata. while True: crawl_status = requests.get( f'{xtract_base_url}/get_crawl_status', json={'crawl_id': crawl_id}) if verbose: print(crawl_status) crawl_content = json.loads(crawl_status.content) if verbose: print(f"Crawl Status: {crawl_content}") if crawl_content['crawl_status'] == 'SUCCEEDED': files_crawled = crawl_content['files_crawled'] if verbose: print("Our crawl has succeeded!") break else: if verbose: print("Sleeping before re-polling...") time.sleep(2) # Now we fetch our metadata. Here you can configure n to be maximum number of # messages you want at once. file_ls = [] fetched_files = 0 while fetched_files < files_crawled: fetch_mdata = requests.get( f'{xtract_base_url}/fetch_crawl_mdata', json={ 'crawl_id': crawl_id, 'n': 2 }) fetch_content = json.loads(fetch_mdata.content) for file_path in fetch_content['file_ls']: file_ls.append(file_path) fetched_files += 1 if fetch_content['queue_empty']: if verbose: print("Queue is empty! Continuing...") time.sleep(2) source_path = os.path.join(self.config.local_cache_dir, self.mdf['source_id']) if not os.path.exists(self.config.local_cache_dir): os.mkdir(self.config.local_cache_dir) os.mkdir(source_path) elif not os.path.exists(source_path): os.mkdir(source_path) num_cores = multiprocessing.cpu_count() def download_file(file): requests.packages.urllib3.disable_warnings( InsecureRequestWarning) url = 'https://data.materialsdatafacility.org' + file['path'] destination = 'data/' + source_id + '/' + file['path'][ file['path'].rindex("/") + 1:] response = requests.get(url, verify=False) with open(destination, 'wb') as f: f.write(response.content) return {file['path'] + ' status': True} results = Parallel(n_jobs=num_cores)(delayed(download_file)(file) for file in file_ls) print('Done curling.') print(results) return self
# -*- coding: utf-8 -*- """ @Project : formationEPres @Author : Xu-Shan Zhao @Filename: mdfOqmdRetrieval202005280940.py @IDE : PyCharm @Time1 : 2020-05-28 09:40:27 @Time2 : 2020/5/28 9:40 @Month1 : 5月 @Month2 : 五月 """ from mdf_forge import Forge mdf = Forge() dataset_name = 'oqmd' # ro = mdf.match_source_names(dataset_name) # ro = ro.search(limit=-1) ro = mdf.aggregate_sources(dataset_name) import pymongo client = pymongo.MongoClient(host='localhost', port=27017) collection = client['MDF_datasets']['oqmd'] # collection.insert_many(ro) for i in range(len(ro)): try: collection.insert_one(ro[i]) except: print(i)
client.add_service("mrr") # Make the source name "nist_MDR_[item_number]" to make retrieval easy client.set_source_name("mdr_item_{}".format(item_id)) return client def _make_failure(req): return RuntimeError('Problem connecting with {}. HTTP Status Code: {}'.format(req.url, req.status_code)) if __name__ == "__main__": # Make the client client = MDFConnectClient() forge = Forge() # Create an array to store the source_id source_ids = [] # Loop through all items for item in tqdm(get_all_publications()): # Check if we have done it already if has_been_submitted(item, forge): continue # If not, ready the client to submit prepare_client_submission(item, client) # Skip if no data if len(client.data) == 0: