Esempio n. 1
0
def test_forge_aggregate_sources():
    # Test limit
    f = Forge(index="mdf")
    res1 = f.aggregate_sources("nist_xps_db")
    assert isinstance(res1, list)
    assert len(res1) > 10000
    assert isinstance(res1[0], dict)
Esempio n. 2
0
def test_forge_globus_download():
    f = Forge(index="mdf")
    # Simple case
    f.globus_download(example_result1)
    assert os.path.exists("./test_fetch.txt")
    os.remove("./test_fetch.txt")

    # With dest and preserve_dir
    dest_path = os.path.expanduser("~/mdf")
    f.globus_download(example_result1, dest=dest_path, preserve_dir=True)
    assert os.path.exists(os.path.join(dest_path, "test", "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test", "test_fetch.txt"))
    os.rmdir(os.path.join(dest_path, "test"))

    # With multiple files
    f.globus_download(example_result2, dest=dest_path)
    assert os.path.exists(os.path.join(dest_path, "test_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt"))
    os.remove(os.path.join(dest_path, "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test_multifetch.txt"))

    f.globus_download(example_result3, dest=dest_path)
    assert os.path.exists(os.path.join(dest_path, "test_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt"))
    os.remove(os.path.join(dest_path, "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test_multifetch.txt"))
Esempio n. 3
0
 def __init__(self, no_local_server, anonymous, test):
     self.no_local_server = no_local_server
     self.anonymous = anonymous
     self.test = test
     self.mdf = Forge(no_local_server=self.no_local_server,
                      anonymous=self.anonymous,
                      test=self.test)
Esempio n. 4
0
def test_forge_search_by_titles():
    f = Forge(index="mdf")
    titles1 = ['"High-throughput Ab-initio Dilute Solute Diffusion Database"']
    res1 = f.search_by_titles(titles1)
    assert check_field(
        res1, "dc.titles.[].title",
        "High-throughput Ab-initio Dilute Solute Diffusion Database") == 0

    titles2 = ["Database"]
    res2 = f.search_by_titles(titles2)
    assert check_field(res2, "dc.titles.[].title",
                       "NIST X-ray Photoelectron Spectroscopy Database") == 2
Esempio n. 5
0
def test_forge_search_by_elements():
    f = Forge(index="mdf")
    elements = ["Cu", "Al"]
    sources = ["oqmd", "nist_xps_db"]
    res1, info1 = f.match_source_names(sources).match_elements(
        elements).search(limit=10000, info=True)
    res2, info2 = f.search_by_elements(elements,
                                       sources,
                                       limit=10000,
                                       info=True)
    assert all([r in res2 for r in res1]) and all([r in res1 for r in res2])
    assert check_field(res1, "material.elements", "Al") == 1
    assert check_field(res1, "mdf.source_name", "oqmd") == 2
Esempio n. 6
0
    def __init__(self,
                 no_browser=False,
                 no_local_server=False,
                 search_index="mdf-test",
                 **data):
        super().__init__(**data)
        auths = mdf_toolbox.login(
            services=[
                "data_mdf",
                "search",
                "petrel",
                "transfer",
                "dlhub",
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",
            ],
            app_name="Foundry",
            make_clients=True,
            no_browser=no_browser,
            no_local_server=no_local_server,
        )

        self.forge_client = Forge(
            index=search_index,
            services=None,
            search_client=auths["search"],
            transfer_client=auths["transfer"],
            data_mdf_authorizer=auths["data_mdf"],
            petrel_authorizer=auths["petrel"],
        )

        self.dlhub_client = DLHubClient(
            dlh_authorizer=auths["dlhub"],
            search_client=auths["search"],
            fx_authorizer=auths[
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"],
            force_login=False,
        )

        self.xtract_tokens = {
            'auth_token':
            auths['petrel'].access_token,
            'transfer_token':
            auths['transfer'].authorizer.access_token,
            'funx_token':
            auths[
                'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all']
            .access_token
        }
Esempio n. 7
0
def test_forge_chaining():
    f = Forge(index="mdf")
    f.match_field("source_name", "cip")
    f.match_field("material.elements", "Al")
    res1 = f.search()
    res2 = f.match_field("source_name",
                         "cip").match_field("material.elements",
                                            "Al").search()
    assert all([r in res2 for r in res1]) and all([r in res1 for r in res2])
Esempio n. 8
0
def test_forge_match_resource_types():
    f = Forge(index="mdf")
    # Test one type
    f.match_resource_types("record")
    res1 = f.search(limit=10)
    assert check_field(res1, "mdf.resource_type", "record") == 0

    # Test two types
    f.match_resource_types(["collection", "dataset"])
    res2 = f.search()
    assert check_field(res2, "mdf.resource_type", "record") == -1

    # Test zero types
    assert f.match_resource_types("") == f
Esempio n. 9
0
def test_forge_anonymous(capsys):
    f = Forge(anonymous=True)
    # Test search
    assert len(
        f.search("mdf.source_name:ab_initio_solute_database",
                 advanced=True,
                 limit=300)) == 300

    # Test aggregation
    assert len(f.aggregate("mdf.source_name:nist_xps_db")) > 10000

    # Error on auth-only functions
    # http_download
    assert f.http_download({})["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous HTTP download not yet supported." in out
    # globus_download
    assert f.globus_download({})["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous Globus Transfer not supported." in out
    # http_stream
    res = f.http_stream({})
    assert next(res)["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous HTTP download not yet supported." in out
    with pytest.raises(StopIteration):
        next(res)
Esempio n. 10
0
def test_forge_http_stream(capsys):
    f = Forge(index="mdf")
    # Simple case
    res1 = f.http_stream(example_result1)
    assert isinstance(res1, types.GeneratorType)
    assert next(
        res1
    ) == "This is a test document for Forge testing. Please do not remove.\n"

    # With multiple files
    res2 = f.http_stream((example_result2, {"info": {}}))
    assert isinstance(res2, types.GeneratorType)
    assert next(
        res2
    ) == "This is a test document for Forge testing. Please do not remove.\n"
    assert next(
        res2
    ) == "This is a second test document for Forge testing. Please do not remove.\n"
    assert next(
        res2
    ) == "This is a test document for Forge testing. Please do not remove.\n"
    assert next(
        res2
    ) == "This is a second test document for Forge testing. Please do not remove.\n"

    res3 = f.http_stream((example_result3, {"info": {}}))
    assert isinstance(res3, types.GeneratorType)
    assert next(
        res3
    ) == "This is a test document for Forge testing. Please do not remove.\n"
    assert next(
        res3
    ) == "This is a second test document for Forge testing. Please do not remove.\n"
    assert next(
        res3
    ) == "This is a test document for Forge testing. Please do not remove.\n"
    assert next(
        res3
    ) == "This is a second test document for Forge testing. Please do not remove.\n"

    # Too many results
    res4 = f.http_stream(list(range(10001)))
    assert next(res4)["success"] is False
    out, err = capsys.readouterr()
    assert "Too many results supplied. Use globus_download()" in out
    with pytest.raises(StopIteration):
        next(res4)

    # "Missing" files
    assert next(f.http_stream(example_result_missing)) is None
    out, err = capsys.readouterr()
    assert not os.path.exists("./should_not_exist.txt")
    assert (
        "Error 404 when attempting to access "
        "'https://data.materialsdatafacility.org/test/should_not_exist.txt'"
    ) in out
Esempio n. 11
0
def test_forge_match_organizations():
    f = Forge(index="mdf")
    # One repo
    f.match_organizations("NIST")
    res1 = f.search()
    assert res1 != []
    check_val1 = check_field(res1, "mdf.organizations", "NIST")
    assert check_val1 == 1

    # Multi-repo
    f.match_organizations(["NIST", "PRISMS"], match_all=False)
    res2 = f.search()
    assert check_field(res2, "mdf.organizations", "PRISMS") == 2
    assert check_field(res2, "mdf.organizations", "NIST") == 2

    # No repos
    assert f.match_organizations("") == f
Esempio n. 12
0
def test_forge_match_elements():
    f = Forge(index="mdf")
    # One element
    f.match_elements("Al")
    res1 = f.search()
    assert res1 != []
    check_val1 = check_field(res1, "material.elements", "Al")
    assert check_val1 == 0 or check_val1 == 1

    # Multi-element
    f.match_elements(["Al", "Cu"])
    res2 = f.search()
    assert check_field(res2, "material.elements", "Al") == 1
    assert check_field(res2, "material.elements", "Cu") == 1

    # No elements
    assert f.match_elements("") == f
Esempio n. 13
0
def test_forge_match_titles():
    # One title
    f = Forge(index="mdf")
    titles1 = '"High-throughput Ab-initio Dilute Solute Diffusion Database"'
    res1 = f.match_titles(titles1).search()
    assert res1 != []
    assert check_field(
        res1, "dc.titles.[].title",
        "High-throughput Ab-initio Dilute Solute Diffusion Database") == 0

    # Multiple titles
    titles2 = [
        '"High-throughput Ab-initio Dilute Solute Diffusion Database"',
        '"Khazana (VASP)"'
    ]
    res2 = f.match_titles(titles2).search()
    assert res2 != []
    assert check_field(res2, "dc.titles.[].title", "Khazana (VASP)") == 2

    # No titles
    assert f.match_titles("") == f
Esempio n. 14
0
def test_forge_test_match_records():
    f = Forge(index="mdf")
    # One record
    f.match_records("cip", 1006)
    res = f.search()
    assert len(res) == 1
    assert check_field(res, "mdf.source_name", "cip") == 0
    assert check_field(res, "mdf.scroll_id", 1006) == 0

    # Multi-record, strip version info
    f.match_records("cip_v3.4", [1006, 1002])
    res = f.search()
    assert len(res) == 2
    assert check_field(res, "mdf.source_name", "cip") == 0
    assert check_field(res, "mdf.scroll_id", 1006) == 2

    # No args
    assert f.match_records("", "") == f
Esempio n. 15
0
def test_forge_match_source_names():
    f = Forge(index="mdf")
    # One source
    f.match_source_names("khazana_vasp")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0

    # Multi-source, strip version info
    f.match_source_names(["khazana_vasp", "ta_melting_v3.4"])
    res2 = f.search()

    # res1 is a subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "mdf.source_name", "ta_melting") == 2

    # No source
    assert f.match_source_names("") == f
Esempio n. 16
0
def test_forge_match_dois():
    f = Forge(index="mdf")
    # One doi
    f.match_dois("https://dx.doi.org/10.13011/M3B36G")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "dc.identifier.identifier",
                       "https://dx.doi.org/10.13011/M3B36G") == 0

    # Multiple dois
    f.match_dois(["https://dx.doi.org/10.13011/M3B36G", "10.18126/M23P9G"])
    res2 = f.search()

    # # res1 is ça subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "dc.identifier.identifier",
                       "10.18126/M23P9G") == 2

    # No doi
    assert f.match_dois("") == f
Esempio n. 17
0
def test_forge_match_source_names():
    os.system('echo hello')
    f = Forge(index="mdf", no_local_server=True, no_browser=True)
    os.system('echo there')
    assert True
    # One source
    f.match_source_names("khazana_vasp")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0

    # Multi-source, strip version info
    f.match_source_names(["khazana_vasp", "ta_melting_v3.4"])
    res2 = f.search()

    # res1 is a subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "mdf.source_name", "ta_melting") == 2

    # No source
    assert f.match_source_names("") == f
Esempio n. 18
0
def test_get_dataset_version():
    # Get the version number of the OQMD
    f = Forge()
    hits = f.search('mdf.source_name:oqmd AND mdf.resource_type:dataset',
                    advanced=True,
                    limit=1)
    assert hits[0]['mdf']['version'] == f.get_dataset_version('oqmd')

    # Test invalid source_name
    with pytest.raises(ValueError):
        f.get_dataset_version('notreal')
Esempio n. 19
0
class FoundryDatasets():
    """
    Class to download datasets hosted on Materials Data Facility

    Args:
        no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab

        anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True

        test: (bool), whether to be in test mode. Some functionality may be disabled if True

    Methods:
        download_data: downloads specified data from MDF and saves to current directory
            Args:
                name: (str), name of the dataset to download

                doi: (str), digital object identifier of the dataset to download

                download: (bool), whether or not to download the full dataset

            Returns:
                None
    """

    def __init__(self, no_local_server, anonymous, test):
        self.no_local_server = no_local_server
        self.anonymous = anonymous
        self.test = test
        self.mdf = Forge(no_local_server=self.no_local_server,
                         anonymous=self.anonymous,
                         test=self.test)

    def download_data(self, name=None, doi=None, download=False):
        if name is not None:
            self.mdf.match_source_names(name)
        elif doi is not None:
            self.mdf.match_dois(doi)
        else:
            print('ERROR: please specify either the dataset name or DOI for lookup MDF')
        result = self.mdf.search()
        if len(result) == 1:
            print('Successfully found the desired dataset on MDF')
            print('MDF entry:')
            pprint(result)
            if download == True:
                print('Downloading dataset from MDF')
                self.mdf.globus_download(results=result)
        return
Esempio n. 20
0
def test_forge_search_by_dois():
    f = Forge(index="mdf")
    res1 = f.search_by_dois("https://dx.doi.org/10.13011/M3B36G")
    assert check_field(res1, "dc.identifier.identifier",
                       "https://dx.doi.org/10.13011/M3B36G") == 0
Esempio n. 21
0
def test_forge_match_years(capsys):
    # One year of data/results
    f = Forge(index="mdf")
    res1 = f.match_years("2015").search()
    assert res1 != []
    assert check_field(res1, "dc.publicationYear", 2015) == 0

    # Multiple years
    res2 = f.match_years(years=["2015", 2016]).search()
    assert check_field(res2, "dc.publicationYear", 2016) == 2

    # Wrong input
    with pytest.raises(AttributeError) as excinfo:
        f.match_years(["20x5"]).search()
    assert "Invalid year: '20x5'" in str(excinfo.value)

    with pytest.raises(AttributeError) as excinfo:
        f.match_years(start="20x5").search()
    assert "Invalid start year: '20x5'" in str(excinfo.value)

    with pytest.raises(AttributeError) as excinfo:
        f.match_years(stop="20x5").search()
    assert "Invalid stop year: '20x5'" in str(excinfo.value)

    # No filters with no input
    f.match_years()
    assert f.current_query() == ""

    # Test range
    res4 = f.match_years(start=2015, stop=2015, inclusive=True).search()
    assert check_field(res4, "dc.publicationYear", 2015) == 0

    res5 = f.match_years(start=2014, stop=2017, inclusive=False).search()
    assert check_field(res5, "dc.publicationYear", 2013) == -1
    assert check_field(res5, "dc.publicationYear", 2014) == -1
    assert check_field(res5, "dc.publicationYear", 2015) == 2
    assert check_field(res5, "dc.publicationYear", 2016) == 2
    assert check_field(res5, "dc.publicationYear", 2017) == -1

    assert f.match_years(start=2015, stop=2015, inclusive=False).search() == []
Esempio n. 22
0
def test_describe_field(capsys):
    f = Forge()
    # Basic usage (raw=True for ease of testing)
    res = f.describe_field("dataset", raw=True)
    assert res["success"]
    assert "dc" in res["schema"]["properties"].keys()
    assert res["schema"]["properties"]["mdf"]["properties"]["source_id"]
    # Specific field
    res = f.describe_field("dataset", field="dc", raw=True)
    assert "mdf" not in res["schema"]["properties"].keys()
    assert "titles" in res["schema"]["properties"].keys()
    # Special case
    res = f.describe_field("list", raw=True)
    assert isinstance(res["schema"], list)
    assert "mdf" in res["schema"]
    # Printing to stdout
    f.describe_field("record")
    out, err = capsys.readouterr()
    assert "- custom" in out
    # Specific field
    f.describe_field("record", field="mdf")
    out, err = capsys.readouterr()
    assert "- custom" not in out
    assert "- source_id" in out

    # Errors
    # Invalid resource_type
    res = f.describe_field("notexists", raw=True)
    assert res["success"] is False
    assert res["schema"] is None
    assert res["error"].startswith("Error 404")
    # stdout
    f.describe_field("notexists")
    out, err = capsys.readouterr()
    assert "Error 404" in out
    # Invalid field
    res = f.describe_field("dataset", field="foo.bar", raw=True)
    assert res["success"] is False
    assert res["schema"] is None
    assert res["error"].startswith("Error: Field 'foo' (from 'foo.bar')")
    # stdout
    f.describe_field("dataset", field="foo.bar")
    out, err = capsys.readouterr()
    assert "Error: Field 'foo' (from 'foo.bar')" in out
Esempio n. 23
0
def generate_stats(raw=False, return_all=False, many_cutoff=100):
    """Generates statistics on datasets in MDF Search.

    Arguments:
        raw (bool): When False, will print stats to stdout and display a progress bar.
                When True, will return a dict of stats and will not display a progress bar.
                Default False.
        return_all (bool): When False or when raw is False, generate summary statistics.
                When True and raw is True, return the dataset source_ids for each category.
                Extremely verbose.
                Default False.
        many_cutoff (int): The number of records required to be considered "many" records.
                Thie value is inclusive.
                Default 100.

    Returns:
        dict: Stats, when raw is True (else these are printed)
            
    """
    mdf = Forge()
    dataset_list = mdf.match_resource_types("dataset").search()

    all_datasets = []
    num_records = 0
    zero_records = []
    one_record = []
    multiple_records = []
    many_records = []

    for ds in tqdm(dataset_list, disable=raw):
        source_id = ds["mdf"]["source_id"]
        record_count = mdf.match_resource_types("record") \
                          .match_source_names(source_id) \
                          .search(limit=0, info=True)[1]["total_query_matches"]

        all_datasets.append((source_id, record_count))
        num_records += record_count
        if record_count == 0:
            zero_records.append(source_id)
        elif record_count == 1:
            one_record.append(source_id)
        elif record_count > 1:
            multiple_records.append(source_id)
            if record_count >= int(many_cutoff):
                many_records.append(source_id)

    if raw:
        returnable = {}
        returnable["all_datasets_count"] = len(all_datasets)
        returnable["all_records_count"] = num_records
        returnable["zero_records_count"] = len(zero_records)
        returnable["one_record_count"] = len(one_record)
        returnable["multiple_records_count"] = len(multiple_records)
        returnable["many_records_count"] = len(many_records)
        returnable["one_or_more_count"] = len(one_record) + len(
            multiple_records)

        if return_all:
            returnable["all_datasets"] = all_datasets
            returnable["zero_records"] = zero_records
            returnable["one_record"] = one_record
            returnable["multiple_records"] = multiple_records
            returnable["many_records"] = many_records
            returnable["one_or_more"] = one_record + multiple_records

        return returnable
    else:
        print("MDF Search Statistics")
        print("---------------------")
        print("Total datasets:", len(all_datasets))
        print("Total records:", num_records)
        print("Datasets with zero records:", len(zero_records))
        print("Datasets with any records: ",
              len(one_record) + len(multiple_records))
        print("{}% of datasets have records".format(
            int((len(one_record) + len(multiple_records)) / len(all_datasets) *
                100)))
        print()
        print("Datasets with exactly one record:   ", len(one_record))
        print("Datasets with more than one record: ", len(multiple_records))
        print("Datasets with more than", many_cutoff, "records:",
              len(many_records))
        print()
        return
Esempio n. 24
0
def test_forge_http_download(capsys):
    f = Forge(index="mdf")
    # Simple case
    f.http_download(example_result1)
    assert os.path.exists("./test_fetch.txt")

    # Test conflicting filenames
    f.http_download(example_result1)
    assert os.path.exists("./test_fetch(1).txt")
    f.http_download(example_result1)
    assert os.path.exists("./test_fetch(2).txt")
    os.remove("./test_fetch.txt")
    os.remove("./test_fetch(1).txt")
    os.remove("./test_fetch(2).txt")

    # With dest and preserve_dir, and tuple of results
    dest_path = os.path.expanduser("~/mdf")
    f.http_download(([example_result1], {
        "info": None
    }),
                    dest=dest_path,
                    preserve_dir=True)
    assert os.path.exists(os.path.join(dest_path, "test", "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test", "test_fetch.txt"))
    os.rmdir(os.path.join(dest_path, "test"))

    # With multiple files
    f.http_download(example_result2, dest=dest_path)
    assert os.path.exists(os.path.join(dest_path, "test_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "petrel_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "petrel_multifetch.txt"))
    os.remove(os.path.join(dest_path, "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test_multifetch.txt"))
    os.remove(os.path.join(dest_path, "petrel_fetch.txt"))
    os.remove(os.path.join(dest_path, "petrel_multifetch.txt"))

    f.http_download(example_result3, dest=dest_path)
    assert os.path.exists(os.path.join(dest_path, "test_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "test_multifetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "petrel_fetch.txt"))
    assert os.path.exists(os.path.join(dest_path, "petrel_multifetch.txt"))
    os.remove(os.path.join(dest_path, "test_fetch.txt"))
    os.remove(os.path.join(dest_path, "test_multifetch.txt"))
    os.remove(os.path.join(dest_path, "petrel_fetch.txt"))
    os.remove(os.path.join(dest_path, "petrel_multifetch.txt"))

    # Too many files
    assert f.http_download(list(range(10001)))["success"] is False
    out, err = capsys.readouterr()
    assert "Too many results supplied. Use globus_download()" in out

    # "Missing" files
    f.http_download(example_result_missing)
    out, err = capsys.readouterr()
    assert not os.path.exists("./should_not_exist.txt")
    assert (
        "Error 404 when attempting to access "
        "'https://data.materialsdatafacility.org/test/should_not_exist.txt'"
    ) in out

    # No datasets
    f.http_download(example_dataset)
    out, err = capsys.readouterr()
    assert not os.path.exists(os.path.join(dest_path, "petrel_fetch.txt"))
    assert (
        "Skipping datset entry for 'foobar_v1': Cannot download dataset over HTTPS. "
        "Use globus_download() for datasets.") in out

    # Bad resource_type
    f.http_download(example_bad_resource)
    out, err = capsys.readouterr()
    assert "Error: Found unknown resource_type 'foobar'. Skipping entry." in out
Esempio n. 25
0
def test_forge_fetch_datasets_from_results():
    # Get some results
    f = Forge(index="mdf")
    # Record from OQMD
    res01 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record",
                     advanced=True,
                     limit=1)
    # Record from OQMD with info
    res02 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record",
                     advanced=True,
                     limit=1,
                     info=True)
    # Records from JANAF
    res03 = f.search(
        "mdf.source_name:khazana_vasp AND mdf.resource_type:record",
        advanced=True,
        limit=2)
    # Dataset for NIST XPS DB
    res04 = f.search(
        "mdf.source_name:nist_xps_db AND mdf.resource_type:dataset",
        advanced=True)

    # Get the correct dataset entries
    oqmd = f.search("mdf.source_name:oqmd AND mdf.resource_type:dataset",
                    advanced=True)[0]
    khazana_vasp = f.search(
        "mdf.source_name:khazana_vasp AND mdf.resource_type:dataset",
        advanced=True)[0]

    # Fetch single dataset
    res1 = f.fetch_datasets_from_results(res01[0])
    assert mdf_toolbox.insensitive_comparison(res1[0], oqmd)

    # Fetch dataset with results + info
    res2 = f.fetch_datasets_from_results(res02)
    assert mdf_toolbox.insensitive_comparison(res2[0], oqmd)

    # Fetch multiple datasets
    rtemp = res01 + res03
    res3 = f.fetch_datasets_from_results(rtemp)
    assert len(res3) == 2
    assert oqmd in res3
    assert khazana_vasp in res3

    # Fetch dataset from dataset
    res4 = f.fetch_datasets_from_results(res04)
    assert mdf_toolbox.insensitive_comparison(res4, res04)

    # Fetch entries from current query
    f.match_source_names("nist_xps_db")
    assert f.fetch_datasets_from_results() == res04

    # Fetch nothing
    unknown_entry = {"mdf": {"resource_type": "unknown"}}
    assert f.fetch_datasets_from_results(unknown_entry) == []
Esempio n. 26
0
def test_describe_organization(capsys):
    f = Forge()
    # Basic usage (with raw=True)
    res = f.describe_organization("Argonne National Laboratory", raw=True)
    assert res["success"]
    assert isinstance(res["organization"], dict)
    assert res["organization"][
        "canonical_name"] == "Argonne National Laboratory"
    assert "ANL" in res["organization"]["aliases"]
    # List
    res = f.describe_organization("list", raw=True)
    assert isinstance(res["organization"], list)
    assert "Center for Hierarchical Materials Design" in res["organization"]
    # All
    res = f.describe_organization("all", raw=True)
    assert isinstance(res["organization"], list)
    assert isinstance(res["organization"][0], dict)
    # Print to stdout
    f.describe_organization("CHiMaD")
    out, err = capsys.readouterr()
    assert "canonical_name: Center for Hierarchical Materials Design" in out
    assert "CHiMaD" in out
    assert "public" in out
    # List
    f.describe_organization("list")
    out, err = capsys.readouterr()
    assert "Center for Hierarchical Materials Design" in out
    assert "CHiMaD" not in out
    assert "Argonne National Laboratory" in out
    assert "ANL" not in out
    # Summary flag
    f.describe_organization("chimad", summary=True)
    out, err = capsys.readouterr()
    assert "canonical_name: Center for Hierarchical Materials Design" not in out
    assert "Center for Hierarchical Materials Design" in out
    assert "CHiMaD" in out
    assert "public" not in out

    # Errors
    # Invalid org
    res = f.describe_organization("foobar", raw=True)
    assert res["success"] is False
    assert "Error 404" in res["error"]
    assert res["status_code"] == 404
    # stdout
    res = f.describe_organization("foobar")
    out, err = capsys.readouterr()
    assert "Error 404" in out
Esempio n. 27
0
class Foundry(FoundryMetadata):
    """Foundry Client Base Class
    TODO:
    -------
    Add Docstring

    """

    # transfer_client: Any
    dlhub_client: Any
    forge_client: Any
    # connect_client: #Add this back in later, not necessary for current functionality

    xtract_tokens: Any

    def __init__(self,
                 no_browser=False,
                 no_local_server=False,
                 search_index="mdf-test",
                 **data):
        super().__init__(**data)
        auths = mdf_toolbox.login(
            services=[
                "data_mdf",
                "search",
                "petrel",
                "transfer",
                "dlhub",
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",
            ],
            app_name="Foundry",
            make_clients=True,
            no_browser=no_browser,
            no_local_server=no_local_server,
        )

        self.forge_client = Forge(
            index=search_index,
            services=None,
            search_client=auths["search"],
            transfer_client=auths["transfer"],
            data_mdf_authorizer=auths["data_mdf"],
            petrel_authorizer=auths["petrel"],
        )

        self.dlhub_client = DLHubClient(
            dlh_authorizer=auths["dlhub"],
            search_client=auths["search"],
            fx_authorizer=auths[
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"],
            force_login=False,
        )

        self.xtract_tokens = {
            'auth_token':
            auths['petrel'].access_token,
            'transfer_token':
            auths['transfer'].authorizer.access_token,
            'funx_token':
            auths[
                'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all']
            .access_token
        }

    def load(self, name, download=True, globus=True, verbose=False, **kwargs):
        """Load the metadata for a Foundry dataset into the client

        Args:
            name (str): Name of the foundry dataset
            download (bool): If True, download the data associated with the package (default is True)
    
        Keyword Args:
            interval (int): How often to poll Globus to check if transfers are complete

        Returns
        -------
            self
        """
        # MDF specific logic
        res = self.forge_client.match_field(
            "mdf.organizations", "foundry").match_resource_types("dataset")
        res = res.match_field("mdf.source_id", name).search()

        res = res[0]
        res["dataset"] = res["projects"]["foundry"]
        res["dataset"]["type"] = res["dataset"]["package_type"]
        del res["projects"]["foundry"]

        self = Foundry(**res)

        if download is True:  # Add check for package existence
            self.download(interval=kwargs.get("interval", 10),
                          globus=globus,
                          verbose=verbose)

        return self

    def list(self):
        """List available Foundry data packages

        Returns
        -------
            (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year
        """
        res = (self.forge_client.match_field(
            "mdf.organizations",
            "foundry").match_resource_types("dataset").search())

        return pd.DataFrame([{
            "source_id": r["mdf"]["source_id"],
            "name": r["dc"]["titles"][0]["title"],
            "year": r["dc"].get("publicationYear", None),
        } for r in res])

    def get_packages(self, paths=False):
        """Get available local data packages

        Args:
           paths (bool): If True return paths in addition to package, if False return package name only

        Returns
        -------
            (list): List describing local Foundry packages
        """
        pkg_paths = glob.glob(self.config.local_cache_dir + "/*/")
        if paths:
            return [{
                "path": path,
                "package": path.split("/")[-2]
            } for path in pkg_paths]
        else:
            return [path.split("/")[-2] for path in pkg_paths]

    def collect_dataframes(self, inputs=[], outputs=[], packages=None):
        """Collect dataframes of local data packages
        Args:
           inputs (list): List of strings for input columns
           outputs (list): List of strings for output columns

        Returns
        -------
            (pandas.DataFrame): Collected dataframe with specified inputs and outputs
        """
        frame_files = glob.glob(self.config.local_cache_dir + "/*/*dataframe*",
                                recursive=True)

        frames = []
        for frame in frame_files:
            df_tmp = pd.read_json(frame)
            df_tmp["source"] = frame
            frames.append(df_tmp)
        df = pd.concat(frames)

        if inputs and outputs:
            return df[inputs], df[outputs]
        else:
            return df

    def run(self, name, inputs, **kwargs):
        """Run a model on data

        Args:
           name (str): DLHub model name
           inputs: Data to send to DLHub as inputs (should be JSON serializable)

        Returns
        -------
             Returns results after invocation via the DLHub service

        TODO:
        -------
        - Pass **kwargs through to DLHub client and document kwargs
        """
        return self.dlhub_client.run(name, inputs=inputs)

    def load_data(self, source_id=None, globus=True):
        """Load in the data associated with the prescribed dataset

        Tabular Data Type: Data are arranged in a standard data frame
        stored in self.dataframe_file. The contents are read, and 

        File Data Type: <<Add desc>>

        For more complicated data structures, users should
        subclass Foundry and override the load_data function 

        Args:
           inputs (list): List of strings for input columns
           outputs (list): List of strings for output columns

        Returns
        -------
             (tuple): Tuple of X, y values
        """

        if source_id:
            path = os.path.join(self.config.local_cache_dir, source_id)
        else:
            path = os.path.join(self.config.local_cache_dir,
                                self.mdf["source_id"])
        # Handle Foundry-defined types.
        if self.dataset.type.value == "tabular":
            # If the file is not local, fetch the contents with Globus
            # Check if the contents are local
            # TODO: Add hashes and versioning to metadata and checking to the file
            try:
                self.dataset.dataframe = pd.read_json(
                    os.path.join(path, self.config.dataframe_file))
            except:
                # Try to read individual lines instead
                self.dataset.dataframe = pd.read_json(os.path.join(
                    path, self.config.dataframe_file),
                                                      lines=True)

            return (
                self.dataset.dataframe[self.dataset.inputs],
                self.dataset.dataframe[self.dataset.outputs],
            )
        elif self.dataset.type.value == "hdf5":
            f = h5py.File(os.path.join(path, self.config.data_file), "r")
            inputs = [f[i[0:]] for i in self.dataset.inputs]
            outputs = [f[i[0:]] for i in self.dataset.outputs]
            return (inputs, outputs)
        else:
            raise NotImplementedError

    def describe(self):
        print("DC:{}".format(self.dc))
        print("Dataset:{}".format(self.dataset.json(exclude={"dataframe"})))

    def publish(self, foundry_metadata, update=False, **kwargs):
        """Submit a data package for publication
        Args:
            foundry_metadata (dict): Path to the file containing
            update (bool): True if this is an update to a prior data package
            (default: self.config.metadata_file)
        Keyword Args:
            title (str): Title of the data package
            authors (list): List of data package author names e.g., Jack Black or Nunez, Victoria
            affiliations (list): List of author affiliations
            tags (list): List of tags to apply to the data package

        Returns
        -------
        (dict) MDF Connect Response: Response from MDF Connect to allow tracking of dataset 
        """

        self.connect_client.create_dc_block(
            title=kwargs["title"],
            authors=kwargs["authors"],
            affiliations=kwargs.get("affiliations", []),
            subjects=kwargs.get("tags", ["machine learning", "foundry"]),
        )
        self.connect_client.add_organization("Foundry")
        self.connect_client.set_project_block("foundry", foundry_metadata)
        self.connect_client.add_data_source(kwargs.get("data_sources", []))

        res = self.connect_client.submit_dataset(update=update)
        return res

    def from_file(self, file=None):
        """Create a Foundry client from a file

        Args:
            file (str): Path to the file containing
            (default: self.config.metadata_file)

        Returns
        -------
        (Foundry): an newly instantiated Foundry client
        """

        if file is None:
            file = self.config.metadata_file
        with open("./{}".format(file)) as fp:
            obj = json.load(fp)
            return Foundry(**obj)

    def to_file(self, file=None):
        """Create a Foundry client from a file

        Args:
            file (str): Path to the file to save metadata to
            (default: self.config.metadata_file)

        Returns
        -------
        (Foundry) self: for chaining
        """

        if file is None:
            file = self.config.metadata_file
        with open("./{}".format(file)) as fp:
            obj = json.dump(
                self.json(exclude={"dlhub_client", "forge_client"}), fp)
        return self

    def configure(self, **kwargs):
        self.config = FoundryConfig(**kwargs)
        return self

    def download(self, globus=True, verbose=False, **kwargs):
        # Check if the dir already exists
        path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"])
        if os.path.isdir(path):
            return self

        res = self.forge_client.search(
            "mdf.source_id:{name}".format(name=self.mdf["source_id"]),
            advanced=True)
        if globus:
            self.forge_client.globus_download(
                res,
                dest=self.config.local_cache_dir,
                dest_ep=self.config.destination_endpoint,
                interval=kwargs.get("interval", 20),
                download_datasets=True,
            )
        else:
            source_id = self.mdf['source_id']
            xtract_base_url = "http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com"

            # MDF Materials Data at NCSA
            source_ep_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec"
            base_url = "https://data.materialsdatafacility.org"
            folder_to_crawl = f"/foundry/{source_id}/"

            # This only matters if you want files grouped together.
            grouper = "matio"

            auth_token = self.xtract_tokens['auth_token']
            transfer_token = self.xtract_tokens['transfer_token']
            funcx_token = self.xtract_tokens['transfer_token']

            headers = {
                'Authorization': f"Bearer {auth_token}",
                'Transfer': transfer_token,
                'FuncX': funcx_token,
                'Petrel': auth_token
            }
            if verbose: print(f"Headers: {headers}")

            # Initialize the crawl. This kicks off the Globus EP crawling service on the backend.
            crawl_url = f'{xtract_base_url}/crawl'
            if verbose: print(f"Crawl URL is : {crawl_url}")
            crawl_req = requests.post(crawl_url,
                                      json={
                                          'repo_type': "GLOBUS",
                                          'eid': source_ep_id,
                                          'dir_path': folder_to_crawl,
                                          'Transfer': transfer_token,
                                          'Authorization': funcx_token,
                                          'grouper': grouper,
                                          'https_info': {
                                              'base_url': base_url
                                          }
                                      })
            if verbose: print('Crawl response:', crawl_req)
            crawl_id = json.loads(crawl_req.content)['crawl_id']
            if verbose: print(f"Crawl ID: {crawl_id}")

            # Wait for the crawl to finish before we can start fetching our metadata.
            while True:
                crawl_status = requests.get(
                    f'{xtract_base_url}/get_crawl_status',
                    json={'crawl_id': crawl_id})
                if verbose: print(crawl_status)
                crawl_content = json.loads(crawl_status.content)
                if verbose: print(f"Crawl Status: {crawl_content}")

                if crawl_content['crawl_status'] == 'SUCCEEDED':
                    files_crawled = crawl_content['files_crawled']
                    if verbose: print("Our crawl has succeeded!")
                    break
                else:
                    if verbose: print("Sleeping before re-polling...")
                    time.sleep(2)

            # Now we fetch our metadata. Here you can configure n to be maximum number of
            # messages you want at once.

            file_ls = []
            fetched_files = 0
            while fetched_files < files_crawled:
                fetch_mdata = requests.get(
                    f'{xtract_base_url}/fetch_crawl_mdata',
                    json={
                        'crawl_id': crawl_id,
                        'n': 2
                    })
                fetch_content = json.loads(fetch_mdata.content)

                for file_path in fetch_content['file_ls']:
                    file_ls.append(file_path)
                    fetched_files += 1

                if fetch_content['queue_empty']:
                    if verbose: print("Queue is empty! Continuing...")
                    time.sleep(2)

            source_path = os.path.join(self.config.local_cache_dir,
                                       self.mdf['source_id'])

            if not os.path.exists(self.config.local_cache_dir):
                os.mkdir(self.config.local_cache_dir)
                os.mkdir(source_path)

            elif not os.path.exists(source_path):
                os.mkdir(source_path)

            num_cores = multiprocessing.cpu_count()

            def download_file(file):
                requests.packages.urllib3.disable_warnings(
                    InsecureRequestWarning)

                url = 'https://data.materialsdatafacility.org' + file['path']
                destination = 'data/' + source_id + '/' + file['path'][
                    file['path'].rindex("/") + 1:]
                response = requests.get(url, verify=False)

                with open(destination, 'wb') as f:
                    f.write(response.content)

                return {file['path'] + ' status': True}

            results = Parallel(n_jobs=num_cores)(delayed(download_file)(file)
                                                 for file in file_ls)

            print('Done curling.')
            print(results)

        return self
Esempio n. 28
0
# -*- coding: utf-8 -*-
"""
@Project : formationEPres
@Author  : Xu-Shan Zhao
@Filename: mdfOqmdRetrieval202005280940.py
@IDE     : PyCharm
@Time1   : 2020-05-28 09:40:27
@Time2   : 2020/5/28 9:40
@Month1  : 5月
@Month2  : 五月
"""

from mdf_forge import Forge

mdf = Forge()

dataset_name = 'oqmd'
# ro = mdf.match_source_names(dataset_name)
# ro = ro.search(limit=-1)
ro = mdf.aggregate_sources(dataset_name)

import pymongo

client = pymongo.MongoClient(host='localhost', port=27017)
collection = client['MDF_datasets']['oqmd']
# collection.insert_many(ro)
for i in range(len(ro)):
    try:
        collection.insert_one(ro[i])
    except:
        print(i)
Esempio n. 29
0
    client.add_service("mrr")

    # Make the source name "nist_MDR_[item_number]" to make retrieval easy
    client.set_source_name("mdr_item_{}".format(item_id))
    return client


def _make_failure(req):
    return RuntimeError('Problem connecting with {}. HTTP Status Code: {}'.format(req.url,
                                                                                  req.status_code))


if __name__ == "__main__":
    # Make the client
    client = MDFConnectClient()
    forge = Forge()

    # Create an array to store the source_id
    source_ids = []

    # Loop through all items
    for item in tqdm(get_all_publications()):
        # Check if we have done it already
        if has_been_submitted(item, forge):
            continue

        # If not, ready the client to submit
        prepare_client_submission(item, client)

        # Skip if no data
        if len(client.data) == 0: