Python Forge.search Exemples, mdf_forge.Forge.search Python Exemples

Exemple #1

0

Afficher le fichier

def test_forge_match_resource_types():
    f = Forge(index="mdf")
    # Test one type
    f.match_resource_types("record")
    res1 = f.search(limit=10)
    assert check_field(res1, "mdf.resource_type", "record") == 0

    # Test two types
    f.match_resource_types(["collection", "dataset"])
    res2 = f.search()
    assert check_field(res2, "mdf.resource_type", "record") == -1

    # Test zero types
    assert f.match_resource_types("") == f

Exemple #2

0

Afficher le fichier

def test_forge_anonymous(capsys):
    f = Forge(anonymous=True)
    # Test search
    assert len(
        f.search("mdf.source_name:ab_initio_solute_database",
                 advanced=True,
                 limit=300)) == 300

    # Test aggregation
    assert len(f.aggregate("mdf.source_name:nist_xps_db")) > 10000

    # Error on auth-only functions
    # http_download
    assert f.http_download({})["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous HTTP download not yet supported." in out
    # globus_download
    assert f.globus_download({})["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous Globus Transfer not supported." in out
    # http_stream
    res = f.http_stream({})
    assert next(res)["success"] is False
    out, err = capsys.readouterr()
    assert "Error: Anonymous HTTP download not yet supported." in out
    with pytest.raises(StopIteration):
        next(res)

Exemple #3

0

Afficher le fichier

def test_forge_match_organizations():
    f = Forge(index="mdf")
    # One repo
    f.match_organizations("NIST")
    res1 = f.search()
    assert res1 != []
    check_val1 = check_field(res1, "mdf.organizations", "NIST")
    assert check_val1 == 1

    # Multi-repo
    f.match_organizations(["NIST", "PRISMS"], match_all=False)
    res2 = f.search()
    assert check_field(res2, "mdf.organizations", "PRISMS") == 2
    assert check_field(res2, "mdf.organizations", "NIST") == 2

    # No repos
    assert f.match_organizations("") == f

Exemple #4

0

Afficher le fichier

def test_forge_match_elements():
    f = Forge(index="mdf")
    # One element
    f.match_elements("Al")
    res1 = f.search()
    assert res1 != []
    check_val1 = check_field(res1, "material.elements", "Al")
    assert check_val1 == 0 or check_val1 == 1

    # Multi-element
    f.match_elements(["Al", "Cu"])
    res2 = f.search()
    assert check_field(res2, "material.elements", "Al") == 1
    assert check_field(res2, "material.elements", "Cu") == 1

    # No elements
    assert f.match_elements("") == f

Exemple #5

0

Afficher le fichier

def test_forge_chaining():
    f = Forge(index="mdf")
    f.match_field("source_name", "cip")
    f.match_field("material.elements", "Al")
    res1 = f.search()
    res2 = f.match_field("source_name",
                         "cip").match_field("material.elements",
                                            "Al").search()
    assert all([r in res2 for r in res1]) and all([r in res1 for r in res2])

Exemple #6

0

Afficher le fichier

def test_forge_test_match_records():
    f = Forge(index="mdf")
    # One record
    f.match_records("cip", 1006)
    res = f.search()
    assert len(res) == 1
    assert check_field(res, "mdf.source_name", "cip") == 0
    assert check_field(res, "mdf.scroll_id", 1006) == 0

    # Multi-record, strip version info
    f.match_records("cip_v3.4", [1006, 1002])
    res = f.search()
    assert len(res) == 2
    assert check_field(res, "mdf.source_name", "cip") == 0
    assert check_field(res, "mdf.scroll_id", 1006) == 2

    # No args
    assert f.match_records("", "") == f

Exemple #7

0

Afficher le fichier

def test_forge_match_source_names():
    f = Forge(index="mdf")
    # One source
    f.match_source_names("khazana_vasp")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0

    # Multi-source, strip version info
    f.match_source_names(["khazana_vasp", "ta_melting_v3.4"])
    res2 = f.search()

    # res1 is a subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "mdf.source_name", "ta_melting") == 2

    # No source
    assert f.match_source_names("") == f

Exemple #8

0

Afficher le fichier

def test_get_dataset_version():
    # Get the version number of the OQMD
    f = Forge()
    hits = f.search('mdf.source_name:oqmd AND mdf.resource_type:dataset',
                    advanced=True,
                    limit=1)
    assert hits[0]['mdf']['version'] == f.get_dataset_version('oqmd')

    # Test invalid source_name
    with pytest.raises(ValueError):
        f.get_dataset_version('notreal')

Exemple #9

0

Afficher le fichier

def test_forge_match_dois():
    f = Forge(index="mdf")
    # One doi
    f.match_dois("https://dx.doi.org/10.13011/M3B36G")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "dc.identifier.identifier",
                       "https://dx.doi.org/10.13011/M3B36G") == 0

    # Multiple dois
    f.match_dois(["https://dx.doi.org/10.13011/M3B36G", "10.18126/M23P9G"])
    res2 = f.search()

    # # res1 is ça subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "dc.identifier.identifier",
                       "10.18126/M23P9G") == 2

    # No doi
    assert f.match_dois("") == f

Exemple #10

0

Afficher le fichier

Fichier : test_foundry.py Projet : ethantruelove/foundry

def test_forge_match_source_names():
    os.system('echo hello')
    f = Forge(index="mdf", no_local_server=True, no_browser=True)
    os.system('echo there')
    assert True
    # One source
    f.match_source_names("khazana_vasp")
    res1 = f.search()
    assert res1 != []
    assert check_field(res1, "mdf.source_name", "khazana_vasp") == 0

    # Multi-source, strip version info
    f.match_source_names(["khazana_vasp", "ta_melting_v3.4"])
    res2 = f.search()

    # res1 is a subset of res2
    assert len(res2) > len(res1)
    assert all([r1 in res2 for r1 in res1])
    assert check_field(res2, "mdf.source_name", "ta_melting") == 2

    # No source
    assert f.match_source_names("") == f

Exemple #11

0

Afficher le fichier

class FoundryDatasets():
    """
    Class to download datasets hosted on Materials Data Facility

    Args:
        no_local_server: (bool), whether or not the server is local. Set to True if running on e.g. Google Colab

        anonymous: (bool), whether to use your MDF user or be anonymous. Some functionality may be disabled if True

        test: (bool), whether to be in test mode. Some functionality may be disabled if True

    Methods:
        download_data: downloads specified data from MDF and saves to current directory
            Args:
                name: (str), name of the dataset to download

                doi: (str), digital object identifier of the dataset to download

                download: (bool), whether or not to download the full dataset

            Returns:
                None
    """

    def __init__(self, no_local_server, anonymous, test):
        self.no_local_server = no_local_server
        self.anonymous = anonymous
        self.test = test
        self.mdf = Forge(no_local_server=self.no_local_server,
                         anonymous=self.anonymous,
                         test=self.test)

    def download_data(self, name=None, doi=None, download=False):
        if name is not None:
            self.mdf.match_source_names(name)
        elif doi is not None:
            self.mdf.match_dois(doi)
        else:
            print('ERROR: please specify either the dataset name or DOI for lookup MDF')
        result = self.mdf.search()
        if len(result) == 1:
            print('Successfully found the desired dataset on MDF')
            print('MDF entry:')
            pprint(result)
            if download == True:
                print('Downloading dataset from MDF')
                self.mdf.globus_download(results=result)
        return

Exemple #12

0

Afficher le fichier

def test_forge_fetch_datasets_from_results():
    # Get some results
    f = Forge(index="mdf")
    # Record from OQMD
    res01 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record",
                     advanced=True,
                     limit=1)
    # Record from OQMD with info
    res02 = f.search("mdf.source_name:oqmd AND mdf.resource_type:record",
                     advanced=True,
                     limit=1,
                     info=True)
    # Records from JANAF
    res03 = f.search(
        "mdf.source_name:khazana_vasp AND mdf.resource_type:record",
        advanced=True,
        limit=2)
    # Dataset for NIST XPS DB
    res04 = f.search(
        "mdf.source_name:nist_xps_db AND mdf.resource_type:dataset",
        advanced=True)

    # Get the correct dataset entries
    oqmd = f.search("mdf.source_name:oqmd AND mdf.resource_type:dataset",
                    advanced=True)[0]
    khazana_vasp = f.search(
        "mdf.source_name:khazana_vasp AND mdf.resource_type:dataset",
        advanced=True)[0]

    # Fetch single dataset
    res1 = f.fetch_datasets_from_results(res01[0])
    assert mdf_toolbox.insensitive_comparison(res1[0], oqmd)

    # Fetch dataset with results + info
    res2 = f.fetch_datasets_from_results(res02)
    assert mdf_toolbox.insensitive_comparison(res2[0], oqmd)

    # Fetch multiple datasets
    rtemp = res01 + res03
    res3 = f.fetch_datasets_from_results(rtemp)
    assert len(res3) == 2
    assert oqmd in res3
    assert khazana_vasp in res3

    # Fetch dataset from dataset
    res4 = f.fetch_datasets_from_results(res04)
    assert mdf_toolbox.insensitive_comparison(res4, res04)

    # Fetch entries from current query
    f.match_source_names("nist_xps_db")
    assert f.fetch_datasets_from_results() == res04

    # Fetch nothing
    unknown_entry = {"mdf": {"resource_type": "unknown"}}
    assert f.fetch_datasets_from_results(unknown_entry) == []

Exemple #13

0

Afficher le fichier

class Foundry(FoundryMetadata):
    """Foundry Client Base Class
    TODO:
    -------
    Add Docstring

    """

    # transfer_client: Any
    dlhub_client: Any
    forge_client: Any
    # connect_client: #Add this back in later, not necessary for current functionality

    xtract_tokens: Any

    def __init__(self,
                 no_browser=False,
                 no_local_server=False,
                 search_index="mdf-test",
                 **data):
        super().__init__(**data)
        auths = mdf_toolbox.login(
            services=[
                "data_mdf",
                "search",
                "petrel",
                "transfer",
                "dlhub",
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",
            ],
            app_name="Foundry",
            make_clients=True,
            no_browser=no_browser,
            no_local_server=no_local_server,
        )

        self.forge_client = Forge(
            index=search_index,
            services=None,
            search_client=auths["search"],
            transfer_client=auths["transfer"],
            data_mdf_authorizer=auths["data_mdf"],
            petrel_authorizer=auths["petrel"],
        )

        self.dlhub_client = DLHubClient(
            dlh_authorizer=auths["dlhub"],
            search_client=auths["search"],
            fx_authorizer=auths[
                "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"],
            force_login=False,
        )

        self.xtract_tokens = {
            'auth_token':
            auths['petrel'].access_token,
            'transfer_token':
            auths['transfer'].authorizer.access_token,
            'funx_token':
            auths[
                'https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all']
            .access_token
        }

    def load(self, name, download=True, globus=True, verbose=False, **kwargs):
        """Load the metadata for a Foundry dataset into the client

        Args:
            name (str): Name of the foundry dataset
            download (bool): If True, download the data associated with the package (default is True)
    
        Keyword Args:
            interval (int): How often to poll Globus to check if transfers are complete

        Returns
        -------
            self
        """
        # MDF specific logic
        res = self.forge_client.match_field(
            "mdf.organizations", "foundry").match_resource_types("dataset")
        res = res.match_field("mdf.source_id", name).search()

        res = res[0]
        res["dataset"] = res["projects"]["foundry"]
        res["dataset"]["type"] = res["dataset"]["package_type"]
        del res["projects"]["foundry"]

        self = Foundry(**res)

        if download is True:  # Add check for package existence
            self.download(interval=kwargs.get("interval", 10),
                          globus=globus,
                          verbose=verbose)

        return self

    def list(self):
        """List available Foundry data packages

        Returns
        -------
            (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year
        """
        res = (self.forge_client.match_field(
            "mdf.organizations",
            "foundry").match_resource_types("dataset").search())

        return pd.DataFrame([{
            "source_id": r["mdf"]["source_id"],
            "name": r["dc"]["titles"][0]["title"],
            "year": r["dc"].get("publicationYear", None),
        } for r in res])

    def get_packages(self, paths=False):
        """Get available local data packages

        Args:
           paths (bool): If True return paths in addition to package, if False return package name only

        Returns
        -------
            (list): List describing local Foundry packages
        """
        pkg_paths = glob.glob(self.config.local_cache_dir + "/*/")
        if paths:
            return [{
                "path": path,
                "package": path.split("/")[-2]
            } for path in pkg_paths]
        else:
            return [path.split("/")[-2] for path in pkg_paths]

    def collect_dataframes(self, inputs=[], outputs=[], packages=None):
        """Collect dataframes of local data packages
        Args:
           inputs (list): List of strings for input columns
           outputs (list): List of strings for output columns

        Returns
        -------
            (pandas.DataFrame): Collected dataframe with specified inputs and outputs
        """
        frame_files = glob.glob(self.config.local_cache_dir + "/*/*dataframe*",
                                recursive=True)

        frames = []
        for frame in frame_files:
            df_tmp = pd.read_json(frame)
            df_tmp["source"] = frame
            frames.append(df_tmp)
        df = pd.concat(frames)

        if inputs and outputs:
            return df[inputs], df[outputs]
        else:
            return df

    def run(self, name, inputs, **kwargs):
        """Run a model on data

        Args:
           name (str): DLHub model name
           inputs: Data to send to DLHub as inputs (should be JSON serializable)

        Returns
        -------
             Returns results after invocation via the DLHub service

        TODO:
        -------
        - Pass **kwargs through to DLHub client and document kwargs
        """
        return self.dlhub_client.run(name, inputs=inputs)

    def load_data(self, source_id=None, globus=True):
        """Load in the data associated with the prescribed dataset

        Tabular Data Type: Data are arranged in a standard data frame
        stored in self.dataframe_file. The contents are read, and 

        File Data Type: <<Add desc>>

        For more complicated data structures, users should
        subclass Foundry and override the load_data function 

        Args:
           inputs (list): List of strings for input columns
           outputs (list): List of strings for output columns

        Returns
        -------
             (tuple): Tuple of X, y values
        """

        if source_id:
            path = os.path.join(self.config.local_cache_dir, source_id)
        else:
            path = os.path.join(self.config.local_cache_dir,
                                self.mdf["source_id"])
        # Handle Foundry-defined types.
        if self.dataset.type.value == "tabular":
            # If the file is not local, fetch the contents with Globus
            # Check if the contents are local
            # TODO: Add hashes and versioning to metadata and checking to the file
            try:
                self.dataset.dataframe = pd.read_json(
                    os.path.join(path, self.config.dataframe_file))
            except:
                # Try to read individual lines instead
                self.dataset.dataframe = pd.read_json(os.path.join(
                    path, self.config.dataframe_file),
                                                      lines=True)

            return (
                self.dataset.dataframe[self.dataset.inputs],
                self.dataset.dataframe[self.dataset.outputs],
            )
        elif self.dataset.type.value == "hdf5":
            f = h5py.File(os.path.join(path, self.config.data_file), "r")
            inputs = [f[i[0:]] for i in self.dataset.inputs]
            outputs = [f[i[0:]] for i in self.dataset.outputs]
            return (inputs, outputs)
        else:
            raise NotImplementedError

    def describe(self):
        print("DC:{}".format(self.dc))
        print("Dataset:{}".format(self.dataset.json(exclude={"dataframe"})))

    def publish(self, foundry_metadata, update=False, **kwargs):
        """Submit a data package for publication
        Args:
            foundry_metadata (dict): Path to the file containing
            update (bool): True if this is an update to a prior data package
            (default: self.config.metadata_file)
        Keyword Args:
            title (str): Title of the data package
            authors (list): List of data package author names e.g., Jack Black or Nunez, Victoria
            affiliations (list): List of author affiliations
            tags (list): List of tags to apply to the data package

        Returns
        -------
        (dict) MDF Connect Response: Response from MDF Connect to allow tracking of dataset 
        """

        self.connect_client.create_dc_block(
            title=kwargs["title"],
            authors=kwargs["authors"],
            affiliations=kwargs.get("affiliations", []),
            subjects=kwargs.get("tags", ["machine learning", "foundry"]),
        )
        self.connect_client.add_organization("Foundry")
        self.connect_client.set_project_block("foundry", foundry_metadata)
        self.connect_client.add_data_source(kwargs.get("data_sources", []))

        res = self.connect_client.submit_dataset(update=update)
        return res

    def from_file(self, file=None):
        """Create a Foundry client from a file

        Args:
            file (str): Path to the file containing
            (default: self.config.metadata_file)

        Returns
        -------
        (Foundry): an newly instantiated Foundry client
        """

        if file is None:
            file = self.config.metadata_file
        with open("./{}".format(file)) as fp:
            obj = json.load(fp)
            return Foundry(**obj)

    def to_file(self, file=None):
        """Create a Foundry client from a file

        Args:
            file (str): Path to the file to save metadata to
            (default: self.config.metadata_file)

        Returns
        -------
        (Foundry) self: for chaining
        """

        if file is None:
            file = self.config.metadata_file
        with open("./{}".format(file)) as fp:
            obj = json.dump(
                self.json(exclude={"dlhub_client", "forge_client"}), fp)
        return self

    def configure(self, **kwargs):
        self.config = FoundryConfig(**kwargs)
        return self

    def download(self, globus=True, verbose=False, **kwargs):
        # Check if the dir already exists
        path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"])
        if os.path.isdir(path):
            return self

        res = self.forge_client.search(
            "mdf.source_id:{name}".format(name=self.mdf["source_id"]),
            advanced=True)
        if globus:
            self.forge_client.globus_download(
                res,
                dest=self.config.local_cache_dir,
                dest_ep=self.config.destination_endpoint,
                interval=kwargs.get("interval", 20),
                download_datasets=True,
            )
        else:
            source_id = self.mdf['source_id']
            xtract_base_url = "http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com"

            # MDF Materials Data at NCSA
            source_ep_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec"
            base_url = "https://data.materialsdatafacility.org"
            folder_to_crawl = f"/foundry/{source_id}/"

            # This only matters if you want files grouped together.
            grouper = "matio"

            auth_token = self.xtract_tokens['auth_token']
            transfer_token = self.xtract_tokens['transfer_token']
            funcx_token = self.xtract_tokens['transfer_token']

            headers = {
                'Authorization': f"Bearer {auth_token}",
                'Transfer': transfer_token,
                'FuncX': funcx_token,
                'Petrel': auth_token
            }
            if verbose: print(f"Headers: {headers}")

            # Initialize the crawl. This kicks off the Globus EP crawling service on the backend.
            crawl_url = f'{xtract_base_url}/crawl'
            if verbose: print(f"Crawl URL is : {crawl_url}")
            crawl_req = requests.post(crawl_url,
                                      json={
                                          'repo_type': "GLOBUS",
                                          'eid': source_ep_id,
                                          'dir_path': folder_to_crawl,
                                          'Transfer': transfer_token,
                                          'Authorization': funcx_token,
                                          'grouper': grouper,
                                          'https_info': {
                                              'base_url': base_url
                                          }
                                      })
            if verbose: print('Crawl response:', crawl_req)
            crawl_id = json.loads(crawl_req.content)['crawl_id']
            if verbose: print(f"Crawl ID: {crawl_id}")

            # Wait for the crawl to finish before we can start fetching our metadata.
            while True:
                crawl_status = requests.get(
                    f'{xtract_base_url}/get_crawl_status',
                    json={'crawl_id': crawl_id})
                if verbose: print(crawl_status)
                crawl_content = json.loads(crawl_status.content)
                if verbose: print(f"Crawl Status: {crawl_content}")

                if crawl_content['crawl_status'] == 'SUCCEEDED':
                    files_crawled = crawl_content['files_crawled']
                    if verbose: print("Our crawl has succeeded!")
                    break
                else:
                    if verbose: print("Sleeping before re-polling...")
                    time.sleep(2)

            # Now we fetch our metadata. Here you can configure n to be maximum number of
            # messages you want at once.

            file_ls = []
            fetched_files = 0
            while fetched_files < files_crawled:
                fetch_mdata = requests.get(
                    f'{xtract_base_url}/fetch_crawl_mdata',
                    json={
                        'crawl_id': crawl_id,
                        'n': 2
                    })
                fetch_content = json.loads(fetch_mdata.content)

                for file_path in fetch_content['file_ls']:
                    file_ls.append(file_path)
                    fetched_files += 1

                if fetch_content['queue_empty']:
                    if verbose: print("Queue is empty! Continuing...")
                    time.sleep(2)

            source_path = os.path.join(self.config.local_cache_dir,
                                       self.mdf['source_id'])

            if not os.path.exists(self.config.local_cache_dir):
                os.mkdir(self.config.local_cache_dir)
                os.mkdir(source_path)

            elif not os.path.exists(source_path):
                os.mkdir(source_path)

            num_cores = multiprocessing.cpu_count()

            def download_file(file):
                requests.packages.urllib3.disable_warnings(
                    InsecureRequestWarning)

                url = 'https://data.materialsdatafacility.org' + file['path']
                destination = 'data/' + source_id + '/' + file['path'][
                    file['path'].rindex("/") + 1:]
                response = requests.get(url, verify=False)

                with open(destination, 'wb') as f:
                    f.write(response.content)

                return {file['path'] + ' status': True}

            results = Parallel(n_jobs=num_cores)(delayed(download_file)(file)
                                                 for file in file_ls)

            print('Done curling.')
            print(results)

        return self