Ejemplo n.º 1
0
def get_pifs_from_Citrination(client, dataset_id_list):
    all_hits = []
    for dataset in dataset_id_list:
        query = PifSystemReturningQuery(
            from_index=0,
            size=100,
            query=DataQuery(dataset=DatasetQuery(id=Filter(equal=dataset))))

        current_result = client.search(query)
        while current_result.hits is not None:
            all_hits.extend(current_result.hits)
            n_current_hits = len(current_result.hits)
            #n_hits += n_current_hits
            query.from_index += n_current_hits
            current_result = client.search(query)

    pifs = [x.system for x in all_hits]
    return pifs
Ejemplo n.º 2
0
def query_to_mdf_records(query=None, dataset_id=None, mdf_acl=None):
    """Evaluate a query and return a list of MDF records

    If a datasetID is specified by there is no query, a simple
    whole dataset query is formed for the user
    """
    if not query and not dataset_id:
        raise ValueError("Either query or dataset_id must be specified")
    if query and dataset_id:
        raise ValueError(
            "Both query and dataset_id were specified; pick one or the other.")
    if not query:
        query = PifSystemReturningQuery(
            query=DataQuery(dataset=DatasetQuery(id=Filter(equal=dataset_id))),
            size=10000  # Don't pull down all the results by default
        )

    client = get_client()

    if not mdf_acl:
        raise ValueError(
            'Access controls (mdf_acl) must be specified.  Use ["public"] for public access'
        )

    pif_result = client.pif_search(query)
    if len(pif_result.hits) == 0:
        return []

    example_uid = pif_result.hits[0].system.uid
    dataset_query = DatasetReturningQuery(
        query=DataQuery(system=PifSystemQuery(uid=Filter(equal=example_uid))),
        size=1  # we only expect one dataset to hit
    )

    dataset_result = client.dataset_search(dataset_query)

    records = []
    for hit in pif_result.hits:
        records.append(
            pif_to_mdf_record(hit.system, dataset_result.hits[0], mdf_acl))

    return records
def run_sequential_learning(client:CitrinationClient, view_id:int, dataset_id:int,
                        num_candidates_per_iter:int,
                        design_effort:int, wait_time:int,
                        num_sl_iterations:int, input_properties:List[str],
                        target:List[str], print_output:bool,
                        true_function:Callable[[np.ndarray], float],
                        score_type:str,
                        ) -> Tuple[List[float], List[float]]:
    '''Runs SL design

    :param client: Client object
    :type client: CitrinationClient
    :param view_id: View ID
    :type view_id: int
    :param dataset_id: Dataset ID
    :type dataset_id: int
    :param num_candidates_per_iter: Candidates in a batch
    :type num_candidates_per_iter: int
    :param design_effort: Effort from 1-30
    :type design_effort: int
    :param wait_time: Wait time in seconds before polling API
    :type wait_time: int
    :param num_sl_iterations: SL iterations to run
    :type num_sl_iterations: int
    :param input_properties: Inputs
    :type input_properties: List[str]
    :param target: ("Output property", {"Min", "Max"})
    :type target: List[str]
    :param print_output: Whether or not to print outputs
    :type print_output: bool
    :param true_function: Actual function for evaluating measured/true values
    :type true_function: Callable[[np.ndarray], float]
    :param score_type: MLI or MEI
    :type score_type: str
    :return: 2-tuple: list of predicted scores/uncertainties; list of measured scores/uncertainties
    :rtype: Tuple[List[float], List[float]]
    '''



    best_sl_pred_vals = []
    best_sl_measured_vals = []

    _wait_on_ingest(client, dataset_id, wait_time, print_output)

    for i in range(num_sl_iterations):
        if print_output:
            print(f"\n---STARTING SL ITERATION #{i+1}---")

        _wait_on_ingest(client, dataset_id, wait_time, print_output)
        _wait_on_data_view(client, dataset_id, view_id, wait_time, print_output)

        # Submit a design run
        design_id = client.submit_design_run(
                data_view_id=view_id,
                num_candidates=num_candidates_per_iter,
                effort=design_effort,
                target=Target(*target),
                constraints=[],
                sampler="Default"
            ).uuid

        if print_output:
            print(f"Created design run with ID {design_id}")

        _wait_on_design_run(client, design_id, view_id, wait_time, print_output)

        # Compute the best values with uncertainties as a list of (value, uncertainty)
        if score_type == "MEI":
            candidates = client.get_design_run_results(view_id, design_id).best_materials
        else:
            candidates = client.get_design_run_results(view_id, design_id).next_experiments
        values_w_uncertainties = [
            (
                m["descriptor_values"][target[0]],
                m["descriptor_values"][f"Uncertainty in {target[0]}"]
            ) for m in candidates
        ]

        # Find and save the best predicted value
        if target[1] == "Min":
            best_value_w_uncertainty = min(values_w_uncertainties, key=lambda x: x[0])
        else:
            best_value_w_uncertainty = max(values_w_uncertainties, key=lambda x: x[0])

        best_sl_pred_vals.append(best_value_w_uncertainty)
        if print_output:
            print(f"SL iter #{i+1}, best predicted (value, uncertainty) = {best_value_w_uncertainty}")

        # Update dataset w/ new candidates
        new_x_vals = []
        for material in candidates:
            new_x_vals.append(np.array(
                [float(material["descriptor_values"][x]) for x in input_properties]
            ))

        temp_dataset_fpath = f"design-{design_id}.json"
        write_dataset_from_func(true_function, temp_dataset_fpath, new_x_vals)
        upload_data_and_get_id(
            client,
            "", # No name needed for updating a dataset
            temp_dataset_fpath,
            given_dataset_id=dataset_id
        )

        _wait_on_ingest(client, dataset_id, wait_time, print_output)

        if print_output:
            print(f"Dataset updated: {len(new_x_vals)} candidates added")

        query_dataset = PifSystemReturningQuery(size=9999,
                            query=DataQuery(
                            dataset=DatasetQuery(
                                id=Filter(equal=str(dataset_id))
                        )))
        query_result = client.search.pif_search(query_dataset)

        if print_output:
            print(f"New dataset contains {query_result.total_num_hits} PIFs")

        # Update measured values in new dataset
        dataset_y_values = []
        for hit in query_result.hits:
            # Assume last prop is output if following this script
            dataset_y_values.append(
                float(hit.system.properties[-1].scalars[0].value)
            )

        if target[1] == "Min":
            best_sl_measured_vals.append(min(dataset_y_values))
        else:
            best_sl_measured_vals.append(max(dataset_y_values))

        # Retrain model w/ wait times
        client.data_views.retrain(view_id)
        _wait_on_data_view(client, dataset_id, view_id, wait_time, print_output)

    if print_output:
        print("SL finished!\n")

    return (best_sl_pred_vals, best_sl_measured_vals)
Ejemplo n.º 4
0
def load_data_zT():
    results_dir = setResDir()

    ## Metadata
    keys_response = [
        'Seebeck coefficient; squared', 'Electrical resistivity',
        'Thermal conductivity'
    ]
    sign = np.array([
        +1,  # Seebeck
        -1,  # Electric resistivity
        -1  # Thermal conductivity
    ])

    ## Load data, if possible
    # --------------------------------------------------
    try:
        df_X_all = pd.read_csv(results_dir + file_features)
        X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values

        df_Y_all = pd.read_csv(results_dir + file_responses)
        Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values
        print("Cached data loaded.")

    except FileNotFoundError:
        ## Data Import
        # --------------------------------------------------
        # Initialize client
        print("Accessing data from Citrination...")
        site = 'https://citrination.com'  # Citrination
        client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'],
                                   site=site)
        search_client = client.search
        # Aluminum dataset
        dataset_id = 178480  # ucsb_te_roomtemp_seebeck
        system_query = PifSystemReturningQuery(
            size=1000,
            query=DataQuery(dataset=DatasetQuery(id=Filter(
                equal=str(dataset_id)))))

        query_result = search_client.pif_search(system_query)
        print("    Found {} PIFs in dataset {}.".format(
            query_result.total_num_hits, dataset_id))

        ## Wrangle
        # --------------------------------------------------
        pifs = [x.system for x in query_result.hits]
        # Utility function will tabularize PIFs
        df_response = pifs2df(pifs)
        # Down-select columns to play well with to_numeric
        df_response = df_response[[
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]]
        df_response = df_response.apply(pd.to_numeric)

        # Parse chemical compositions
        formulas = [pif.chemical_formula for pif in pifs]

        df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas)

        # Join
        df_data = pd.concat([df_comp, df_response], axis=1)
        print("    Accessed data.")

        # Featurize
        print("Featurizing data...")
        df_data['composition'] = df_data['chemical_formula'].apply(
            get_compostion)

        f = MultipleFeaturizer([
            cf.Stoichiometry(),
            cf.ElementProperty.from_preset("magpie"),
            cf.ValenceOrbital(props=['avg']),
            cf.IonProperty(fast=True)
        ])

        X = np.array(f.featurize_many(df_data['composition']))

        # Find valid response values
        keys_original = [
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]

        index_valid_response = {
            key: df_data[key].dropna().index.values
            for key in keys_original
        }

        index_valid_all = df_data[keys_original].dropna().index.values
        X_all = X[index_valid_all, :]
        Y_all = df_data[keys_original].iloc[index_valid_all].values

        # Manipulate columns for proper objective values
        Y_all[:, 0] = Y_all[:, 0]**2  # Squared seebeck
        print("    Data prepared; {0:} valid observations.".format(
            X_all.shape[0]))

        # Cache data
        pd.DataFrame(data=X_all).to_csv(results_dir + file_features)
        pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir +
                                                               file_responses)
        print("Data cached in results directory.")

    return X_all, Y_all, sign, keys_response, prefix
Ejemplo n.º 5
0
    def get_data(self,
                 formula=None,
                 prop=None,
                 data_type=None,
                 reference=None,
                 min_measurement=None,
                 max_measurement=None,
                 from_record=None,
                 data_set_id=None,
                 max_results=None):
        """
        Gets raw api data from Citrine in json format. See api_link for more
        information on input parameters

        Args:
            formula: (str) filter for the chemical formula field; only those
                results that have chemical formulas that contain this string
                will be returned
            prop: (str) name of the property to search for
            data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING';
                filter for properties obtained from experimental work,
                computational methods, or machine learning.
            reference: (str) filter for the reference field; only those
                results that have contributors that contain this string
                will be returned
            min_measurement: (str/num) minimum of the property value range
            max_measurement: (str/num) maximum of the property value range
            from_record: (int) index of first record to return (indexed from 0)
            data_set_id: (int) id of the particular data set to search on
            max_results: (int) number of records to limit the results to

        Returns: (list) of jsons/pifs returned by Citrine's API
        """

        json_data = []
        start = from_record if from_record else 0
        per_page = 100
        refresh_time = 3  # seconds to wait between search calls

        # Construct all of the relevant queries from input args
        formula_query = ChemicalFieldQuery(filter=ChemicalFilter(
            equal=formula))
        prop_query = PropertyQuery(
            name=FieldQuery(filter=Filter(equal=prop)),
            value=FieldQuery(
                filter=Filter(min=min_measurement, max=max_measurement)),
            data_type=FieldQuery(filter=Filter(equal=data_type)))
        ref_query = ReferenceQuery(doi=FieldQuery(filter=Filter(
            equal=reference)))

        system_query = PifSystemQuery(chemical_formula=formula_query,
                                      properties=prop_query,
                                      references=ref_query)
        dataset_query = DatasetQuery(id=Filter(equal=data_set_id))
        data_query = DataQuery(system=system_query, dataset=dataset_query)

        while True:
            # use per_page=max_results, eg: in case of max_results=68 < 100
            if max_results and max_results < per_page:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=max_results)
            else:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=per_page)

            # Check if any results found
            if "hits" not in self.client.search.pif_search(
                    pif_query).as_dictionary():
                raise KeyError("No results found!")

            data = self.client.search.pif_search(
                pif_query).as_dictionary()["hits"]
            size = len(data)
            start += size
            json_data.extend(data)

            # check if limit is reached
            if max_results and len(json_data) > max_results:
                # get first multiple of 100 records
                json_data = json_data[:max_results]
                break
            if size < per_page:  # break out of last loop of results
                break
            time.sleep(refresh_time)
        return json_data