def get_data(self,
                 formula=None,
                 prop=None,
                 data_type=None,
                 reference=None,
                 min_measurement=None,
                 max_measurement=None,
                 from_record=None,
                 data_set_id=None,
                 max_results=None):
        """
        Gets raw api data from Citrine in json format. See api_link for more
        information on input parameters

        Args:
            formula: (str) filter for the chemical formula field; only those
                results that have chemical formulas that contain this string
                will be returned
            prop: (str) name of the property to search for
            data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING';
                filter for properties obtained from experimental work,
                computational methods, or machine learning.
            reference: (str) filter for the reference field; only those
                results that have contributors that contain this string
                will be returned
            min_measurement: (str/num) minimum of the property value range
            max_measurement: (str/num) maximum of the property value range
            from_record: (int) index of first record to return (indexed from 0)
            data_set_id: (int) id of the particular data set to search on
            max_results: (int) number of records to limit the results to

        Returns: (list) of jsons/pifs returned by Citrine's API
        """

        json_data = []
        start = from_record if from_record else 0
        per_page = 100
        refresh_time = 3  # seconds to wait between search calls

        # Construct all of the relevant queries from input args
        formula_query = ChemicalFieldQuery(filter=ChemicalFilter(
            equal=formula))
        prop_query = PropertyQuery(
            name=FieldQuery(filter=Filter(equal=prop)),
            value=FieldQuery(
                filter=Filter(min=min_measurement, max=max_measurement)),
            data_type=FieldQuery(filter=Filter(equal=data_type)))
        ref_query = ReferenceQuery(doi=FieldQuery(filter=Filter(
            equal=reference)))

        system_query = PifSystemQuery(chemical_formula=formula_query,
                                      properties=prop_query,
                                      references=ref_query)
        dataset_query = DatasetQuery(id=Filter(equal=data_set_id))
        data_query = DataQuery(system=system_query, dataset=dataset_query)

        while True:
            # use per_page=max_results, eg: in case of max_results=68 < 100
            if max_results and max_results < per_page:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=max_results)
            else:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=per_page)

            # Check if any results found
            if "hits" not in self.client.search.pif_search(
                    pif_query).as_dictionary():
                raise KeyError("No results found!")

            data = self.client.search.pif_search(
                pif_query).as_dictionary()["hits"]
            size = len(data)
            start += size
            json_data.extend(data)

            # check if limit is reached
            if max_results and len(json_data) > max_results:
                # get first multiple of 100 records
                json_data = json_data[:max_results]
                break
            if size < per_page:  # break out of last loop of results
                break
            time.sleep(refresh_time)
        return json_data
Esempio n. 2
0
def run_sequential_learning(client, view_id, dataset_id,
                            num_candidates_per_iter, design_effort, wait_time,
                            num_sl_iterations, input_properties, target,
                            print_output, true_function, score_type):
    '''Runs SL design

    :param client: Client object
    :type client: CitrinationClient
    :param view_id: View ID
    :type view_id: int
    :param dataset_id: Dataset ID
    :type dataset_id: int
    :param num_candidates_per_iter: Candidates in a batch
    :type num_candidates_per_iter: int
    :param design_effort: Effort from 1-30
    :type design_effort: int
    :param wait_time: Wait time in seconds before polling API
    :type wait_time: int
    :param num_sl_iterations: SL iterations to run
    :type num_sl_iterations: int
    :param input_properties: Inputs
    :type input_properties: List[str]
    :param target: ("Output property", {"Min", "Max"})
    :type target: List[str]
    :param print_output: Whether or not to print outputs
    :type print_output: bool
    :param true_function: Actual function for evaluating measured/true values
    :type true_function: Callable[[np.ndarray], float]
    :param score_type: MLI or MEI
    :type score_type: str
    :return: 2-tuple: list of predicted scores/uncertainties; list of measured scores/uncertainties
    :rtype: Tuple[List[float], List[float]]
    '''

    best_sl_pred_vals = []
    best_sl_measured_vals = []

    _wait_on_ingest(client, dataset_id, wait_time, print_output)

    for i in range(num_sl_iterations):
        if print_output:
            print("\n---STARTING SL ITERATION #{}---".format(i + 1))

        _wait_on_ingest(client, dataset_id, wait_time, print_output)
        _wait_on_data_view(client, dataset_id, view_id, wait_time,
                           print_output)

        # Submit a design run
        design_id = client.models.submit_design_run(
            data_view_id=view_id,
            num_candidates=num_candidates_per_iter,
            effort=design_effort,
            target=Target(*target),
            constraints=[],
            sampler="Default").uuid

        if print_output:
            print("Created design run with ID {}".format(design_id))

        _wait_on_design_run(client, design_id, view_id, wait_time,
                            print_output)

        # Compute the best values with uncertainties as a list of (value, uncertainty)
        if score_type == "MEI":
            candidates = client.models.get_design_run_results(
                view_id, design_id).best_materials
        else:
            candidates = client.models.get_design_run_results(
                view_id, design_id).next_experiments
        values_w_uncertainties = [
            (m["descriptor_values"][target[0]],
             m["descriptor_values"]["Uncertainty in {}".format(target[0])])
            for m in candidates
        ]

        # Find and save the best predicted value
        if target[1] == "Min":
            best_value_w_uncertainty = min(values_w_uncertainties,
                                           key=lambda x: x[0])
        else:
            best_value_w_uncertainty = max(values_w_uncertainties,
                                           key=lambda x: x[0])

        best_sl_pred_vals.append(best_value_w_uncertainty)
        if print_output:
            print(
                "SL iter #{}, best predicted (value, uncertainty) = {}".format(
                    i + 1, best_value_w_uncertainty))

        # Update dataset w/ new candidates
        new_x_vals = []
        for material in candidates:
            new_x_vals.append(
                np.array([
                    float(material["descriptor_values"][x])
                    for x in input_properties
                ]))

        temp_dataset_fpath = "design-{}.json".format(design_id)
        write_dataset_from_func(true_function, temp_dataset_fpath, new_x_vals)
        upload_data_and_get_id(
            client,
            "",  # No name needed for updating a dataset
            temp_dataset_fpath,
            given_dataset_id=dataset_id)

        _wait_on_ingest(client, dataset_id, wait_time, print_output)

        if print_output:
            print("Dataset updated: {} candidates added.".format(
                len(new_x_vals)))

        query_dataset = PifSystemReturningQuery(
            size=9999,
            query=DataQuery(dataset=DatasetQuery(id=Filter(
                equal=str(dataset_id)))))
        query_result = client.search.pif_search(query_dataset)

        if print_output:
            print("New dataset contains {} PIFs.".format(
                query_result.total_num_hits))

        # Update measured values in new dataset
        dataset_y_values = []
        for hit in query_result.hits:
            # Assume last prop is output if following this script
            dataset_y_values.append(
                float(hit.system.properties[-1].scalars[0].value))

        if target[1] == "Min":
            best_sl_measured_vals.append(min(dataset_y_values))
        else:
            best_sl_measured_vals.append(max(dataset_y_values))

        # Retrain model w/ wait times
        client.models.retrain(view_id)
        _wait_on_data_view(client, dataset_id, view_id, wait_time,
                           print_output)

    if print_output:
        print("SL finished!\n")

    return (best_sl_pred_vals, best_sl_measured_vals)