def get_data(self, formula=None, prop=None, data_type=None, reference=None, min_measurement=None, max_measurement=None, from_record=None, data_set_id=None, max_results=None): """ Gets raw api data from Citrine in json format. See api_link for more information on input parameters Args: formula: (str) filter for the chemical formula field; only those results that have chemical formulas that contain this string will be returned prop: (str) name of the property to search for data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING'; filter for properties obtained from experimental work, computational methods, or machine learning. reference: (str) filter for the reference field; only those results that have contributors that contain this string will be returned min_measurement: (str/num) minimum of the property value range max_measurement: (str/num) maximum of the property value range from_record: (int) index of first record to return (indexed from 0) data_set_id: (int) id of the particular data set to search on max_results: (int) number of records to limit the results to Returns: (list) of jsons/pifs returned by Citrine's API """ json_data = [] start = from_record if from_record else 0 per_page = 100 refresh_time = 3 # seconds to wait between search calls # Construct all of the relevant queries from input args formula_query = ChemicalFieldQuery(filter=ChemicalFilter( equal=formula)) prop_query = PropertyQuery( name=FieldQuery(filter=Filter(equal=prop)), value=FieldQuery( filter=Filter(min=min_measurement, max=max_measurement)), data_type=FieldQuery(filter=Filter(equal=data_type))) ref_query = ReferenceQuery(doi=FieldQuery(filter=Filter( equal=reference))) system_query = PifSystemQuery(chemical_formula=formula_query, properties=prop_query, references=ref_query) dataset_query = DatasetQuery(id=Filter(equal=data_set_id)) data_query = DataQuery(system=system_query, dataset=dataset_query) while True: # use per_page=max_results, eg: in case of max_results=68 < 100 if max_results and max_results < per_page: pif_query = PifSystemReturningQuery(query=data_query, from_index=start, size=max_results) else: pif_query = PifSystemReturningQuery(query=data_query, from_index=start, size=per_page) # Check if any results found if "hits" not in self.client.search.pif_search( pif_query).as_dictionary(): raise KeyError("No results found!") data = self.client.search.pif_search( pif_query).as_dictionary()["hits"] size = len(data) start += size json_data.extend(data) # check if limit is reached if max_results and len(json_data) > max_results: # get first multiple of 100 records json_data = json_data[:max_results] break if size < per_page: # break out of last loop of results break time.sleep(refresh_time) return json_data
def run_sequential_learning(client, view_id, dataset_id, num_candidates_per_iter, design_effort, wait_time, num_sl_iterations, input_properties, target, print_output, true_function, score_type): '''Runs SL design :param client: Client object :type client: CitrinationClient :param view_id: View ID :type view_id: int :param dataset_id: Dataset ID :type dataset_id: int :param num_candidates_per_iter: Candidates in a batch :type num_candidates_per_iter: int :param design_effort: Effort from 1-30 :type design_effort: int :param wait_time: Wait time in seconds before polling API :type wait_time: int :param num_sl_iterations: SL iterations to run :type num_sl_iterations: int :param input_properties: Inputs :type input_properties: List[str] :param target: ("Output property", {"Min", "Max"}) :type target: List[str] :param print_output: Whether or not to print outputs :type print_output: bool :param true_function: Actual function for evaluating measured/true values :type true_function: Callable[[np.ndarray], float] :param score_type: MLI or MEI :type score_type: str :return: 2-tuple: list of predicted scores/uncertainties; list of measured scores/uncertainties :rtype: Tuple[List[float], List[float]] ''' best_sl_pred_vals = [] best_sl_measured_vals = [] _wait_on_ingest(client, dataset_id, wait_time, print_output) for i in range(num_sl_iterations): if print_output: print("\n---STARTING SL ITERATION #{}---".format(i + 1)) _wait_on_ingest(client, dataset_id, wait_time, print_output) _wait_on_data_view(client, dataset_id, view_id, wait_time, print_output) # Submit a design run design_id = client.models.submit_design_run( data_view_id=view_id, num_candidates=num_candidates_per_iter, effort=design_effort, target=Target(*target), constraints=[], sampler="Default").uuid if print_output: print("Created design run with ID {}".format(design_id)) _wait_on_design_run(client, design_id, view_id, wait_time, print_output) # Compute the best values with uncertainties as a list of (value, uncertainty) if score_type == "MEI": candidates = client.models.get_design_run_results( view_id, design_id).best_materials else: candidates = client.models.get_design_run_results( view_id, design_id).next_experiments values_w_uncertainties = [ (m["descriptor_values"][target[0]], m["descriptor_values"]["Uncertainty in {}".format(target[0])]) for m in candidates ] # Find and save the best predicted value if target[1] == "Min": best_value_w_uncertainty = min(values_w_uncertainties, key=lambda x: x[0]) else: best_value_w_uncertainty = max(values_w_uncertainties, key=lambda x: x[0]) best_sl_pred_vals.append(best_value_w_uncertainty) if print_output: print( "SL iter #{}, best predicted (value, uncertainty) = {}".format( i + 1, best_value_w_uncertainty)) # Update dataset w/ new candidates new_x_vals = [] for material in candidates: new_x_vals.append( np.array([ float(material["descriptor_values"][x]) for x in input_properties ])) temp_dataset_fpath = "design-{}.json".format(design_id) write_dataset_from_func(true_function, temp_dataset_fpath, new_x_vals) upload_data_and_get_id( client, "", # No name needed for updating a dataset temp_dataset_fpath, given_dataset_id=dataset_id) _wait_on_ingest(client, dataset_id, wait_time, print_output) if print_output: print("Dataset updated: {} candidates added.".format( len(new_x_vals))) query_dataset = PifSystemReturningQuery( size=9999, query=DataQuery(dataset=DatasetQuery(id=Filter( equal=str(dataset_id))))) query_result = client.search.pif_search(query_dataset) if print_output: print("New dataset contains {} PIFs.".format( query_result.total_num_hits)) # Update measured values in new dataset dataset_y_values = [] for hit in query_result.hits: # Assume last prop is output if following this script dataset_y_values.append( float(hit.system.properties[-1].scalars[0].value)) if target[1] == "Min": best_sl_measured_vals.append(min(dataset_y_values)) else: best_sl_measured_vals.append(max(dataset_y_values)) # Retrain model w/ wait times client.models.retrain(view_id) _wait_on_data_view(client, dataset_id, view_id, wait_time, print_output) if print_output: print("SL finished!\n") return (best_sl_pred_vals, best_sl_measured_vals)