class CitrineFeatureGeneration(object): """ Class to generate new features using Citrine data and dataframe containing material compositions Attributes: configdict <dict> : MASTML configfile object as dict dataframe <pandas dataframe> : dataframe containing x and y data and feature names api_key <str> : your Citrination API key Methods: generate_citrine_features : generates Citrine feature set based on compositions in dataframe args: save_to_csv <bool> : whether to save the magpie feature set to a csv file returns: dataframe <pandas dataframe> : dataframe containing magpie feature set """ def __init__(self, configdict, dataframe, api_key): self.configdict = configdict self.dataframe = dataframe self.api_key = api_key self.client = CitrinationClient(api_key, 'https://citrination.com') @timeit def generate_citrine_features(self, save_to_csv=True): logging.info( 'WARNING: You have specified generation of features from Citrine. Based on which materials you are' 'interested in, there may be many records to parse through, thus this routine may take a long time to complete!' ) compositions = self.dataframe['Material compositions'].tolist() citrine_dict_property_min = dict() citrine_dict_property_max = dict() citrine_dict_property_avg = dict() for composition in compositions: pifquery = self._get_pifquery(composition=composition) property_name_list, property_value_list = self._get_pifquery_property_list( pifquery=pifquery) property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list( property_name_list=property_name_list, property_value_list=property_value_list) citrine_dict_property_min[composition] = parsed_property_min citrine_dict_property_max[composition] = parsed_property_max citrine_dict_property_avg[composition] = parsed_property_avg dataframe = self.dataframe citrine_dict_list = [ citrine_dict_property_min, citrine_dict_property_max, citrine_dict_property_avg ] for citrine_dict in citrine_dict_list: dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_citrine = dataframe_citrine.reindex( self.dataframe['Material compositions'].tolist()) # Need to make compositions the first column, instead of the row names dataframe_citrine.index.name = 'Material compositions' dataframe_citrine.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_citrine['Material compositions'] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_citrine) if save_to_csv == bool(True): # Get y_feature in this dataframe, attach it to save path for column in dataframe.columns.values: if column in self.configdict['General Setup'][ 'target_feature']: filetag = column dataframe.to_csv(self.configdict['General Setup']['save_path'] + "/" + 'input_with_citrine_features' + '_' + str(filetag) + '.csv', index=False) return dataframe def _get_pifquery(self, composition): pif_query = PifQuery(system=SystemQuery( chemical_formula=ChemicalFieldQuery(filter=ChemicalFilter( equal=composition)))) # Check if any results found if 'hits' not in self.client.search(pif_query).as_dictionary(): raise KeyError('No results found!') pifquery = self.client.search(pif_query).as_dictionary()['hits'] return pifquery def _get_pifquery_property_list(self, pifquery): property_name_list = list() property_value_list = list() accepted_properties_list = [ 'mass', 'space group', 'band', 'Band', 'energy', 'volume', 'density', 'dielectric', 'Dielectric', 'Enthalpy', 'Convex', 'Magnetization', 'Elements', 'Modulus', 'Shear', "Poisson's", 'Elastic', 'Energy' ] for result_number, results in enumerate(pifquery): for system_heading, system_value in results.items(): if system_heading == 'system': # print('FOUND SYSTEM') for property_name, property_value in system_value.items(): if property_name == 'properties': # print('FOUND PROPERTIES') # pprint(property_value) for list_index, list_element in enumerate( property_value): for name, value in property_value[ list_index].items(): if name == 'name': # Check that the property name is in the acceptable property list if value != "CIF File": for entry in accepted_properties_list: if entry in value: # print('found acceptable name', entry, 'for name', value, 'with value',property_value[list_index]['scalars'][0]['value'] ) property_name_list.append( value) try: property_value_list.append( float( property_value[ list_index] ['scalars'][0] ['value'])) except (ValueError, KeyError): # print('found something to remove', property_value[list_index]['scalars'][0]['value']) property_name_list.pop( -1) continue return property_name_list, property_value_list def _parse_pifquery_property_list(self, property_name_list, property_value_list): parsed_property_max = dict() parsed_property_min = dict() parsed_property_avg = dict() property_names_unique = list() if len(property_name_list) != len(property_value_list): print( 'Error! Length of property name and property value lists are not the same. There must be a bug in the _get_pifquerey_property_list method' ) sys.exit() else: # Get unique property names for name in property_name_list: if name not in property_names_unique: property_names_unique.append(name) for unique_name in property_names_unique: unique_property = list() unique_property_avg = 0 count = 0 for i, name in enumerate(property_name_list): # Only include property values whose name are same as those in unique_name list if name == unique_name: count += 1 # count how many instances of the same property occur unique_property_avg += property_value_list[i] unique_property.append(property_value_list[i]) unique_property_min = min(entry for entry in unique_property) unique_property_max = max(entry for entry in unique_property) unique_property_avg = unique_property_avg / count parsed_property_min[str(unique_name) + "_min"] = unique_property_min parsed_property_max[str(unique_name) + "_max"] = unique_property_max parsed_property_avg[str(unique_name) + "_avg"] = unique_property_avg return property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg
class CitrineDataRetrieval: """ CitrineDataRetrieval is used to retrieve data from the Citrination database. See API client docs at http://citrineinformatics.github.io/api-documentation/ """ def __init__(self, api_key=None): """ Args: api_key: (str) Your Citrine API key, or None if you've set the CITRINE_KEY environment variable Returns: None """ api_key = api_key if api_key else os.environ["CITRINE_KEY"] self.client = CitrinationClient(api_key, "https://citrination.com") def get_dataframe(self, formula=None, prop=None, data_type=None, reference=None, min_measurement=None, max_measurement=None, from_record=None, data_set_id=None, max_results=None, show_columns=None): """ Gets a Pandas dataframe object from data retrieved from the Citrine API. See client docs at http://citrineinformatics.github.io/api-documentation/ for more details on input parameters. Args: formula: (str) filter for the chemical formula field; only those results that have chemical formulas that contain this string will be returned prop: (str) name of the property to search for data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING'; filter for properties obtained from experimental work, computational methods, or machine learning. reference: (str) filter for the reference field; only those results that have contributors that contain this string will be returned min_measurement: (str/num) minimum of the property value range max_measurement: (str/num) maximum of the property value range from_record: (int) index of first record to return (indexed from 0) data_set_id: (int) id of the particular data set to search on max_results: (int) number of records to limit the results to show_columns: (list) list of columns to show from the resulting dataframe Returns: (object) Pandas dataframe object containing the results """ # Get all of the jsons from client jsons = self.get_api_data(formula=formula, prop=prop, data_type=data_type, reference=reference, min_measurement=min_measurement, max_measurement=max_measurement, from_record=from_record, data_set_id=data_set_id, max_results=max_results) non_prop_df = pd.DataFrame() # df w/o measurement column prop_df = pd.DataFrame() # df containing only measurement column counter = 0 # variable to keep count of sample hit and set indexes for hit in tqdm(jsons): counter += 1 # Keep a count to appropriately index the rows if "system" in hit.keys( ): # Check if 'system' key exists, else skip system_value = hit["system"] system_normdf = json_normalize(system_value) # Make a DF of all non-'properties' fields non_prop_cols = [ cols for cols in system_normdf.columns if "properties" not in cols ] non_prop_row = pd.DataFrame() for col in non_prop_cols: non_prop_row[col] = system_normdf[col] non_prop_row.index = [counter] * len(system_normdf) non_prop_df = non_prop_df.append(non_prop_row) # Make a DF of the 'properties' array if "properties" in system_value: p_df = pd.DataFrame() # Rename duplicate property names in a record with progressive numbering all_prop_names = [ x["name"] for x in system_value["properties"] ] counts = { k: v for k, v in Counter(all_prop_names).items() if v > 1 } for i in reversed(range(len(all_prop_names))): item = all_prop_names[i] if item in counts and counts[item]: all_prop_names[i] += "_" + str(counts[item]) counts[item] -= 1 # add each property, and its associated fields, as a new column for p_idx, prop in enumerate(system_value["properties"]): # Rename property name according to above duplicate numbering prop["name"] = all_prop_names[p_idx] if "scalars" in prop: p_df.set_value(counter, prop["name"], parse_scalars(prop["scalars"])) elif "vectors" in prop: p_df[prop["name"]] = prop["vectors"] elif "matrices" in prop: p_df[prop["name"]] = prop["matrices"] # parse all keys in the Property object except 'name', 'scalars', 'vectors', and 'matrices' for prop_key in prop: if prop_key not in [ "name", "scalars", "vectors", "matrices" ]: # If value is a list of multiple items, set the cell to the entire list by first # converting to object type, else results in a ValueError/IndexError if type(prop[prop_key]) == list and len( prop[prop_key]) > 1: p_df[prop["name"] + "-" + prop_key] = np.nan p_df[prop["name"] + "-" + prop_key] = \ p_df[prop["name"] + "-" + prop_key].astype(object) p_df.set_value(counter, prop["name"] + "-" + prop_key, prop[prop_key]) p_df.index = [counter] prop_df = prop_df.append(p_df) # Concatenate 'properties' and 'non-properties' dataframes df = pd.concat([non_prop_df, prop_df], axis=1) df.index.name = "system" # Remove uninformative columns, such as 'category' and 'uid' df.drop(["category", "uid"], axis=1, inplace=True) # Filter out columns not selected if show_columns: df = df[show_columns] return df def get_api_data(self, formula=None, prop=None, data_type=None, reference=None, min_measurement=None, max_measurement=None, from_record=None, data_set_id=None, max_results=None): """ Gets raw api data from Citrine in json format. See client docs at http://citrineinformatics.github.io/api-documentation/ for more details on these parameters. Args: formula: (str) filter for the chemical formula field; only those results that have chemical formulas that contain this string will be returned prop: (str) name of the property to search for data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING'; filter for properties obtained from experimental work, computational methods, or machine learning. reference: (str) filter for the reference field; only those results that have contributors that contain this string will be returned min_measurement: (str/num) minimum of the property value range max_measurement: (str/num) maximum of the property value range from_record: (int) index of first record to return (indexed from 0) data_set_id: (int) id of the particular data set to search on max_results: (int) number of records to limit the results to Returns: (list) of jsons/pifs returned by Citrine's API """ json_data = [] start = from_record if from_record else 0 per_page = 100 refresh_time = 3 # seconds to wait between search calls # Construct all of the relevant queries from input args formula_query = ChemicalFieldQuery(filter=ChemicalFilter( equal=formula)) prop_query = PropertyQuery( name=FieldQuery(filter=Filter(equal=prop)), value=FieldQuery( filter=Filter(min=min_measurement, max=max_measurement)), data_type=FieldQuery(filter=Filter(equal=data_type))) ref_query = ReferenceQuery(doi=FieldQuery(filter=Filter( equal=reference))) system_query = PifSystemQuery(chemical_formula=formula_query, properties=prop_query, references=ref_query) dataset_query = DatasetQuery(id=Filter(equal=data_set_id)) data_query = DataQuery(system=system_query, dataset=dataset_query) while True: # use per_page=max_results, eg: in case of max_results=68 < 100 if max_results and max_results < per_page: pif_query = PifSystemReturningQuery(query=data_query, from_index=start, size=max_results) else: pif_query = PifSystemReturningQuery(query=data_query, from_index=start, size=per_page) # Check if any results found if "hits" not in self.client.search(pif_query).as_dictionary(): raise KeyError("No results found!") data = self.client.search(pif_query).as_dictionary()["hits"] size = len(data) start += size json_data.extend(data) # check if limit is reached if max_results and len(json_data) > max_results: # get first multiple of 100 records json_data = json_data[:max_results] break if size < per_page: # break out of last loop of results break time.sleep(refresh_time) return json_data
class CitrineFeatureGeneration(object): """ Class to generate new features using Citrine data and dataframe containing material compositions Datarame must have a column named "Material compositions". Args: configdict (dict) : MASTML configfile object as dict dataframe (pandas dataframe) : dataframe containing x and y data and feature names api_key (str) : your Citrination API key Methods: generate_citrine_features : generates Citrine feature set based on compositions in dataframe Args: save_to_csv (bool) : whether to save the magpie feature set to a csv file Returns: pandas dataframe : dataframe containing magpie feature set """ def __init__(self, dataframe, api_key, composition_feature): self.dataframe = dataframe self.api_key = api_key self.client = CitrinationClient(api_key, 'https://citrination.com') self.composition_feature = composition_feature def generate_citrine_features(self): log.warning( 'WARNING: You have specified generation of features from Citrine. Based on which' ' materials you are interested in, there may be many records to parse through, thus' ' this routine may take a long time to complete!') try: compositions = self.dataframe[self.composition_feature].tolist() except KeyError as e: log.error(f'original python error: {str(e)}') raise utils.MissingColumnError( 'Error! No column named {self.composition_feature} found in your input data file. ' 'To use this feature generation routine, you must supply a material composition for each data point' ) citrine_dict_property_min = dict() citrine_dict_property_max = dict() citrine_dict_property_avg = dict() # before: ~11 seconds # made into a func so we can do requests in parallel # now like 1.8 secs! pool = multiprocessing.Pool(processes=20) #result_tuples = pool.map(self._load_composition, compositions) result_tuples = map(self._load_composition, compositions) for comp, (prop_min, prop_max, prop_avg) in zip(compositions, result_tuples): citrine_dict_property_min[comp] = prop_min citrine_dict_property_max[comp] = prop_max citrine_dict_property_avg[comp] = prop_avg dataframe = self.dataframe citrine_dict_list = [ citrine_dict_property_min, citrine_dict_property_max, citrine_dict_property_avg ] for citrine_dict in citrine_dict_list: dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict, orient='index') # Need to reorder compositions in new dataframe to match input dataframe dataframe_citrine = dataframe_citrine.reindex( self.dataframe[self.composition_feature].tolist()) # Need to make compositions the first column, instead of the row names dataframe_citrine.index.name = self.composition_feature dataframe_citrine.reset_index(inplace=True) # Need to delete duplicate column before merging dataframes del dataframe_citrine[self.composition_feature] # Merge magpie feature dataframe with originally supplied dataframe dataframe = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe, dataframe2=dataframe_citrine) return dataframe def _load_composition(self, composition): pifquery = self._get_pifquery(composition=composition) property_name_list, property_value_list = self._get_pifquery_property_list( pifquery=pifquery) #print("Citrine Feature Generation: ", composition, property_name_list, property_value_list) property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list( property_name_list=property_name_list, property_value_list=property_value_list) return parsed_property_min, parsed_property_max, parsed_property_avg def _get_pifquery(self, composition): # TODO: does this stop csv generation on first invalid composition? # TODO: Is there a way to send many compositions in one call to citrine? pif_query = PifQuery(system=SystemQuery( chemical_formula=ChemicalFieldQuery(filter=ChemicalFilter( equal=composition)))) # Check if any results found if 'hits' not in self.client.search(pif_query).as_dictionary(): raise KeyError('No results found!') pifquery = self.client.search(pif_query).as_dictionary()['hits'] return pifquery def _get_pifquery_property_list(self, pifquery): property_name_list = list() property_value_list = list() accepted_properties_list = [ 'mass', 'space group', 'band', 'Band', 'energy', 'volume', 'density', 'dielectric', 'Dielectric', 'Enthalpy', 'Convex', 'Magnetization', 'Elements', 'Modulus', 'Shear', "Poisson's", 'Elastic', 'Energy' ] for result_number, results in enumerate(pifquery): for i, dictionary in enumerate(results['system']['properties']): if 'name' not in dictionary or dictionary['name'] == "CIF File": continue value = dictionary['name'] for entry in accepted_properties_list: if entry not in value: continue property_name_list.append(value) try: property_value_list.append( float(dictionary['scalars'][0]['value'])) except (ValueError, KeyError): property_name_list.pop(-1) continue #for result_number, results in enumerate(pifquery): # property_value = results['system']['properties'] # for list_index, list_element in enumerate(property_value): # for name, value in property_value[list_index].items(): # if name == 'name' and value != "CIF File": # for entry in accepted_properties_list: # if entry in value: # property_name_list.append(value) # try: # property_value_list.append( # float(property_value[list_index]['scalars'][0]['value'])) # except (ValueError, KeyError): # # print('found something to remove', property_value[list_index]['scalars'][0]['value']) # property_name_list.pop(-1) # continue return property_name_list, property_value_list def _parse_pifquery_property_list(self, property_name_list, property_value_list): parsed_property_max = dict() parsed_property_min = dict() parsed_property_avg = dict() property_names_unique = list() if len(property_name_list) != len(property_value_list): print( 'Error! Length of property name and property value lists are not the same. There must be a bug in the _get_pifquerey_property_list method' ) raise IndexError( "property_name_list and property_value_list are not the same size." ) else: # Get unique property names for name in property_name_list: if name not in property_names_unique: property_names_unique.append(name) for unique_name in property_names_unique: unique_property = list() unique_property_avg = 0 count = 0 for i, name in enumerate(property_name_list): # Only include property values whose name are same as those in unique_name list if name == unique_name: count += 1 # count how many instances of the same property occur unique_property_avg += property_value_list[i] unique_property.append(property_value_list[i]) unique_property_min = min(entry for entry in unique_property) unique_property_max = max(entry for entry in unique_property) unique_property_avg = unique_property_avg / count parsed_property_min[str(unique_name) + "_min"] = unique_property_min parsed_property_max[str(unique_name) + "_max"] = unique_property_max parsed_property_avg[str(unique_name) + "_avg"] = unique_property_avg return property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg
class CitrineDataRetrieval: def __init__(self, api_key=None): """ Args: api_key: (str) Your Citrine API key, or None if you've set the CITRINE_KEY environment variable Returns: None """ api_key = api_key if api_key else os.environ['CITRINE_KEY'] self.client = CitrinationClient(api_key, 'http://citrination.com') def get_dataframe(self, term=None, formula=None, property=None, contributor=None, reference=None, min_measurement=None, max_measurement=None, from_record=None, data_set_id=None, max_results=None, show_columns=None): """ Gets data from MP in a dataframe format. See client docs at http://citrineinformatics.github.io/api-documentation/ for more details on these parameters. Args: term: (str) general search string; this is searched against all fields formula: (str) filter for the chemical formula field; only those results that have chemical formulas that contain this string will be returned property: (str) name of the property to search for contributor: (str) filter for the contributor field; only those results that have contributors that contain this string will be returned reference: (str) filter for the reference field; only those results that have contributors that contain this string will be returned min_measurement: (str/num) minimum of the property value range max_measurement: (str/num) maximum of the property value range from_record: (int) index of the first record to return (indexed from 0) data_set_id: (int) id of the particular data set to search on max_results: (int) number of records to limit the results to Returns: (object) Pandas dataframe object containing the results """ json_data = [] start = from_record if from_record else 0 per_page = 100 refresh_time = 3 # seconds to wait between search calls while True: if max_results and max_results < per_page: # use per_page=max_results, eg: in case of max_results=68 < 100 data = self.client.search(term=term, formula=formula, property=property, contributor=contributor, reference=reference, min_measurement=min_measurement, max_measurement=max_measurement, from_record=start, per_page=max_results, data_set_id=data_set_id) else: data = self.client.search(term=term, formula=formula, property=property, contributor=contributor, reference=reference, min_measurement=min_measurement, max_measurement=max_measurement, from_record=start, per_page=per_page, data_set_id=data_set_id) size = len(data.json()['results']) start += size json_data.append(data.json()['results']) if max_results and len( json_data ) * per_page > max_results: # check if limit is reached json_data = json_data[:( max_results / per_page)] # get first multiple of 100 records json_data.append( data.json()['results'][:max_results % per_page]) # get remaining records break if size < per_page: # break out of last loop of results break time.sleep(refresh_time) non_meas_df = pd.DataFrame() # df w/o measurement column meas_df = pd.DataFrame() # df containing only measurement column counter = 0 # variable to keep count of sample hit and set indexes for page in json_data: # df = pd.concat((json_normalize(hit) for hit in set)) # Useful tool for the future for hit in tqdm(page): counter += 1 if 'sample' in hit.keys(): sample_value = hit['sample'] sample_normdf = json_normalize(sample_value) # Make a DF of all non-'measurement' fields non_meas_cols = [ cols for cols in sample_normdf.columns if "measurement" not in cols ] non_meas_row = pd.DataFrame() for col in non_meas_cols: non_meas_row[col] = sample_normdf[col] non_meas_row.index = [counter] * len(sample_normdf) non_meas_df = non_meas_df.append(non_meas_row) # Make a DF of the 'measurement' array if 'measurement' in sample_value: meas_normdf = json_normalize( sample_value['measurement']) # Extract numbers of properties if 'property.scalar' in meas_normdf.columns: for row, col in enumerate( meas_normdf['property.scalar']): for item in col: if 'value' in item: meas_normdf.xs(row)[ 'property.scalar'] = item['value'] # TODO: ask Anubhav how to deal with these and rest of formats elif 'minimum' in item and 'maximum' in item: meas_normdf.xs( row )['property.scalar'] = 'Minimum = ' + item[ 'minimum'] + ', ' + 'Maximum = ' + item[ 'maximum'] # Take all property rows and convert them into columns prop_df = pd.DataFrame() prop_cols = [ cols for cols in meas_normdf.columns if "property" in cols ] for col in prop_cols: prop_df[col] = meas_normdf[col] prop_df.index = [counter] * len(meas_normdf) prop_df = prop_df.drop_duplicates(['property.name']) if 'property.scalar' in meas_normdf.columns: prop_df = prop_df.pivot(columns='property.name', values='property.scalar') elif 'property.matrix' in meas_normdf.columns: prop_df = prop_df.pivot(columns='property.name', values='property.matrix') prop_df = prop_df.convert_objects( convert_numeric=True ) # Convert columns from object to num # Making a single row DF of non-'measurement.property' columns non_prop_df = pd.DataFrame() non_prop_cols = [ cols for cols in meas_normdf.columns if "property" not in cols ] for col in non_prop_cols: non_prop_df['measurement.' + col] = meas_normdf[col] if len( non_prop_df ) > 0: # Do not index empty DF (non-'measuremenet.property' columns absent) non_prop_df.index = [counter] * len(meas_normdf) non_prop_df = non_prop_df[: 1] # Take only first row - does not collect non-unique rows units_df = pd.DataFrame( ) # Get property unit and insert it as a dict if 'property.units' in meas_normdf.columns: curr_units = dict( zip(meas_normdf['property.name'], meas_normdf['property.units'])) units_df['property.units'] = [curr_units] units_df.index = [counter] * len(meas_normdf) meas_df = meas_df.append( pd.concat([prop_df, non_prop_df, units_df], axis=1)) df = pd.concat([non_meas_df, meas_df], axis=1) df.index.name = 'sample' if show_columns: for column in df.columns: if column not in show_columns: df.drop(column, axis=1, inplace=True) return df df.append