class CitrineFeatureGeneration(object):
    """
    Class to generate new features using Citrine data and dataframe containing material compositions

    Attributes:
        configdict <dict> : MASTML configfile object as dict
        dataframe <pandas dataframe> : dataframe containing x and y data and feature names
        api_key <str> : your Citrination API key

    Methods:
        generate_citrine_features : generates Citrine feature set based on compositions in dataframe
            args:
                save_to_csv <bool> : whether to save the magpie feature set to a csv file
            returns:
                dataframe <pandas dataframe> : dataframe containing magpie feature set
    """
    def __init__(self, configdict, dataframe, api_key):
        self.configdict = configdict
        self.dataframe = dataframe
        self.api_key = api_key
        self.client = CitrinationClient(api_key, 'https://citrination.com')

    @timeit
    def generate_citrine_features(self, save_to_csv=True):
        logging.info(
            'WARNING: You have specified generation of features from Citrine. Based on which materials you are'
            'interested in, there may be many records to parse through, thus this routine may take a long time to complete!'
        )
        compositions = self.dataframe['Material compositions'].tolist()
        citrine_dict_property_min = dict()
        citrine_dict_property_max = dict()
        citrine_dict_property_avg = dict()
        for composition in compositions:
            pifquery = self._get_pifquery(composition=composition)
            property_name_list, property_value_list = self._get_pifquery_property_list(
                pifquery=pifquery)
            property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list(
                property_name_list=property_name_list,
                property_value_list=property_value_list)
            citrine_dict_property_min[composition] = parsed_property_min
            citrine_dict_property_max[composition] = parsed_property_max
            citrine_dict_property_avg[composition] = parsed_property_avg

        dataframe = self.dataframe
        citrine_dict_list = [
            citrine_dict_property_min, citrine_dict_property_max,
            citrine_dict_property_avg
        ]
        for citrine_dict in citrine_dict_list:
            dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict,
                                                       orient='index')
            # Need to reorder compositions in new dataframe to match input dataframe
            dataframe_citrine = dataframe_citrine.reindex(
                self.dataframe['Material compositions'].tolist())
            # Need to make compositions the first column, instead of the row names
            dataframe_citrine.index.name = 'Material compositions'
            dataframe_citrine.reset_index(inplace=True)
            # Need to delete duplicate column before merging dataframes
            del dataframe_citrine['Material compositions']
            # Merge magpie feature dataframe with originally supplied dataframe
            dataframe = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe, dataframe2=dataframe_citrine)

        if save_to_csv == bool(True):
            # Get y_feature in this dataframe, attach it to save path
            for column in dataframe.columns.values:
                if column in self.configdict['General Setup'][
                        'target_feature']:
                    filetag = column
            dataframe.to_csv(self.configdict['General Setup']['save_path'] +
                             "/" + 'input_with_citrine_features' + '_' +
                             str(filetag) + '.csv',
                             index=False)

        return dataframe

    def _get_pifquery(self, composition):
        pif_query = PifQuery(system=SystemQuery(
            chemical_formula=ChemicalFieldQuery(filter=ChemicalFilter(
                equal=composition))))
        # Check if any results found
        if 'hits' not in self.client.search(pif_query).as_dictionary():
            raise KeyError('No results found!')
        pifquery = self.client.search(pif_query).as_dictionary()['hits']
        return pifquery

    def _get_pifquery_property_list(self, pifquery):
        property_name_list = list()
        property_value_list = list()
        accepted_properties_list = [
            'mass', 'space group', 'band', 'Band', 'energy', 'volume',
            'density', 'dielectric', 'Dielectric', 'Enthalpy', 'Convex',
            'Magnetization', 'Elements', 'Modulus', 'Shear', "Poisson's",
            'Elastic', 'Energy'
        ]
        for result_number, results in enumerate(pifquery):
            for system_heading, system_value in results.items():
                if system_heading == 'system':
                    # print('FOUND SYSTEM')
                    for property_name, property_value in system_value.items():
                        if property_name == 'properties':
                            # print('FOUND PROPERTIES')
                            # pprint(property_value)
                            for list_index, list_element in enumerate(
                                    property_value):
                                for name, value in property_value[
                                        list_index].items():
                                    if name == 'name':
                                        # Check that the property name is in the acceptable property list
                                        if value != "CIF File":
                                            for entry in accepted_properties_list:
                                                if entry in value:
                                                    # print('found acceptable name', entry, 'for name', value, 'with value',property_value[list_index]['scalars'][0]['value'] )
                                                    property_name_list.append(
                                                        value)
                                                    try:
                                                        property_value_list.append(
                                                            float(
                                                                property_value[
                                                                    list_index]
                                                                ['scalars'][0]
                                                                ['value']))
                                                    except (ValueError,
                                                            KeyError):
                                                        # print('found something to remove', property_value[list_index]['scalars'][0]['value'])
                                                        property_name_list.pop(
                                                            -1)
                                                        continue
        return property_name_list, property_value_list

    def _parse_pifquery_property_list(self, property_name_list,
                                      property_value_list):
        parsed_property_max = dict()
        parsed_property_min = dict()
        parsed_property_avg = dict()
        property_names_unique = list()
        if len(property_name_list) != len(property_value_list):
            print(
                'Error! Length of property name and property value lists are not the same. There must be a bug in the _get_pifquerey_property_list method'
            )
            sys.exit()
        else:
            # Get unique property names
            for name in property_name_list:
                if name not in property_names_unique:
                    property_names_unique.append(name)
            for unique_name in property_names_unique:
                unique_property = list()
                unique_property_avg = 0
                count = 0
                for i, name in enumerate(property_name_list):
                    # Only include property values whose name are same as those in unique_name list
                    if name == unique_name:
                        count += 1  # count how many instances of the same property occur
                        unique_property_avg += property_value_list[i]
                        unique_property.append(property_value_list[i])
                unique_property_min = min(entry for entry in unique_property)
                unique_property_max = max(entry for entry in unique_property)
                unique_property_avg = unique_property_avg / count
                parsed_property_min[str(unique_name) +
                                    "_min"] = unique_property_min
                parsed_property_max[str(unique_name) +
                                    "_max"] = unique_property_max
                parsed_property_avg[str(unique_name) +
                                    "_avg"] = unique_property_avg

        return property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg
Esempio n. 2
0
class CitrineDataRetrieval:
    """
    CitrineDataRetrieval is used to retrieve data from
    the Citrination database.  See API client docs at
    http://citrineinformatics.github.io/api-documentation/
    """
    def __init__(self, api_key=None):
        """
        Args:
            api_key: (str) Your Citrine API key, or None if
                you've set the CITRINE_KEY environment variable

        Returns: None
        """
        api_key = api_key if api_key else os.environ["CITRINE_KEY"]
        self.client = CitrinationClient(api_key, "https://citrination.com")

    def get_dataframe(self,
                      formula=None,
                      prop=None,
                      data_type=None,
                      reference=None,
                      min_measurement=None,
                      max_measurement=None,
                      from_record=None,
                      data_set_id=None,
                      max_results=None,
                      show_columns=None):
        """
        Gets a Pandas dataframe object from data retrieved from
        the Citrine API.  See client docs at
        http://citrineinformatics.github.io/api-documentation/
        for more details on input parameters.

        Args:
            formula: (str) filter for the chemical formula field; only those
                results that have chemical formulas that contain this string
                will be returned
            prop: (str) name of the property to search for
            data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING';
                filter for properties obtained from experimental work,
                computational methods, or machine learning.
            reference: (str) filter for the reference field; only those
                results that have contributors that contain this string
                will be returned
            min_measurement: (str/num) minimum of the property value range
            max_measurement: (str/num) maximum of the property value range
            from_record: (int) index of first record to return (indexed from 0)
            data_set_id: (int) id of the particular data set to search on
            max_results: (int) number of records to limit the results to
            show_columns: (list) list of columns to show from the
                resulting dataframe

        Returns: (object) Pandas dataframe object containing the results

        """
        # Get all of the jsons from client
        jsons = self.get_api_data(formula=formula,
                                  prop=prop,
                                  data_type=data_type,
                                  reference=reference,
                                  min_measurement=min_measurement,
                                  max_measurement=max_measurement,
                                  from_record=from_record,
                                  data_set_id=data_set_id,
                                  max_results=max_results)

        non_prop_df = pd.DataFrame()  # df w/o measurement column
        prop_df = pd.DataFrame()  # df containing only measurement column

        counter = 0  # variable to keep count of sample hit and set indexes

        for hit in tqdm(jsons):

            counter += 1  # Keep a count to appropriately index the rows

            if "system" in hit.keys(
            ):  # Check if 'system' key exists, else skip
                system_value = hit["system"]
                system_normdf = json_normalize(system_value)

                # Make a DF of all non-'properties' fields
                non_prop_cols = [
                    cols for cols in system_normdf.columns
                    if "properties" not in cols
                ]
                non_prop_row = pd.DataFrame()
                for col in non_prop_cols:
                    non_prop_row[col] = system_normdf[col]
                non_prop_row.index = [counter] * len(system_normdf)
                non_prop_df = non_prop_df.append(non_prop_row)

                # Make a DF of the 'properties' array
                if "properties" in system_value:

                    p_df = pd.DataFrame()

                    # Rename duplicate property names in a record with progressive numbering
                    all_prop_names = [
                        x["name"] for x in system_value["properties"]
                    ]

                    counts = {
                        k: v
                        for k, v in Counter(all_prop_names).items() if v > 1
                    }

                    for i in reversed(range(len(all_prop_names))):
                        item = all_prop_names[i]
                        if item in counts and counts[item]:
                            all_prop_names[i] += "_" + str(counts[item])
                            counts[item] -= 1

                    # add each property, and its associated fields, as a new column
                    for p_idx, prop in enumerate(system_value["properties"]):

                        # Rename property name according to above duplicate numbering
                        prop["name"] = all_prop_names[p_idx]

                        if "scalars" in prop:
                            p_df.set_value(counter, prop["name"],
                                           parse_scalars(prop["scalars"]))
                        elif "vectors" in prop:
                            p_df[prop["name"]] = prop["vectors"]
                        elif "matrices" in prop:
                            p_df[prop["name"]] = prop["matrices"]

                        # parse all keys in the Property object except 'name', 'scalars', 'vectors', and 'matrices'
                        for prop_key in prop:

                            if prop_key not in [
                                    "name", "scalars", "vectors", "matrices"
                            ]:

                                # If value is a list of multiple items, set the cell to the entire list by first
                                # converting to object type, else results in a ValueError/IndexError
                                if type(prop[prop_key]) == list and len(
                                        prop[prop_key]) > 1:
                                    p_df[prop["name"] + "-" +
                                         prop_key] = np.nan
                                    p_df[prop["name"] + "-" + prop_key] = \
                                        p_df[prop["name"] + "-" + prop_key].astype(object)

                                p_df.set_value(counter,
                                               prop["name"] + "-" + prop_key,
                                               prop[prop_key])

                    p_df.index = [counter]
                    prop_df = prop_df.append(p_df)

        # Concatenate 'properties' and 'non-properties' dataframes
        df = pd.concat([non_prop_df, prop_df], axis=1)
        df.index.name = "system"

        # Remove uninformative columns, such as 'category' and 'uid'
        df.drop(["category", "uid"], axis=1, inplace=True)

        # Filter out columns not selected
        if show_columns:
            df = df[show_columns]

        return df

    def get_api_data(self,
                     formula=None,
                     prop=None,
                     data_type=None,
                     reference=None,
                     min_measurement=None,
                     max_measurement=None,
                     from_record=None,
                     data_set_id=None,
                     max_results=None):
        """
        Gets raw api data from Citrine in json format. See client docs
        at http://citrineinformatics.github.io/api-documentation/
        for more details on these parameters.

        Args:
            formula: (str) filter for the chemical formula field; only those
                results that have chemical formulas that contain this string
                will be returned
            prop: (str) name of the property to search for
            data_type: (str) 'EXPERIMENTAL'/'COMPUTATIONAL'/'MACHINE_LEARNING';
                filter for properties obtained from experimental work,
                computational methods, or machine learning.
            reference: (str) filter for the reference field; only those
                results that have contributors that contain this string
                will be returned
            min_measurement: (str/num) minimum of the property value range
            max_measurement: (str/num) maximum of the property value range
            from_record: (int) index of first record to return (indexed from 0)
            data_set_id: (int) id of the particular data set to search on
            max_results: (int) number of records to limit the results to

        Returns: (list) of jsons/pifs returned by Citrine's API
        """

        json_data = []
        start = from_record if from_record else 0
        per_page = 100
        refresh_time = 3  # seconds to wait between search calls

        # Construct all of the relevant queries from input args
        formula_query = ChemicalFieldQuery(filter=ChemicalFilter(
            equal=formula))
        prop_query = PropertyQuery(
            name=FieldQuery(filter=Filter(equal=prop)),
            value=FieldQuery(
                filter=Filter(min=min_measurement, max=max_measurement)),
            data_type=FieldQuery(filter=Filter(equal=data_type)))
        ref_query = ReferenceQuery(doi=FieldQuery(filter=Filter(
            equal=reference)))

        system_query = PifSystemQuery(chemical_formula=formula_query,
                                      properties=prop_query,
                                      references=ref_query)
        dataset_query = DatasetQuery(id=Filter(equal=data_set_id))
        data_query = DataQuery(system=system_query, dataset=dataset_query)

        while True:
            # use per_page=max_results, eg: in case of max_results=68 < 100
            if max_results and max_results < per_page:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=max_results)
            else:
                pif_query = PifSystemReturningQuery(query=data_query,
                                                    from_index=start,
                                                    size=per_page)

            # Check if any results found
            if "hits" not in self.client.search(pif_query).as_dictionary():
                raise KeyError("No results found!")

            data = self.client.search(pif_query).as_dictionary()["hits"]
            size = len(data)
            start += size
            json_data.extend(data)

            # check if limit is reached
            if max_results and len(json_data) > max_results:
                # get first multiple of 100 records
                json_data = json_data[:max_results]
                break
            if size < per_page:  # break out of last loop of results
                break

            time.sleep(refresh_time)

        return json_data
Esempio n. 3
0
class CitrineFeatureGeneration(object):
    """
    Class to generate new features using Citrine data and dataframe containing material compositions
    Datarame must have a column named "Material compositions".

    Args:
        configdict (dict) : MASTML configfile object as dict
        dataframe (pandas dataframe) : dataframe containing x and y data and feature names
        api_key (str) : your Citrination API key

    Methods:
        generate_citrine_features : generates Citrine feature set based on compositions in dataframe

            Args:
                save_to_csv (bool) : whether to save the magpie feature set to a csv file

            Returns:
                pandas dataframe : dataframe containing magpie feature set
    """
    def __init__(self, dataframe, api_key, composition_feature):
        self.dataframe = dataframe
        self.api_key = api_key
        self.client = CitrinationClient(api_key, 'https://citrination.com')
        self.composition_feature = composition_feature

    def generate_citrine_features(self):
        log.warning(
            'WARNING: You have specified generation of features from Citrine. Based on which'
            ' materials you are interested in, there may be many records to parse through, thus'
            ' this routine may take a long time to complete!')
        try:
            compositions = self.dataframe[self.composition_feature].tolist()
        except KeyError as e:
            log.error(f'original python error: {str(e)}')
            raise utils.MissingColumnError(
                'Error! No column named {self.composition_feature} found in your input data file. '
                'To use this feature generation routine, you must supply a material composition for each data point'
            )
        citrine_dict_property_min = dict()
        citrine_dict_property_max = dict()
        citrine_dict_property_avg = dict()

        # before: ~11 seconds
        # made into a func so we can do requests in parallel

        # now like 1.8 secs!
        pool = multiprocessing.Pool(processes=20)
        #result_tuples = pool.map(self._load_composition, compositions)
        result_tuples = map(self._load_composition, compositions)

        for comp, (prop_min, prop_max,
                   prop_avg) in zip(compositions, result_tuples):
            citrine_dict_property_min[comp] = prop_min
            citrine_dict_property_max[comp] = prop_max
            citrine_dict_property_avg[comp] = prop_avg

        dataframe = self.dataframe
        citrine_dict_list = [
            citrine_dict_property_min, citrine_dict_property_max,
            citrine_dict_property_avg
        ]
        for citrine_dict in citrine_dict_list:
            dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict,
                                                       orient='index')
            # Need to reorder compositions in new dataframe to match input dataframe
            dataframe_citrine = dataframe_citrine.reindex(
                self.dataframe[self.composition_feature].tolist())
            # Need to make compositions the first column, instead of the row names
            dataframe_citrine.index.name = self.composition_feature
            dataframe_citrine.reset_index(inplace=True)
            # Need to delete duplicate column before merging dataframes
            del dataframe_citrine[self.composition_feature]
            # Merge magpie feature dataframe with originally supplied dataframe
            dataframe = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe, dataframe2=dataframe_citrine)

        return dataframe

    def _load_composition(self, composition):
        pifquery = self._get_pifquery(composition=composition)
        property_name_list, property_value_list = self._get_pifquery_property_list(
            pifquery=pifquery)
        #print("Citrine Feature Generation: ", composition, property_name_list, property_value_list)
        property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list(
            property_name_list=property_name_list,
            property_value_list=property_value_list)
        return parsed_property_min, parsed_property_max, parsed_property_avg

    def _get_pifquery(self, composition):
        # TODO: does this stop csv generation on first invalid composition?
        # TODO: Is there a way to send many compositions in one call to citrine?
        pif_query = PifQuery(system=SystemQuery(
            chemical_formula=ChemicalFieldQuery(filter=ChemicalFilter(
                equal=composition))))
        # Check if any results found
        if 'hits' not in self.client.search(pif_query).as_dictionary():
            raise KeyError('No results found!')
        pifquery = self.client.search(pif_query).as_dictionary()['hits']
        return pifquery

    def _get_pifquery_property_list(self, pifquery):
        property_name_list = list()
        property_value_list = list()
        accepted_properties_list = [
            'mass', 'space group', 'band', 'Band', 'energy', 'volume',
            'density', 'dielectric', 'Dielectric', 'Enthalpy', 'Convex',
            'Magnetization', 'Elements', 'Modulus', 'Shear', "Poisson's",
            'Elastic', 'Energy'
        ]

        for result_number, results in enumerate(pifquery):
            for i, dictionary in enumerate(results['system']['properties']):
                if 'name' not in dictionary or dictionary['name'] == "CIF File":
                    continue
                value = dictionary['name']
                for entry in accepted_properties_list:
                    if entry not in value: continue
                    property_name_list.append(value)
                    try:
                        property_value_list.append(
                            float(dictionary['scalars'][0]['value']))
                    except (ValueError, KeyError):
                        property_name_list.pop(-1)
                        continue

        #for result_number, results in enumerate(pifquery):
        #    property_value = results['system']['properties']
        #    for list_index, list_element in enumerate(property_value):
        #        for name, value in property_value[list_index].items():
        #            if name == 'name' and value != "CIF File":
        #                for entry in accepted_properties_list:
        #                    if entry in value:
        #                        property_name_list.append(value)
        #                        try:
        #                            property_value_list.append(
        #                                float(property_value[list_index]['scalars'][0]['value']))
        #                        except (ValueError, KeyError):
        #                            # print('found something to remove', property_value[list_index]['scalars'][0]['value'])
        #                            property_name_list.pop(-1)
        #                            continue

        return property_name_list, property_value_list

    def _parse_pifquery_property_list(self, property_name_list,
                                      property_value_list):
        parsed_property_max = dict()
        parsed_property_min = dict()
        parsed_property_avg = dict()
        property_names_unique = list()
        if len(property_name_list) != len(property_value_list):
            print(
                'Error! Length of property name and property value lists are not the same. There must be a bug in the _get_pifquerey_property_list method'
            )
            raise IndexError(
                "property_name_list and property_value_list are not the same size."
            )
        else:
            # Get unique property names
            for name in property_name_list:
                if name not in property_names_unique:
                    property_names_unique.append(name)
            for unique_name in property_names_unique:
                unique_property = list()
                unique_property_avg = 0
                count = 0
                for i, name in enumerate(property_name_list):
                    # Only include property values whose name are same as those in unique_name list
                    if name == unique_name:
                        count += 1  # count how many instances of the same property occur
                        unique_property_avg += property_value_list[i]
                        unique_property.append(property_value_list[i])
                unique_property_min = min(entry for entry in unique_property)
                unique_property_max = max(entry for entry in unique_property)
                unique_property_avg = unique_property_avg / count
                parsed_property_min[str(unique_name) +
                                    "_min"] = unique_property_min
                parsed_property_max[str(unique_name) +
                                    "_max"] = unique_property_max
                parsed_property_avg[str(unique_name) +
                                    "_avg"] = unique_property_avg

        return property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg
Esempio n. 4
0
class CitrineDataRetrieval:
    def __init__(self, api_key=None):
        """
        Args:
            api_key: (str) Your Citrine API key, or None if you've set the CITRINE_KEY environment variable

        Returns: None
        """
        api_key = api_key if api_key else os.environ['CITRINE_KEY']
        self.client = CitrinationClient(api_key, 'http://citrination.com')

    def get_dataframe(self,
                      term=None,
                      formula=None,
                      property=None,
                      contributor=None,
                      reference=None,
                      min_measurement=None,
                      max_measurement=None,
                      from_record=None,
                      data_set_id=None,
                      max_results=None,
                      show_columns=None):
        """
        Gets data from MP in a dataframe format.
        See client docs at http://citrineinformatics.github.io/api-documentation/ for more details on these parameters.

        Args:
            term: (str) general search string; this is searched against all fields
            formula: (str) filter for the chemical formula field; only those results that have chemical formulas that
                contain this string will be returned
            property: (str) name of the property to search for
            contributor: (str) filter for the contributor field; only those results that have contributors that
                contain this string will be returned
            reference: (str) filter for the reference field; only those results that have contributors that
                contain this string will be returned
            min_measurement: (str/num) minimum of the property value range
            max_measurement: (str/num) maximum of the property value range
            from_record: (int) index of the first record to return (indexed from 0)
            data_set_id: (int) id of the particular data set to search on
            max_results: (int) number of records to limit the results to

        Returns: (object) Pandas dataframe object containing the results
        """

        json_data = []
        start = from_record if from_record else 0
        per_page = 100
        refresh_time = 3  # seconds to wait between search calls

        while True:
            if max_results and max_results < per_page:  # use per_page=max_results, eg: in case of max_results=68 < 100
                data = self.client.search(term=term,
                                          formula=formula,
                                          property=property,
                                          contributor=contributor,
                                          reference=reference,
                                          min_measurement=min_measurement,
                                          max_measurement=max_measurement,
                                          from_record=start,
                                          per_page=max_results,
                                          data_set_id=data_set_id)
            else:
                data = self.client.search(term=term,
                                          formula=formula,
                                          property=property,
                                          contributor=contributor,
                                          reference=reference,
                                          min_measurement=min_measurement,
                                          max_measurement=max_measurement,
                                          from_record=start,
                                          per_page=per_page,
                                          data_set_id=data_set_id)
            size = len(data.json()['results'])
            start += size
            json_data.append(data.json()['results'])
            if max_results and len(
                    json_data
            ) * per_page > max_results:  # check if limit is reached
                json_data = json_data[:(
                    max_results /
                    per_page)]  # get first multiple of 100 records
                json_data.append(
                    data.json()['results'][:max_results %
                                           per_page])  # get remaining records
                break
            if size < per_page:  # break out of last loop of results
                break
            time.sleep(refresh_time)

        non_meas_df = pd.DataFrame()  # df w/o measurement column
        meas_df = pd.DataFrame()  # df containing only measurement column

        counter = 0  # variable to keep count of sample hit and set indexes

        for page in json_data:
            # df = pd.concat((json_normalize(hit) for hit in set))   # Useful tool for the future
            for hit in tqdm(page):
                counter += 1
                if 'sample' in hit.keys():
                    sample_value = hit['sample']
                    sample_normdf = json_normalize(sample_value)
                    # Make a DF of all non-'measurement' fields
                    non_meas_cols = [
                        cols for cols in sample_normdf.columns
                        if "measurement" not in cols
                    ]
                    non_meas_row = pd.DataFrame()
                    for col in non_meas_cols:
                        non_meas_row[col] = sample_normdf[col]
                    non_meas_row.index = [counter] * len(sample_normdf)
                    non_meas_df = non_meas_df.append(non_meas_row)
                    # Make a DF of the 'measurement' array
                    if 'measurement' in sample_value:
                        meas_normdf = json_normalize(
                            sample_value['measurement'])
                        # Extract numbers of properties
                        if 'property.scalar' in meas_normdf.columns:
                            for row, col in enumerate(
                                    meas_normdf['property.scalar']):
                                for item in col:
                                    if 'value' in item:
                                        meas_normdf.xs(row)[
                                            'property.scalar'] = item['value']
                                    # TODO: ask Anubhav how to deal with these and rest of formats
                                    elif 'minimum' in item and 'maximum' in item:
                                        meas_normdf.xs(
                                            row
                                        )['property.scalar'] = 'Minimum = ' + item[
                                            'minimum'] + ', ' + 'Maximum = ' + item[
                                                'maximum']
                        # Take all property rows and convert them into columns
                        prop_df = pd.DataFrame()
                        prop_cols = [
                            cols for cols in meas_normdf.columns
                            if "property" in cols
                        ]
                        for col in prop_cols:
                            prop_df[col] = meas_normdf[col]
                        prop_df.index = [counter] * len(meas_normdf)
                        prop_df = prop_df.drop_duplicates(['property.name'])
                        if 'property.scalar' in meas_normdf.columns:
                            prop_df = prop_df.pivot(columns='property.name',
                                                    values='property.scalar')
                        elif 'property.matrix' in meas_normdf.columns:
                            prop_df = prop_df.pivot(columns='property.name',
                                                    values='property.matrix')
                        prop_df = prop_df.convert_objects(
                            convert_numeric=True
                        )  # Convert columns from object to num
                        # Making a single row DF of non-'measurement.property' columns
                        non_prop_df = pd.DataFrame()
                        non_prop_cols = [
                            cols for cols in meas_normdf.columns
                            if "property" not in cols
                        ]
                        for col in non_prop_cols:
                            non_prop_df['measurement.' +
                                        col] = meas_normdf[col]
                        if len(
                                non_prop_df
                        ) > 0:  # Do not index empty DF (non-'measuremenet.property' columns absent)
                            non_prop_df.index = [counter] * len(meas_normdf)
                        non_prop_df = non_prop_df[:
                                                  1]  # Take only first row - does not collect non-unique rows
                        units_df = pd.DataFrame(
                        )  # Get property unit and insert it as a dict
                        if 'property.units' in meas_normdf.columns:
                            curr_units = dict(
                                zip(meas_normdf['property.name'],
                                    meas_normdf['property.units']))
                            units_df['property.units'] = [curr_units]
                            units_df.index = [counter] * len(meas_normdf)
                        meas_df = meas_df.append(
                            pd.concat([prop_df, non_prop_df, units_df],
                                      axis=1))

        df = pd.concat([non_meas_df, meas_df], axis=1)
        df.index.name = 'sample'
        if show_columns:
            for column in df.columns:
                if column not in show_columns:
                    df.drop(column, axis=1, inplace=True)
        return df
        df.append