Esempio n. 1
0
    def populate_point_information(prop_x, prop_y, current_func):
        if not (prop_x and prop_y):
            raise PreventUpdate

        prop_x_name = Registry("symbols")[prop_x].display_names[0]
        prop_y_name = Registry("symbols")[prop_y].display_names[0]

        data = list(
            store.query(criteria={
                'property_x': prop_x,
                'property_y': prop_y
            }))

        path_length = data[0]['shortest_path_length']
        if path_length is None:
            path_text = "not connected"
        elif path_length == 0:
            path_text = "properties are the same"
        else:
            path_text = f"separated by {path_length} model"
            if path_length > 1:
                path_text += "s"
        point_text = dcc.Markdown(f"""
##### Point information
**x-axis property:** {prop_x_name}

**y-axis property:** {prop_y_name}

**distance apart on graph:** {path_text}

**number of data points:** {data[0]['n_points']}
""")

        # This ensures we know the ordering of the rows
        correlation_data = {
            d['correlation_func']: {
                'Correlation Function':
                correlation_func_info[d['correlation_func']]["name"],
                'Correlation Value':
                f"{d['correlation']:0.5f}"
            }
            for d in data
        }
        correlation_data = [
            correlation_data[func] for func in correlation_funcs
        ]

        correlation_table = dt.DataTable(
            id='corr-table',
            data=correlation_data,
            columns=[{
                'id': val,
                'name': val
            } for val in ('Correlation Function', 'Correlation Value')],
            editable=False,
            style_data_conditional=[{
                'if': {
                    'row_index': correlation_funcs.index(current_func)
                },
                "backgroundColor": "#3D9970",
                'color': 'white'
            }],
            style_cell={
                'font-family': 'HelveticaNeue',
                'text-align': 'left'
            },
            style_header={
                'fontWeight': 'bold',
                'font-family': 'HelveticaNeue',
                'text-align': 'left'
            })
        link_to_plot = dcc.Link("View the data plot",
                                href=f'/plot?x={prop_x}&y={prop_y}')
        return [point_text, correlation_table, link_to_plot], True
Esempio n. 2
0
 def tearDownClass(cls):
     non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin]
     for sym in non_builtin_syms:
         Registry("symbols").pop(sym)
Esempio n. 3
0
    def setUpClass(cls):
        add_builtin_models_to_registry()
        # Inspiration was taken from the GraphTest class
        # I tried to construct the dictionaries for comparison
        # without writing out every one explicitly by reusing
        # information where it was applicable.
        # If this is too unreadable, can change to writing it
        # out explicitly in a JSON file and importing it. Would
        # still need to replace some fields dynamically.
        symbols = StorageTest.generate_symbols()

        cls.custom_syms_as_dicts = {
            k: {'@module': 'propnet.core.symbols',
                '@class': 'Symbol',
                'name': k,
                'display_names': [k],
                'display_symbols': [k],
                'units': (1, ()),
                'shape': 1,
                'object_type': None,
                'comment': None,
                'category': 'property',
                'constraint': None,
                'default_value': None,
                'is_builtin': False} for k in ['A', 'B', 'C']
        }
        cls.custom_syms_as_dicts['C'].update(
            {"units": None,
             "shape": None,
             "object_type": "str",
             "category": "object"})

        cls.custom_symbols_json = copy.deepcopy(cls.custom_syms_as_dicts)
        for k in ['A', 'B']:
            cls.custom_symbols_json[k]['units'] = [1, []]

        a = [QuantityFactory.create_quantity(symbols['A'], 19),
             QuantityFactory.create_quantity(symbols['A'], 23)]
        b = [QuantityFactory.create_quantity(symbols['B'], 38,
                                             provenance=ProvenanceElement(model='model1',
                                                                          inputs=[a[0]])),
             QuantityFactory.create_quantity(symbols['B'], 46,
                                             provenance=ProvenanceElement(model='model1',
                                                                          inputs=[a[1]]))]
        cls.quantities_custom_symbol = {"A": a,
                                         "B": b}

        cls.sq_custom_sym_as_dicts = {
            k: [{'@module': 'propnet.dbtools.storage',
                 '@class': 'StorageQuantity',
                 'internal_id': vv._internal_id,
                 'data_type': 'NumQuantity',
                 'symbol_type': symbols[k],
                 'value': vv.magnitude,
                 'units': 'dimensionless',
                 'provenance': ProvenanceStore.from_provenance_element(vv.provenance),
                 'tags': [],
                 'uncertainty': None} for vv in v] for k, v in cls.quantities_custom_symbol.items()
        }

        provenances_json = {
            "A": [{'@module': 'propnet.dbtools.storage',
                   '@class': 'ProvenanceStore',
                   'model': None,
                   'inputs': None,
                   'source': aa.provenance.source} for aa in a]}
        provenances_json['B'] = [
            {'@module': 'propnet.dbtools.storage',
             '@class': 'ProvenanceStore',
             'model': 'model1',
             'inputs': [{'@module': 'propnet.dbtools.storage',
                         '@class': 'ProvenanceStoreQuantity',
                         'data_type': 'NumQuantity',
                         'symbol_type': cls.custom_symbols_json['A'],
                         'internal_id': q.provenance.inputs[0]._internal_id,
                         'tags': [],
                         'provenance': p}],
             'source': q.provenance.source} for q, p in zip(b, provenances_json['A'])]

        cls.sq_custom_sym_json = copy.deepcopy(cls.sq_custom_sym_as_dicts)
        for sym in ['A', 'B']:
            for q, p in zip(cls.sq_custom_sym_json[sym], provenances_json[sym]):
                q['symbol_type'] = cls.custom_symbols_json[sym]
                q['provenance'] = p

        band_gaps = [QuantityFactory.create_quantity('band_gap', 3.3, 'eV'),
                     QuantityFactory.create_quantity('band_gap', 2.1, 'eV')]

        bg_ri_model = Registry("models")['band_gap_refractive_index_moss']
        refractive_indices = [bg_ri_model.evaluate({"Eg": bg}).pop('refractive_index') for bg in band_gaps]

        cls.quantities_canonical_symbol = {"band_gaps": band_gaps,
                                            "refractive_indices": refractive_indices}

        cls.sq_canonical_sym_as_dicts_no_value = copy.deepcopy(cls.sq_custom_sym_as_dicts)
        cls.sq_canonical_sym_as_dicts_no_value['band_gaps'] = cls.sq_canonical_sym_as_dicts_no_value.pop('A')
        cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'] = cls.sq_canonical_sym_as_dicts_no_value.pop('B')

        for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['band_gaps'], band_gaps):
            d.update({
                "internal_id": sq._internal_id,
                "symbol_type": "band_gap",
                "units": "electron_volt",
                "provenance": ProvenanceStore.from_provenance_element(sq.provenance)
            })
            d.pop('value')

        for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'], refractive_indices):
            d.update({
                "internal_id": sq._internal_id,
                "symbol_type": "refractive_index",
                "units": "dimensionless",
                "provenance": ProvenanceStore.from_provenance_element(sq.provenance)
            })
            d.pop('value')

        cls.sq_canonical_sym_values = {"band_gaps": [3.3, 2.1],
                                        "refractive_indices": [2.316340583741216, 2.593439239956374]}

        provenances_json['band_gaps'] = [
            {'@module': 'propnet.dbtools.storage',
             '@class': 'ProvenanceStore',
             'model': None,
             'inputs': None,
             'source': bg.provenance.source}
            for bg in band_gaps
        ]

        provenances_json['refractive_indices'] = [{
            '@module': 'propnet.dbtools.storage',
            '@class': 'ProvenanceStore',
            'model': 'band_gap_refractive_index_moss',
            'inputs': [{'@module': 'propnet.dbtools.storage',
                        '@class': 'ProvenanceStoreQuantity',
                        'data_type': 'NumQuantity',
                        'symbol_type': 'band_gap',
                        'internal_id': bg._internal_id,
                        'tags': [],
                        'provenance': pj}],
            'source': ri.provenance.source}
            for bg, pj, ri in zip(band_gaps,
                                  provenances_json['band_gaps'],
                                  refractive_indices)
        ]

        cls.sq_canonical_sym_json_no_value = copy.deepcopy(cls.sq_canonical_sym_as_dicts_no_value)

        for sym in ["band_gaps", "refractive_indices"]:
            for q, p in zip(cls.sq_canonical_sym_json_no_value[sym], provenances_json[sym]):
                q['provenance'] = p

        cls.quantity_with_uncertainty = NumQuantity.from_weighted_mean(b)
        cls.sq_with_uncertainty_as_dict_no_numbers = {
            '@module': 'propnet.dbtools.storage',
            '@class': 'StorageQuantity',
            'internal_id': cls.quantity_with_uncertainty._internal_id,
            'data_type': 'NumQuantity',
            'symbol_type': symbols['B'],
            'units': 'dimensionless',
            'provenance': ProvenanceStore.from_provenance_element(
                cls.quantity_with_uncertainty.provenance),
            'tags': []}

        provenances_json = {
            '@module': 'propnet.dbtools.storage',
            '@class': 'ProvenanceStore',
            'model': 'aggregation',
            'inputs': [
                {'@module': 'propnet.dbtools.storage',
                 '@class': 'ProvenanceStoreQuantity',
                 'data_type': 'NumQuantity',
                 'symbol_type': cls.custom_symbols_json['B'],
                 'internal_id': b['internal_id'],
                 'tags': [],
                 'provenance': b['provenance']}
                for b in cls.sq_custom_sym_json['B']],
            'source': cls.quantity_with_uncertainty.provenance.source
        }

        cls.sq_with_uncertainty_json_no_numbers = copy.deepcopy(cls.sq_with_uncertainty_as_dict_no_numbers)
        cls.sq_with_uncertainty_json_no_numbers.update({"symbol_type": cls.custom_symbols_json['B'],
                                                         "provenance": provenances_json})
        cls.sq_with_uncertainty_numbers = {"value": 42.0,
                                            "uncertainty": 4.0}

        obj_symbol = symbols['C']
        cls.object_quantity = QuantityFactory.create_quantity(obj_symbol, "Test string")
        cls.sq_object_as_dict = copy.deepcopy(cls.sq_custom_sym_as_dicts['A'][0])
        cls.sq_object_as_dict.update({
            "data_type": "ObjQuantity",
            "symbol_type": symbols['C'],
            "internal_id": cls.object_quantity._internal_id,
            "value": "Test string",
            "units": None,
            "provenance": ProvenanceStore.from_provenance_element(cls.object_quantity.provenance)
        })
        cls.sq_object_json = copy.deepcopy(cls.sq_object_as_dict)
        cls.sq_object_json.update(
            {"symbol_type": cls.custom_syms_as_dicts['C'],
             "provenance": {'@module': 'propnet.dbtools.storage',
                            '@class': 'ProvenanceStore',
                            'model': None,
                            'inputs': None,
                            'source': cls.object_quantity.provenance.source}}
        )

        # This setting allows dict differences to be shown in full
        cls.maxDiff = None
Esempio n. 4
0
import logging

logger = logging.getLogger(__name__)

mpr = MPRester()

try:
    store = loadfn(environ["PROPNET_STORE_FILE"])
    store.connect()
except (ServerSelectionTimeoutError, KeyError):
    from maggma.stores import MemoryStore
    store = MemoryStore()
    store.connect()
    # layout won't work if database is down, but at least web app will stay up
    scalar_symbols = {k: v for k, v in Registry("symbols").items()
                      if (v.category == 'property' and v.shape == 1)}
    warning_layout = html.Div('No database connection could be established.',
                              style={'font-family': 'monospace',
                                     'color': 'rgb(211, 84, 0)',
                                     'text-align': 'left',
                                     'font-size': '1.2em'})
else:
    cut_off = 100  # need at least this many available quantities for plot
    """
    scalar_symbols = {k: v for k, v in Registry("symbols").items()
                      if (v.category == 'property' and v.shape == 1
                          and store.query(
                              criteria={f'{k}.mean': {'$exists': True}}).count() > cut_off)}
    """
    scalar_symbols = {
Esempio n. 5
0
def _update_globals():
    for name, model in Registry("models").items():
        if model.is_builtin:
            globals()[name] = model
Esempio n. 6
0
 def tearDownClass(cls):
     Registry.clear_all_registries()
Esempio n. 7
0
 def setUpClass(cls):
     Registry.clear_all_registries()
Esempio n. 8
0
from propnet.ext.matproj import MPRester
from propnet.ext.aflow import AflowAdapter

MPR = MPRester()
AFA = AflowAdapter()
graph_evaluator = Graph(parallel=True, max_workers=4)

# explicitly making this an OrderedDict so we can go back from the
# display name to the symbol name
# Removed condition symbols from table until we can handle combinatorics blow-up that results
# from adding a temperature -cml
# TODO: Add condition symbols back when combinartorics problem solved
SCALAR_SYMBOLS = OrderedDict({
    k: v
    for k, v in sorted(Registry("symbols").items(),
                       key=lambda x: x[1].display_names[0])
    if (v.category == 'property' and v.shape == 1)
})
ROW_IDX_TO_SYMBOL_NAME = [symbol for symbol in SCALAR_SYMBOLS.keys()]

DEFAULT_ROWS = [{
    'Property': symbol.display_names[0],
    'Editable Value': ""
} for symbol in SCALAR_SYMBOLS.values()]

REMAINING_SYMBOLS = OrderedDict({
    k: v
    for k, v in sorted(Registry("symbols").items(),
                       key=lambda x: x[1].display_names[0])
    if not ((v.category == 'property' or v.category == 'condition')
Esempio n. 9
0
    def evaluate(input_rows, data, aggregate):

        quantities = []

        for idx, row in enumerate(input_rows):
            if row['Editable Value']:
                try:
                    value = ureg.parse_expression(row['Editable Value'])
                    units = Registry("units").get(ROW_IDX_TO_SYMBOL_NAME[idx])
                    value.ito(units)
                except Exception:
                    # Someone put an invalid value in the table
                    # TODO: Make error known to the user
                    raise PreventUpdate
                q = QuantityFactory.create_quantity(
                    symbol_type=ROW_IDX_TO_SYMBOL_NAME[idx], value=value)
                quantities.append(q)

        if data and len(data) > 0:
            quantities += json.loads(data, cls=MontyDecoder).values()

        if not quantities:
            raise PreventUpdate

        material = Material()

        for quantity in quantities:
            material.add_quantity(quantity)

        output_material = graph_evaluator.evaluate(material, timeout=5)

        if aggregate:
            aggregated_quantities = output_material.get_aggregated_quantities()
            non_aggregatable_quantities = [
                v for v in output_material.get_quantities()
                if v.symbol not in aggregated_quantities
            ]
            output_quantities = list(
                aggregated_quantities.values()) + non_aggregatable_quantities
        else:
            output_quantities = output_material.get_quantities()

        output_rows = [{
            'Property': quantity.symbol.display_names[0],
            'Value': quantity.pretty_string(sigfigs=3)
        } for quantity in output_quantities]

        output_table = dt.DataTable(id='output-table',
                                    data=output_rows,
                                    columns=[{
                                        'id': val,
                                        'name': val
                                    } for val in ('Property', 'Value')],
                                    editable=False,
                                    **DATA_TABLE_STYLE)

        # TODO: clean up

        input_quantity_names = [q.symbol for q in quantities]
        derived_quantity_names = \
            set([q.symbol for q in output_quantities]) - \
            set(input_quantity_names)

        models_evaluated = set(
            output_q.provenance.model
            for output_q in output_material.get_quantities())
        models_evaluated = [
            Registry("models").get(m) for m in models_evaluated
            if Registry("models").get(m) is not None
        ]

        material_graph_data = graph_conversion(
            propnet_nx_graph,
            derivation_pathway={
                'inputs': input_quantity_names,
                'outputs': list(derived_quantity_names),
                'models': models_evaluated
            })

        output_graph = html.Div(children=[
            dcc.Checklist(id='material-graph-options',
                          options=[{
                              'label': 'Show models',
                              'value': 'show_models'
                          }, {
                              'label': 'Show properties',
                              'value': 'show_properties'
                          }],
                          value=['show_properties'],
                          labelStyle={'display': 'inline-block'}),
            Cytoscape(id='material-graph',
                      elements=material_graph_data,
                      stylesheet=GRAPH_STYLESHEET,
                      layout=GRAPH_LAYOUT_CONFIG,
                      **GRAPH_SETTINGS['full_view'])
        ])

        return [output_graph, html.Br(), output_table]
Esempio n. 10
0
    def get_data_from_full_db(self, prop_x, prop_y):
        """
        Collects scalar data from full propnet database, aggregates it by property,
        and samples it if desired.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y

        Returns:
            dict: dictionary of data keyed by property name

        """

        # Get all materials which have both properties in the inputs or outputs
        criteria = {
            '$and': [{
                '$or': [{
                    'inputs.symbol_type': prop_x
                }, {
                    prop_x: {
                        '$exists': True
                    }
                }]
            }, {
                '$or': [{
                    'inputs.symbol_type': prop_y
                }, {
                    prop_y: {
                        '$exists': True
                    }
                }]
            }]
        }
        properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs']

        if self.sample_size is None:
            pn_data = self.propnet_store.query(criteria=criteria,
                                               properties=properties)
        else:
            pipeline = [
                {
                    '$match': criteria
                },
                {
                    '$sample': {
                        'size': self.sample_size
                    }
                },
                {
                    '$project': {p: True
                                 for p in properties}
                },
            ]
            pn_data = self.propnet_store.collection.aggregate(
                pipeline, allowDiskUse=True)

        x_unit = Registry("units")[prop_x]
        y_unit = Registry("units")[prop_y]
        data = defaultdict(list)
        for material in pn_data:
            # Collect all data with units for this material
            # and calculate the mean, convert units, store magnitude of mean
            if prop_x == prop_y:
                # This is to avoid duplicating the work and the data
                props = (prop_x, )
                units = (x_unit, )
            else:
                props = (prop_x, prop_y)
                units = (x_unit, y_unit)
            for prop, unit in zip(props, units):
                qs = [
                    ureg.Quantity(q['value'], q['units'])
                    for q in material['inputs'] if q['symbol_type'] == prop
                ]
                if prop in material:
                    qs.extend([
                        ureg.Quantity(q['value'], q['units'])
                        for q in material[prop]['quantities']
                    ])

                if len(qs) == 0:
                    raise ValueError("Query for property {} gave no results"
                                     "".format(prop))
                prop_mean = sum(qs) / len(qs)
                data[prop].append(prop_mean.to(unit).magnitude)

        return data
Esempio n. 11
0
 def tearDownClass(cls):
     warnings.filterwarnings("default", category=UnitStrippedWarning)
     non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin]
     for sym in non_builtin_syms:
         Registry("symbols").pop(sym)
Esempio n. 12
0
class CorrelationBuilder(Builder):
    """
    A class to calculate the correlation between properties derived by or used in propnet
    using a suite of regression tools. Uses the Builder architecture for optional parallel
    processing of data.

    Note: serialization of builder does not work with custom correlation functions, although
    interactive use does support them.

    """
    PROPNET_PROPS = [
        v.name for v in Registry("symbols").values()
        if (v.category == 'property' and v.shape == 1)
    ]

    def __init__(self,
                 propnet_store,
                 correlation_store,
                 out_file=None,
                 funcs='linlsq',
                 props=None,
                 sample_size=None,
                 from_quantity_db=True,
                 **kwargs):
        """
        Constructor for the correlation builder.

        Args:
            propnet_store (Mongolike Store): store instance pointing to propnet collection
                with read access
            correlation_store (Mongolike Store): store instance pointing to collection with write access
            out_file (str): optional, filename to output data in JSON format (useful if using a MemoryStore
                for correlation_store)
            funcs (`str`, `callable`, list of `str` or `callable`) functions to use for correlation.
                Built-in functions can be specified by the following strings:

                linlsq (default): linear least-squares, reports R^2
                pearson: Pearson r-correlation, reports r
                spearman: Spearman rank correlation, reports r
                mic: maximal-information non-parametric exploration, reports maximal information coefficient
                ransac: random sample consensus (RANSAC) regression, reports score
                theilsen: Theil-Sen regression, reports score
                all: runs all correlation functions above
            props (`list` of `str`): optional, list of properties for which to calculate the correlation.
                Default is to calculate for all possible pairs (props=None)
            sample_size (int): optional, limits correlation calculation data to a random sample of size
                `sample_size`. Default: None (no limit)
            from_quantity_db (bool): True means propnet_store follows the quantity-indexed database
                schema, False means the full, material-indexed database schema. Note: querying quantity-indexed
                databases is considerably faster than material-indexed.
                Default: True (quantity schema)
            **kwargs: arguments to the Builder superclass
        """

        self.propnet_store = propnet_store
        self.from_quantity_db = from_quantity_db
        self.correlation_store = correlation_store
        self.out_file = out_file

        self._correlation_funcs = self.get_correlation_funcs()

        self._funcs = {}

        if not isinstance(funcs, list):
            funcs = [funcs]

        for f in funcs:
            if isinstance(f, str) and f == 'all':
                self._funcs.update(self._correlation_funcs)
            elif isinstance(f, str) and f in self._correlation_funcs.keys():
                self._funcs[f] = self._correlation_funcs[f]
            elif callable(f):
                name = f.__module__ + "." + f.__name__
                self._funcs[name] = f
            else:
                raise ValueError("Invalid correlation function: {}".format(f))

        if not self._funcs:
            raise ValueError("No valid correlation functions selected")

        self._props = props or self.PROPNET_PROPS

        if sample_size is not None and sample_size < 2:
            raise ValueError("Sample size must be greater than 1")
        self.sample_size = sample_size
        self.total = None

        super(CorrelationBuilder, self).__init__(sources=[propnet_store],
                                                 targets=[correlation_store],
                                                 **kwargs)

    @classmethod
    def get_correlation_funcs(cls):
        """
        Gets built-in correlation functions and their names.

        Returns:
            dict: dict of function handles keyed by name

        """
        return {
            f.replace('_cfunc_', ''): getattr(cls, f)
            for f in dir(cls)
            if re.match(r'^_cfunc_.+$', f) and callable(getattr(cls, f))
        }

    def get_items(self):
        """
        Accumulates data and generates data sets for pairs of properties coupled
        with correlation functions.

        Returns:
            (generator): yields dicts of data (see _make_data_combinations())
        """
        self.total = len(self._props)**2 * len(self._funcs)

        # combinations_with_replacement() produces all possible pairs of properties
        # without repeating, i.e. will give AB but not BA. Code below manually
        # produces "BA" so that we don't have to re-query the database.
        for prop_x, prop_y in combinations_with_replacement(self._props, 2):
            if self.from_quantity_db:
                data = self.get_data_from_quantity_db(
                    self.propnet_store,
                    prop_x,
                    prop_y,
                    sample_size=self.sample_size)
            else:
                data = self.get_data_from_full_db(prop_x, prop_y)

            yield from self._make_data_combinations(prop_x, prop_y, data)

    @staticmethod
    def get_data_from_quantity_db(store,
                                  *props,
                                  sample_size=None,
                                  include_id=False):
        """
        Collects scalar data from the quantity-onlu propnet database,
        aggregates it by material and property, and samples it if desired.

        Args:
            store (maggma.stores.Store): MongoDB store instance for quantity databse
            *props (str): property names as strings
            sample_size (int): If specified, limits the number of returned records
                to sample_size, randomly selected. If total of records is less than
                sample_size, only those records are returned. Default: None (all records)
            include_id (bool): True includes the '_id' field, which contains the material
                key for the record. Default: False (do not include the field)

        Returns:
            dict: dictionary of data keyed by property name

        """

        # This aggregation query collects the quantities, groups them by material
        # and averages the values for that material, then samples them (if specified)
        match_stage = {
            '$match': {
                '$or': [{
                    'symbol_type': prop
                } for prop in props]
            }
        }
        group_stage = {'$group': {'_id': '$material_key'}}
        for prop in props:
            group_stage['$group'].update({
                prop: {
                    '$avg': {
                        '$cond': [{
                            "$eq": ['$symbol_type', prop]
                        }, '$value', None]
                    }
                }
            })
        pipeline = [match_stage, group_stage]

        if sample_size is not None:
            pipeline.append({'$sample': {'size': sample_size}})

        query = store.collection.aggregate(pipeline=pipeline,
                                           allowDiskUse=True)

        data = defaultdict(list)
        for m in query:
            if all(m[prop] is not None and np.isfinite(m[prop])
                   for prop in props):
                for prop in props:
                    data[prop].append(m[prop])
                if include_id:
                    data['_id'].append(m['_id'])

        return dict(data)

    def get_data_from_full_db(self, prop_x, prop_y):
        """
        Collects scalar data from full propnet database, aggregates it by property,
        and samples it if desired.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y

        Returns:
            dict: dictionary of data keyed by property name

        """

        # Get all materials which have both properties in the inputs or outputs
        criteria = {
            '$and': [{
                '$or': [{
                    'inputs.symbol_type': prop_x
                }, {
                    prop_x: {
                        '$exists': True
                    }
                }]
            }, {
                '$or': [{
                    'inputs.symbol_type': prop_y
                }, {
                    prop_y: {
                        '$exists': True
                    }
                }]
            }]
        }
        properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs']

        if self.sample_size is None:
            pn_data = self.propnet_store.query(criteria=criteria,
                                               properties=properties)
        else:
            pipeline = [
                {
                    '$match': criteria
                },
                {
                    '$sample': {
                        'size': self.sample_size
                    }
                },
                {
                    '$project': {p: True
                                 for p in properties}
                },
            ]
            pn_data = self.propnet_store.collection.aggregate(
                pipeline, allowDiskUse=True)

        x_unit = Registry("units")[prop_x]
        y_unit = Registry("units")[prop_y]
        data = defaultdict(list)
        for material in pn_data:
            # Collect all data with units for this material
            # and calculate the mean, convert units, store magnitude of mean
            if prop_x == prop_y:
                # This is to avoid duplicating the work and the data
                props = (prop_x, )
                units = (x_unit, )
            else:
                props = (prop_x, prop_y)
                units = (x_unit, y_unit)
            for prop, unit in zip(props, units):
                qs = [
                    ureg.Quantity(q['value'], q['units'])
                    for q in material['inputs'] if q['symbol_type'] == prop
                ]
                if prop in material:
                    qs.extend([
                        ureg.Quantity(q['value'], q['units'])
                        for q in material[prop]['quantities']
                    ])

                if len(qs) == 0:
                    raise ValueError("Query for property {} gave no results"
                                     "".format(prop))
                prop_mean = sum(qs) / len(qs)
                data[prop].append(prop_mean.to(unit).magnitude)

        return data

    def _make_data_combinations(self, prop_x, prop_y, data):
        """
        Generates combinations of properties and desired correlation functions for evaluation.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y
            data (dict): dictionary of data keyed by property name

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        # So we get AB and BA without re-querying, but not two AA
        if prop_x == prop_y:
            prop_combos = ((prop_x, prop_x), )
        else:
            prop_combos = ((prop_x, prop_y), (prop_y, prop_x))
        for x, y in prop_combos:
            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': data.get(x, []),
                    'x_name': x,
                    'y_data': data.get(y, []),
                    'y_name': y,
                    'func': (name, func)
                }
                yield data_dict

    def process_item(self, item):
        """
        Run correlation calculation on a pair of properties using the specified function.

        Args:
            item: (dict) input provided by get_items() (see get_items() for structure)

        Returns: (tuple<str, str, float, str, int>) output of calculation with necessary
            information about calculation included. Format in tuple:
                independent property (x-axis) name,
                dependent property (y-axis) name,
                correlation value,
                correlation function name,
                number of data points used for correlation
                length of shortest path between properties on propnet graph where x-axis property
                    is starting property and y-axis property is ending property.
                    Note: if no (forward) connection exists, the path length will be None. This does
                    not preclude y->x having a forward path.

        """
        prop_x, prop_y = item['x_name'], item['y_name']
        data_x, data_y = item['x_data'], item['y_data']
        func_name, func = item['func']
        n_points = len(data_x)

        g = Graph()
        try:
            path_length_xy = g.get_degree_of_separation(prop_x, prop_y)
            path_length_yx = g.get_degree_of_separation(prop_y, prop_x)
        except ValueError:
            # This shouldn't happen...but just in case
            path_length_xy = None
            path_length_yx = None

        try:
            path_length = min(path_length_xy, path_length_yx)
        except TypeError:
            path_length = path_length_xy or path_length_yx

        if n_points < 2:
            result = 0.0
        else:
            try:
                result = func(data_x, data_y)
            except Exception as ex:
                # If correlation fails, catch the error, save it, and move on
                result = ex
        return prop_x, prop_y, result, func_name, n_points, path_length

    @staticmethod
    def _cfunc_mic(x, y):
        """
        Get maximal information coefficient for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) maximal information coefficient

        """
        from minepy import MINE
        m = MINE()
        m.compute_score(x, y)
        return m.mic()

    @staticmethod
    def _cfunc_linlsq(x, y):
        """
        Get R^2 value for linear least-squares fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) R^2 value

        """
        from scipy import stats
        fit = stats.linregress(x, y)
        return fit.rvalue**2

    @staticmethod
    def _cfunc_pearson(x, y):
        """
        Get R value for Pearson fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Pearson R value

        """
        from scipy import stats
        fit = stats.pearsonr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_spearman(x, y):
        """
        Get R value for Spearman fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Spearman R value

        """
        from scipy import stats
        fit = stats.spearmanr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_ransac(x, y):
        """
        Get random sample consensus (RANSAC) regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) RANSAC score

        """
        from sklearn.linear_model import RANSACRegressor
        r = RANSACRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    @staticmethod
    def _cfunc_theilsen(x, y):
        """
        Get Theil-Sen regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Theil-Sen score

        """
        from sklearn.linear_model import TheilSenRegressor
        r = TheilSenRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    def update_targets(self, items):
        """
        Write correlation data to Mongo store.

        Args:
            items: (list<dict>) list of results output by process_item()

        """
        data = []
        for item in items:
            prop_x, prop_y, result, func_name, n_points, path_length = item
            d = {
                'property_x': prop_x,
                'property_y': prop_y,
                'correlation_func': func_name,
                'n_points': n_points,
                'shortest_path_length': path_length,
                'id': hash((prop_x, prop_y)) ^ hash(func_name)
            }
            if not isinstance(result, Exception):
                d['correlation'] = result
            else:
                d['correlation'] = None
                d['error'] = (result.__class__.__name__, result.args)
            data.append(d)
        self.correlation_store.update(data, key='id')

    def finalize(self, cursor=None):
        """
        Outputs correlation data to JSON file, if specified in instantiation, and runs
        clean-up function for Builder.

        Args:
            cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed.

        """

        props_to_index = [
            'property_x', 'property_y', 'correlation_func', 'correlation',
            'shortest_path_length'
        ]
        for prop in props_to_index:
            if not self.correlation_store.ensure_index(prop):
                logger.warning(
                    "Could not add index for property {}".format(prop))

        if self.out_file:
            try:
                self.write_correlation_data_file(self.out_file)
            except OSError:
                logger.warning(
                    "Cannot open file for writing! Skipping file writing.")

        super(CorrelationBuilder, self).finalize(cursor)

    def write_correlation_data_file(self, out_file):
        """
        Gets data dictionary containing correlation matrices and outputs to a file.

        Args:
            out_file: (str) file path and name for output to JSON file
        """
        matrix = self.get_correlation_matrices()
        with open(out_file, 'w') as f:
            json.dump(matrix, f)

    def get_correlation_matrices(self, func_name=None):
        """
        Builds document containing the correlation matrix with relevant data regarding
        correlation algorithm and properties of the data set.

        Args:
            func_name: (str) optional, name of the correlation functions to include in the document
                default: None, which is to include all that were run by this builder.

        Returns: (dict) document containing correlation data. Format:
            {'properties': (list<str>) names of properties calculated in order of how they are indexed
                    in the matrices
             'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data
                    points evaluated during the fitting procedure
             'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation
                    results, keyed by correlation function name
            }

        """

        prop_data = self.correlation_store.query(
            criteria={'property_x': {
                '$exists': True
            }},
            properties=['property_x'])
        props = list(set(item['property_x'] for item in prop_data))

        out = {
            'properties': props,
            'n_points': None,
            'shortest_path_length': None,
            'correlation': {}
        }

        if not func_name:
            func_name = list(self._funcs.keys())

        if isinstance(func_name, str):
            func_name = [func_name]

        for f in func_name:
            data = self.correlation_store.query(
                criteria={'correlation_func': f})
            corr_matrix: list = np.zeros(shape=(len(props),
                                                len(props))).tolist()

            fill_info_matrices = False
            if not out['n_points'] and not out['shortest_path_length']:
                fill_info_matrices = True
                out['n_points'] = np.zeros(shape=(len(props),
                                                  len(props))).tolist()
                out['shortest_path_length'] = np.zeros(
                    shape=(len(props), len(props))).tolist()

            for d in data:
                prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \
                                                                     d['property_y'], \
                                                                     d['correlation'], \
                                                                     d['n_points'], \
                                                                     d['shortest_path_length']
                ia, ib = props.index(prop_x), props.index(prop_y)
                corr_matrix[ia][ib] = correlation

                if fill_info_matrices:
                    out['n_points'][ia][ib] = n_points
                    out['n_points'][ib][ia] = n_points
                    out['shortest_path_length'][ia][ib] = path_length

            out['correlation'][f] = corr_matrix

        return out

    def as_dict(self):
        """
        Returns the representation of the builder as a dictionary in JSON serializable format.
        Note: because functions are not JSON serializable, custom functions are omitted when
            serializing the object.

        Returns: (dict) representation of this builder as a JSON-serializable dictionary

        """
        d = super(CorrelationBuilder, self).as_dict()
        serialized_funcs = []
        for name in d['funcs'].keys():
            if name in self._correlation_funcs.keys():
                serialized_funcs.append(name)
            else:
                logger.warning(
                    "Cannot serialize custom function '{}'. Omitting.".format(
                        name))

        if not serialized_funcs:
            logger.warning(
                "No functions were able to be serialized from this builder.")

        d['funcs'] = serialized_funcs
        return d
Esempio n. 13
0
 def setUpClass(cls) -> None:
     Registry.clear_all_registries()
     add_builtin_symbols_to_registry()
Esempio n. 14
0
    def process(self, item):
        if self.graph_parallel and not self.allow_child_process and \
                current_process().name != "MainProcess":
            logger.warning(
                "It appears derive_quantities() is running "
                "in a child process, possibly in a parallelized "
                "Runner.\nThis is not recommended and will deteriorate "
                "performance.")
        # Define quantities corresponding to materials doc fields
        # Attach quantities to materials
        item = MontyDecoder().process_decoded(item)
        logger.info("Populating material for %s", item['task_id'])
        material = Material()

        if 'created_at' in item.keys():
            date_created = item['created_at']
        else:
            date_created = None

        provenance = ProvenanceElement(
            source={
                "source": self.source_name,
                "source_key": item['task_id'],
                "date_created": date_created
            })

        for mkey, property_name in self.materials_symbol_map.items():
            value = pydash.get(item, mkey)
            if value:
                material.add_quantity(
                    QuantityFactory.create_quantity(
                        property_name,
                        value,
                        units=Registry("units").get(property_name, None),
                        provenance=provenance))

        # Add custom things, e. g. computed entry
        computed_entry = get_entry(item)
        if computed_entry:
            material.add_quantity(
                QuantityFactory.create_quantity("computed_entry",
                                                computed_entry,
                                                provenance=provenance))
        else:
            logger.info("Unable to create computed entry for {}".format(
                item['task_id']))
        material.add_quantity(
            QuantityFactory.create_quantity("external_identifier_mp",
                                            item['task_id'],
                                            provenance=provenance))

        input_quantities = material.symbol_quantities_dict

        # Use graph to generate expanded quantity pool
        logger.info("Evaluating graph for %s", item['task_id'])

        new_material = self._graph_evaluator.evaluate(
            material, timeout=self.graph_timeout)

        # Format document and return
        logger.info("Creating doc for %s", item['task_id'])
        # Gives the initial inputs that were used to derive properties of a
        # certain material.

        doc = {
            "inputs": [
                StorageQuantity.from_quantity(q)
                for q in chain.from_iterable(input_quantities.values())
            ]
        }

        for symbol, quantities in new_material.symbol_quantities_dict.items():
            # If no new quantities of a given symbol were derived (i.e. if the initial
            # input quantity/ies is/are the only one/s listed in the new material) then don't add
            # that quantity to the propnet entry document as a derived quantity.
            if len(quantities) == len(input_quantities[symbol]):
                continue
            sub_doc = {}
            try:
                # Write out all quantities as dicts including the
                # internal ID for provenance tracing
                qs = [
                    jsanitize(StorageQuantity.from_quantity(q), strict=True)
                    for q in quantities
                ]
            except AttributeError as ex:
                # Check to see if this is an error caused by an object
                # that is not JSON serializable
                msg = ex.args[0]
                if "object has no attribute 'as_dict'" in msg:
                    # Write error to db and logger
                    errmsg = "Quantity of Symbol '{}' is not ".format(symbol.name) + \
                        "JSON serializable. Cannot write quantities to database!"
                    logger.error(errmsg)
                    sub_doc['error'] = errmsg
                    qs = []
                else:
                    # If not, re-raise the error
                    raise ex
            sub_doc['quantities'] = qs
            doc[symbol.name] = sub_doc

        aggregated_quantities = new_material.get_aggregated_quantities()

        for symbol, quantity in aggregated_quantities.items():
            if symbol.name not in doc:
                # No new quantities were derived
                continue
            # Store mean and std dev for aggregated quantities
            sub_doc = {
                "mean": unumpy.nominal_values(quantity.magnitude).tolist(),
                "std_dev": unumpy.std_devs(quantity.magnitude).tolist(),
                "units":
                quantity.units.format_babel() if quantity.units else None,
                "title": quantity.symbol.display_names[0]
            }
            # Symbol Name -> Sub_Document, listing all Quantities of that type.
            doc[symbol.name].update(sub_doc)

        doc.update({
            "task_id": item["task_id"],
            "pretty_formula": item.get("pretty_formula"),
            "deprecated": item.get("deprecated", False)
        })

        if self.include_sandboxed:
            doc.update({'sbxn': item.get("sbxn", [])})

        return jsanitize(doc, strict=True)
Esempio n. 15
0
class CorrelationBuilder(Builder):
    """
    A class to calculate the correlation between properties derived by or used in propnet
    using a suite of regression tools. Uses the Builder architecture for optional parallel
    processing of data.

    Note: serialization of builder does not work with custom correlation functions, although
    interactive use does support them.

    """
    # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?
    MP_QUERY_PROPS = [
        "piezo.eij_max", "elasticity.universal_anisotropy",
        "diel.poly_electronic", "total_magnetization", "efermi",
        "magnetism.total_magnetization_normalized_vol"
    ]
    PROPNET_PROPS = [
        v.name for v in Registry("symbols").values()
        if (v.category == 'property' and v.shape == 1)
    ]

    def __init__(self,
                 propnet_store,
                 mp_store,
                 correlation_store,
                 out_file=None,
                 funcs='linlsq',
                 props=None,
                 **kwargs):
        """
        Constructor for the correlation builder.

        Args:
            propnet_store: (Mongolike Store) store instance pointing to propnet collection
                with read access
            mp_store: (Mongolike Store) store instance pointing to Materials Project collection with read access
            correlation_store: (Mongolike Store) store instance pointing to collection with write access
            out_file: (str) optional, filename to output data in JSON format (useful if using a MemoryStore
                for correlation_store)
            funcs: (str, function, list<str, function>) functions to use for correlation. Built-in functions can
                be specified by the following strings:

                linlsq (default): linear least-squares, reports R^2
                pearson: Pearson r-correlation, reports r
                spearman: Spearman rank correlation, reports r
                mic: maximal-information non-parametric exploration, reports maximal information coefficient
                ransac: random sample consensus (RANSAC) regression, reports score
                theilsen: Theil-Sen regression, reports score
                all: runs all correlation functions above
            **kwargs: arguments to the Builder superclass
        """

        self.propnet_store = propnet_store
        self.mp_store = mp_store
        self.correlation_store = correlation_store
        self.out_file = out_file

        self._correlation_funcs = {
            f.replace('_cfunc_', ''): getattr(self, f)
            for f in dir(self)
            if re.match(r'^_cfunc_.+$', f) and callable(getattr(self, f))
        }

        self._funcs = {}

        if not isinstance(funcs, list):
            funcs = [funcs]

        for f in funcs:
            if isinstance(f, str) and f == 'all':
                self._funcs.update(self._correlation_funcs)
            elif isinstance(f, str) and f in self._correlation_funcs.keys():
                self._funcs[f] = self._correlation_funcs[f]
            elif callable(f):
                name = f.__module__ + "." + f.__name__
                self._funcs[name] = f
            else:
                raise ValueError("Invalid correlation function: {}".format(f))

        if not self._funcs:
            raise ValueError("No valid correlation functions selected")

        mp_prop_map = {(p.split(".")[1] if len(p.split(".")) == 2 else p): p
                       for p in self.MP_QUERY_PROPS}
        self._props = props
        if not props:
            self.mp_query_props = self.MP_QUERY_PROPS
            self.mp_props = list(mp_prop_map.keys())
            self.propnet_props = self.PROPNET_PROPS
        else:
            self.propnet_props = []
            self.mp_props = []
            self.mp_query_props = []
            if isinstance(props, str):
                props = [props]
            for p in props:
                if p in self.PROPNET_PROPS:
                    self.propnet_props.append(p)
                elif p in mp_prop_map.keys():
                    self.mp_props.append(p)
                    self.mp_query_props.append(mp_prop_map[p])

        super(CorrelationBuilder,
              self).__init__(sources=[propnet_store, mp_store],
                             targets=[correlation_store],
                             **kwargs)

    def get_items(self):
        """
        Collects scalar data from propnet and MP databases, aggregates it by property, and creates
        a generator to iterate over all pairs of properties, including pairing of the same property
        with itself for sanity check, and correlation functions.

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        data = defaultdict(dict)

        propnet_data = self.propnet_store.query(
            criteria={},
            properties=[p + '.mean' for p in self.propnet_props] +
            [p + '.units' for p in self.propnet_props] +
            [p + '.quantities'
             for p in self.propnet_props] + ['task_id', 'inputs'])

        for material in propnet_data:
            mpid = material['task_id']

            input_d = defaultdict(list)
            for q in material['inputs']:
                if q['symbol_type'] in self.propnet_props:
                    this_q = ureg.Quantity(q['value'], q['units'])
                    input_d[q['symbol_type']].append(this_q)

            for prop, values in material.items():
                if prop in self.propnet_props:
                    if prop in input_d.keys():
                        for q in values['quantities']:
                            input_d[prop].append(
                                ureg.Quantity(q['value'], q['units']))
                    else:
                        this_q = ureg.Quantity(values['mean'], values['units'])
                        input_d[prop] = [this_q]

            data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()})

        # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?

        mp_data = self.mp_store.query(criteria={},
                                      properties=self.mp_query_props +
                                      ['task_id'])

        for material in mp_data:
            mpid = material['task_id']
            for prop, value in material.items():
                if isinstance(value, dict):
                    for sub_prop, sub_value in value.items():
                        if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None:
                            data[mpid][sub_prop] = sub_value
                elif prop in self.mp_query_props and value is not None:
                    data[mpid][prop] = value

        # product() produces all possible combinations of properties
        for prop_x, prop_y in product(self.propnet_props + self.mp_props,
                                      repeat=2):
            x = []
            y = []
            for props_data in data.values():
                if prop_x in props_data.keys() and prop_y in props_data.keys():
                    x.append(props_data[prop_x])
                    y.append(props_data[prop_y])

            # MP data does not have units listed in database, so will be floats. propnet
            # data may not have the same units as the MP data, so is stored as pint
            # quantities. Here, the quantities are coerced into the units of MP data
            # as stored in symbols and coverts them to floats.
            if x and any(isinstance(v, ureg.Quantity) for v in x):
                x_float = [
                    xx.to(Registry("symbols")[prop_x].units).magnitude
                    if isinstance(xx, ureg.Quantity) else xx for xx in x
                ]
            else:
                x_float = x
            if y and any(isinstance(v, ureg.Quantity) for v in y):
                y_float = [
                    yy.to(Registry("symbols")[prop_y].units).magnitude
                    if isinstance(yy, ureg.Quantity) else yy for yy in y
                ]
            else:
                y_float = y

            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': x_float,
                    'x_name': prop_x,
                    'y_data': y_float,
                    'y_name': prop_y,
                    'func': (name, func)
                }
                yield data_dict

    def process_item(self, item):
        """
        Run correlation calculation on a pair of properties using the specified function.

        Args:
            item: (dict) input provided by get_items() (see get_items() for structure)

        Returns: (tuple<str, str, float, str, int>) output of calculation with necessary
            information about calculation included. Format in tuple:
                independent property (x-axis) name,
                dependent property (y-axis) name,
                correlation value,
                correlation function name,
                number of data points used for correlation
                length of shortest path between properties on propnet graph where x-axis property
                    is starting property and y-axis property is ending property.
                    Note: if no (forward) connection exists, the path length will be None. This does
                    not preclude y->x having a forward path.

        """
        prop_x, prop_y = item['x_name'], item['y_name']
        data_x, data_y = item['x_data'], item['y_data']
        func_name, func = item['func']
        n_points = len(data_x)

        g = Graph()
        try:
            path_length = g.get_degree_of_separation(prop_x, prop_y)
        except ValueError:
            path_length = None

        if n_points < 2:
            correlation = 0.0
        else:
            correlation = func(data_x, data_y)
        return prop_x, prop_y, correlation, func_name, n_points, path_length

    @staticmethod
    def _cfunc_mic(x, y):
        """
        Get maximal information coefficient for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) maximal information coefficient

        """
        from minepy import MINE
        m = MINE()
        m.compute_score(x, y)
        return m.mic()

    @staticmethod
    def _cfunc_linlsq(x, y):
        """
        Get R^2 value for linear least-squares fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) R^2 value

        """
        from scipy import stats
        fit = stats.linregress(x, y)
        return fit.rvalue**2

    @staticmethod
    def _cfunc_pearson(x, y):
        """
        Get R value for Pearson fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Pearson R value

        """
        from scipy import stats
        fit = stats.pearsonr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_spearman(x, y):
        """
        Get R value for Spearman fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Spearman R value

        """
        from scipy import stats
        fit = stats.spearmanr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_ransac(x, y):
        """
        Get random sample consensus (RANSAC) regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) RANSAC score

        """
        from sklearn.linear_model import RANSACRegressor
        r = RANSACRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    @staticmethod
    def _cfunc_theilsen(x, y):
        """
        Get Theil-Sen regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Theil-Sen score

        """
        from sklearn.linear_model import TheilSenRegressor
        r = TheilSenRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    def update_targets(self, items):
        """
        Write correlation data to Mongo store.

        Args:
            items: (list<dict>) list of results output by process_item()

        """
        data = []
        for item in items:
            prop_x, prop_y, correlation, func_name, n_points, path_length = item
            data.append({
                'property_x': prop_x,
                'property_y': prop_y,
                'correlation': correlation,
                'correlation_func': func_name,
                'n_points': n_points,
                'shortest_path_length': path_length,
                'id': hash((prop_x, prop_y)) ^ hash(func_name)
            })
        self.correlation_store.update(data, key='id')

    def finalize(self, cursor=None):
        """
        Outputs correlation data to JSON file, if specified in instantiation, and runs
        clean-up function for Builder.

        Args:
            cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed.

        """
        if self.out_file:
            try:
                self.write_correlation_data_file(self.out_file)
            except OSError:
                logger.warning(
                    "Cannot open file for writing! Skipping file writing.")

        super(CorrelationBuilder, self).finalize(cursor)

    def write_correlation_data_file(self, out_file):
        """
        Gets data dictionary containing correlation matrices and outputs to a file.

        Args:
            out_file: (str) file path and name for output to JSON file
        """
        matrix = self.get_correlation_matrices()
        with open(out_file, 'w') as f:
            json.dump(matrix, f)

    def get_correlation_matrices(self, func_name=None):
        """
        Builds document containing the correlation matrix with relevant data regarding
        correlation algorithm and properties of the data set.

        Args:
            func_name: (str) optional, name of the correlation functions to include in the document
                default: None, which is to include all that were run by this builder.

        Returns: (dict) document containing correlation data. Format:
            {'properties': (list<str>) names of properties calculated in order of how they are indexed
                    in the matrices
             'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data
                    points evaluated during the fitting procedure
             'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation
                    results, keyed by correlation function name
            }

        """

        prop_data = self.correlation_store.query(
            criteria={'property_x': {
                '$exists': True
            }},
            properties=['property_x'])
        props = list(set(item['property_x'] for item in prop_data))

        out = {
            'properties': props,
            'n_points': None,
            'shortest_path_length': None,
            'correlation': {}
        }

        if not func_name:
            func_name = list(self._funcs.keys())

        if isinstance(func_name, str):
            func_name = [func_name]

        for f in func_name:
            data = self.correlation_store.query(
                criteria={'correlation_func': f})
            corr_matrix: list = np.zeros(shape=(len(props),
                                                len(props))).tolist()

            fill_info_matrices = False
            if not out['n_points'] and not out['shortest_path_length']:
                fill_info_matrices = True
                out['n_points'] = np.zeros(shape=(len(props),
                                                  len(props))).tolist()
                out['shortest_path_length'] = np.zeros(
                    shape=(len(props), len(props))).tolist()

            for d in data:
                prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \
                                                                     d['property_y'], \
                                                                     d['correlation'], \
                                                                     d['n_points'], \
                                                                     d['shortest_path_length']
                ia, ib = props.index(prop_x), props.index(prop_y)
                corr_matrix[ia][ib] = correlation

                if fill_info_matrices:
                    out['n_points'][ia][ib] = n_points
                    out['n_points'][ib][ia] = n_points
                    out['shortest_path_length'][ia][ib] = path_length

            out['correlation'][f] = corr_matrix

        return out

    def as_dict(self):
        """
        Returns the representation of the builder as a dictionary in JSON serializable format.
        Note: because functions are not JSON serializable, custom functions are omitted when
            serializing the object.

        Returns: (dict) representation of this builder as a JSON-serializable dictionary

        """
        d = super(CorrelationBuilder, self).as_dict()
        serialized_funcs = []
        for name in d['funcs'].keys():
            if name in self._correlation_funcs.keys():
                serialized_funcs.append(name)
            else:
                logger.warning(
                    "Cannot serialize custom function '{}'. Omitting.".format(
                        name))

        if not serialized_funcs:
            logger.warning(
                "No functions were able to be serialized from this builder.")

        d['funcs'] = serialized_funcs
        return d
Esempio n. 16
0
    def get_items(self):
        """
        Collects scalar data from propnet and MP databases, aggregates it by property, and creates
        a generator to iterate over all pairs of properties, including pairing of the same property
        with itself for sanity check, and correlation functions.

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        data = defaultdict(dict)

        propnet_data = self.propnet_store.query(
            criteria={},
            properties=[p + '.mean' for p in self.propnet_props] +
            [p + '.units' for p in self.propnet_props] +
            [p + '.quantities'
             for p in self.propnet_props] + ['task_id', 'inputs'])

        for material in propnet_data:
            mpid = material['task_id']

            input_d = defaultdict(list)
            for q in material['inputs']:
                if q['symbol_type'] in self.propnet_props:
                    this_q = ureg.Quantity(q['value'], q['units'])
                    input_d[q['symbol_type']].append(this_q)

            for prop, values in material.items():
                if prop in self.propnet_props:
                    if prop in input_d.keys():
                        for q in values['quantities']:
                            input_d[prop].append(
                                ureg.Quantity(q['value'], q['units']))
                    else:
                        this_q = ureg.Quantity(values['mean'], values['units'])
                        input_d[prop] = [this_q]

            data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()})

        # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?

        mp_data = self.mp_store.query(criteria={},
                                      properties=self.mp_query_props +
                                      ['task_id'])

        for material in mp_data:
            mpid = material['task_id']
            for prop, value in material.items():
                if isinstance(value, dict):
                    for sub_prop, sub_value in value.items():
                        if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None:
                            data[mpid][sub_prop] = sub_value
                elif prop in self.mp_query_props and value is not None:
                    data[mpid][prop] = value

        # product() produces all possible combinations of properties
        for prop_x, prop_y in product(self.propnet_props + self.mp_props,
                                      repeat=2):
            x = []
            y = []
            for props_data in data.values():
                if prop_x in props_data.keys() and prop_y in props_data.keys():
                    x.append(props_data[prop_x])
                    y.append(props_data[prop_y])

            # MP data does not have units listed in database, so will be floats. propnet
            # data may not have the same units as the MP data, so is stored as pint
            # quantities. Here, the quantities are coerced into the units of MP data
            # as stored in symbols and coverts them to floats.
            if x and any(isinstance(v, ureg.Quantity) for v in x):
                x_float = [
                    xx.to(Registry("symbols")[prop_x].units).magnitude
                    if isinstance(xx, ureg.Quantity) else xx for xx in x
                ]
            else:
                x_float = x
            if y and any(isinstance(v, ureg.Quantity) for v in y):
                y_float = [
                    yy.to(Registry("symbols")[prop_y].units).magnitude
                    if isinstance(yy, ureg.Quantity) else yy for yy in y
                ]
            else:
                y_float = y

            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': x_float,
                    'x_name': prop_x,
                    'y_data': y_float,
                    'y_name': prop_y,
                    'func': (name, func)
                }
                yield data_dict