コード例 #1
0
    def run_implementation(self):
        # Se obtienen los resultados del data fusion
        dal = DALMongo(self.project_id)

        records = dal.get_fused_records()

        if not self.only_matches:
            records += dal.get_non_matches()

        schema = dal.get_global_schema()

        return self._load_module(records=records, schema=schema).run()
コード例 #2
0
    def config_json(project_id):
        dal = DALMongo(project_id)

        cols = [{
            "label": c['custom_name'],
            "value": c['name'],
            "id": c['name'],
            "config": {
                "key": {
                    'type': 'hidden',
                    'value': c['name'],
                }
            }
        } for c in dal.get_global_schema() if c['name'].startswith('__new__')]
        # Above checking of __new__ prefix is unncessary I think...

        encoding_configs = dynamic_loading.list_modules('encoding')

        rowmodel = {
            'type': 'row',
            'cols': {
                '1_key': {
                    'type': 'dropdown',
                    'label': 'Select a column',
                    'selectedoption': {},
                    'options': cols
                },
                'encoding': {
                    "type": "dropdown",
                    'label': 'Select encoding',
                    'selectedoption': {},
                    'options': encoding_configs
                }
            }
        }
        return {
            'keys': {
                'type': 'rows',
                'rows': [],
                'label': 'Keys',
                "rowmodel": rowmodel
            }
        }
コード例 #3
0
    def run_implementation(self):
        # Se obtienen los grupos de registros

        dal = DALMongo(self.project_id)

        groups = dal.get_indexing_groups()
        segmented_schema = dal.get_global_schema()
        output_fields_schema = {}
        matched_cols = []
        for column in segmented_schema:
            if column['name'].startswith("__new__"):
                output_fields_schema[column['name']] = column['fields']
                matched_cols.append(column['name'])
        simils = []

        max_weight = max(
            [float(module['weight']) for idx, module in self.config.items()])

        for group in groups:
            for r1 in group.records1:
                for r2 in group.records2:
                    # Initialize similarity vector
                    sv = SimilarityVector(r1._id, r2._id, group=group.key)
                    for col in matched_cols:  # could be r2.matched_cols() as well (they return the same)
                        if not self.segmentation_skipped:
                            for out_field, comparison_module in self.config.items(
                            ):
                                # Check that the output field exists in the column, otherwise it wont create an entrance
                                # in the similarity vector
                                if out_field in [
                                        f['output_field']
                                        for f in output_fields_schema[col]
                                ]:
                                    # Se obienen los valores a comparar y se comparan
                                    out_field_value1 = r1.get_output_field_col(
                                        out_field, col)
                                    out_field_value2 = r2.get_output_field_col(
                                        out_field, col)

                                    module = self._load_module(
                                        comparison_module)

                                    weight = float(comparison_module['weight'])

                                    # Actualiza el valor de la comparacion en el vector
                                    sim_value = module.run(
                                        out_field_value1, out_field_value2)
                                    sim_value_weighted = sim_value * weight / max_weight
                                    sv.vector.append(sim_value_weighted)
                                    # sv.comparisons.append([out_field_value1, out_field_value2])
                                    sv.comparisons.append({
                                        'values':
                                        [out_field_value1, out_field_value2],
                                        'output_field':
                                        out_field
                                    })
                        else:
                            comparison_module = self.config[col]

                            # Se obienen los valores completos de la columna
                            column_value_s1 = r1.get_field_col(col)
                            column_value_s2 = r2.get_field_col(col)

                            module = self._load_module(comparison_module)

                            weight = float(comparison_module['weight'])

                            # Actualiza el valor de la comparacion en el vector
                            sim_value = module.run(column_value_s1,
                                                   column_value_s2)
                            sim_value_weighted = sim_value * weight / max_weight
                            sv.vector.append(sim_value_weighted)
                            sv.comparisons.append(
                                {'values': [column_value_s1, column_value_s2]})
                    simils.append(sv)

        self._append_result_collection(simils)