Esempio n. 1
0
    def before_index(self, data_dict):
        """
        customize data sent to solr
        """
        bogus_date = datetime.datetime(1, 1, 1)
        dataset_schema = scheming_get_dataset_schema(
            data_dict.get('type', 'unknown')
        )
        if dataset_schema is None:
            raise ValidationError(
                'Found no schema for following dataset :\n{dump}'.format(
                    dump=json.dumps(data_dict, indent=2)
                )
            )

        # iterate through dataset fields defined in schema
        field_schema = dict()
        for dataset_field in dataset_schema['dataset_fields']:
            d = dataset_field
            field_schema[d['field_name']] = d

        index_data_dict = {}
        authors = []
        # drop extras fields
        for dict_key in data_dict:
            if not dict_key.startswith('extras_'):
                index_data_dict[dict_key] = data_dict[dict_key]
        # iterate through validated data_dict fields and modify as needed
        validated_data_dict = json.loads(data_dict['validated_data_dict'])
        for item in validated_data_dict.keys():
            value = validated_data_dict[item]
            if not value and item in index_data_dict:
                index_data_dict.pop(item)
                continue
            fs = field_schema.get(item, None)
            # ignore all fields not currently in the schema
            if not fs:
                continue

            field_type = fs.get('schema_field_type', 'string')
            multivalued = fs.get('schema_multivalued', False)

            if field_type == 'fluent':
                for key in value.keys():
                    label = u'{item}_{key}'.format(
                        item=item,
                        key=key
                    )
                    index_data_dict[label] = value[key]

            # for code type, the en/fr labels need to be looked up
            # and sent to Solr
            elif field_type == 'code':
                lookup_type = fs.get('lookup', '')
                if lookup_type == 'codeset':
                    lookup = fs.get('codeset_type', '')
                elif lookup_type == 'preset':
                    lookup = fs.get('preset', '')[4:]
                else:
                    lookup = fs.get('lookup', '')
                if lookup and value:
                    label_en = u'{item}_desc_en'.format(
                        item=item
                    )
                    label_fr = u'{item}_desc_fr'.format(
                        item=item
                    )
                    if multivalued:
                        desc_en = []
                        desc_fr = []
                        for v in value:
                            if not v:
                                continue
                            desc = lookup_label(lookup, v, lookup_type)
                            desc_en.append(desc[u'en'])
                            desc_fr.append(desc[u'fr'])

                        index_data_dict[str(item)] = value

                        index_data_dict[label_en] = desc_en
                        index_data_dict[label_fr] = desc_fr
                    else:
                        desc = lookup_label(lookup, value, lookup_type)
                        index_data_dict[label_en] = desc[u'en']
                        index_data_dict[label_fr] = desc[u'fr']
            elif field_type == 'date':
                if value:
                    try:
                        date = parse(value, default=bogus_date)
                        if date != bogus_date:
                            index_data_dict[item] = date.isoformat() + 'Z'
                    except ValueError:
                        continue
            elif item.endswith('_authors'):
                index_data_dict[str(item)] = value
                authors.extend(value)
            else:  # all other field types
                if multivalued:
                    index_data_dict[str(item)] = value
                else:
                    index_data_dict[str(item)] = value

            if authors:
                index_data_dict['authors'] = authors
                index_data_dict['authors_initials'] = list(
                    set(
                        [strip_accents(i[0]).upper() for i in authors]
                    )
                )

        return index_data_dict
Esempio n. 2
0
    def before_index(self, data_dict):
        """
        customize data sent to solr

        :param data_dict:
        :type data_dict dict

        :returns dict
        """
        dataset_schema = scheming_get_dataset_schema(data_dict.get('type'))
        if dataset_schema is None:
            raise ValidationError((_(
                'Found no schema for following datasets:\n{dump}'.format(
                    dump=json.dumps(data_dict, indent=2, sort_keys=True)
                )
            ),))

        field_schema = dict(
            (s['field_name'], s) for s in dataset_schema['dataset_fields']
        )

        index_data_dict = data_dict.copy()
        for k in data_dict:
            if k.startswith(u'extras_'):
                index_data_dict.pop(k, None)

        authors = []
        default_date = datetime(1, 1, 1, 8, 30, 0, 0)

        validated_data_dict = json.loads(data_dict['validated_data_dict'])

        name = validated_data_dict.get(u'name')

        # append dguids from the datastore
        if validated_data_dict.get(u'product_id_new'):
            index_data_dict[u'dguid_codes'] = []
            for dguid_pkg_id in geo.get_geodescriptors_for_package(
                validated_data_dict[u'product_id_new']):
                index_data_dict[u'dguid_codes'].append(
                        helpers.get_dguid_from_pkg_id(dguid_pkg_id))
            # strip the vintages from dguids to get geodescriptors
            index_data_dict[u'geodescriptor_codes'] = \
                [g[4:] if is_dguid(g) else g
                 for g in index_data_dict[u'dguid_codes'] if g]

        for item, value in validated_data_dict.iteritems():
            fs = field_schema.get(item)

            # Do not index any field that is not currently in the schema.
            if not fs:
                continue

            field_type = fs.get('schema_field_type', 'string')
            # TODO: we're not using the multivalued schema field.  Drop it?
            multivalued = fs.get('schema_multivalued', False)

            # Legacy issues numbers are non-numeric, which is problematic
            # for sorting and external tools. We can't just use a Solr
            # <copyTo> directive, as it'll fail entirely on a bad value.
            if name == 'issue_number':
                if value.isdigit():
                    index_data_dict['issue_number_int'] = int(value)

            # Fluent (multilingual) fields are really dictionaries, where
            # each key is the ISO language code, and the value the translated
            # text. We need to unpack these into individual solr fields
            # for per-language search.
            if field_type == 'fluent':
                if isinstance(value, dict):
                    index_data_dict.update(
                        (u'{0}_{1}'.format(item, k), v)
                        for k, v in value.iteritems()
                    )
                else:
                    raise ValidationError((_(
                        '{name}: Expecting a fluent dict for {item}, '
                        'instead got {value!r}'.format(
                            name=name,
                            item=item,
                            value=value
                        )
                    ), ))

            # Numeric foreign keys that need to be looked up to retrieve
            # their multilingual labels for searching.
            elif field_type == u'code':
                index_data_dict[unicode(item)] = value

                # These codes can refer to a codeset (a dataset of type
                # 'codeset' with a particular key), a preset (a hardcoded
                # value in a Scheming schema), or another dataset (lookup).
                lookup_type = fs.get(u'lookup', '')
                if lookup_type == u'codeset':
                    lookup = fs.get(u'codeset_type', '')
                elif lookup_type == u'preset':
                    lookup = fs.get(u'preset', '')[4:]
                else:
                    lookup = fs.get(u'lookup', '')

                if not lookup:
                    raise ValidationError((_(
                        '{name}: unable to determine lookup '
                        'for {item}'.format(
                            name=name,
                            item=item
                        )
                    ), ))

                if isinstance(value, list):
                    for value_to_lookup in value:
                        if not value_to_lookup:
                            continue

                        desc = lookup_label(
                            lookup,
                            value_to_lookup,
                            lookup_type
                        )

                        for k, v in desc.iteritems():
                            if v and not k == u'found':
                                n = u'{item}_desc_{key}'.format(
                                    item=item,
                                    key=k
                                )
                                index_data_dict.update(
                                    {n: index_data_dict.get(n, []) + [v]}
                                )

                else:
                    desc = lookup_label(lookup, value, lookup_type)

                    index_data_dict.update((
                        u'{item}_desc_{key}'.format(
                            item=item,
                            key=k
                        ), v)
                        for k, v in desc.iteritems() if v and not k == u'found'
                    )
                if item == u'geodescriptor_codes':
                    index_data_dict[u'dguid_codes'] = \
                        list(index_data_dict[u'geodescriptor_codes'])
            elif field_type == 'date':
                try:
                    date = parse(value, default=default_date)
                    index_data_dict[unicode(item)] = unicode(
                        date.isoformat()[:19] + u'Z'
                    )
                except ValueError:
                    continue
            elif item.endswith('_authors'):
                index_data_dict[unicode(item)] = value
                authors.extend(value)
            else:
                index_data_dict[unicode(item)] = value

            if authors:
                index_data_dict['authors'] = authors
                index_data_dict['authors_initials'] = list(
                    set(
                        [strip_accents(i[0]).upper() for i in authors]
                    )
                )

        return index_data_dict