def before_index(self, data_dict): """ customize data sent to solr """ bogus_date = datetime.datetime(1, 1, 1) dataset_schema = scheming_get_dataset_schema( data_dict.get('type', 'unknown') ) if dataset_schema is None: raise ValidationError( 'Found no schema for following dataset :\n{dump}'.format( dump=json.dumps(data_dict, indent=2) ) ) # iterate through dataset fields defined in schema field_schema = dict() for dataset_field in dataset_schema['dataset_fields']: d = dataset_field field_schema[d['field_name']] = d index_data_dict = {} authors = [] # drop extras fields for dict_key in data_dict: if not dict_key.startswith('extras_'): index_data_dict[dict_key] = data_dict[dict_key] # iterate through validated data_dict fields and modify as needed validated_data_dict = json.loads(data_dict['validated_data_dict']) for item in validated_data_dict.keys(): value = validated_data_dict[item] if not value and item in index_data_dict: index_data_dict.pop(item) continue fs = field_schema.get(item, None) # ignore all fields not currently in the schema if not fs: continue field_type = fs.get('schema_field_type', 'string') multivalued = fs.get('schema_multivalued', False) if field_type == 'fluent': for key in value.keys(): label = u'{item}_{key}'.format( item=item, key=key ) index_data_dict[label] = value[key] # for code type, the en/fr labels need to be looked up # and sent to Solr elif field_type == 'code': lookup_type = fs.get('lookup', '') if lookup_type == 'codeset': lookup = fs.get('codeset_type', '') elif lookup_type == 'preset': lookup = fs.get('preset', '')[4:] else: lookup = fs.get('lookup', '') if lookup and value: label_en = u'{item}_desc_en'.format( item=item ) label_fr = u'{item}_desc_fr'.format( item=item ) if multivalued: desc_en = [] desc_fr = [] for v in value: if not v: continue desc = lookup_label(lookup, v, lookup_type) desc_en.append(desc[u'en']) desc_fr.append(desc[u'fr']) index_data_dict[str(item)] = value index_data_dict[label_en] = desc_en index_data_dict[label_fr] = desc_fr else: desc = lookup_label(lookup, value, lookup_type) index_data_dict[label_en] = desc[u'en'] index_data_dict[label_fr] = desc[u'fr'] elif field_type == 'date': if value: try: date = parse(value, default=bogus_date) if date != bogus_date: index_data_dict[item] = date.isoformat() + 'Z' except ValueError: continue elif item.endswith('_authors'): index_data_dict[str(item)] = value authors.extend(value) else: # all other field types if multivalued: index_data_dict[str(item)] = value else: index_data_dict[str(item)] = value if authors: index_data_dict['authors'] = authors index_data_dict['authors_initials'] = list( set( [strip_accents(i[0]).upper() for i in authors] ) ) return index_data_dict
def before_index(self, data_dict): """ customize data sent to solr :param data_dict: :type data_dict dict :returns dict """ dataset_schema = scheming_get_dataset_schema(data_dict.get('type')) if dataset_schema is None: raise ValidationError((_( 'Found no schema for following datasets:\n{dump}'.format( dump=json.dumps(data_dict, indent=2, sort_keys=True) ) ),)) field_schema = dict( (s['field_name'], s) for s in dataset_schema['dataset_fields'] ) index_data_dict = data_dict.copy() for k in data_dict: if k.startswith(u'extras_'): index_data_dict.pop(k, None) authors = [] default_date = datetime(1, 1, 1, 8, 30, 0, 0) validated_data_dict = json.loads(data_dict['validated_data_dict']) name = validated_data_dict.get(u'name') # append dguids from the datastore if validated_data_dict.get(u'product_id_new'): index_data_dict[u'dguid_codes'] = [] for dguid_pkg_id in geo.get_geodescriptors_for_package( validated_data_dict[u'product_id_new']): index_data_dict[u'dguid_codes'].append( helpers.get_dguid_from_pkg_id(dguid_pkg_id)) # strip the vintages from dguids to get geodescriptors index_data_dict[u'geodescriptor_codes'] = \ [g[4:] if is_dguid(g) else g for g in index_data_dict[u'dguid_codes'] if g] for item, value in validated_data_dict.iteritems(): fs = field_schema.get(item) # Do not index any field that is not currently in the schema. if not fs: continue field_type = fs.get('schema_field_type', 'string') # TODO: we're not using the multivalued schema field. Drop it? multivalued = fs.get('schema_multivalued', False) # Legacy issues numbers are non-numeric, which is problematic # for sorting and external tools. We can't just use a Solr # <copyTo> directive, as it'll fail entirely on a bad value. if name == 'issue_number': if value.isdigit(): index_data_dict['issue_number_int'] = int(value) # Fluent (multilingual) fields are really dictionaries, where # each key is the ISO language code, and the value the translated # text. We need to unpack these into individual solr fields # for per-language search. if field_type == 'fluent': if isinstance(value, dict): index_data_dict.update( (u'{0}_{1}'.format(item, k), v) for k, v in value.iteritems() ) else: raise ValidationError((_( '{name}: Expecting a fluent dict for {item}, ' 'instead got {value!r}'.format( name=name, item=item, value=value ) ), )) # Numeric foreign keys that need to be looked up to retrieve # their multilingual labels for searching. elif field_type == u'code': index_data_dict[unicode(item)] = value # These codes can refer to a codeset (a dataset of type # 'codeset' with a particular key), a preset (a hardcoded # value in a Scheming schema), or another dataset (lookup). lookup_type = fs.get(u'lookup', '') if lookup_type == u'codeset': lookup = fs.get(u'codeset_type', '') elif lookup_type == u'preset': lookup = fs.get(u'preset', '')[4:] else: lookup = fs.get(u'lookup', '') if not lookup: raise ValidationError((_( '{name}: unable to determine lookup ' 'for {item}'.format( name=name, item=item ) ), )) if isinstance(value, list): for value_to_lookup in value: if not value_to_lookup: continue desc = lookup_label( lookup, value_to_lookup, lookup_type ) for k, v in desc.iteritems(): if v and not k == u'found': n = u'{item}_desc_{key}'.format( item=item, key=k ) index_data_dict.update( {n: index_data_dict.get(n, []) + [v]} ) else: desc = lookup_label(lookup, value, lookup_type) index_data_dict.update(( u'{item}_desc_{key}'.format( item=item, key=k ), v) for k, v in desc.iteritems() if v and not k == u'found' ) if item == u'geodescriptor_codes': index_data_dict[u'dguid_codes'] = \ list(index_data_dict[u'geodescriptor_codes']) elif field_type == 'date': try: date = parse(value, default=default_date) index_data_dict[unicode(item)] = unicode( date.isoformat()[:19] + u'Z' ) except ValueError: continue elif item.endswith('_authors'): index_data_dict[unicode(item)] = value authors.extend(value) else: index_data_dict[unicode(item)] = value if authors: index_data_dict['authors'] = authors index_data_dict['authors_initials'] = list( set( [strip_accents(i[0]).upper() for i in authors] ) ) return index_data_dict