Esempio n. 1
0
 def get_identifier_list_variants(self, id_list):
     """ makes different variants of identifiers
         for a list of identifiers
     """
     output_list = []
     if not isinstance(id_list, list):
         id_list = [str(id_list)]
     for identifier in id_list:
         output_list.append(identifier)
         if(identifier.startswith('http://') or identifier.startswith('https://')):
             oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier)
             if oc_uuid:
                 output_list.append(oc_uuid)
             prefix_id = URImanagement.prefix_common_uri(identifier)
             if prefix_id:
                 output_list.append(prefix_id)
         elif ':' in identifier:
             full_uri = URImanagement.convert_prefix_to_full_uri(identifier)
             output_list.append(full_uri)
         else:
             # probably an open context uuid or a slug
             m_cache = MemoryCache()
             ent = m_cache.get_entity(identifier)
             if ent:
                 full_uri = ent.uri
                 output_list.append(full_uri)
                 prefix_uri = URImanagement.prefix_common_uri(full_uri)
                 if prefix_uri != full_uri:
                     output_list.append(prefix_uri)
     return output_list
Esempio n. 2
0
 def __init__(self, request_dict_json=False):
     self.uuid = False
     self.uri = False  # cannonical uri for the item
     self.href = False  # link to the item in the current deployment
     self.cite_uri = False  # stable / persistent uri
     self.label = False
     self.item_type = False
     self.updated = False
     self.published = False
     self.project_href = False  # link to the project in current deployment
     self.project_uri = False  # cannonical uri for the project
     self.project_label = False
     self.context_href = False  # link to parent context in current deployment
     self.context_uri = False  # link to parent context cannonical uri
     self.context_label = False
     self.category = False
     self.latitude = False
     self.longitude = False
     self.geojson = False
     self.early_date = False
     self.late_date = False
     self.human_remains_flagged = False  # flagged as relating to human remains
     self.thumbnail_href = False
     self.thumbnail_uri = False
     self.thumbnail_scr = False
     self.preview_scr = False
     self.fullfile_scr = False
     self.snippet = False
     self.cite_uri = False  # stable identifier as an HTTP uri
     self.other_attributes = False  # other attributes to the record
     # flatten list of an attribute values to single value
     self.flatten_rec_attributes = False
     # A list of (non-standard) attributes to include in a record
     self.rec_attributes = []
     self.attribute_hierarchies = {}
     self.base_url = settings.CANONICAL_HOST
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.m_cache = MemoryCache()  # memory caching object
     self.s_cache = SearchGenerationCache(
     )  # supplemental caching object, specific for searching
     self.request_dict_json = request_dict_json
     if request_dict_json is not False:
         self.request_dict = json.loads(request_dict_json)
     else:
         self.request_dict = False
     self.add_attribute_uris = False
     if self.request_dict and self.request_dict.get('add-attribute-uris'):
         self.add_attribute_uris = True
     self.highlighting = False
     self.recursive_count = 0
     self.min_date = False
     self.max_date = False
     self.thumbnail_data = {}
     self.media_file_data = {}
     self.string_attrib_data = {}
Esempio n. 3
0
    def add_entity_item_to_act_filter(
        self,
        lookup_val,
        act_filter,
        is_spatial_context=False,
        look_up_mapping_dict=None,
    ):
        """Looks up a entity item to add to an act_filter"""
        lookup_val = str(lookup_val)

        if lookup_val.startswith(configs.RELATED_ENTITY_ID_PREFIX):
            # Strip off the related property prefix. Note that this
            # is a related property.
            lookup_val = lookup_val[len(configs.RELATED_ENTITY_ID_PREFIX):]
            act_filter['oc-api:related-property'] = True

        # Map the lookup_val to a mapping dict
        if look_up_mapping_dict:
            lookup_val = look_up_mapping_dict.get(lookup_val, lookup_val)

        m_cache = MemoryCache()
        items = []
        if configs.REQUEST_OR_OPERATOR in lookup_val:
            lookup_list = lookup_val.split(configs.REQUEST_OR_OPERATOR)
        else:
            lookup_list = [lookup_val]

        for act_val in lookup_list:
            if is_spatial_context:
                item = m_cache.get_entity_by_context(act_val)
            else:
                item = m_cache.get_entity(act_val)
            if not item:
                continue
            items.append(item)

        if not len(items):
            # We didn't find any item entities, so return
            # the lookup list as the label.
            act_filter['label'] = ' OR '.join(lookup_list)
            return act_filter, None

        # Use all the item labels to make a label.
        item_labels = [item.label for item in items]
        act_filter['label'] = ' OR '.join(item_labels)

        if len(items) == 1:
            # We only have 1 item, so define it with a
            # URI and slug.
            act_filter['rdfs:isDefinedBy'] = items[0].uri
            act_filter['oc-api:filter-slug'] = items[0].slug

        return act_filter, item
Esempio n. 4
0
 def get_cache_earliest_date(self):
     """ Gets and caches the earliest date
         as a date_time object!
     """
     mc = MemoryCache()
     cache_key = mc.make_memory_cache_key('early_date', 'manifest')
     early_date = mc.get_cache_object(cache_key)
     if early_date is None:
         sum_man = Manifest.objects\
                           .filter(published__gt='2001-01-01')\
                           .aggregate(Min('published'))
         early_date = sum_man['published__min']
         mc.save_cache_object(cache_key, early_date)
     return early_date
Esempio n. 5
0
 def __init__(self, proj_context_json_ld=None):
     self.m_cache = MemoryCache()
     self.context = None
     self.graph = None
     self.fail_on_missing_entities = False
     if not isinstance(proj_context_json_ld, dict):
         return None
     if '@context' in proj_context_json_ld:
         self.context = proj_context_json_ld['@context']
     if '@graph' in proj_context_json_ld:
         self.graph = self.GLOBAL_VOCAB_GRAPH + proj_context_json_ld[
             '@graph']
     else:
         self.graph = self.GLOBAL_VOCAB_GRAPH
     logger.info('Read project graph size: {}'.format(len(self.graph)))
Esempio n. 6
0
 def __init__(self, request_dict=False):
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.base_search_link = '/search/'
     self.base_request = request_dict
     self.base_request_json = False
     self.base_r_full_path = False
     self.spatial_context = False
     self.testing = settings.DEBUG
     self.hierarchy_delim = '---'
     self.partial_param_val_match = False
     self.remove_start_param = True
     self.m_cache = MemoryCache(
     )  # memory caching object  # memory caching object
     self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS
     for param_key, solr_field in DCterms.DC_META_FIELDS.items():
         self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key
    def _get_cache_contexts_dict(self, uuids):
        """Make a dictionary that associates uuids to context paths"""
        m_cache = MemoryCache()
        uuids_for_qs = []
        uuid_context_dict = {}
        for uuid in uuids:
            cache_key = m_cache.make_cache_key(prefix='context-path',
                                               identifier=uuid)
            context_path = m_cache.get_cache_object(cache_key)
            if context_path is None:
                uuids_for_qs.append(uuid)
            else:
                uuid_context_dict[uuid] = context_path

        if not len(uuids_for_qs):
            # Found them all from the cache!
            # Return without touching the database.
            return uuid_context_dict

        # Lookup the remaining geospace objects from a
        # database query. We order by uuid then reverse
        # of feature_id so that the lowest feature id is the
        # thing that actually gets cached.
        subject_qs = Subject.objects.filter(uuid__in=uuids_for_qs, )
        for sub_obj in subject_qs:
            cache_key = m_cache.make_cache_key(prefix='context-path',
                                               identifier=str(sub_obj.uuid))
            m_cache.save_cache_object(cache_key, sub_obj.context)
            uuid_context_dict[sub_obj.uuid] = sub_obj.context

        return uuid_context_dict
    def _make_cache_geospace_obj_dict(self, uuids):
        """Make a dict of geospace objects keyed by uuid"""
        m_cache = MemoryCache()
        uuids_for_qs = []
        uuid_geo_dict = {}
        for uuid in uuids:
            cache_key = m_cache.make_cache_key(prefix='geospace-obj',
                                               identifier=uuid)
            geo_obj = m_cache.get_cache_object(cache_key)
            if geo_obj is None:
                uuids_for_qs.append(uuid)
            else:
                uuid_geo_dict[uuid] = geo_obj

        if not len(uuids_for_qs):
            # Found them all from the cache!
            # Return without touching the database.
            return uuid_geo_dict

        # Lookup the remaining geospace objects from a
        # database query. We order by uuid then reverse
        # of feature_id so that the lowest feature id is the
        # thing that actually gets cached.
        geospace_qs = Geospace.objects.filter(uuid__in=uuids_for_qs, ).exclude(
            ftype__in=['Point', 'point']).order_by('uuid', '-feature_id')
        for geo_obj in geospace_qs:
            cache_key = m_cache.make_cache_key(prefix='geospace-obj',
                                               identifier=str(geo_obj.uuid))
            m_cache.save_cache_object(cache_key, geo_obj)
            uuid_geo_dict[geo_obj.uuid] = geo_obj

        return uuid_geo_dict
Esempio n. 9
0
 def __init__(self):
     self.m_cache = MemoryCache()
     self.parent_entities = None
     self.child_entities = None
     # cache prefix for the json-ldish-parents
     self.jsonldish_p_prefix = 'json-ldish-parents-{}'
     # cache prefix for list of parents
     self.p_prefix = 'lr-parents'
     # cache prefix for children of an item
     self.children_prefix = 'lr-children-{}'
     # cache prefix for full tree of child items
     self.child_tree_prefix = 'lr-child-tree-{}'
Esempio n. 10
0
 def __init__(self, request_dict_json=False):
     self.uuid = False
     self.uri = False  # cannonical uri for the item
     self.href = False  # link to the item in the current deployment
     self.cite_uri = False  # stable / persistent uri
     self.label = False
     self.item_type = False
     self.updated = False
     self.published = False
     self.project_href = False  # link to the project in current deployment
     self.project_uri = False  # cannonical uri for the project
     self.project_label = False 
     self.context_href = False  # link to parent context in current deployment
     self.context_uri = False  # link to parent context cannonical uri
     self.context_label = False
     self.category = False
     self.latitude = False
     self.longitude = False
     self.geojson = False
     self.early_date = False
     self.late_date = False
     self.human_remains_flagged = False  # flagged as relating to human remains
     self.thumbnail_href = False
     self.thumbnail_uri = False
     self.thumbnail_scr = False
     self.preview_scr = False
     self.fullfile_scr = False
     self.snippet = False
     self.cite_uri = False  # stable identifier as an HTTP uri
     self.other_attributes = False  # other attributes to the record
     # flatten list of an attribute values to single value
     self.flatten_rec_attributes = False
     # A list of (non-standard) attributes to include in a record
     self.rec_attributes = []
     self.attribute_hierarchies = {}
     self.base_url = settings.CANONICAL_HOST
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.m_cache = MemoryCache()  # memory caching object
     self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching
     self.request_dict_json = request_dict_json
     if request_dict_json is not False:
         self.request_dict = json.loads(request_dict_json)
     else:
         self.request_dict = False
     self.highlighting = False
     self.recursive_count = 0
     self.min_date = False
     self.max_date = False
     self.thumbnail_data = {}
     self.media_file_data = {}
     self.string_attrib_data = {}
Esempio n. 11
0
 def __init__(self, response_dict_json=False):
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.uuids = []
     self.uris = []
     self.m_cache = MemoryCache()  # memory caching object
     self.s_cache = SearchGenerationCache(
     )  # supplemental caching object, specific for searching
     self.response_dict_json = response_dict_json
     self.highlighting = False
     # make values to these fields "flat" not a list
     self.flatten_rec_fields = True
     self.total_found = False
     self.rec_start = False
     self.min_date = False
     self.max_date = False
     # flatten list of an attribute values to single value
     self.flatten_rec_attributes = False
     # A list of (non-standard) attributes to include in a record
     self.rec_attributes = []
     self.do_media_thumbs = True  # get thumbnails for records
     self.get_all_media = False  # get links to all media files for an item
Esempio n. 12
0
class SearchGenerationCache():
    """
    methods for using the Reddis cache to
    streamline making JSON-LD search results
    """
    def __init__(self, cannonical_uris=False):
        self.m_cache = MemoryCache()

    def get_dtypes(self, entity_uri):
        """ returns an entity data type """
        cache_key = self.m_cache.make_cache_key('data-types', entity_uri)
        dtypes = self.m_cache.get_cache_object(cache_key)
        if dtypes is None:
            dtypes = self._get_dtypes_db(entity_uri)
            if dtypes:
                self.m_cache.save_cache_object(cache_key, dtypes)
        return dtypes

    def _get_dtypes_db(self, entity_uri):
        """ returns an entity data type """
        # haven't found it yet, so look in database
        lequiv = LinkEquivalence()
        return lequiv.get_data_types_from_object(entity_uri)
Esempio n. 13
0
def get_valid_context_slugs(paths_list):
    '''Takes a list of context paths and returns a list of
    slugs for valid paths, ignoring invalid paths.
    
    :param list paths_list: List of spatial context path
        strings.
    '''
    m_cache = MemoryCache()
    paths_list = list(paths_list)
    url_fixes = []
    for context in paths_list:
        for url_issue, rep in {'+': ' ', '%20': ' '}.items():
            if not url_issue in context:
                continue
            url_fix_context = context.replace(url_issue, rep)
            if url_fix_context in paths_list:
                # skip, we already have this context in the paths_list
                continue
            url_fixes.append(url_fix_context)
    # Add a list of url_fixes that have substitutions that maybe
    # needed for replace problematic URL encoding to successfully
    # lookup items.
    paths_list += url_fixes
    valid_context_slugs = []
    for context in list(paths_list):
        # Verify that the contexts are valid
        # find and save the entity to memory
        entity = m_cache.get_entity_by_context(context)
        if not entity:
            # Skip, we couldn't find an entity for
            # this context path
            continue
        if entity.slug in valid_context_slugs:
            # Skip, we already have this entity slug in our valid list.
            continue
        valid_context_slugs.append(entity.slug)
    return valid_context_slugs
Esempio n. 14
0
 def get_project_date_range(self, project_uuid):
     """ gets a project date range """
     mem = MemoryCache()
     key = mem.make_cache_key('proj-chrono', project_uuid)
     date_range = mem.get_cache_object(key)
     if not isinstance(date_range, dict):
         date_range = self.get_project_date_range_db(project_uuid)
         mem.save_cache_object(key, date_range)
     return date_range
Esempio n. 15
0
 def get_project_geo_meta(self, project_uuid):
     """ gets a geo_meta object for a project """
     mem = MemoryCache()
     key = mem.make_cache_key('proj-geo', project_uuid)
     geo_meta = mem.get_cache_object(key)
     if geo_meta is None:
         geo_meta = self.get_project_geo_meta_db(project_uuid)
         mem.save_cache_object(key, geo_meta)
     return geo_meta
Esempio n. 16
0
class SearchGenerationCache():
    """
    methods for using the Reddis cache to
    streamline making JSON-LD search results
    """
    
    def __init__(self, cannonical_uris = False):
        self.m_cache = MemoryCache()
    
    def get_dtypes(self, entity_uri):
        """ returns an entity data type """
        cache_key = self.m_cache.make_cache_key('data-types', entity_uri)
        dtypes = self.m_cache.get_cache_object(cache_key)
        if dtypes is None:
            dtypes = self._get_dtypes_db(entity_uri)
            if dtypes:
                self.m_cache.save_cache_object(cache_key, dtypes)
        return dtypes

    def _get_dtypes_db(self, entity_uri):
        """ returns an entity data type """
        # haven't found it yet, so look in database
        lequiv = LinkEquivalence()
        return lequiv.get_data_types_from_object(entity_uri)
Esempio n. 17
0
 def __init__(self, request_dict=False):
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.base_search_link = '/search/'
     self.base_request = request_dict
     self.base_request_json = False
     self.base_r_full_path = False
     self.spatial_context = False
     self.testing = settings.DEBUG
     self.hierarchy_delim = '---'
     self.partial_param_val_match = False
     self.remove_start_param = True
     self.mem_cache_obj = MemoryCache()  # memory caching object
     self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS
     for param_key, solr_field in DCterms.DC_META_FIELDS.items():
         self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key
Esempio n. 18
0
 def get_all_uuids_related_to_gazetteers(self, all_gaz_annos=None):
     """ gets ALL subject entities related to gazetteer entities """
     mc = MemoryCache()
     cache_id = mc.make_cache_key('gaz', 'uuids_all_gaz')
     uuids_all_gaz = mc.get_cache_object(cache_id)
     if uuids_all_gaz is None:
         if all_gaz_annos is None:
             all_gaz_annos = self.get_all_related_to_gazetteers()
         uuids_all_gaz = {
             'subjects': {},
             'documents': {},
             'media': {},
             'projects': {},
             'types': {}
         }
         for gaz_anno in all_gaz_annos:
             hash_id = gaz_anno.hash_id
             gaz_ent_uri = gaz_anno.object_uri
             key = gaz_anno.subject_type
             if hash_id not in uuids_all_gaz[key]:
                 gaz_ref = {
                     'uuid': gaz_anno.subject,
                     'item_type': gaz_anno.subject_type,
                     'gaz_ent_uri': gaz_ent_uri
                 }
                 if key == 'subjects':
                     # get subjects specific information for the gaz_ref
                     gaz_ref = self.subjects_specific_gaz_ref(
                         gaz_anno.subject, gaz_ent_uri)
                 uuids_all_gaz[key][hash_id] = gaz_ref
             # Gazeteer linked types describe other items that we want to annotate
             # Look up the items described by a type so we can add to the
             # gazetteer described items
             if gaz_anno.subject_type == 'types':
                 rel_asserts = Assertion.objects\
                                        .filter(subject_type__in=self.OC_OA_TARGET_TYPES,
                                                object_uuid=gaz_anno.subject)
                 for rel_assert in rel_asserts:
                     key = rel_assert.subject_type
                     if hash_id not in uuids_all_gaz[key]:
                         gaz_ref = {
                             'uuid': rel_assert.uuid,
                             'item_type': rel_assert.subject_type,
                             'gaz_ent_uri': gaz_ent_uri
                         }
                         if key == 'subjects':
                             # get subjects specific information
                             gaz_ref = self.subjects_specific_gaz_ref(
                                 rel_assert.uuid, gaz_ent_uri)
                         uuids_all_gaz[key][hash_id] = gaz_ref
         # save this hard work to the cache
         mc.save_cache_object(cache_id, uuids_all_gaz)
     return uuids_all_gaz
Esempio n. 19
0
 def get_used_gazetteer_entities(self):
     """ gets entitites in gazetteer vocabularies
         that are actually being used.
         NOTE! This checks the memnory cache first!
     """
     mc = MemoryCache()
     cache_id = mc.make_cache_key('gaz', 'used_gazetteer_ents')
     act_gaz_list = mc.get_cache_object(cache_id)
     if act_gaz_list is None:
         # cache was empty, so get this from the database
         act_gaz_list = self.get_used_gazetteer_entities_db()
         mc.save_cache_object(cache_id, act_gaz_list)
     return act_gaz_list
Esempio n. 20
0
 def get_geo_overlays(self):
     """Gets geo overlays for an item identified by uuid."""
     m_cache = MemoryCache()
     cache_key = m_cache.make_cache_key('geo-layers',
                                        self.uuid)
     geo_overlays = m_cache.get_cache_object(cache_key)
     if geo_overlays is not None:
         self.geo_overlays = geo_overlays
         return self.geo_overlays
     else:
         geo_overlays = self.get_geo_overlays_db()
         m_cache.save_cache_object(cache_key, geo_overlays)
     return self.geo_overlays
Esempio n. 21
0
 def get_all_related_to_gazetteers(self):
     """ gets ALL subject entities related to gazetteer entities """
     mc = MemoryCache()
     cache_id = mc.make_cache_key('gaz', 'all_gaz_annos')
     all_gaz_annos = mc.get_cache_object(cache_id)
     if all_gaz_annos is None:
         subject_types = self.OC_OA_TARGET_TYPES
         subject_types.append('types')
         act_gaz_list = self.get_used_gazetteer_entities()
         all_gaz_annos = LinkAnnotation.objects\
                                       .filter(subject_type__in=subject_types,
                                               object_uri__in=act_gaz_list)
         mc.save_cache_object(cache_id, all_gaz_annos)
     return all_gaz_annos
Esempio n. 22
0
 def get_cache_earliest_date(self):
     """ Gets and caches the earliest date
         as a date_time object!
     """
     mc = MemoryCache()
     cache_key = mc.make_cache_key('early_date', 'manifest')
     early_date = mc.get_cache_object(cache_key)
     if early_date is None:
         sum_man = Manifest.objects\
                           .filter(published__gt='2001-01-01')\
                           .aggregate(Min('published'))
         early_date = sum_man['published__min']
         mc.save_cache_object(cache_key, early_date)
     return early_date
Esempio n. 23
0
 def __init__(self, response_dict_json=False):
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.uuids = []
     self.uris = []
     self.mem_cache_obj = MemoryCache()  # memory caching object
     self.response_dict_json = response_dict_json
     self.highlighting = False
     # make values to these fields "flat" not a list
     self.flatten_rec_fields = True
     self.total_found = False
     self.rec_start = False
     self.min_date = False
     self.max_date = False
     # flatten list of an attribute values to single value
     self.flatten_rec_attributes = False
     # A list of (non-standard) attributes to include in a record
     self.rec_attributes = []
     self.do_media_thumbs = True  # get thumbnails for records
     self.get_all_media = False  # get links to all media files for an item
Esempio n. 24
0
 def __init__(self, response_dict_json):
     rp = RootPath()
     self.base_url = rp.get_baseurl()
     self.m_cache = MemoryCache()  # memory caching object
     self.response_dict_json = response_dict_json
     self.response_dict = json.loads(response_dict_json)
     self.highlighting = False
     # make values to these fields "flat" not a list
     self.flatten_rec_fields = True
     self.geojson_recs = []
     self.non_geo_recs = []
     self.total_found = False
     self.rec_start = False
     self.min_date = False
     self.max_date = False
     # flatten list of an attribute values to single value
     self.flatten_rec_attributes = False
     # A list of (non-standard) attributes to include in a record
     self.rec_attributes = []
     self.do_complex_geo = False  # get complex (Polygons, etc.) geospatial data from database
     self.do_media_thumbs = True  # get thumbnails for records
     self.get_all_media = False  # get links to all media files for an item
Esempio n. 25
0
 def get_jsonldish_parents(self, uuid, add_original=True):
     """Gets parent projects for a project.
     Returns a list of dictionary objects similar to JSON-LD expectations
     This is useful for faceted search
     """
     m_cache = MemoryCache()
     cache_key = m_cache.make_cache_key(
         'proj-par-jsonldish_{}'.format(add_original),
         uuid
     )
     output = m_cache.get_cache_object(cache_key)
     if output is None:
         output = self._db_get_jsonldish_parents(
             uuid, add_original=add_original
         )
         m_cache.save_cache_object(cache_key, output)
     return output
Esempio n. 26
0
def get_containment_parent_slug(slug):
    '''Takes a slug and returns the slug of its parent. Returns 'root'
    if a slug has no parent.
        
    :param str slug: Slug identifying a subjects item.
    '''
    m_cache = MemoryCache()
    cache_key = m_cache.make_cache_key('contain-par-slug', slug)
    parent_slug = m_cache.get_cache_object(cache_key)
    if parent_slug is None:
        contain_obj = Containment()
        # Because it seems to introduce memory errors, turn off
        # caching for this class instance.
        contain_obj.use_cache = False
        parent_slug = contain_obj.get_parent_slug_by_slug(slug)
        m_cache.save_cache_object(cache_key, parent_slug)
    if parent_slug:
        return parent_slug
    return 'root'
Esempio n. 27
0
class RecordProperties():
    """ Methods to make properties for individual record items
        useful for making geospatial feature records or
        lists of items without geospatial data
    """
    ATTRIBUTE_DELIM = '; '  # delimiter for multiple attributes

    def __init__(self, request_dict_json=False):
        self.uuid = False
        self.uri = False  # cannonical uri for the item
        self.href = False  # link to the item in the current deployment
        self.cite_uri = False  # stable / persistent uri
        self.label = False
        self.item_type = False
        self.updated = False
        self.published = False
        self.project_href = False  # link to the project in current deployment
        self.project_uri = False  # cannonical uri for the project
        self.project_label = False 
        self.context_href = False  # link to parent context in current deployment
        self.context_uri = False  # link to parent context cannonical uri
        self.context_label = False
        self.category = False
        self.latitude = False
        self.longitude = False
        self.geojson = False
        self.early_date = False
        self.late_date = False
        self.human_remains_flagged = False  # flagged as relating to human remains
        self.thumbnail_href = False
        self.thumbnail_uri = False
        self.thumbnail_scr = False
        self.preview_scr = False
        self.fullfile_scr = False
        self.snippet = False
        self.cite_uri = False  # stable identifier as an HTTP uri
        self.other_attributes = False  # other attributes to the record
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.attribute_hierarchies = {}
        self.base_url = settings.CANONICAL_HOST
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.m_cache = MemoryCache()  # memory caching object
        self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching
        self.request_dict_json = request_dict_json
        if request_dict_json is not False:
            self.request_dict = json.loads(request_dict_json)
        else:
            self.request_dict = False
        self.highlighting = False
        self.recursive_count = 0
        self.min_date = False
        self.max_date = False
        self.thumbnail_data = {}
        self.media_file_data = {}
        self.string_attrib_data = {}

    def parse_solr_record(self, solr_rec):
        """ Parses a solr rec object """
        if isinstance(solr_rec, dict):
            self.get_item_basics(solr_rec)
            self.get_citation_uri(solr_rec)
            self.get_lat_lon(solr_rec)
            self.get_category(solr_rec)
            self.get_project(solr_rec)
            self.get_context(solr_rec)
            self.get_time(solr_rec)  # get time information, limiting date ranges to query constaints
            self.get_thumbnail(solr_rec)
            self.get_media_files(solr_rec)
            self.get_snippet(solr_rec)  # get snippet of highlighted text
            self.get_attributes(solr_rec)  # get non-standard attributes
            self.get_string_attributes(solr_rec)  # get non-standard string attributes

    def get_item_basics(self, solr_rec):
        """ get basic metadata for an item """
        output = False
        if isinstance(solr_rec, dict):
            if 'uuid' in solr_rec:
                self.uuid = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    output = True
                    self.uri = self.make_url_from_val_string(id_parts['uri'], True)
                    self.href = self.make_url_from_val_string(id_parts['uri'], False)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True)
                    self.item_type = item_type_output['item_type']
                    self.label = id_parts['label']
            if 'updated' in solr_rec:
                self.updated = solr_rec['updated']
            if 'published' in solr_rec:
                self.published = solr_rec['published']
            if 'human_remains' in solr_rec:
                # is the record flagged as related to human remains ?human_remains
                if solr_rec['human_remains'] > 0:
                    self.human_remains_flagged = True
        return output

    def get_snippet(self, solr_rec):
        """ get a text highlighting snippet """
        if isinstance(self.highlighting, dict):
            if self.uuid is False:
                if 'uuid' in solr_rec:
                    self.uuid = solr_rec['uuid']
            if self.uuid in self.highlighting:
                if 'text' in self.highlighting[self.uuid]:
                    text_list = self.highlighting[self.uuid]['text']
                    self.snippet = ' '.join(text_list)
                    # some processing to remove fagments of HTML markup.
                    self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]')
                    self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]')
                    try:
                        self.snippet = '<div>' + self.snippet + '</div>'
                        self.snippet = lxml.html.fromstring(self.snippet).text_content()
                        self.snippet = strip_tags(self.snippet)
                    except:
                        self.snippet = strip_tags(self.snippet)
                    self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>')
                    self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>')

    def get_citation_uri(self, solr_rec):
        """ gets the best citation / persistent uri for the item """
        if 'persistent_uri' in solr_rec:
            for p_uri in solr_rec['persistent_uri']:
                self.cite_uri = p_uri
                if 'dx.doi.org' in p_uri:
                    break  # stop looking once we have a DOI, the best

    def get_lat_lon(self, solr_rec):
        """ gets latitute and longitude information """
        if 'discovery_geolocation' in solr_rec:
            geo_strings = solr_rec['discovery_geolocation']
            geo_coords_str = geo_strings.split(',')
            # NOT geojson ording, since solr uses lat/lon ordering
            self.latitude = float(geo_coords_str[0])
            self.longitude = float(geo_coords_str[1]) 

    def get_category(self, solr_rec):
        """ Gets the most specific category for the item """
        self.recursive_count = 0
        cat_hierarchy = self.get_category_hierarchy(solr_rec)
        if len(cat_hierarchy) > 0:
            self.category = cat_hierarchy[-1]['label']

    def get_context(self, solr_rec):
        """ Get the most specific context parent for the record """
        self.recursive_count = 0
        contexts = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_CONTEXT_SOLR,
                                          '___context',
                                          [])
        if len(contexts) > 0:
            self.context_label = self.make_context_path_label(contexts)
            self.context_uri = self. make_context_link(contexts, True)
            self.context_href = self. make_context_link(contexts, False)

    def get_project(self, solr_rec):
        """ Get the most specific project for the record """
        self.recursive_count = 0
        projects = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_PROJECT_SOLR,
                                          '___project',
                                          [])
        if len(projects) > 0:
            self.project_label = projects[-1]['label']
            self.project_uri = self.make_url_from_val_string(projects[-1]['uri'],
                                                             True)
            self.project_href = self.make_url_from_val_string(projects[-1]['uri'],
                                                              False)

    def get_time(self, solr_rec):
        """ parses time information """
        early_list = False
        late_list = False
        if 'form_use_life_chrono_earliest' in solr_rec:
            early_list = solr_rec['form_use_life_chrono_earliest']
        if 'form_use_life_chrono_latest' in solr_rec:
            late_list = solr_rec['form_use_life_chrono_latest']
        if isinstance(early_list, list):
            date_list = early_list
        else:
            date_list = []
        if isinstance(late_list, list):
            date_list += late_list
        if len(date_list) > 0:
            min_max = self.get_list_min_max(date_list)
            self.early_date = min(min_max)
            self.late_date = max(min_max)

    def get_list_min_max(self, date_list):
        """ Returns the minimum and maximum dates
            from a date list, constrained by
            preset min and max dates
        """
        min_date = False
        max_date = False
        # print(str(date_list))
        if isinstance(date_list, list):
            date_list.sort()
            for date in date_list:
                if self.min_date is not False:
                    if date >= self.min_date \
                       and min_date is False:
                        min_date = date
                if self.max_date is not False:
                    if date <= self.max_date:
                        max_date = date
        if min_date is False:
            min_date = self.min_date
        if max_date is False:
            max_date = self.max_date
        return [min_date, max_date]

    def get_thumbnail(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.thumbnail_data:
                if self.thumbnail_data[uuid] is not False:
                    self.thumbnail_href = self.thumbnail_data[uuid]['href']
                    self.thumbnail_uri = self.thumbnail_data[uuid]['uri']
                    self.thumbnail_scr = self.thumbnail_data[uuid]['scr']
                    rp = RootPath()
                    self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr)
            else:
                # did not precache thumbnail data, get an indivitual record
                self.get_thumbnail_from_database(solr_rec)

    def get_media_files(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.media_file_data:
                if self.media_file_data[uuid] is not False:
                    rp = RootPath()
                    for file_type, file_uri in self.media_file_data[uuid].items():
                        if file_type == 'oc-gen:thumbnail':
                            self.thumbnail_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:preview':
                            self.preview_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:fullfile':
                            self.fullfile_scr = rp.convert_to_https(file_uri)

    def get_thumbnail_from_database(self, solr_rec):
        """ get media record and thumbnail, if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            thumb = []
            if self.item_type != 'media':
                media_item = Assertion.objects\
                                      .filter(uuid=uuid,
                                              object_type='media')[:1]
                if len(media_item) > 0:
                    muuid = media_item[0].object_uuid
                    thumb = Mediafile.objects\
                                     .filter(uuid=muuid,
                                             file_type='oc-gen:thumbnail')[:1]
            else:
                # do this for media items
                muuid = uuid
                thumb = Mediafile.objects\
                                 .filter(uuid=uuid,
                                         file_type='oc-gen:thumbnail')[:1]
            if len(thumb) > 0:
                self.thumbnail_href = self.base_url + '/media/' + muuid
                self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid
                self.thumbnail_scr = thumb[0].file_uri

    def get_category_hierarchy(self, solr_rec):
        """ gets the most specific category
            informtation about
            an item
        """
        cat_hierarchy = []
        if 'item_type' in solr_rec:
            item_type = solr_rec['item_type'][0]
            root_cat_field = 'oc_gen_' + item_type + '___pred_id'
            cat_hierarchy = self.extract_hierarchy(solr_rec,
                                                   root_cat_field,
                                                   '___pred',
                                                   [])
        return cat_hierarchy

    """ The following seciton of code
        processes non-default attributes for records
    """
    def get_attributes(self, solr_rec):
        """ gets attributes for a record, based on the
            predicates requested in the search
            and optional predicates passed by a client
            with a GET request with parameter 'attributes'
        """
        qm = QueryMaker()
        solr_field_entities = {}
        for attribute in self.rec_attributes:
            entity = self.m_cache.get_entity(attribute)
            if entity:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.s_cache.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                field_parts = qm.make_prop_solr_field_parts(entity)
                solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                # print('Found: ' + solr_field)
                # extract children of the solr_field so we know if
                # we have the most specific attributes, then we can get
                # values for the most specific attributes
                self.extract_attribute_children(solr_rec, solr_field)
        self.clean_attribute_hiearchies()
        if isinstance(self.attribute_hierarchies, dict):
            self.other_attributes = []
            for field_slug_key, values in self.attribute_hierarchies.items():
                entity = self.m_cache.get_entity(field_slug_key)
                if entity:
                    attribute_dict = LastUpdatedOrderedDict()
                    attribute_dict['property'] = entity.label
                    attribute_dict['values_list'] = []
                    attribute_dict['value'] = ''
                    string_val = False
                    delim = ''
                    for val in values:
                        if isinstance(val, str):
                            string_val = True
                            parsed_val = self.parse_solr_value_parts(val)
                            attribute_dict["values_list"].append(parsed_val['label'])
                            attribute_dict['value'] += delim + str(parsed_val['label'])
                        else:
                            attribute_dict["values_list"].append(val)
                            attribute_dict['value'] += delim + str(val)
                        delim = self.ATTRIBUTE_DELIM
                    if len(values) == 1 \
                       and string_val is False:
                        attribute_dict['value'] = values[0]
                    self.other_attributes.append(attribute_dict)

    def get_string_attributes(self, solr_rec):
        """ gets string attributes for a solr rec, from a previous database query
            needed because solr does not cache string field data
        """
        if isinstance(self.string_attrib_data, dict):
            # now add predicate attributes for string predicates, from the database
            if 'uuid' in solr_rec and 'data' in self.string_attrib_data:
                uuid = solr_rec['uuid']
                if uuid in self.string_attrib_data['data']:
                    item_data = self.string_attrib_data['data'][uuid]
                    for pred_uuid, values_list in item_data.items():
                        act_attribute = self.string_attrib_data['pred_ents'][pred_uuid]
                        act_attribute['values_list'] = values_list
                        act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list)
                        self.other_attributes.append(act_attribute)

    def prevent_attribute_key_collision(self, item_prop_dict, prop_key):
        """ checks to make sure there's no collision between the prop_key
            and the dict that it will be added to
        """
        i = 2
        output_prop_key = prop_key
        while output_prop_key in item_prop_dict:
            output_prop_key = prop_key + '[' + str(i) + ']'
            i += 1
        return output_prop_key

    def clean_attribute_hiearchies(self):
        """ some post-processing to make sure
            we have clean attribute hierarchies
        """
        if isinstance(self.attribute_hierarchies, dict):
            # print('check: ' + str(self.attribute_hierarchies))
            temp_attribute_hierarchies = self.attribute_hierarchies
            clean_attribute_hiearchies = {}
            for solr_field_key, field_char in self.attribute_hierarchies.items():
                if field_char['most-specific']:
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    specific_ok = True
                    for val in field_char['values']:
                        if isinstance(val, str):
                            #  print('check:' + solr_field_key + ' val: ' + val)
                            parsed_val = self.parse_solr_value_parts(val)
                            check_field = parsed_val['slug'].replace('-', '_')
                            check_field += '___pred_' + parsed_val['data_type']
                            if check_field in temp_attribute_hierarchies:
                                # note a field is NOT at the most specific level
                                specific_ok = False
                            else:
                                # now check a version with the predicate as part of
                                # the solr field
                                check_field = parsed_val['slug'].replace('-', '_')
                                check_field += pred_suffix
                                if check_field in temp_attribute_hierarchies:
                                    # note a field is NOT at the most specific level
                                    specific_ok = False
                    if specific_ok:
                        # ok to add
                        # print('checked OK: ' + solr_field_key)
                        clean_attribute_hiearchies[solr_field_key] = field_char
            # now that we got rid of problem fields, lets sort these for consistent
            # rendering
            self.attribute_hierarchies = LastUpdatedOrderedDict()
            keys = LastUpdatedOrderedDict()
            # order of key types, we want id fields, followed by numeric then date
            key_types = ['___pred_id',
                         '___pred_numeric',
                         '___pred_date']
            for key_type in key_types:
                keys[key_type] = []
                for solr_field_key, field_char in clean_attribute_hiearchies.items():
                    if key_type in solr_field_key:
                        keys[key_type].append(solr_field_key)
                # sort alphabetically. Slugs useful, since they will cluster predicates
                # from similar vocabularies
                keys[key_type].sort()
                for key in keys[key_type]:
                    field_char = clean_attribute_hiearchies[key]
                    field_ex = key.split('___')
                    # the penultimate part is the predicate
                    field_slug = field_ex[-2].replace('_', '-')
                    if field_slug not in self.attribute_hierarchies:
                        self.attribute_hierarchies[field_slug] = []
                    for val in field_char['values']:
                        if val not in self.attribute_hierarchies[field_slug]:
                            self.attribute_hierarchies[field_slug].append(val)

    def extract_attribute_children(self,
                                   solr_rec,
                                   solr_field_key):
        """ extracts ALL children from the hiearchy of
            a solr_field_key
        """
        is_field = False
        if solr_field_key not in self.attribute_hierarchies:
            # so we don't look at the same thing twice!
            if solr_field_key in solr_rec:
                is_field = True
                field_char = {'most-specific': False,
                              'values': []}
                if '___pred_numeric' in solr_field_key \
                   or '___pred_numeric' in solr_field_key:
                    field_char['most-specific'] = True
                    field_char['values'] = solr_rec[solr_field_key]
                elif '___pred_id' in solr_field_key:
                    # make a suffix for the 
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    childless_children = []
                    for child_val in solr_rec[solr_field_key]:
                        # print('Child: ' + solr_field_key + ': ' + child_val)
                        parsed_path_item = self.parse_solr_value_parts(child_val)
                        new_field_prefix = parsed_path_item['slug'].replace('-', '_')
                        new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type']
                        if parsed_path_item['data_type'] == 'id':
                            child_is_field = self.extract_attribute_children(solr_rec,
                                                                             new_field_key)
                            if child_is_field is False:
                                # now check an alternative combining the child
                                # slug with the predicate of the parent
                                new_field_key = new_field_prefix + pred_suffix
                                # print('check: ' + new_field_key)
                                child_is_field = self.extract_attribute_children(solr_rec,
                                                                                 new_field_key)
                                if child_is_field is False:
                                    childless_children.append(child_val)
                    if len(childless_children) > 0:
                        field_char['most-specific'] = True
                        field_char['values'] = childless_children
                else:
                    pass
                self.attribute_hierarchies[solr_field_key] = field_char
        return is_field

    def extract_hierarchy(self,
                          solr_rec,
                          facet_field_key,
                          facet_suffix,
                          hierarchy=[],
                          pred_field=False):
        """ extracts a hierarchy from a solr_record.
            The output is a list starting with the most
            general parent of the hiearchy,
            then going to the most specific

            This is a recursive function and
            default / starts with the root
            of the hiearchy as the facet_field_key

            This only follows a single path (not multiple paths)
        """
        alt_facet_field_key = facet_field_key
        if pred_field is not False:
            # do this to allow search of hiarchy in a named
            # predicate field
            f_parts = facet_field_key.split('___')
            if len(f_parts) == 2:
                alt_f_parts = [f_parts[0],
                               pred_field.replace('-', '_'),
                               f_parts[1]]
                alt_facet_field_key = '___'.join(alt_f_parts)
                # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key)
        if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\
           and self.recursive_count < 20:
            self.recursive_count += 1
            if facet_field_key in solr_rec:
                path_item_val = solr_rec[facet_field_key][0]
            else:
                path_item_val = solr_rec[alt_facet_field_key][0]
            parsed_path_item = self.parse_solr_value_parts(path_item_val)
            if isinstance(parsed_path_item, dict):
                hierarchy.append(parsed_path_item)
                new_facet_field = parsed_path_item['slug'].replace('-', '_')
                new_facet_field += facet_suffix + '_' + parsed_path_item['data_type']
                # print('New hierarchy field: ' + new_facet_field)
                hierarchy = self.extract_hierarchy(solr_rec,
                                                   new_facet_field,
                                                   facet_suffix,
                                                   hierarchy)
        return hierarchy

    def make_context_path_label(self, contexts):
        """ Makes a '/' delimited context
            path for easy human readability
        """
        context_path = False
        if len(contexts) > 0:
            context_labels = []
            for context in contexts:
                context_labels.append(context['label'])
            context_path = '/'.join(context_labels)
        return context_path

    def make_context_link(self, contexts, cannonical=False):
        """ makes a URI for a context """
        context_uri = False
        if len(contexts) > 0:
            context_uri = self.make_url_from_val_string(contexts[-1]['uri'],
                                                        cannonical)
        return context_uri

    def make_url_from_val_string(self,
                                 partial_url,
                                 use_cannonical=True):
        """ parses a solr value if it has
            '___' delimiters, to get the URI part
            string.
            if it's already a URI part, it makes
            a URL
        """
        if use_cannonical:
            base_url = settings.CANONICAL_HOST
        else:
            base_url = self.base_url
        solr_parts = self.parse_solr_value_parts(partial_url)
        if isinstance(solr_parts, dict):
            partial_url = solr_parts['uri']
        if 'http://' not in partial_url \
           and 'https://' not in partial_url:
            url = base_url + partial_url
        else:
            url = partial_url
        return url

    def add_record_fields(self):
        """ adds fields to include in the GeoJSON properties """
        if 'rec-field' in self.response_dict:
            raw_rec_fields = self.response_dict['rec-field'][0]
            if ',' in raw_rec_fields:
                self.record_fields = raw_rec_fields.split(',')
            else:
                self.record_fields = [raw_rec_fields]
        else:
            self.record_fields = []
        return self.record_fields

    def parse_solr_value_parts(self, solr_value):
        """ parses a solr_value string into
            slug, solr-data-type, uri, and label
            parts
        """
        output = False
        if isinstance(solr_value, str):
            if '___' in solr_value:
                solr_ex = solr_value.split('___')
                if len(solr_ex) == 4:
                    output = {}
                    output['slug'] = solr_ex[0]
                    output['data_type'] = solr_ex[1]
                    output['uri'] = solr_ex[2]
                    output['label'] = solr_ex[3]
            else:
                output = solr_value
        else:
            output = solr_value
        return output

    def get_solr_record_uuid_type(self, solr_rec):
        """ get item uuid, label, and type from a solr_rec """
        output = False
        if isinstance(solr_rec, dict):
            output = {'uuid': False,
                      'label': False,
                      'item_type': False}
            if 'uuid' in solr_rec:
                output['uuid'] = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    uri = self.make_url_from_val_string(id_parts['uri'], True)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True)
                    output['item_type'] = item_type_output['item_type']
                    output['label'] = id_parts['label']
        return output

    def get_key_val(self, key, dict_obj):
        """ returns the value associated
            with a key, if the key exists
            else, none
        """
        output = None
        if isinstance(dict_obj, dict):
            if key in dict_obj:
                output = dict_obj[key]
        return output
Esempio n. 28
0
 def __init__(self):
     self.geojson_ld = False
     self.raw_related_labels = {}
     self.m_cache = MemoryCache()  # memory caching object
Esempio n. 29
0
def projects_html_view(request, spatial_context=None):
    """ returns HTML representation of projects search
    """
    mem_cache_obj = MemoryCache()
    mem_cache_obj.ping_redis_server()
    rp = RootPath()
    base_url = rp.get_baseurl()
    rd = RequestDict()
    request_dict_json = rd.make_request_dict_json(request,
                                                  spatial_context)
    if rd.security_ok is False:
        template = loader.get_template('400.html')
        context = RequestContext(request,
                                 {'abusive': True})
        return HttpResponse(template.render(context), status=400)
    elif rd.do_bot_limit:
        # redirect bot requests away from faceted search where
        # they can negatively impact performance
        cache_control(no_cache=True)
        return redirect('/projects-search/', permanent=False)
    else:
        # url and json_url neeed for view templating
        url = request.get_full_path()
        if 'http://' not in url \
           and 'https://' not in url:
            url = base_url + url
        if '?' in url:
            json_url = url.replace('?', '.json?')
        else:
            json_url = url + '.json'
        # see if search results are cached. this is not done
        # with a view decorator, because we want to handle bots differently
        db_cache = DatabaseCache()
        cache_key = db_cache.make_cache_key('projects-search',
                                            request_dict_json)
        if rd.refresh_cache:
            # the request wanted to refresh the cache
            db_cache.remove_cache_object(cache_key)
        # get the search result JSON-LD, if it exists in cache
        json_ld = db_cache.get_cache_object(cache_key)
        if json_ld is None:
            # cached result is not found, so make it with a new search
            solr_s = SolrSearch()
            solr_s.is_bot = rd.is_bot  # True if bot detected
            solr_s.do_bot_limit = rd.do_bot_limit  # Toggle limits on facets for bots
            solr_s.mem_cache_obj = mem_cache_obj
            solr_s.do_context_paths = False
            solr_s.item_type_limit = 'projects'
            if solr_s.solr is not False:
                response = solr_s.search_solr(request_dict_json)
                mem_cache_obj = solr_s.mem_cache_obj  # reused cached memory items
                m_json_ld = MakeJsonLd(request_dict_json)
                m_json_ld.base_search_link = '/projects-search/'
                # share entities already looked up. Saves database queries
                m_json_ld.mem_cache_obj = mem_cache_obj
                m_json_ld.request_full_path = request.get_full_path()
                m_json_ld.spatial_context = spatial_context
                json_ld = m_json_ld.convert_solr_json(response.raw_content)
                # now cache the resulting JSON-LD
                db_cache.save_cache_object(cache_key, json_ld)
        if json_ld is not None:
            req_neg = RequestNegotiation('text/html')
            req_neg.supported_types = ['application/json',
                                       'application/ld+json',
                                       'application/vnd.geo+json']
            if 'HTTP_ACCEPT' in request.META:
                req_neg.check_request_support(request.META['HTTP_ACCEPT'])
            if 'json' in req_neg.use_response_type:
                # content negotiation requested JSON or JSON-LD
                recon_obj = Reconciliation()
                json_ld = recon_obj.process(request.GET,
                                            json_ld)
                return HttpResponse(json.dumps(json_ld,
                                    ensure_ascii=False, indent=4),
                                    content_type=req_neg.use_response_type + "; charset=utf8")
            else:
                # now make the JSON-LD into an object suitable for HTML templating
                st = SearchTemplate(json_ld)
                st.process_json_ld()
                p_aug = ProjectAugment(json_ld)
                p_aug.process_json_ld()
                template = loader.get_template('search/view.html')
                context = RequestContext(request,
                                         {'st': st,
                                          'item_type': 'projects',
                                          'base_search_link': m_json_ld.base_search_link,
                                          'p_aug': p_aug,
                                          'url': url,
                                          'json_url': json_url,
                                          'base_url': base_url})
                if req_neg.supported:
                    return HttpResponse(template.render(context))
                else:
                    # client wanted a mimetype we don't support
                    return HttpResponse(req_neg.error_message,
                                        content_type=req_neg.use_response_type + "; charset=utf8",
                                        status=415)
        else:
            cache_control(no_cache=True)
            template = loader.get_template('500.html')
            context = RequestContext(request,
                                     {'error': 'Solr Connection Problem'})
            return HttpResponse(template.render(context), status=503)
Esempio n. 30
0
class FilterLinks():

    BASE_SOLR_FIELD_PARAM_MAPPINGS = \
        {'___project_id': 'proj',
         '___context_id': 'path',
         'obj_all___biol_term_hastaxonomy___pred_id': 'reconcile',
         '___pred_': 'prop',
         'item_type': 'type'}

    def __init__(self, request_dict=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.base_search_link = '/search/'
        self.base_request = request_dict
        self.base_request_json = False
        self.base_r_full_path = False
        self.spatial_context = False
        self.testing = settings.DEBUG
        self.hierarchy_delim = '---'
        self.partial_param_val_match = False
        self.remove_start_param = True
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS
        for param_key, solr_field in DCterms.DC_META_FIELDS.items():
            self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key

    def make_request_urls(self, new_rparams):
        """ makes request urls from the new request object """
        output = {}
        output['html'] = self.make_request_url(new_rparams)
        output['json'] = self.make_request_url(new_rparams, '.json')
        output['atom'] = self.make_request_url(new_rparams, '.atom')
        return output

    def make_request_url(self,
                         new_rparams,
                         doc_format=''):
        """ makes request urls from the new request object
            default doc_format is '' (HTML)
        """
        url = self.base_url + self.base_search_link
        if 'path' in new_rparams:
            if new_rparams['path'] is not None \
               and new_rparams['path'] is not False:
                # context_path = iri_to_uri(new_rparams['path'])
                context_path = new_rparams['path']
                context_path = context_path.replace(' ', '+')
                url += context_path
        url += doc_format
        param_sep = '?'
        for param, param_vals in new_rparams.items():
            if param != 'path':
                for val in param_vals:
                    quote_val = quote_plus(val)
                    quote_val = quote_val.replace('%7BSearchTerm%7D', '{SearchTerm}')
                    url += param_sep + param + '=' + quote_val
                    param_sep = '&'
        return url

    def make_request_sub(self,
                         old_request_dict,
                         rem_param_key,
                         rem_param_val,
                         sub_param_val=None):
        """ makes a dictionary object for
            request parameters WITHOUT the current fparam_key
            and fparam_vals
        """
        filter_request = LastUpdatedOrderedDict()
        for ch_param_key, ch_param_vals in old_request_dict.items():
            if ch_param_key != rem_param_key:
                # a different parameter than the one in the filter, so add
                filter_request[ch_param_key] = ch_param_vals
            else:
                if rem_param_key != 'path' and len(ch_param_vals) > 0:
                    filter_request[ch_param_key] = []
                    for ch_param_val in ch_param_vals:
                        if rem_param_val != ch_param_val:
                            # the filter value for this key is not the same
                            # as the check value for this key, so add
                            # to the filter request
                            filter_request[ch_param_key].append(ch_param_val)
                        else:
                            if sub_param_val is not None:
                                # put in the substitute value
                                filter_request[ch_param_key].append(sub_param_val)
        return filter_request

    def add_to_request_by_solr_field(self,
                                     solr_facet_key,
                                     new_value):
        """ uses the solr_facet_key to determine the
           request parameter
        """
        param = self.get_param_from_solr_facet_key(solr_facet_key)
        slugs = self.parse_slugs_in_solr_facet_key(solr_facet_key)
        if slugs is not False:
            add_to_value = self.hierarchy_delim.join(slugs)
        else:
            add_to_value = None
        #print('New param: ' + param + ' new val: ' + new_value + ' len:' + str(self.base_request))
        new_rparams = self.add_to_request(param,
                                          new_value,
                                          add_to_value)
        return new_rparams

    def add_to_request(self,
                       param,
                       new_value,
                       add_to_value=None):
        """ adds to the new request object a parameter and value """
        if self.base_request_json is not False:
            # start of with JSON encoded base request parameters
            new_rparams = json.loads(self.base_request_json)
        elif self.base_r_full_path is not False:
            # start of with parsing a URL string
            new_rparams = self.make_base_params_from_url(self.base_r_full_path)
        elif self.base_request is not False:
            # start with a dictionary object of the base request
            # for some reason this often leads to memory errors
            new_rparams = self.base_request
        else:
            new_rparams = {}
        if 'start' in new_rparams and self.remove_start_param:
            # remove paging information when composing a new link
            new_rparams.pop('start', None)
        if param == 'path':
            found = self.mem_cache_obj.check_con_entity_found(new_value)
            if found:
                # convert the (slug) value into a context path
                entity = self.mem_cache_obj.get_con_entity(new_value)
                new_value = entity.context
        if param not in new_rparams:
            if param == 'path':
                new_rparams[param] = new_value
            else:
                new_rparams[param] = [new_value]
        else:
            if param == 'path':
                new_rparams['path'] = new_value
            else:
                if add_to_value is not None:
                    new_list = []
                    old_found = False
                    for old_val in new_rparams[param]:
                        old_prefix = self.remove_solr_part(old_val)
                        first_last_old_val = False
                        if self.hierarchy_delim in old_val:
                            old_val_ex = old_val.split(self.hierarchy_delim)
                            if len(old_val_ex) > 2:
                                first_last_old_val = old_val_ex[0]
                                first_last_old_val += self.hierarchy_delim
                                first_last_old_val += old_val_ex[-1]
                        if old_val == add_to_value:
                            old_found = True
                            new_list_val = old_val + self.hierarchy_delim + new_value
                        elif old_prefix == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        elif first_last_old_val == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        else:
                            new_list_val = old_val
                        new_list.append(new_list_val)
                    if old_found is False:
                        if self.partial_param_val_match:
                            for old_val in new_rparams[param]:
                                if add_to_value in old_val:
                                    old_found = True
                                    old_prefix = self.remove_solr_part(old_val)
                                    new_list_val = old_prefix + self.hierarchy_delim + new_value
                                    # add the new item
                                    new_list.append(new_list_val)
                                    # remove the old
                                    new_list.remove(old_val)
                    new_rparams[param] = new_list
                    if old_found is False:
                        new_rparams[param].append(new_value)
                else:
                    new_rparams[param].append(new_value)
        return new_rparams

    def remove_solr_part(self, old_val):
        """ removes part of a query parameter that
            is in solr query syntax, inside square
            brackets []
        """
        output = old_val
        splitter = self.hierarchy_delim + '['
        if splitter in old_val:
            old_ex = old_val.split(splitter)
            output = old_ex[0]
        return output

    def make_base_params_from_url(self, request_url):
        """ makes the base parameters from the url """
        rparams = {}
        url_o = urlparse(request_url)
        rparams = parse_qs(url_o.query)
        if self.spatial_context is False:
            self.spatial_context = self.get_context_from_path(url_o.path)
        rparams['path'] = self.spatial_context
        return rparams

    def get_context_from_path(self, path):
        """ geths the spatial context from a request path """
        context = False
        if '.' in path:
            pathex = path.split('.')
            path = pathex[0]
        if '/' in path:
            pathex = path.split('/')
            print(str(pathex))
            if len(pathex) > 2:
                # remove the part that's the first slash
                pathex.pop(0)
                # remove the part that's for the url of search
                pathex.pop(0)
            context = '/'.join(pathex)
        return context

    def get_param_from_solr_facet_key(self, solr_facet_key):
        """" returns the public parameter from the solr_facet_key """
        output = solr_facet_key
        exact_match = False
        for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items():
            if solr_field_part_key == solr_facet_key:
                output = param
                exact_match = True
                break
        if exact_match is False:
            for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items():
                if solr_field_part_key in solr_facet_key:
                    output = param
                    break
        return output

    def parse_slugs_in_solr_facet_key(self, solr_facet_key):
        """ returns a list of slugs encoded in a solr_facet_key
            the solr field has these slugs in reverse order
        """
        no_slug_field_list = [SolrDocument.ROOT_CONTEXT_SOLR,
                              SolrDocument.ROOT_PROJECT_SOLR,
                              SolrDocument.ROOT_LINK_DATA_SOLR,
                              SolrDocument.ROOT_PREDICATE_SOLR]
        if solr_facet_key in no_slug_field_list:
            slugs = False
        else:
            raw_slugs = []
            facet_key_list = solr_facet_key.split('___')
            list_len = len(facet_key_list)
            i = 0
            for list_item in facet_key_list:
                i += 1
                if i < list_len:
                    # last item is the suffix for the field type
                    # also replace '_' with '-' to get a slug
                    raw_slugs.append(list_item.replace('_', '-'))
            slugs = raw_slugs[::-1]
        return slugs

    def prep_base_request_obj(self, request_dict):
        """ prepares a base request object from the old request object
            to use to create new requests
        """
        self.base_request = request_dict
        return self.base_request

    def get_request_param(self, param, default, as_list=False):
        """ get a string or list to use in queries from either
            the request object or the internal_request object
            so we have flexibility in doing searches without
            having to go through HTTP
        """
        output = False
        if self.request is not False:
            if as_list:
                output = self.request.GET.getlist(param)
            else:
                output = self.request.GET.get(param, default=default)
        elif self.internal_request is not False:
            if as_list:
                if param in self.internal_request:
                    param_obj = self.internal_request[param]
                    if isinstance(param_obj, list):
                        output = param_obj
                    else:
                        output = [param_obj]
            else:
                if param in self.internal_request:
                    output = self.internal_request[param]
                else:
                    output = default
        else:
            output = False
        return output 
    def make_facet_dict_from_solr_field(
        self,
        solr_facet_field_key,
        facet_type,
        facet_labeling,
        range_data_type=None,
    ):
        """Makes the dict for a fact with id options."""

        if configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key):
            # We have a standard "root" field. Return the facet
            # dict object for it.
            return configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key)

        solr_slug_parts = solr_facet_field_key.split(
            SolrDocument.SOLR_VALUE_DELIM)

        # Making this dict will require some database (usually from
        # the cache) because it is not a standard root solr field,
        # rather it is a solr field deeper in a hierarchy.
        m_cache = MemoryCache()

        # The solr field parts are in reverse hierarchy order
        solr_slug_parts.reverse()

        # Iterate through the parts, skipping the first item
        # which is the most general part (the field suffix).
        items = []
        for solr_slug in solr_slug_parts[1:]:
            is_related = False
            slug = solr_slug.replace('_', '-')
            if slug.startswith(configs.RELATED_ENTITY_ID_PREFIX):
                is_related = True
                slug = slug[len(configs.RELATED_ENTITY_ID_PREFIX):]
            item = m_cache.get_entity(slug)
            if not item:
                continue

            # Add an "is_related" attribute
            item.is_related = is_related
            items.append(item)

        if not len(items):
            return None

        slugs_id = configs.REQUEST_PROP_HIERARCHY_DELIM.join(
            [item.slug for item in items])
        facet = LastUpdatedOrderedDict()

        if range_data_type is None:
            id_prefix = 'facet'
        else:
            id_prefix = 'range-facet'

        if is_related:
            facet['id'] = '#{}-{}{}'.format(id_prefix,
                                            configs.RELATED_ENTITY_ID_PREFIX,
                                            slugs_id)
        else:
            facet['id'] = '#{}-{}'.format(id_prefix, slugs_id)

        labels = [item.label for item in items]
        if len(labels) == 1:
            labels.append(facet_labeling)
        # Put the last label in parentheses.
        labels[-1] = '({})'.format(labels[-1])
        facet['label'] = ' '.join(labels)
        facet['rdfs:isDefinedBy'] = items[0].uri
        facet['slug'] = items[0].slug
        facet['type'] = facet_type
        if range_data_type:
            facet['data-type'] = range_data_type
        if items[0].is_related:
            facet['oc-api:related-property'] = True
        return facet
Esempio n. 32
0
def projects_json_view(request, spatial_context=None):
    """ API for searching Open Context, media only """
    mem_cache_obj = MemoryCache()
    mem_cache_obj.ping_redis_server()
    rd = RequestDict()
    request_dict_json = rd.make_request_dict_json(request,
                                                  spatial_context)
    if rd.security_ok is False:
        template = loader.get_template('400.html')
        context = RequestContext(request,
                                 {'abusive': True})
        return HttpResponse(template.render(context), status=400)
    elif rd.do_bot_limit:
        # redirect bot requests away from faceted search where
        # they can negatively impact performance
        cache_control(no_cache=True)
        return redirect('/projects-search/', permanent=False)
    else:
        # see if search results are cached. this is not done
        # with a view decorator, because we want to handle bots differently
        db_cache = DatabaseCache()
        cache_key = db_cache.make_cache_key('projects-search',
                                            request_dict_json)
        if rd.refresh_cache:
            # the request wanted to refresh the cache
            db_cache.remove_cache_object(cache_key)
        # get the search result JSON-LD, if it exists in cache
        json_ld = db_cache.get_cache_object(cache_key)
        if json_ld is None:
            # cached result is not found, so make it with a new search
            solr_s = SolrSearch()
            solr_s.is_bot = rd.is_bot  # True if bot detected
            solr_s.do_bot_limit = rd.do_bot_limit  # Toggle limits on facets for bots
            solr_s.do_context_paths = False
            solr_s.item_type_limit = 'projects'
            if solr_s.solr is not False:
                response = solr_s.search_solr(request_dict_json)
                m_json_ld = MakeJsonLd(request_dict_json)
                m_json_ld.base_search_link = '/projects-search/'
                # share entities already looked up. Saves database queries
                m_json_ld.entities = solr_s.entities
                m_json_ld.request_full_path = request.get_full_path()
                m_json_ld.spatial_context = spatial_context
                json_ld = m_json_ld.convert_solr_json(response.raw_content)
                # now cache the resulting JSON-LD
                db_cache.save_cache_object(cache_key, json_ld)
        if json_ld is not None:
            req_neg = RequestNegotiation('application/json')
            req_neg.supported_types = ['application/ld+json',
                                       'application/vnd.geo+json']
            if 'HTTP_ACCEPT' in request.META:
                req_neg.check_request_support(request.META['HTTP_ACCEPT'])
            if req_neg.supported:
                # requester wanted a mimetype we DO support
                if 'callback' in request.GET:
                    funct = request.GET['callback']
                    json_str = json.dumps(json_ld,
                                          ensure_ascii=False,
                                          indent=4)
                    return HttpResponse(funct + '(' + json_str + ');',
                                        content_type='application/javascript' + "; charset=utf8")
                else:
                    return HttpResponse(json.dumps(json_ld,
                                        ensure_ascii=False, indent=4),
                                        content_type=req_neg.use_response_type + "; charset=utf8")
            else:
                # client wanted a mimetype we don't support
                return HttpResponse(req_neg.error_message,
                                    status=415)
        else:
            cache_control(no_cache=True)
            template = loader.get_template('500.html')
            context = RequestContext(request,
                                     {'error': 'Solr Connection Problem'})
            return HttpResponse(template.render(context), status=503)
Esempio n. 33
0
 def __init__(self):
     self.m_cache = MemoryCache()
     self.request_full_path = ''
Esempio n. 34
0
 def get_entity(self, identifier):
     """ gets entities, but checkes first if they are in memory """
     mc = MemoryCache()
     return mc.get_entity(identifier)
Esempio n. 35
0
class ActiveFilters():

    """ Methods to show search / query filters in use """
    TEXT_SEARCH_TITLE = 'Current Text Search Filter'

    IGNORE_PARAMS = ['geodeep',
                     'chronodeep',
                     'sort',
                     'rows',
                     'start']

    def __init__(self):
        self.m_cache = MemoryCache()  # memory caching object
        self.base_search_link = '/search/'
        self.hierarchy_delim = '---'

    def add_filters_json(self, request_dict):
        """ adds JSON describing search filters """
        fl = FilterLinks()
        fl.base_search_link = self.base_search_link
        filters = []
        string_fields = []  # so we have an interface for string searches
        i = 0
        for param_key, param_vals in request_dict.items():
            if param_key == 'path':
                if param_vals:
                    i += 1
                    f_entity = self.m_cache.get_entity(param_vals)
                    label = http.urlunquote_plus(param_vals)
                    act_filter = LastUpdatedOrderedDict()
                    act_filter['id'] = '#filter-' + str(i)
                    act_filter['oc-api:filter'] = 'Context'
                    act_filter['label'] = label.replace('||', ' OR ')
                    if f_entity:
                        act_filter['rdfs:isDefinedBy'] = f_entity.uri
                    # generate a request dict without the context filter
                    rem_request = fl.make_request_sub(request_dict,
                                                      param_key,
                                                      param_vals)
                    act_filter['oc-api:remove'] = fl.make_request_url(rem_request)
                    act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json')
                    filters.append(act_filter)
            else:
                for param_val in param_vals:
                    i += 1
                    remove_geodeep = False
                    act_filter = LastUpdatedOrderedDict()
                    act_filter['id'] = '#filter-' + str(i)
                    if self.hierarchy_delim in param_val:
                        all_vals = param_val.split(self.hierarchy_delim)
                    else:
                        all_vals = [param_val]
                    if param_key == 'proj':
                        # projects, only care about the last item in the parameter value
                        act_filter['oc-api:filter'] = 'Project'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                    elif param_key == 'prop':
                        # prop, the first item is the filter-label
                        # the last is the filter
                        act_filter['label'] = False
                        if len(all_vals) < 2:
                            act_filter['oc-api:filter'] = 'Description'
                            act_filter['oc-api:filter-slug'] = all_vals[0]
                        else:
                            filt_dict = self.make_filter_label_dict(all_vals[0])
                            act_filter['oc-api:filter'] = filt_dict['label']
                            if 'slug' in filt_dict:
                                act_filter['oc-api:filter-slug'] = filt_dict['slug']
                            if filt_dict['data-type'] == 'string':
                                act_filter['label'] = 'Search Term: \'' + all_vals[-1] + '\''
                        if act_filter['label'] is False:
                            label_dict = self.make_filter_label_dict(all_vals[-1])
                            act_filter['label'] = label_dict['label']
                    elif param_key == 'type':
                        act_filter['oc-api:filter'] = 'Open Context Type'
                        if all_vals[0] in QueryMaker.TYPE_MAPPINGS:
                            type_uri = QueryMaker.TYPE_MAPPINGS[all_vals[0]]
                            label_dict = self.make_filter_label_dict(type_uri)
                            act_filter['label'] = label_dict['label']
                        else:
                            act_filter['label'] = all_vals[0]
                    elif param_key == 'q':
                        act_filter['oc-api:filter'] = self.TEXT_SEARCH_TITLE
                        act_filter['label'] = 'Search Term: \'' + all_vals[0] + '\''
                    elif param_key == 'id':
                        act_filter['oc-api:filter'] = 'Identifier Lookup'
                        act_filter['label'] = 'Identifier: \'' + all_vals[0] + '\''
                    elif param_key == 'form-chronotile':
                        act_filter['oc-api:filter'] = 'Time of formation, use, or life'
                        chrono = ChronoTile()
                        dates = chrono.decode_path_dates(all_vals[0])
                        if isinstance(dates, dict):
                            act_filter['label'] = 'Time range: ' + str(dates['earliest_bce'])
                            act_filter['label'] += ' to ' + str(dates['latest_bce'])
                    elif param_key == 'form-start':
                        act_filter['oc-api:filter'] = 'Earliest formation, use, or life date'
                        try:
                            val_date = int(float(all_vals[0]))
                        except:
                            val_date = False
                        if val_date is False:
                            act_filter['label'] = '[Invalid year]'
                        elif val_date < 0:
                            act_filter['label'] = str(val_date * -1) + ' BCE'
                        else:
                            act_filter['label'] = str(val_date) + ' CE'
                    elif param_key == 'form-stop':
                        act_filter['oc-api:filter'] = 'Latest formation, use, or life date'
                        try:
                            val_date = int(float(all_vals[0]))
                        except:
                            val_date = False
                        if val_date is False:
                            act_filter['label'] = '[Invalid year]'
                        elif val_date < 0:
                            act_filter['label'] = str(val_date * -1) + ' BCE'
                        else:
                            act_filter['label'] = str(val_date) + ' CE'
                    elif param_key == 'disc-geotile':
                        act_filter['oc-api:filter'] = 'Location of discovery or observation'
                        act_filter['label'] = self.make_geotile_filter_label(all_vals[0])
                        remove_geodeep = True
                    elif param_key == 'disc-bbox':
                        act_filter['oc-api:filter'] = 'Location of discovery or observation'
                        act_filter['label'] = self.make_bbox_filter_label(all_vals[0])
                        remove_geodeep = True
                    elif param_key == 'images':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to images'
                    elif param_key == 'other-media':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to media (other than images)'
                    elif param_key == 'documents':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to documents'
                    elif param_key == 'dc-subject':
                        act_filter['oc-api:filter'] = 'Has subject metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if 'tdar' == all_vals[-1] or 'tdar*' == all_vals[-1]:
                            act_filter['label'] = 'tDAR defined metadata record(s)'
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-spatial':
                        act_filter['oc-api:filter'] = 'Has spatial metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-coverage':
                        act_filter['oc-api:filter'] = 'Has coverage / period metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-temporal':
                        act_filter['oc-api:filter'] = 'Has temporal coverage'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                            if len(label_dict['entities']) == 1: 
                                if label_dict['entities'][0].entity_type == 'vocabulary':
                                    act_filter['label'] = 'Concepts defined by: ' + label_dict['label']
                            elif 'periodo' in all_vals[-1]:
                                act_filter['label'] = 'PeriodO defined concepts'
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False\
                               and label_dict['entities'][0].vocabulary != label_dict['label']:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'obj':
                        act_filter['oc-api:filter'] = 'Links (in some manner) to object'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-isReferencedBy':
                        act_filter['oc-api:filter'] = 'Is referenced by'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False\
                               and label_dict['entities'][0].vocab_uri != label_dict['entities'][0].uri:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'linked' and all_vals[-1] == 'dinaa-cross-ref':
                        act_filter['oc-api:filter'] = 'Has cross references'
                        act_filter['label'] = 'Links to, or with, DINAA curated site files'
                    else:
                        act_filter = False
                    if act_filter is not False:
                        rem_request = fl.make_request_sub(request_dict,
                                                          param_key,
                                                          param_val)
                        if 'geodeep' in rem_request and remove_geodeep:
                            rem_request.pop('geodeep', None)    
                        act_filter['oc-api:remove'] = fl.make_request_url(rem_request)
                        act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json')
                        filters.append(act_filter)
        return filters

    def make_geotile_filter_label(self, raw_geotile):
        """ parses a raw bbox parameter value to make
            a filter label
        """
        output_list = []
        if '||' in raw_geotile:
            tile_list = raw_geotile.split('||')
        else:
            tile_list = [raw_geotile]
        for tile in tile_list:
            geotile = GlobalMercator()
            coordinates = geotile.quadtree_to_lat_lon(tile)
            if coordinates is not False:
                label = 'In the region bounded by: '
                label += str(round(coordinates[0], 3))
                label += ', ' + str(round(coordinates[1], 3))
                label += ' (SW) and ' + str(round(coordinates[2], 3))
                label += ', ' + str(round(coordinates[3], 3))
                label += ' (NE)'
                output_list.append(label)
            else:
                output_list.append('[Ignored invalid geospatial tile]')
        output = '; or '.join(output_list)
        return output

    def make_bbox_filter_label(self, raw_disc_bbox):
        """ parses a raw bbox parameter value to make
            a filter label
        """
        qm = QueryMaker()
        output_list = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                bbox_coors = bbox.split(',')
                bbox_valid = qm.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    label = 'In the bounding-box of: Latitude '
                    label += str(bbox_coors[1])
                    label += ', Longitude ' + str(bbox_coors[0])
                    label += ' (SW) and Latitude ' + str(bbox_coors[3])
                    label += ', Longitude ' + str(bbox_coors[2])
                    label += ' (NE)'
                    output_list.append(label)
                else:
                    output_list.append('[Ignored invalid bounding-box]')
            else:
                output_list.append('[Ignored invalid bounding-box]')
        output = '; or '.join(output_list)
        return output

    def make_filter_label_dict(self, act_val):
        """ returns a dictionary object
            with a label and set of entities (in cases of OR
            searchs)
        """
        related_suffix = ''
        output = {'label': False,
                  'data-type': 'id',
                  'slug': False,
                  'entities': []}
        labels = []
        if '||' in act_val:
            vals = act_val.split('||')
        else:
            vals = [act_val]
        for val in vals:
            qm = QueryMaker()
            db_val = qm.clean_related_slug(val)
            if val != db_val:
                related_suffix = ' (for related items)'
            f_entity = self.m_cache.get_entity(db_val)
            if f_entity:
                # get the solr field data type
                ent_solr_data_type = qm.get_solr_field_type(f_entity.data_type)
                if ent_solr_data_type is not False \
                   and ent_solr_data_type != 'id':
                    output['data-type'] = ent_solr_data_type
                labels.append(f_entity.label)
                output['entities'].append(f_entity)
            else:
                labels.append(val)
        output['label'] = (' OR '.join(labels)) + related_suffix
        output['slug'] = '-or-'.join(vals)
        return output
Esempio n. 36
0
 def __init__(self, cannonical_uris=False):
     self.m_cache = MemoryCache()
Esempio n. 37
0
class QueryMaker():

    # main item-types mapped to their slugs to get solr-facet field prefix
    TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects',
                     'media': 'oc-gen-media',
                     'documents': 'oc-gen-documents',
                     'persons': 'oc-gen-persons',
                     'projects': 'oc-gen-projects',
                     'types': 'oc-gen-types',
                     'predicates': 'oc-gen-predicates'}

    TYPE_URIS = {'subjects': 'oc-gen:subjects',
                 'media': 'oc-gen:media',
                 'documents': 'oc-gen:documents',
                 'persons': 'oc-gen:persons',
                 'projects': 'oc-gen:projects',
                 'types': 'oc-gen:types',
                 'predicates': 'oc-gen:predicates'}

    def __init__(self):
        self.error = False
        self.histogram_groups = 10
        self.mem_cache_obj = MemoryCache()  # memory caching object

    def _get_context_paths(self, spatial_context):
        '''
        Takes a context path and returns an iterator with the list of possible
        contexts. Parses the list of boolean '||' (OR) and returns a list
        of contexts.

        For example:

        >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray')

        ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray']

        '''
        # Split the context path by '/' and then by '||'
        context_lists = (value.split('||') for value in
                         spatial_context.split('/'))
        # Create a list of the various permutations
        context_tuple_list = list(itertools.product(*context_lists))
        # Turn the lists back into URIs
        return ('/'.join(value) for value in context_tuple_list)

    def _get_context_depth(self, spatial_context):
        '''
        Takes a context path and returns its depth as an interger. For
        example, the context '/Turkey/Domuztepe'
        would have a depth of 2.
        '''
        # Remove a possible trailing slash before calculating the depth
        return len(spatial_context.rstrip('/').split('/'))

    def _get_valid_context_slugs(self, contexts):
        '''
        Takes a list of contexts and, for valid contexts, returns a list of
        slugs
        '''
        entity = Entity()
        valid_context_slugs = []
        context_list = list(contexts)
        for context in context_list:
            # Verify that the contexts are valid
            # find and save the enity to memory
            # print('check: ' + context)
            found = self.mem_cache_obj.check_entity_found(context,
                                                          True)
            # print('found: ' + str(found))
            if found:
                entity = self.mem_cache_obj.get_entity(context,
                                                       True)
                valid_context_slugs.append(entity.slug)
        return valid_context_slugs

    def _get_parent_slug(self, slug):
        '''
        Takes a slug and returns the slug of its parent. Returns 'root' if
        a slug has no parent.
        '''
        cache_key = self.mem_cache_obj.make_memory_cache_key('par-slug', slug)
        parent_slug = self.mem_cache_obj.get_cache_object(cache_key)
        if parent_slug is None:
            contain_obj = Containment()
            contain_obj.use_cache = False  # because it seems to introduce memory errors
            parent_slug = contain_obj.get_parent_slug_by_slug(slug)
            self.mem_cache_obj.save_cache_object(cache_key, parent_slug)
        if parent_slug:
            return parent_slug
        else:
            return 'root'

    def _prepare_filter_query(self, parent_child_slug):
        # TODO docstring
        parent_child_set = parent_child_slug.split('___')
        return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \
            parent_child_set[1]

    def expand_hierarchy_options(self,
                                 path_param_val,
                                 hier_delim='---',
                                 or_delim='||'):
        """ Exapands a hiearchic path string into a
            list of listed hierachically ordered items.
            This method also makes a new hiearchic ordered
            list if there is an 'or_delim'.
        """
        if isinstance(path_param_val, list):
            inital_path_list = path_param_val
        else:
            inital_path_list = [path_param_val]
        path_list = []
        for path_string in inital_path_list:
            raw_path_list = (value.split(or_delim) for value in
                             path_string.split(hier_delim))
            # Create a list of the various permutations
            path_tuple_list = list(itertools.product(*raw_path_list))
            for item in path_tuple_list:
                path_list.append(list(item))
        return path_list

    def get_solr_field_type(self, data_type, prefix=''):
        '''
        Defines whether our dynamic solr fields names for
        predicates end with ___pred_id, ___pred_numeric, etc.
        '''
        if data_type in ['@id', 'id', False]:
            return prefix + 'id'
        elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']:
            return prefix + 'numeric'
        elif data_type == 'xsd:string':
            return prefix + 'string'
        elif data_type == 'xsd:date':
            return prefix + 'date'
        else:
            raise Exception("Error: Unknown predicate type")

    def make_prop_solr_field_parts(self, entity):
        """ Makes a solr field for a property """
        output = {}
        output['prefix'] = entity.slug.replace('-', '_')
        output['suffix'] = self.get_solr_field_type(entity.data_type)
        return output

    def process_proj(self, proj_path):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        project_path_lists = self.expand_hierarchy_options(proj_path)
        for proj_path_list in project_path_lists:
            i = 0
            path_list_len = len(proj_path_list)
            fq_field = SolrDocument.ROOT_PROJECT_SOLR
            fq_path_terms = []
            for proj_slug in proj_path_list:
                found = self.mem_cache_obj.check_entity_found(proj_slug, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(proj_slug, False)
                    # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                    # the below is a bit of a hack. We should have a query field
                    # as with ___pred_ to query just the slug. But this works for now
                    proj_slug = entity.slug
                    fq_path_term = fq_field + ':' + proj_slug + '*'
                else:
                    fq_path_term = fq_field + ':' + proj_slug
                fq_path_terms.append(fq_path_term)
                fq_field = proj_slug.replace('-', '_') + '___project_id'
                i += 1
                if i >= path_list_len and fq_field not in query_dict['facet.field']:
                    query_dict['facet.field'].append(fq_field)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_ld_object(self, objects):
        # TODO docstring
        query_dict = {'fq': []}
        fq_terms = []
        if not isinstance(objects, list):
            objects = [objects]
        for raw_obj in objects:
            if '||' in raw_obj:
                or_objects = raw_obj.split('||')
            else:
                or_objects = [raw_obj]
            fq_or_terms = []
            for obj in or_objects:
                # find and save the entity to memory
                found = self.mem_cache_obj.check_entity_found(obj, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(obj, False)
                    fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri)
                    fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"'
                else:
                    fq_term = 'object_uri:' + obj
                fq_or_terms.append(fq_term)
            fq_all_ors = ' OR '.join(fq_or_terms)
            fq_all_ors = '(' + fq_all_ors + ')'
            fq_terms.append(fq_all_ors)
        fq_final = ' AND '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_dc_term(self, dc_param, dc_terms, add_facet=False):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        if dc_param in DCterms.DC_META_FIELDS:
            fq_field = DCterms.DC_META_FIELDS[dc_param]
            if fq_field not in query_dict['facet.field'] and add_facet:
                query_dict['facet.field'].append(fq_field)
            add_to_fq = False
            for raw_dc_term in dc_terms:
                if '||' in raw_dc_term:
                    use_dc_terms = raw_dc_term.split('||')
                else:
                    use_dc_terms = [raw_dc_term]
                fq_path_terms = []
                for dc_term in use_dc_terms:
                    if len(dc_term) > 0:
                        add_to_fq = True
                        # check if entity exists, and or store in memory
                        found = self.mem_cache_obj.check_entity_found(dc_term, False)
                        if found:
                            # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                            # the below is a bit of a hack. We should have a query field
                            # as with ___pred_ to query just the slug. But this works for now
                            entity = self.mem_cache_obj.get_entity(dc_term, False)
                            fq_path_term = fq_field + '_fq:' + entity.slug
                            if dc_param == 'dc-temporal' \
                               and entity.entity_type == 'vocabulary' \
                               and 'periodo' in entity.slug:
                                # it's a temporal vocabulary from periodo
                                # so search for specific periods contained in
                                # the vocabulary
                                fq_path_term = '(' + fq_path_term +\
                                               ' OR ' + fq_path_term + '*)'
                        else:
                            if dc_term[-1] != '*':
                                dc_term += '*'
                            fq_path_term = fq_field + ':' + dc_term
                        fq_path_terms.append(fq_path_term)
                final_path_term = ' AND '.join(fq_path_terms)
                final_path_term = '(' + final_path_term + ')'
                fq_terms.append(final_path_term)
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            if add_to_fq:
                query_dict['fq'].append(fq_final)
        return query_dict

    def get_related_slug_field_prefix(self, slug):
        """ gets the field prefix for a related property
            if it is present in the slug, 
            then return the solr_field prefix otherwise
            return a '' string
        """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            return field_prefix
        else:
            return ''

    def clean_related_slug(self, slug):
        """ removes the field_prefix for related slugs """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            slug = slug[prefix_len:]
        return slug

    def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq):
        """ makes sure the solr prefix is on the fq if needed """
        if solr_f_prefix != '':
            if solr_f_prefix not in act_field_fq:
                act_field_fq = solr_f_prefix + act_field_fq
        return act_field_fq

    def process_prop(self, props):
        """ processes 'prop' (property) parameters
            property parameters are tricky because they
            can come in hierarchies
            that's why there's some complexity to this
        """
        # is the property for the item itself, or for a related item?
        query_dict = {'fq': [],
                      'facet.field': [],
                      'stats.field': [],
                      'prequery-stats': [],
                      'facet.range': [],
                      'hl-queries': [],
                      'ranges': {}}
        fq_terms = []
        prop_path_lists = self.expand_hierarchy_options(props)
        for prop_path_list in prop_path_lists:
            i = 0
            path_list_len = len(prop_path_list)
            fq_path_terms = []
            act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
            act_field_data_type = 'id'
            last_field_label = False  # needed for full text highlighting
            predicate_solr_slug = False
            for prop_slug in prop_path_list:
                field_prefix = self.get_related_slug_field_prefix(prop_slug)
                solr_f_prefix = field_prefix.replace('-', '_')
                db_prop_slug = self.clean_related_slug(prop_slug)
                l_prop_entity = False
                pred_prop_entity = False
                require_id_field = False
                if act_field_data_type == 'id':
                    # check entity exists, and save to memory
                    found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                    if found:
                        entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                        last_field_label = entity.label
                        prop_slug = field_prefix + entity.slug
                        if entity.item_type == 'uri' and 'oc-gen' not in db_prop_slug:
                            if entity.entity_type == 'property':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                l_prop_entity = True
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        else:
                            if entity.item_type == 'predicates':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        if i == 0:
                            if 'oc-gen' in db_prop_slug:
                                # for open context categories / types
                                act_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                lr = LinkRecursion()
                                parents = lr.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        act_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                        act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                                    except:
                                        pass
                            elif entity.item_type == 'uri':
                                act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR
                            elif entity.item_type == 'predicates':
                                temp_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                parents = self.mem_cache_obj.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        temp_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                    except:
                                        print('Predicate Parent exception: '+ str(parents))
                                        temp_field_fq = False
                                if temp_field_fq is not False:
                                    act_field_fq = temp_field_fq
                                else:
                                    act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                            else:
                                act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                        # ---------------------------------------------------
                        # THIS PART BUILDS THE FACET-QUERY
                        # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                        # the below is a bit of a hack. We should have a query field
                        # as with ___pred_ to query just the slug. But this works for now
                        fq_field = act_field_fq + '_fq'
                        if path_list_len >= 2 and act_field_data_type == 'id':
                            # could be an object deeper in the hierarchy, so allow the obj_all version
                            fq_path_term = '(' + fq_field + ':' + prop_slug
                            fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')'
                        else:
                            fq_path_term = fq_field + ':' + prop_slug
                        fq_path_terms.append(fq_path_term)
                        #---------------------------------------------------
                        #
                        #---------------------------------------------------
                        # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS
                        #
                        # print('pred-solr-slug: ' + predicate_solr_slug)
                        field_parts = self.make_prop_solr_field_parts(entity)
                        act_field_data_type = field_parts['suffix']
                        if require_id_field:
                            act_field_data_type = 'id'
                            field_parts['suffix'] = 'id'
                        # check if the last or penultimate field has
                        # a different data-type (for linked-data)
                        if i >= (path_list_len - 2) \
                           and l_prop_entity:
                            dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                            if isinstance(dtypes, list):
                                # set te data type and the act-field
                                found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                                if found:
                                    entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                                    entity.date_type = dtypes[0]  # store for later use
                                    self.mem_cache_obj.entities[db_prop_slug] = entity  # store for later use
                                act_field_data_type = self.get_solr_field_type(dtypes[0])
                        if predicate_solr_slug is False or pred_prop_entity:
                            act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            # get a facet on this field
                            if act_field_data_type != 'string':
                                # adds a prefix for related properties
                                ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix']
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                        else:
                            if act_field_data_type == 'id':
                                act_field_fq = 'obj_all___' + predicate_solr_slug \
                                               + '___pred_' + field_parts['suffix']
                                # get a facet on this field
                                if predicate_solr_slug != field_parts['prefix']:
                                    # the predicate_solr_slug is not the
                                    # prefix of the current field part, meaning
                                    # the field_parts[prefix] is the type, and
                                    # we want facets for the predicate -> type
                                    ffield = field_parts['prefix'] \
                                             + '___' \
                                             + predicate_solr_slug \
                                             + '___pred_' + field_parts['suffix']
                                else:
                                    # get facets for the predicate
                                    ffield = field_parts['prefix'] \
                                             + '___pred_' \
                                             + field_parts['suffix']
                                # adds a prefix, in case of a related property
                                ffield = solr_f_prefix + ffield
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                            else:
                                act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix']
                        # -------------------------------------------
                        if act_field_data_type == 'numeric':
                            # print('Numeric field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_numeric'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_math_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        elif act_field_data_type == 'date':
                            # print('Date field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_date'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_date_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        # print('Current data type (' + str(i) + '): ' + act_field_data_type)
                        # print('Current field (' + str(i) + '): ' + act_field_fq)
                    i += 1
                elif act_field_data_type == 'string':
                    # case for a text search
                    # last_field_label = False  # turn off using the field label for highlighting
                    string_terms = self.prep_string_search_term(prop_slug)
                    for escaped_term in string_terms:
                        search_term = act_field_fq + ':' + escaped_term
                        if last_field_label is False:
                            query_dict['hl-queries'].append(escaped_term)
                        else:
                            query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term)
                        fq_path_terms.append(search_term)
                elif act_field_data_type == 'numeric':
                    # numeric search. assume it's well formed solr numeric request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the numeric ranges from query to the range facets
                    query_dict = self.add_math_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
                elif act_field_data_type == 'date':
                    # date search. assume it's well formed solr request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the date ranges from query to the range facets
                    query_dict = self.add_date_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def add_math_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = self.histogram_groups
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                vals = []
                # get the numbers out
                q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query)
                for q_num_str in q_nums_strs:
                    vals.append(float(q_num_str))
                vals.sort()
                if len(vals) > 1:
                    ok = True
                    min_val = vals[0]
                    max_val = vals[-1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = min_val
            query_dict['ranges'][fend] = max_val
            query_dict['ranges'][fgap] = (max_val - min_val) / groups
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def add_date_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = 4
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query)
                if len(q_dt_strs) < 2:
                    # try a less strict regular expression to get dates
                    q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query)
                if len(q_dt_strs) >= 2:
                    ok = True
                    vals = []
                    for q_dt_str in q_dt_strs:
                        vals.append(q_dt_str)
                    vals.sort()
                    min_val = vals[0]
                    max_val = vals[1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val)
            query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val)
            query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups)
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def get_date_difference_for_solr(self, min_date, max_date, groups):
        """ Gets a solr date difference from two values """
        min_dt = self.date_convert(min_date)
        max_dt = self.date_convert(max_date)
        dif_dt = (max_dt - min_dt) / groups
        if dif_dt.days >= 366:
            solr_val = int(round((dif_dt.days / 365.25), 0))
            solr_dif = '+' + str(solr_val) + 'YEAR'
        elif dif_dt.days >= 31:
            solr_val = int(round((dif_dt.days / 30), 0))
            solr_dif = '+' + str(solr_val) + 'MONTH'
        elif dif_dt.days >= 1:
            solr_val = int(round(dif_dt.days, 0))
            solr_dif = '+' + str(solr_val) + 'DAY'
        elif (dif_dt.seconds // 3600) >= 1:
            solr_val = int(round((dif_dt.seconds // 3600), 0))
            solr_dif = '+' + str(solr_val) + 'HOUR'
        elif ((dif_dt.seconds % 3600) // 60) >= 1:
            solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0))
            solr_dif = '+' + str(solr_val) + 'MINUTE'
        elif dif_dt.seconds >= 1:
            solr_val = int(round(dif_dt.seconds, 0))
            solr_dif = '+' + str(solr_val) + 'SECOND'
        else:
            solr_dif = '+1YEAR'
        return solr_dif

    def add_solr_gap_to_date(self, date_val, solr_gap):
        """ adds a solr gap to a date_val """
        solr_val = re.sub(r'[^\d.]', r'', solr_gap)
        solr_val = int(float(solr_val))
        dt = self.date_convert(date_val)
        if 'YEAR' in solr_gap:
            dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0)))
        elif 'MONTH' in solr_gap:
            dt = dt + datetime.timedelta(days=(solr_val * 30))
        elif 'DAY' in solr_gap:
            dt = dt + datetime.timedelta(days=solr_val)
        elif 'HOUR' in solr_gap:
            dt = dt + datetime.timedelta(hours=solr_val)
        elif 'MINUTE' in solr_gap:
            dt = dt + datetime.timedelta(minutes=solr_val)
        elif 'SECOND' in solr_gap:
            dt = dt + datetime.timedelta(seconds=solr_val)
        else:
            dt = dt
        return dt

    def convert_date_to_solr_date(self, date_val):
        """ Conversts a string for a date into
            a Solr formated datetime string
        """
        dt = self.date_convert(date_val)
        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    def make_human_readable_date(self, date_val):
        """ Converts a date value into something
            easier to read
        """
        dt = self.date_convert(date_val)
        check_date = dt.strftime('%Y-%m-%d')
        check_dt = self.date_convert(date_val)
        if check_dt == dt:
            return check_date
        else:
            return dt.strftime('%Y-%m-%d:%H:%M:%S')

    def date_convert(self, date_val):
        """ converts to a python datetime if not already so """
        if isinstance(date_val, str):
            date_val = date_val.replace('Z', '')
            dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S')
        else:
            dt = date_val
        return dt

    def get_parent_item_type_facet_field(self, category_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(category_uri)
        for par in parents:
            if par['slug'] in self.TYPE_MAPPINGS.values():
                # the parent exists in the Type Mappings
                output = par['slug'].replace('-', '_') + '___pred_id'
                break
        return output

    def get_parent_entity_facet_field(self, entity_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(entity_uri)
        if isinstance(parents, list):
            if len(parents) > 1:
                # get the penultimate field
                output = parents[-2]['slug'].replace('-', '_') + '___pred_id'
        return output

    def process_item_type(self, raw_item_type):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        item_type_lists = self.expand_hierarchy_options(raw_item_type)
        for item_type_list in item_type_lists:
            i = 0
            path_list_len = len(item_type_list)
            fq_path_terms = []
            item_type = item_type_list[0]  # no hiearchy in this field, just the type
            fq_term = 'item_type:' + item_type
            fq_terms.append(fq_term)
            if item_type in self.TYPE_MAPPINGS:
                act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id'
                query_dict['facet.field'].append(act_field)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_id(self, identifier):
        # check for identifier
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        escape_id = self.escape_solr_arg(identifier)
        fq_terms.append('persistent_uri:' + escape_id)
        # now make a DOI URI in case this is just a naked DOI
        doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier)
        fq_terms.append('persistent_uri:' + doi_uri)
        # now make an ARK URI in case this is just a naked ARK
        ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier)
        fq_terms.append('persistent_uri:' + ark_uri)
        # now make an ORCID URI in case this is just a naked ORCID
        orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier)
        fq_terms.append('persistent_uri:' + orcid_uri)
        fq_terms.append('uuid:' + escape_id)
        tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True)
        if tcheck is not False:
            uuid = tcheck['uuid']
            fq_terms.append('uuid:' + uuid)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        # print(fq_final)
        return query_dict

    def process_form_use_life_chrono(self, raw_form_use_life_chrono):
        # creates facet query for form-use-life chronological tiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('form_use_life_chrono_tile')
        if '||' in raw_form_use_life_chrono:
            chrono_paths = raw_form_use_life_chrono.split('||')
        else:
            chrono_paths = [raw_form_use_life_chrono]
        for chrono_path in chrono_paths:
            i = 0
            if len(chrono_path) < 30:
                chrono_path += '*'
            fq_term = 'form_use_life_chrono_tile:' + chrono_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_form_date_chrono(self, form_use_life_date, date_type):
        # creates facet query for form-use-life dates
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        if date_type == 'start':
            qterm = '[' + str(form_use_life_date) + ' TO *]'
            fquery = 'form_use_life_chrono_earliest: ' + qterm
        else:
            qterm = '[* TO ' + str(form_use_life_date) + ']'
            fquery = 'form_use_life_chrono_latest: ' + qterm
        query_dict['fq'].append(fquery)
        return query_dict

    def process_discovery_geo(self, raw_disc_geo):
        # creates facet query for discovery geotiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('discovery_geotile')
        if '||' in raw_disc_geo:
            disc_geo_paths = raw_disc_geo.split('||')
        else:
            disc_geo_paths = [raw_disc_geo]
        for disc_path in disc_geo_paths:
            i = 0
            if len(disc_path) < 20:
                disc_path += '*'
            fq_term = 'discovery_geotile:' + disc_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_discovery_bbox(self, raw_disc_bbox):
        # creates facet query for bounding box searches
        # supports or {'||') queries
        query_dict = {'fq': []}
        fq_terms = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                # comma seperated list of coordinates
                bbox_coors = bbox.split(',')
                bbox_valid = self.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    # valid bounding box, now make a solr-query
                    # not how solr expacts latitude / longitude order, which
                    # is the revserse of geojson!
                    fq_term = 'discovery_geolocation:'
                    fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0])
                    fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2])
                    fq_term += ']'
                    fq_terms.append(fq_term)
        if len(fq_terms) > 0:
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            query_dict['fq'].append(fq_final)
        return query_dict

    def validate_bbox_coordiantes(self, bbox_coors):
        """ validates a set of bounding box coordinates """
        is_valid = False
        if len(bbox_coors) == 4:
            lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0],
                                                         bbox_coors[1])
            top_right_valid = self.validate_geo_lon_lat(bbox_coors[2],
                                                        bbox_coors[3])
            # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid))
            if lower_left_valid and top_right_valid:
                if float(bbox_coors[0]) < float(bbox_coors[2]) and\
                   float(bbox_coors[1]) < float(bbox_coors[3]):
                    is_valid = True
        return is_valid

    def validate_geo_lon_lat(self, lon, lat):
        """ checks to see if a lon, lat pair
            are valid. Note the GeoJSON ordering
            of the coordinates
        """
        is_valid = False
        lon_valid = self.validate_geo_coordinate(lon, 'lon')
        lat_valid = self.validate_geo_coordinate(lat, 'lat')
        if lon_valid and lat_valid:
            is_valid = True
        return is_valid

    def validate_geo_coordinate(self, coordinate, coord_type):
        """ validates a geo-spatial coordinate """
        is_valid = False
        try:
            fl_coord = float(coordinate)
        except ValueError:
            fl_coord = False
        if fl_coord is not False:
            if 'lat' in coord_type:
                if fl_coord <= 90 and\
                   fl_coord >= -90:
                    is_valid = True
            elif 'lon' in coord_type:
                if fl_coord <= 180 and\
                   fl_coord >= -180:
                    is_valid = True
        return is_valid

    def make_solr_value_from_entity(self, entity, value_type='id'):
        """ makes a solr value as indexed in SolrDocument
            see _concat_solr_string_value
        """
        id_part = entity.uri
        if 'http://opencontext.org' in entity.uri:
            if '/vocabularies/' not in entity.uri:
                id_part = entity.uri.split('http://opencontext.org')[1]
        return entity.slug + '___' + value_type + '___' + \
            id_part + '___' + entity.label
        return output

    def _process_spatial_context(self, spatial_context=None):
        # TODO docstring
        context = {}
        if spatial_context:
            context_paths = self._get_context_paths(spatial_context)
            context_slugs = self._get_valid_context_slugs(context_paths)
            # print('Context slugs: ' + str(context_slugs))
            # If we cannot find a valid context, raise a 404
            if not context_slugs:
                raise Http404
            # Solr 'fq' parameters
            parent_child_slugs = []
            # Solr 'facet.field' parameters
            facet_field = []
            for slug in context_slugs:
                # fq parameters
                parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug)
                # facet.field parameters
                facet_field.append(slug.replace('-', '_') + '___context_id')
            # First, handle the most likely scenario of a single context
            if len(parent_child_slugs) == 1:
                context['fq'] = self._prepare_filter_query(parent_child_slugs[0])
            # Otherwise, combine multiple contexts into an OR filter
            else:
                fq_string = ' OR '.join(
                    (self._prepare_filter_query(slug_set) for slug_set
                        in parent_child_slugs)
                    )
                context['fq'] = '(' + fq_string + ')'
            context['facet.field'] = facet_field
        # No spatial context provided
        else:
            context['fq'] = None
            context['facet.field'] = ['root___context_id']
        return context

    def prep_string_search_term(self, raw_term):
        """ prepares a string search
            returns a list of search terms
            for AND queries
        """
        if '"' in raw_term:
            nq_term = raw_term.replace('"', ' ')  # get rid of quotes in the search term
            quoted_list = re.findall(r"\"(.*?)\"", raw_term)
            terms = []
            terms.append(self.escape_solr_arg(nq_term))
            for quote_item in quoted_list:
                quote_item = self.escape_solr_arg(quote_item)  # escape characters
                quote_item = '"' + quote_item + '"'  # put quotes back around it
                terms.append(quote_item)
        else:
            terms = []
            terms.append(self.escape_solr_arg(raw_term))
        return terms

    def escaped_seq(self, term):
        """ Yield the next string based on the
            next character (either this char
            or escaped version """
        escaperules = {'+': r'\+',
                       '-': r'\-',
                       '&': r'\&',
                       '|': r'\|',
                       '!': r'\!',
                       '(': r'\(',
                       ')': r'\)',
                       '{': r'\{',
                       '}': r'\}',
                       '[': r'\[',
                       ']': r'\]',
                       '^': r'\^',
                       '~': r'\~',
                       '*': r'\*',
                       '?': r'\?',
                       ':': r'\:',
                       '"': r'\"',
                       ';': r'\;',
                       ' ': r'\ '}
        for char in term:
            if char in escaperules.keys():
                yield escaperules[char]
            else:
                yield char

    def escape_solr_arg(self, term):
        """ Apply escaping to the passed in query terms
            escaping special characters like : , etc"""
        term = term.replace('\\', r'\\')   # escape \ first
        return "".join([next_str for next_str in self.escaped_seq(term)])
Esempio n. 38
0
class RecordProperties():
    """ Methods to make properties for individual record items
        useful for making geospatial feature records or
        lists of items without geospatial data
    """
    ATTRIBUTE_DELIM = '; '  # delimiter for multiple attributes

    def __init__(self, request_dict_json=False):
        self.uuid = False
        self.uri = False  # cannonical uri for the item
        self.href = False  # link to the item in the current deployment
        self.cite_uri = False  # stable / persistent uri
        self.label = False
        self.item_type = False
        self.updated = False
        self.published = False
        self.project_href = False  # link to the project in current deployment
        self.project_uri = False  # cannonical uri for the project
        self.project_label = False 
        self.context_href = False  # link to parent context in current deployment
        self.context_uri = False  # link to parent context cannonical uri
        self.context_label = False
        self.category = False
        self.latitude = False
        self.longitude = False
        self.geojson = False
        self.early_date = False
        self.late_date = False
        self.thumbnail_href = False
        self.thumbnail_uri = False
        self.thumbnail_scr = False
        self.preview_scr = False
        self.fullfile_scr = False
        self.snippet = False
        self.cite_uri = False  # stable identifier as an HTTP uri
        self.other_attributes = False  # other attributes to the record
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.attribute_hierarchies = {}
        self.base_url = settings.CANONICAL_HOST
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.request_dict_json = request_dict_json
        if request_dict_json is not False:
            self.request_dict = json.loads(request_dict_json)
        else:
            self.request_dict = False
        self.highlighting = False
        self.recursive_count = 0
        self.min_date = False
        self.max_date = False
        self.thumbnail_data = {}
        self.media_file_data = {}
        self.string_attrib_data = {}

    def parse_solr_record(self, solr_rec):
        """ Parses a solr rec object """
        if isinstance(solr_rec, dict):
            self.get_item_basics(solr_rec)
            self.get_citation_uri(solr_rec)
            self.get_lat_lon(solr_rec)
            self.get_category(solr_rec)
            self.get_project(solr_rec)
            self.get_context(solr_rec)
            self.get_time(solr_rec)  # get time information, limiting date ranges to query constaints
            self.get_thumbnail(solr_rec)
            self.get_media_files(solr_rec)
            self.get_snippet(solr_rec)  # get snippet of highlighted text
            self.get_attributes(solr_rec)  # get non-standard attributes
            self.get_string_attributes(solr_rec)  # get non-standard string attributes

    def get_item_basics(self, solr_rec):
        """ get basic metadata for an item """
        output = False
        if isinstance(solr_rec, dict):
            if 'uuid' in solr_rec:
                self.uuid = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    output = True
                    self.uri = self.make_url_from_val_string(id_parts['uri'], True)
                    self.href = self.make_url_from_val_string(id_parts['uri'], False)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True)
                    self.item_type = item_type_output['item_type']
                    self.label = id_parts['label']
            if 'updated' in solr_rec:
                self.updated = solr_rec['updated']
            if 'published' in solr_rec:
                self.published = solr_rec['published']
        return output

    def get_snippet(self, solr_rec):
        """ get a text highlighting snippet """
        if isinstance(self.highlighting, dict):
            if self.uuid is False:
                if 'uuid' in solr_rec:
                    self.uuid = solr_rec['uuid']
            if self.uuid in self.highlighting:
                if 'text' in self.highlighting[self.uuid]:
                    text_list = self.highlighting[self.uuid]['text']
                    self.snippet = ' '.join(text_list)
                    # some processing to remove fagments of HTML markup.
                    self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]')
                    self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]')
                    try:
                        self.snippet = '<div>' + self.snippet + '</div>'
                        self.snippet = lxml.html.fromstring(self.snippet).text_content()
                        self.snippet = strip_tags(self.snippet)
                    except:
                        self.snippet = strip_tags(self.snippet)
                    self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>')
                    self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>')

    def get_citation_uri(self, solr_rec):
        """ gets the best citation / persistent uri for the item """
        if 'persistent_uri' in solr_rec:
            for p_uri in solr_rec['persistent_uri']:
                self.cite_uri = p_uri
                if 'dx.doi.org' in p_uri:
                    break  # stop looking once we have a DOI, the best

    def get_lat_lon(self, solr_rec):
        """ gets latitute and longitude information """
        if 'discovery_geolocation' in solr_rec:
            geo_strings = solr_rec['discovery_geolocation']
            geo_coords_str = geo_strings.split(',')
            # NOT geojson ording, since solr uses lat/lon ordering
            self.latitude = float(geo_coords_str[0])
            self.longitude = float(geo_coords_str[1]) 

    def get_category(self, solr_rec):
        """ Gets the most specific category for the item """
        self.recursive_count = 0
        cat_hierarchy = self.get_category_hierarchy(solr_rec)
        if len(cat_hierarchy) > 0:
            self.category = cat_hierarchy[-1]['label']

    def get_context(self, solr_rec):
        """ Get the most specific context parent for the record """
        self.recursive_count = 0
        contexts = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_CONTEXT_SOLR,
                                          '___context',
                                          [])
        if len(contexts) > 0:
            self.context_label = self.make_context_path_label(contexts)
            self.context_uri = self. make_context_link(contexts, True)
            self.context_href = self. make_context_link(contexts, False)

    def get_project(self, solr_rec):
        """ Get the most specific project for the record """
        self.recursive_count = 0
        projects = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_PROJECT_SOLR,
                                          '___project',
                                          [])
        if len(projects) > 0:
            self.project_label = projects[-1]['label']
            self.project_uri = self.make_url_from_val_string(projects[-1]['uri'],
                                                             True)
            self.project_href = self.make_url_from_val_string(projects[-1]['uri'],
                                                              False)

    def get_time(self, solr_rec):
        """ parses time information """
        early_list = False
        late_list = False
        if 'form_use_life_chrono_earliest' in solr_rec:
            early_list = solr_rec['form_use_life_chrono_earliest']
        if 'form_use_life_chrono_latest' in solr_rec:
            late_list = solr_rec['form_use_life_chrono_latest']
        if isinstance(early_list, list):
            date_list = early_list
        else:
            date_list = []
        if isinstance(late_list, list):
            date_list += late_list
        if len(date_list) > 0:
            min_max = self.get_list_min_max(date_list)
            self.early_date = min(min_max)
            self.late_date = max(min_max)

    def get_list_min_max(self, date_list):
        """ Returns the minimum and maximum dates
            from a date list, constrained by
            preset min and max dates
        """
        min_date = False
        max_date = False
        # print(str(date_list))
        if isinstance(date_list, list):
            date_list.sort()
            for date in date_list:
                if self.min_date is not False:
                    if date >= self.min_date \
                       and min_date is False:
                        min_date = date
                if self.max_date is not False:
                    if date <= self.max_date:
                        max_date = date
        if min_date is False:
            min_date = self.min_date
        if max_date is False:
            max_date = self.max_date
        return [min_date, max_date]

    def get_thumbnail(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.thumbnail_data:
                if self.thumbnail_data[uuid] is not False:
                    self.thumbnail_href = self.thumbnail_data[uuid]['href']
                    self.thumbnail_uri = self.thumbnail_data[uuid]['uri']
                    self.thumbnail_scr = self.thumbnail_data[uuid]['scr']
                    rp = RootPath()
                    self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr)
            else:
                # did not precache thumbnail data, get an indivitual record
                self.get_thumbnail_from_database(solr_rec)

    def get_media_files(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.media_file_data:
                if self.media_file_data[uuid] is not False:
                    rp = RootPath()
                    for file_type, file_uri in self.media_file_data[uuid].items():
                        if file_type == 'oc-gen:thumbnail':
                            self.thumbnail_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:preview':
                            self.preview_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:fullfile':
                            self.fullfile_scr = rp.convert_to_https(file_uri)

    def get_thumbnail_from_database(self, solr_rec):
        """ get media record and thumbnail, if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            thumb = []
            if self.item_type != 'media':
                media_item = Assertion.objects\
                                      .filter(uuid=uuid,
                                              object_type='media')[:1]
                if len(media_item) > 0:
                    muuid = media_item[0].object_uuid
                    thumb = Mediafile.objects\
                                     .filter(uuid=muuid,
                                             file_type='oc-gen:thumbnail')[:1]
            else:
                # do this for media items
                muuid = uuid
                thumb = Mediafile.objects\
                                 .filter(uuid=uuid,
                                         file_type='oc-gen:thumbnail')[:1]
            if len(thumb) > 0:
                self.thumbnail_href = self.base_url + '/media/' + muuid
                self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid
                self.thumbnail_scr = thumb[0].file_uri

    def get_category_hierarchy(self, solr_rec):
        """ gets the most specific category
            informtation about
            an item
        """
        cat_hierarchy = []
        if 'item_type' in solr_rec:
            item_type = solr_rec['item_type'][0]
            root_cat_field = 'oc_gen_' + item_type + '___pred_id'
            cat_hierarchy = self.extract_hierarchy(solr_rec,
                                                   root_cat_field,
                                                   '___pred',
                                                   [])
        return cat_hierarchy

    """ The following seciton of code
        processes non-default attributes for records
    """
    def get_attributes(self, solr_rec):
        """ gets attributes for a record, based on the
            predicates requested in the search
            and optional predicates passed by a client
            with a GET request with parameter 'attributes'
        """
        qm = QueryMaker()
        solr_field_entities = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                field_parts = qm.make_prop_solr_field_parts(entity)
                solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                # print('Found: ' + solr_field)
                # extract children of the solr_field so we know if
                # we have the most specific attributes, then we can get
                # values for the most specific attributes
                self.extract_attribute_children(solr_rec, solr_field)
        self.clean_attribute_hiearchies()
        if isinstance(self.attribute_hierarchies, dict):
            self.other_attributes = []
            for field_slug_key, values in self.attribute_hierarchies.items():
                entity = self.mem_cache_obj.get_entity(field_slug_key, False)
                if entity is not False:
                    attribute_dict = LastUpdatedOrderedDict()
                    attribute_dict['property'] = entity.label
                    attribute_dict['values_list'] = []
                    attribute_dict['value'] = ''
                    string_val = False
                    delim = ''
                    for val in values:
                        if isinstance(val, str):
                            string_val = True
                            parsed_val = self.parse_solr_value_parts(val)
                            attribute_dict["values_list"].append(parsed_val['label'])
                            attribute_dict['value'] += delim + str(parsed_val['label'])
                        else:
                            attribute_dict["values_list"].append(val)
                            attribute_dict['value'] += delim + str(val)
                        delim = self.ATTRIBUTE_DELIM
                    if len(values) == 1 \
                       and string_val is False:
                        attribute_dict['value'] = values[0]
                    self.other_attributes.append(attribute_dict)

    def get_string_attributes(self, solr_rec):
        """ gets string attributes for a solr rec, from a previous database query
            needed because solr does not cache string field data
        """
        if isinstance(self.string_attrib_data, dict):
            # now add predicate attributes for string predicates, from the database
            if 'uuid' in solr_rec and 'data' in self.string_attrib_data:
                uuid = solr_rec['uuid']
                if uuid in self.string_attrib_data['data']:
                    item_data = self.string_attrib_data['data'][uuid]
                    for pred_uuid, values_list in item_data.items():
                        act_attribute = self.string_attrib_data['pred_ents'][pred_uuid]
                        act_attribute['values_list'] = values_list
                        act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list)
                        self.other_attributes.append(act_attribute)

    def prevent_attribute_key_collision(self, item_prop_dict, prop_key):
        """ checks to make sure there's no collision between the prop_key
            and the dict that it will be added to
        """
        i = 2
        output_prop_key = prop_key
        while output_prop_key in item_prop_dict:
            output_prop_key = prop_key + '[' + str(i) + ']'
            i += 1
        return output_prop_key

    def clean_attribute_hiearchies(self):
        """ some post-processing to make sure
            we have clean attribute hierarchies
        """
        if isinstance(self.attribute_hierarchies, dict):
            # print('check: ' + str(self.attribute_hierarchies))
            temp_attribute_hierarchies = self.attribute_hierarchies
            clean_attribute_hiearchies = {}
            for solr_field_key, field_char in self.attribute_hierarchies.items():
                if field_char['most-specific']:
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    specific_ok = True
                    for val in field_char['values']:
                        if isinstance(val, str):
                            #  print('check:' + solr_field_key + ' val: ' + val)
                            parsed_val = self.parse_solr_value_parts(val)
                            check_field = parsed_val['slug'].replace('-', '_')
                            check_field += '___pred_' + parsed_val['data_type']
                            if check_field in temp_attribute_hierarchies:
                                # note a field is NOT at the most specific level
                                specific_ok = False
                            else:
                                # now check a version with the predicate as part of
                                # the solr field
                                check_field = parsed_val['slug'].replace('-', '_')
                                check_field += pred_suffix
                                if check_field in temp_attribute_hierarchies:
                                    # note a field is NOT at the most specific level
                                    specific_ok = False
                    if specific_ok:
                        # ok to add
                        # print('checked OK: ' + solr_field_key)
                        clean_attribute_hiearchies[solr_field_key] = field_char
            # now that we got rid of problem fields, lets sort these for consistent
            # rendering
            self.attribute_hierarchies = LastUpdatedOrderedDict()
            keys = LastUpdatedOrderedDict()
            # order of key types, we want id fields, followed by numeric then date
            key_types = ['___pred_id',
                         '___pred_numeric',
                         '___pred_date']
            for key_type in key_types:
                keys[key_type] = []
                for solr_field_key, field_char in clean_attribute_hiearchies.items():
                    if key_type in solr_field_key:
                        keys[key_type].append(solr_field_key)
                # sort alphabetically. Slugs useful, since they will cluster predicates
                # from similar vocabularies
                keys[key_type].sort()
                for key in keys[key_type]:
                    field_char = clean_attribute_hiearchies[key]
                    field_ex = key.split('___')
                    # the penultimate part is the predicate
                    field_slug = field_ex[-2].replace('_', '-')
                    if field_slug not in self.attribute_hierarchies:
                        self.attribute_hierarchies[field_slug] = []
                    for val in field_char['values']:
                        if val not in self.attribute_hierarchies[field_slug]:
                            self.attribute_hierarchies[field_slug].append(val)

    def extract_attribute_children(self,
                                   solr_rec,
                                   solr_field_key):
        """ extracts ALL children from the hiearchy of
            a solr_field_key
        """
        is_field = False
        if solr_field_key not in self.attribute_hierarchies:
            # so we don't look at the same thing twice!
            if solr_field_key in solr_rec:
                is_field = True
                field_char = {'most-specific': False,
                              'values': []}
                if '___pred_numeric' in solr_field_key \
                   or '___pred_numeric' in solr_field_key:
                    field_char['most-specific'] = True
                    field_char['values'] = solr_rec[solr_field_key]
                elif '___pred_id' in solr_field_key:
                    # make a suffix for the 
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    childless_children = []
                    for child_val in solr_rec[solr_field_key]:
                        # print('Child: ' + solr_field_key + ': ' + child_val)
                        parsed_path_item = self.parse_solr_value_parts(child_val)
                        new_field_prefix = parsed_path_item['slug'].replace('-', '_')
                        new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type']
                        if parsed_path_item['data_type'] == 'id':
                            child_is_field = self.extract_attribute_children(solr_rec,
                                                                             new_field_key)
                            if child_is_field is False:
                                # now check an alternative combining the child
                                # slug with the predicate of the parent
                                new_field_key = new_field_prefix + pred_suffix
                                # print('check: ' + new_field_key)
                                child_is_field = self.extract_attribute_children(solr_rec,
                                                                                 new_field_key)
                                if child_is_field is False:
                                    childless_children.append(child_val)
                    if len(childless_children) > 0:
                        field_char['most-specific'] = True
                        field_char['values'] = childless_children
                else:
                    pass
                self.attribute_hierarchies[solr_field_key] = field_char
        return is_field

    def extract_hierarchy(self,
                          solr_rec,
                          facet_field_key,
                          facet_suffix,
                          hierarchy=[],
                          pred_field=False):
        """ extracts a hierarchy from a solr_record.
            The output is a list starting with the most
            general parent of the hiearchy,
            then going to the most specific

            This is a recursive function and
            default / starts with the root
            of the hiearchy as the facet_field_key

            This only follows a single path (not multiple paths)
        """
        alt_facet_field_key = facet_field_key
        if pred_field is not False:
            # do this to allow search of hiarchy in a named
            # predicate field
            f_parts = facet_field_key.split('___')
            if len(f_parts) == 2:
                alt_f_parts = [f_parts[0],
                               pred_field.replace('-', '_'),
                               f_parts[1]]
                alt_facet_field_key = '___'.join(alt_f_parts)
                # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key)
        if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\
           and self.recursive_count < 20:
            self.recursive_count += 1
            if facet_field_key in solr_rec:
                path_item_val = solr_rec[facet_field_key][0]
            else:
                path_item_val = solr_rec[alt_facet_field_key][0]
            parsed_path_item = self.parse_solr_value_parts(path_item_val)
            if isinstance(parsed_path_item, dict):
                hierarchy.append(parsed_path_item)
                new_facet_field = parsed_path_item['slug'].replace('-', '_')
                new_facet_field += facet_suffix + '_' + parsed_path_item['data_type']
                # print('New hierarchy field: ' + new_facet_field)
                hierarchy = self.extract_hierarchy(solr_rec,
                                                   new_facet_field,
                                                   facet_suffix,
                                                   hierarchy)
        return hierarchy

    def make_context_path_label(self, contexts):
        """ Makes a '/' delimited context
            path for easy human readability
        """
        context_path = False
        if len(contexts) > 0:
            context_labels = []
            for context in contexts:
                context_labels.append(context['label'])
            context_path = '/'.join(context_labels)
        return context_path

    def make_context_link(self, contexts, cannonical=False):
        """ makes a URI for a context """
        context_uri = False
        if len(contexts) > 0:
            context_uri = self.make_url_from_val_string(contexts[-1]['uri'],
                                                        cannonical)
        return context_uri

    def make_url_from_val_string(self,
                                 partial_url,
                                 use_cannonical=True):
        """ parses a solr value if it has
            '___' delimiters, to get the URI part
            string.
            if it's already a URI part, it makes
            a URL
        """
        if use_cannonical:
            base_url = settings.CANONICAL_HOST
        else:
            base_url = self.base_url
        solr_parts = self.parse_solr_value_parts(partial_url)
        if isinstance(solr_parts, dict):
            partial_url = solr_parts['uri']
        if 'http://' not in partial_url \
           and 'https://' not in partial_url:
            url = base_url + partial_url
        else:
            url = partial_url
        return url

    def add_record_fields(self):
        """ adds fields to include in the GeoJSON properties """
        if 'rec-field' in self.response_dict:
            raw_rec_fields = self.response_dict['rec-field'][0]
            if ',' in raw_rec_fields:
                self.record_fields = raw_rec_fields.split(',')
            else:
                self.record_fields = [raw_rec_fields]
        else:
            self.record_fields = []
        return self.record_fields

    def parse_solr_value_parts(self, solr_value):
        """ parses a solr_value string into
            slug, solr-data-type, uri, and label
            parts
        """
        output = False
        if isinstance(solr_value, str):
            if '___' in solr_value:
                solr_ex = solr_value.split('___')
                if len(solr_ex) == 4:
                    output = {}
                    output['slug'] = solr_ex[0]
                    output['data_type'] = solr_ex[1]
                    output['uri'] = solr_ex[2]
                    output['label'] = solr_ex[3]
            else:
                output = solr_value
        else:
            output = solr_value
        return output

    def get_solr_record_uuid_type(self, solr_rec):
        """ get item uuid, label, and type from a solr_rec """
        output = False
        if isinstance(solr_rec, dict):
            output = {'uuid': False,
                      'label': False,
                      'item_type': False}
            if 'uuid' in solr_rec:
                output['uuid'] = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    uri = self.make_url_from_val_string(id_parts['uri'], True)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True)
                    output['item_type'] = item_type_output['item_type']
                    output['label'] = id_parts['label']
        return output

    def get_key_val(self, key, dict_obj):
        """ returns the value associated
            with a key, if the key exists
            else, none
        """
        output = None
        if isinstance(dict_obj, dict):
            if key in dict_obj:
                output = dict_obj[key]
        return output
Esempio n. 39
0
class SolrUUIDs():
    """ methods to make get UUIDs from a solr
        search result JSON document,

        also makes URIs
    """

    def __init__(self, response_dict_json=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.uuids = []
        self.uris = []
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.response_dict_json = response_dict_json
        self.highlighting = False
        # make values to these fields "flat" not a list
        self.flatten_rec_fields = True
        self.total_found = False
        self.rec_start = False
        self.min_date = False
        self.max_date = False
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.do_media_thumbs = True  # get thumbnails for records
        self.get_all_media = False  # get links to all media files for an item

    def make_uuids_from_solr(self, solr_json):
        """ makes geojson-ld point records from a solr response """
        #first do lots of checks to make sure the solr-json is OK
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = solr_rec['uuid']
                    self.uuids.append(uuid)
        return self.uuids

    def make_uris_from_solr(self, solr_json, uris_only=True):
        """ processes the solr_json to
             make GeoJSON records
        """
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            if uris_only:
                self.do_media_thumbs = False
            if self.get_all_media:
                self.do_media_thumbs = False
            if 'thumbnail' in self.rec_attributes:
                self.do_media_thumbs = True
            thumbnail_data = self.get_media_thumbs(solr_recs)
            media_file_data = self.get_all_media_files(solr_recs)
            string_attrib_data = self.get_string_rec_attributes(solr_recs)
            for solr_rec in solr_recs:
                rec_props_obj = RecordProperties(self.response_dict_json)
                rec_props_obj.mem_cache_obj = self.mem_cache_obj
                rec_props_obj.min_date = self.min_date
                rec_props_obj.max_date = self.max_date
                rec_props_obj.highlighting = self.highlighting
                rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes
                rec_props_obj.rec_attributes = self.rec_attributes
                rec_props_obj.thumbnail_data = thumbnail_data
                rec_props_obj.media_file_data = media_file_data
                rec_props_obj.string_attrib_data = string_attrib_data
                item_ok = rec_props_obj.get_item_basics(solr_rec)
                if item_ok:
                    if uris_only:
                        item = rec_props_obj.uri
                    else:
                        rec_props_obj.parse_solr_record(solr_rec)
                        self.mem_cache_obj = rec_props_obj.mem_cache_obj  # add to existing list of entities, reduce lookups
                        item = self.make_item_dict_from_rec_props_obj(rec_props_obj)
                    self.uris.append(item)
        return self.uris

    def make_item_dict_from_rec_props_obj(self, rec_props_obj, cannonical=True):
        """ makes item dictionary object from a record prop obj """
        item = LastUpdatedOrderedDict()
        item['uri'] = rec_props_obj.uri
        if cannonical is False or 'href' in self.rec_attributes:
            item['href'] = rec_props_obj.href
        item['citation uri'] = rec_props_obj.cite_uri
        item['label'] = rec_props_obj.label
        item['project label'] = rec_props_obj.project_label
        if cannonical:
            item['project uri'] = rec_props_obj.project_uri
        else:
            item['project href'] = rec_props_obj.project_href
        item['context label'] = rec_props_obj.context_label
        if cannonical:
            item['context uri'] = rec_props_obj.context_uri
        else:
            item['context href'] = rec_props_obj.context_href
        item['latitude'] = rec_props_obj.latitude
        item['longitude'] = rec_props_obj.longitude
        item['early bce/ce'] = rec_props_obj.early_date
        item['late bce/ce'] = rec_props_obj.late_date
        item['item category'] = rec_props_obj.category
        if rec_props_obj.snippet is not False:
            item['snippet'] = rec_props_obj.snippet
        if rec_props_obj.thumbnail_scr is not False:
            item['thumbnail'] = rec_props_obj.thumbnail_scr
        if rec_props_obj.preview_scr is not False:
            item['preview'] = rec_props_obj.preview_scr
        if rec_props_obj.fullfile_scr is not False:
            item['primary-file'] = rec_props_obj.fullfile_scr
        item['published'] = rec_props_obj.published
        item['updated'] = rec_props_obj.updated
        if isinstance(rec_props_obj.other_attributes, list):
            for attribute in rec_props_obj.other_attributes:
                prop_key = attribute['property']
                prop_key = rec_props_obj.prevent_attribute_key_collision(item,
                                                                         prop_key)
                if self.flatten_rec_attributes:
                    if 'value' in attribute:
                        item[prop_key] = attribute['value']
                    elif 'values_list' in attribute:
                        item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(attribute['values_list'])
                else:
                    item[prop_key] = attribute['values_list']
        return item

    def extract_solr_recs(self, solr_json):
        """ extracts solr_recs along with
           some basic metadata from solr_json
        """
        solr_recs = False
        if isinstance(solr_json, dict):
            try:
                self.total_found = solr_json['response']['numFound']
            except KeyError:
                self.total_found = False
            try:
                self.rec_start = solr_json['response']['start']
            except KeyError:
                self.rec_start = False
            try:
                self.highlighting = solr_json['highlighting']
            except KeyError:
                self.highlighting = False
            try:
                solr_recs = solr_json['response']['docs']
            except KeyError:
                solr_recs = False
        return solr_recs

    def get_media_thumbs(self, solr_recs):
        """ gets media thumbnail items """
        thumb_results = {}
        not_media_uuids = []
        media_uuids = []
        rec_props_obj = RecordProperties(self.response_dict_json)
        for solr_rec in solr_recs:
            item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
            if item is not False:
                uuid = item['uuid']
                if item['item_type'] != 'media':
                    not_media_uuids.append(uuid)
                else:
                    media_uuids.append(uuid)
                thumb_results[uuid] = False
        if len(not_media_uuids) > 0:
            if self.do_media_thumbs:
                # only get media_thumbnails if needed
                rows = self.get_thumbs_for_non_media(not_media_uuids)
                for row in rows:
                    uuid = row['uuid']
                    thumb_obj = {}
                    thumb_obj['href'] = self.base_url + '/media/' + row['media_uuid']
                    thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + row['media_uuid']
                    thumb_obj['scr'] = row['file_uri']
                    if thumb_results[uuid] is False:
                        thumb_results[uuid] = thumb_obj
        if len(media_uuids) > 0:
            thumbs = Mediafile.objects\
                              .filter(uuid__in=media_uuids,
                                      file_type='oc-gen:thumbnail')
            for thumb in thumbs:
                uuid = thumb.uuid
                thumb_obj = {}
                thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid
                thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid
                thumb_obj['scr'] = thumb.file_uri
                thumb_results[uuid] = thumb_obj
        return thumb_results

    def get_all_media_files(self, solr_recs):
        """ gets media thumbnail items """
        media_file_results = {}
        if self.get_all_media:
            media_uuids = []
            rec_props_obj = RecordProperties(self.response_dict_json)
            for solr_rec in solr_recs:
                item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
                if item is not False:
                    uuid = item['uuid']
                    if item['item_type'] == 'media':
                        media_uuids.append(uuid)
                    media_file_results[uuid] = False
            if len(media_uuids) > 0:
                media_files = Mediafile.objects\
                                       .filter(uuid__in=media_uuids)
                for media_file in media_files:
                    uuid = media_file.uuid
                    if uuid not in media_file_results:
                        media_file_results[uuid] = {}
                    else:
                        if media_file_results[uuid] is False:
                            media_file_results[uuid] = {}
                    media_file_results[uuid][media_file.file_type] = media_file.file_uri
        return media_file_results

    def get_thumbs_for_non_media(self, uuid_list):
        q_uuids = self.make_query_uuids(uuid_list)
        query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, '
                 'm.uuid AS media_uuid '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid '
                 'AND m.file_type=\'oc-gen:thumbnail\'  '
                 'WHERE ass.uuid IN (' + q_uuids + ') '
                 'GROUP BY ass.uuid,  m.file_uri, m.uuid; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows

    def make_query_uuids(self, uuid_list):
        """ makes a string for uuid list query """
        uuid_q = []
        for uuid in uuid_list:
            uuid = '\'' + uuid + '\''
            uuid_q.append(uuid)
        return ', '.join(uuid_q)

    def dictfetchall(self, cursor):
        """ Return all rows from a cursor as a dict """
        columns = [col[0] for col in cursor.description]
        return [
            dict(zip(columns, row))
            for row in cursor.fetchall()
        ]

    def get_string_rec_attributes(self, solr_recs):
        """ gets string record attributes from the database.
            The solr index does not keep string-fields in memory
        """
        output = {}
        str_attribs = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                if entity.data_type == 'xsd:string':
                    str_attribs[attribute] = entity
        if len(str_attribs) > 0:
            uuid_list = []
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = str(solr_rec['uuid'])
                    uuid_list.append(uuid)
            output = self.get_string_attributes(uuid_list, str_attribs)
        return output

    def get_string_attributes(self, uuid_list, str_attribute_ent_dict):
        """ Gets string attribute data for a solr dict """
        output = {}
        pred_uuid_list = []
        pred_uuid_objs = {}
        for key, entity in str_attribute_ent_dict.items():
            if isinstance(entity.uuid, str):
                # add string predicate entity uuid to the list
                pred_uuid_list.append(entity.uuid)
                pred_uuid_objs[entity.uuid] = {'rec_attribute': key,
                                               'property': entity.label,
                                               'pred_uuid': entity.uuid,
                                               'slug': entity.slug}
        if len(pred_uuid_list) > 0 and len(uuid_list) > 0:
            q_rows = self. get_string_attributes_sql(uuid_list, pred_uuid_list)
            dict_rows = {}
            for row in q_rows:
                # print(str(row))
                # the whole "dict row" bullshit is because for some reason
                # we can't simply append to the output of the 
                uuid = row['uuid']
                pred_uuid = row['predicate_uuid']
                content = row['content']
                if uuid not in dict_rows:
                    dict_rows[uuid] = {}
                if pred_uuid not in dict_rows[uuid]:
                    dict_rows[uuid][pred_uuid] = []
                if isinstance(content, str):
                    dict_rows[uuid][pred_uuid].append(content)
                    # print(str(dict_rows[uuid][pred_uuid]))
            output = {'pred_ents': pred_uuid_objs,
                      'data': dict_rows}
        return output

    def get_string_attributes_sql(self, uuid_list, pred_uuid_list):
        """ executes SQL query to get strings for the solr uuids and predicates """
        q_uuids = self.make_query_uuids(uuid_list)
        p_uuids = self.make_query_uuids(pred_uuid_list)
        query = ('SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, '
                 's.content AS content '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_strings AS s ON ass.object_uuid = s.uuid '
                 'WHERE ass.uuid IN (' + q_uuids + ') AND '
                 'ass.predicate_uuid IN (' + p_uuids + ')'
                 'ORDER BY ass.uuid,  ass.predicate_uuid, s.content; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows
Esempio n. 40
0
class ReadProjectContextVocabGraph():
    """ Methods to read the project context vocabulary graph """

    GLOBAL_VOCAB_GRAPH = [
        {
            '@id': 'oc-pred:link',
            'owl:sameAs': 'http://opencontext.org/predicates/oc-3',
            'label': 'link',
            'slug': 'link',
            'oc-gen:predType': 'link',
            '@type': '@id'
        },
        {
            '@id': Assertion.PREDICATES_NOTE,
            'label': 'Note',
            'owl:sameAs': False,
            'slug': 'oc-gen-has-note',
            '@type': 'xsd:string'
        },
    ]

    # predicates used for equivalence, used to make
    # inferred assertions
    REL_PREDICATES_FOR_INFERRENCE = ['skos:closeMatch', 'skos:exactMatch']
    REL_MEASUREMENTS = [
        'cidoc-crm:P67_refers_to', 'oc-gen:has-technique', 'rdfs:range'
    ]
    ITEM_REL_PREDICATES = [
        'skos:closeMatch', 'skos:exactMatch', 'owl:sameAs', 'skos:related',
        'skos:broader', 'dc-terms:references', 'dc-terms:hasVersion',
        'http://nomisma.org/ontology#hasTypeSeriesItem'
    ]

    # Skip the following predicate keys when looking
    # for inferred linked data assertions in an observation.
    LINKDATA_OBS_PREDS_SKIP = [
        'id',
        'type',
        ItemKeys.PREDICATES_OCGEN_SOURCEID,
        ItemKeys.PREDICATES_OCGEN_OBSTATUS,
        ItemKeys.PREDICATES_OCGEN_OBSLABEL,
        ItemKeys.PREDICATES_OCGEN_OBSNOTE,
    ]

    def __init__(self, proj_context_json_ld=None):
        self.m_cache = MemoryCache()
        self.context = None
        self.graph = None
        self.fail_on_missing_entities = False
        if not isinstance(proj_context_json_ld, dict):
            return None
        if '@context' in proj_context_json_ld:
            self.context = proj_context_json_ld['@context']
        if '@graph' in proj_context_json_ld:
            self.graph = self.GLOBAL_VOCAB_GRAPH + proj_context_json_ld[
                '@graph']
        else:
            self.graph = self.GLOBAL_VOCAB_GRAPH
        logger.info('Read project graph size: {}'.format(len(self.graph)))

    def lookup_predicate(self, id):
        """looks up an Open Context predicate by an identifier
           (slud id, uri, slug, or uuid)
        """
        output = self.lookup_oc_descriptor(id, 'predicates')
        return output

    def lookup_type(self, id):
        """looks up an Open Context type by an identifier
           (slud id, uri, slug, or uuid)
        """
        output = self.lookup_oc_descriptor(id, 'types')
        return output

    def lookup_type_by_type_obj(self, type_obj):
        """looks up an Open Context type to get
           more information, including linked data equivalents
           by looking up the a type from how it is used as
           the object of a descriptive predicate in an observation
        """
        type_ids = self.get_id_list_for_g_obj(type_obj)
        for type_id in type_ids:
            found_type_obj = self.lookup_type(type_id)
            if isinstance(found_type_obj, dict):
                return found_type_obj
        return type_obj

    def lookup_oc_descriptor(self, id, item_type):
        """looks up a predicate, or a type by an identifier
           (slud id, uri, slug, or uuid)
        """
        cache_key = self.m_cache.make_cache_key(
            'lookup_oc_descriptor_{}'.format(item_type), id)
        output = self.m_cache.get_cache_object(cache_key)
        if (output is None and isinstance(self.graph, list)
                and isinstance(id, str)):
            for g_obj in self.graph:
                id_list = self.get_id_list_for_g_obj(g_obj)
                if not id in id_list:
                    continue
                output = g_obj
                if item_type == 'predicates' and '@type' not in g_obj:
                    output[
                        '@type'] = self.get_predicate_datatype_for_graph_obj(
                            g_obj)
                    break
            if output:
                self.m_cache.save_cache_object(cache_key, output)
        if self.fail_on_missing_entities and not output:
            raise RuntimeError('Cannot find {}, item_type: {}'.format(
                id, item_type))
        return output

    def get_predicate_datatype_for_graph_obj(self, g_obj):
        """ looks up a predicate data type for a given graph object """
        slug_uri = self.get_id_from_g_obj(g_obj)
        datatype = self.get_predicate_datatype_by_slug_uri(slug_uri)
        return datatype

    def get_id_list_for_g_obj(self, g_obj):
        """gets a list of ids for an object"""
        id_list = []
        id_keys = ['@id', 'id', 'owl:sameAs', 'slug', 'uuid']
        if isinstance(g_obj, dict):
            for id_key in id_keys:
                if not id_key in g_obj:
                    continue
                if g_obj[id_key] not in id_list:
                    id_list.append(g_obj[id_key])
        return id_list

    def get_id_from_g_obj(self, g_obj):
        """ gets the id form a g_obj, either the @id or id varient """
        id_variants = ['@id', 'id']
        id = None
        if not isinstance(g_obj, dict):
            return None
        for id_variant in id_variants:
            if id_variant not in g_obj:
                continue
            id = g_obj[id_variant]
        return id

    def get_predicate_datatype_by_slug_uri(self, slug_uri):
        """Looks up a predicate's datatype via the predicate slug URI."""
        datatype = 'xsd:string'  # Default to treating all as a string
        if (isinstance(self.context, dict) and isinstance(slug_uri, str)):
            if not slug_uri in self.context:
                return datatype
            for type_variant in ['@type', 'type']:
                if type_variant not in self.context[slug_uri]:
                    continue
                datatype = self.context[slug_uri][type_variant]
        return datatype

    def get_equivalent_objects(self, info_dict):
        """ Gets equivalent linked data dicts associated with an
            info_dict.
        """
        equiv_uris = []
        equiv_objects = []
        for rel_pred in self.REL_PREDICATES_FOR_INFERRENCE:
            if not rel_pred in info_dict:
                continue
            for equiv_obj in info_dict[rel_pred]:
                equiv_uri = self.get_id_from_g_obj(equiv_obj)
                if equiv_uri and equiv_uri not in equiv_uris:
                    # Make sure that the equivalent URIs are unique.
                    equiv_uris.append(equiv_uri)
                    equiv_objects.append(equiv_obj)
        return equiv_objects

    def infer_assertions_for_item_json_ld(self, json_ld):
        """Makes a list of inferred assertions from item json ld """
        lang_obj = Languages()
        inferred_assertions = []
        if not isinstance(json_ld, dict):
            return inferred_assertions
        if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld:
            return inferred_assertions
        unique_pred_assertions = LastUpdatedOrderedDict()
        for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]:
            # Get the status of the observation, defaulting to 'active'. If
            # active, then it's OK to infer assertions, otherwise skip the
            # observation.
            obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS,
                                      'active')
            if obs_status != 'active':
                # Skip this observation. It's there but has a deprecated
                # status.
                continue
            for obs_pred_key, obj_values in obs_dict.items():
                if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP:
                    # Skip this obs_pred_key, it is a general
                    # description of the observation, and will
                    # not have any linked assertions to infer.
                    continue
                obs_pred_info = self.lookup_predicate(obs_pred_key)
                pred_data_type = self.get_predicate_datatype_for_graph_obj(
                    obs_pred_info)
                if not obs_pred_info:
                    continue
                equiv_pred_objs = self.get_equivalent_objects(obs_pred_info)
                if not equiv_pred_objs:
                    # No linked data equivalence for the obs_pred_key
                    # so continue, skipping the rest.
                    continue
                # Start with a None assertion.
                assertion = None
                # Iterate through all the equivalent predicate objects.
                for equiv_pred_obj in equiv_pred_objs:
                    equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj)
                    # Inferred assertions will have unique LOD predicates, with
                    # one or more values. The unique_pred_assertions dict makes
                    # sure the LOD predicates are used only once.
                    if not equiv_pred_uri in unique_pred_assertions:
                        assertion = equiv_pred_obj
                        assertion['type'] = pred_data_type
                        assertion['ld_objects'] = LastUpdatedOrderedDict()
                        assertion['oc_objects'] = LastUpdatedOrderedDict()
                        assertion['literals'] = []
                        unique_pred_assertions[equiv_pred_uri] = assertion
                        assertion = unique_pred_assertions[equiv_pred_uri]
                    if assertion and equiv_pred_uri:
                        # we have a LOD equvalient property
                        if not isinstance(obj_values, list):
                            obj_values = [obj_values]
                        for obj_val in obj_values:
                            literal_val = None
                            if not isinstance(obj_val, dict):
                                # the object of the assertion is not a dict, so it must be
                                # a literal
                                literal_val = obj_val
                                if obj_val not in assertion['literals']:
                                    assertion['literals'].append(obj_val)
                            elif 'xsd:string' in obj_val:
                                literal_val = lang_obj.get_all_value_str(
                                    obj_val['xsd:string'])
                            if literal_val and literal_val not in assertion[
                                    'literals']:
                                assertion['literals'].append(literal_val)
                            if literal_val is None:
                                # Add any linked data equivalences by looking for this
                                # type in the graph list
                                obj_val = self.lookup_type_by_type_obj(obj_val)
                                obj_uri = self.get_id_from_g_obj(obj_val)
                                equiv_obj_objs = self.get_equivalent_objects(
                                    obj_val)
                                if len(equiv_obj_objs):
                                    # We have LD equivalents for the object value
                                    for equiv_obj_obj in equiv_obj_objs:
                                        equiv_obj_uri = self.get_id_from_g_obj(
                                            equiv_obj_obj)
                                        if not biological_taxonomy_validation(
                                                equiv_pred_uri, equiv_obj_uri):
                                            # This object_uri does not belong to this
                                            # predicated uri.
                                            continue
                                        assertion['ld_objects'][
                                            equiv_obj_uri] = equiv_obj_obj
                                elif obj_uri:
                                    # We don't have LD equivalents for the object value
                                    # add to the oc_objects
                                    assertion['oc_objects'][obj_uri] = obj_val
                                unique_pred_assertions[
                                    equiv_pred_uri] = assertion
        for pred_key, assertion in unique_pred_assertions.items():
            inferred_assertions.append(assertion)
        return inferred_assertions
Esempio n. 41
0
class LinkRecursion():
    """
    Does recursive look ups on link annotations, especially to find hierarchies

from opencontext_py.apps.ldata.linkannotations.recursion import LinkRecursion
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('oc-gen:cat-bio-subj-ecofact')
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('oc-gen:cat-arch-element')
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('http://eol.org/pages/7680')
lr = LinkRecursion()
lr.get_entity_children('http://eol.org/pages/4077', True)
    """
    def __init__(self):
        self.m_cache = MemoryCache()
        self.parent_entities = None
        self.child_entities = None
        # cache prefix for the json-ldish-parents
        self.jsonldish_p_prefix = 'json-ldish-parents-{}'
        # cache prefix for list of parents
        self.p_prefix = 'lr-parents'
        # cache prefix for children of an item
        self.children_prefix = 'lr-children-{}'
        # cache prefix for full tree of child items
        self.child_tree_prefix = 'lr-child-tree-{}'

    def get_jsonldish_entity_parents(self, identifier, add_original=True):
        """
        Gets parent concepts for a given URI or UUID identified entity
        returns a list of dictionary objects similar to JSON-LD expectations
        This is useful for faceted search

        If add_original is true, add the original UUID for the entity
        that's the childmost item, at the bottom of the hierarchy
        """
        cache_key = self.m_cache.make_cache_key(
            self.jsonldish_p_prefix.format(str(add_original)),
            identifier
        )
        obj = self.m_cache.get_cache_object(cache_key)
        if obj is not None:
            return obj
        # We don't have it cached, so get from the database.
        obj = self._get_jsonldish_entity_parents_db(
            identifier,
            add_original
        )
        if obj:
            self.m_cache.save_cache_object(cache_key, obj)
        return obj

    def _get_jsonldish_entity_parents_db(self, identifier, add_original=True):
        """
        Gets parent concepts for a given URI or UUID identified entity
        returns a list of dictionary objects similar to JSON-LD expectations
        This is useful for faceted search

        If add_original is true, add the original UUID for the entity
        that's the childmost item, at the bottom of the hierarchy
        """
        output = False
        if add_original:
            # add the original identifer to the list of parents, at lowest rank
            raw_parents = (
                [identifier] +
                self.get_entity_parents(identifier, [], 0)
            )
        else:
            raw_parents = self.get_entity_parents(
                identifier,
                [],
                0
            )
        if not len(raw_parents):
            # No parents. Returns false.
            return output
        # Make the output.
        # reverse the order of the list, to make top most concept
        # first
        output = []
        for par_id in raw_parents[::-1]:
            # print('par_id is: ' + par_id)
            ent = self.m_cache.get_entity(par_id)
            if not ent:
                continue
            p_item = LastUpdatedOrderedDict()
            p_item['id'] = ent.uri
            p_item['slug'] = ent.slug
            p_item['label'] = ent.label
            if ent.data_type is not False:
                p_item['type'] = ent.data_type
            else:
                p_item['type'] = '@id'
            p_item['ld_object_ok'] = ent.ld_object_ok
            output.append(p_item)
        return output
    
    def get_entity_parents(self, identifier, parent_list=None, loop_count=0):
        """
        Gets parent concepts for a given URI or UUID identified entity
        """
        if not parent_list:
            parent_list = []
        loop_count += 1
        parent_id = self._get_parent_id(identifier)
        # print('ID: {} has parent: {}'.format(identifier, parent_id))
        if parent_id:
            if parent_id not in parent_list:
                parent_list.append(parent_id)
                # print('Parent list is: ' + str(parent_list))
            if loop_count <= 50:
                parent_list = self.get_entity_parents(parent_id, parent_list, loop_count)
        else:
            # all done, save the parents
            self.parent_entities = parent_list
        return parent_list
    
    def _get_parent_id(self, identifier):
        """Get the parent id for the current identifier, or from the cache."""
        cache_key = self.m_cache.make_cache_key(self.p_prefix,
                                                identifier)
        obj = self.m_cache.get_cache_object(cache_key)
        if obj is not None:
            return obj
        else:
            obj = self._get_parent_id_db(identifier)
            if obj:
                self.m_cache.save_cache_object(cache_key, obj)
            return obj

    def _get_parent_id_db(self, identifier):
        """Get the parent id for the current identifier """
        parent_id = None
        lequiv = LinkEquivalence()
        identifiers = lequiv.get_identifier_list_variants(identifier)
        # print('identifiers: {}'.format(identifiers))
        p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ
        preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs)
        p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ
        preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs)
        try:
            # look for superior items in the objects of the assertion
            # sorting by sort so we can privelage a certain hierarchy path
            superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers,
                                                           predicate_uri__in=preds_for_superobjs)\
                                                   .exclude(object_uri__in=identifiers)\
                                                   .order_by('sort', 'object_uri')[:1]
            if len(superobjs_anno) < 1:
                superobjs_anno = False
        except LinkAnnotation.DoesNotExist:
            superobjs_anno = False
        if superobjs_anno:
            parent_id = superobjs_anno[0].object_uri
            # print('Subject {} is child of {}'.format(identifiers, parent_id))
            oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id)
            if oc_uuid:
                parent_id = oc_uuid
        try:
            """
            Now look for superior entities in the subject, not the object
            sorting by sort so we can privelage a certain hierarchy path
            """
            supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers,
                                                           predicate_uri__in=preds_for_subobjs)\
                                                   .exclude(subject__in=identifiers)\
                                                   .order_by('sort', 'subject')[:1]
            if len(supersubj_anno) < 1:
                supersubj_anno = False
        except LinkAnnotation.DoesNotExist:
            supersubj_anno = False
        if supersubj_anno:
            parent_id = supersubj_anno[0].subject
            # print('Subject {} is parent of {}'.format(parent_id, identifiers))
            oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id)
            if oc_uuid:
                parent_id = oc_uuid
        return parent_id

    def get_entity_children(self, identifier, recursive=True):
        cache_key = self.m_cache.make_cache_key(self.children_prefix.format(str(recursive)),
                                                identifier)
        tree_cache_key = self.m_cache.make_cache_key(self.child_tree_prefix.format(str(recursive)),
                                                     identifier)
        obj = self.m_cache.get_cache_object(cache_key)
        tree_obj = self.m_cache.get_cache_object(tree_cache_key)
        if obj is not None and tree_obj is not None:
            # print('Hit child cache on {}'.format(identifier))
            self.child_entities = tree_obj  # the fill tree of child entities
            return obj
        else:
            obj = self._get_entity_children_db(identifier, recursive)
            if obj:
                # print('Hit child DB on {}'.format(identifier))
                self.m_cache.save_cache_object(cache_key, obj)
                self.m_cache.save_cache_object(tree_cache_key, self.child_entities)
            return obj
    
    def _get_entity_children_db(self, identifier, recursive=True):
        """
        Gets child concepts for a given URI or UUID identified entity
        """
        if not self.child_entities:
            self.child_entities = LastUpdatedOrderedDict()
        if identifier in self.child_entities and recursive:
            output = self.child_entities[identifier]
        else:
            act_children = []
            p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ
            p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ
            lequiv = LinkEquivalence()
            identifiers = lequiv.get_identifier_list_variants(identifier)
            try:
                # look for child items in the objects of the assertion
                subobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers,
                                                             predicate_uri__in=p_for_subobjs)
                if(len(subobjs_anno) < 1):
                    subobjs_anno = False
            except LinkAnnotation.DoesNotExist:
                subobjs_anno = False
            if subobjs_anno is not False:
                for sub_obj in subobjs_anno:
                    child_id = sub_obj.object_uri
                    act_children.append(child_id)
            try:
                """
                Now look for subordinate entities in the subject, not the object
                """
                subsubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers,
                                                             predicate_uri__in=p_for_superobjs)
                if len(subsubj_anno) < 1:
                    subsubj_anno = False
            except LinkAnnotation.DoesNotExist:
                subsubj_anno = False
            if subsubj_anno is not False:
                for sub_sub in subsubj_anno:
                    child_id = sub_sub.subject
                    act_children.append(child_id)
            if len(act_children) > 0:
                identifier_children = []
                for child_id in act_children:
                    if child_id.count('/') > 1:
                        oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id)
                        if oc_uuid:
                            child_id = oc_uuid
                    identifier_children.append(child_id)
                    # recursively get the children of the child
                    if recursive:
                        self.get_entity_children(child_id, recursive)
                # same the list of children of the current identified item
                if identifier not in self.child_entities:
                    self.child_entities[identifier] = identifier_children
            else:
                # save a False for the current identified item. it has no children
                if identifier not in self.child_entities:
                    self.child_entities[identifier] = []
            output = self.child_entities[identifier]
        return output

    def get_pred_top_rank_types(self, predicate_uuid):
        """ gets the top ranked (not a subordinate) of any other
            type for a predicate
        """
        types = False
        try:
            pred_obj = Predicate.objects.get(uuid=predicate_uuid)
        except Predicate.DoesNotExist:
            pred_obj = False
        if pred_obj is not False:
            # print('found: ' + predicate_uuid)
            if pred_obj.data_type == 'id':
                types = []
                id_list = []
                pred_types = OCtype.objects\
                                   .filter(predicate_uuid=predicate_uuid)
                for p_type in pred_types:
                    type_pars = self.get_jsonldish_entity_parents(p_type.uuid)
                    self.parent_entities = []
                    self.loop_count = 0
                    if type_pars[0]['id'] not in id_list:
                        # so the top parent is only listed once
                        id_list.append(type_pars[0]['id'])
                        types.append(type_pars[0])
        return types
    
    def get_entity(self, identifier):
        """ Gets an entity either from the cache or from
            database lookups. This is a wrapper for the
            MemoryCache().get_entity function.
        """
        return self.m_cache.get_entity(identifier)
Esempio n. 42
0
 def __init__(self):
     self.error = False
     self.histogram_groups = 10
     self.mem_cache_obj = MemoryCache()  # memory caching object
Esempio n. 43
0
 def __init__(self, cannonical_uris = False):
     self.m_cache = MemoryCache()
Esempio n. 44
0
 def __init__(self):
     self.m_cache = MemoryCache()  # memory caching object
     self.base_search_link = '/search/'
     self.hierarchy_delim = '---'
Esempio n. 45
0
class SolrUUIDs():
    """ methods to make get UUIDs from a solr
        search result JSON document,

        also makes URIs
    """
    def __init__(self, response_dict_json=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.uuids = []
        self.uris = []
        self.m_cache = MemoryCache()  # memory caching object
        self.s_cache = SearchGenerationCache(
        )  # supplemental caching object, specific for searching
        self.response_dict_json = response_dict_json
        self.highlighting = False
        # make values to these fields "flat" not a list
        self.flatten_rec_fields = True
        self.total_found = False
        self.rec_start = False
        self.min_date = False
        self.max_date = False
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.do_media_thumbs = True  # get thumbnails for records
        self.get_all_media = False  # get links to all media files for an item

    def make_uuids_from_solr(self, solr_json):
        """ makes geojson-ld point records from a solr response """
        #first do lots of checks to make sure the solr-json is OK
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = solr_rec['uuid']
                    self.uuids.append(uuid)
        return self.uuids

    def make_uris_from_solr(self, solr_json, uris_only=True):
        """ processes the solr_json to
             make GeoJSON records
        """
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            if uris_only:
                self.do_media_thumbs = False
            if self.get_all_media:
                self.do_media_thumbs = False
            if 'thumbnail' in self.rec_attributes:
                self.do_media_thumbs = True
            thumbnail_data = self.get_media_thumbs(solr_recs)
            media_file_data = self.get_all_media_files(solr_recs)
            string_attrib_data = self.get_string_rec_attributes(solr_recs)
            for solr_rec in solr_recs:
                rec_props_obj = RecordProperties(self.response_dict_json)
                rec_props_obj.min_date = self.min_date
                rec_props_obj.max_date = self.max_date
                rec_props_obj.highlighting = self.highlighting
                rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes
                rec_props_obj.rec_attributes = self.rec_attributes
                rec_props_obj.thumbnail_data = thumbnail_data
                rec_props_obj.media_file_data = media_file_data
                rec_props_obj.string_attrib_data = string_attrib_data
                item_ok = rec_props_obj.get_item_basics(solr_rec)
                if item_ok:
                    if uris_only:
                        item = rec_props_obj.uri
                    else:
                        rec_props_obj.parse_solr_record(solr_rec)
                        item = self.make_item_dict_from_rec_props_obj(
                            rec_props_obj)
                    self.uris.append(item)
        return self.uris

    def make_item_dict_from_rec_props_obj(self,
                                          rec_props_obj,
                                          cannonical=True):
        """ makes item dictionary object from a record prop obj """
        item = LastUpdatedOrderedDict()
        item['uri'] = rec_props_obj.uri
        if cannonical is False or 'href' in self.rec_attributes:
            item['href'] = rec_props_obj.href
        item['citation uri'] = rec_props_obj.cite_uri
        item['label'] = rec_props_obj.label
        item['project label'] = rec_props_obj.project_label
        if cannonical:
            item['project uri'] = rec_props_obj.project_uri
        else:
            item['project href'] = rec_props_obj.project_href
        item['context label'] = rec_props_obj.context_label
        if cannonical:
            item['context uri'] = rec_props_obj.context_uri
        else:
            item['context href'] = rec_props_obj.context_href
        item['latitude'] = rec_props_obj.latitude
        item['longitude'] = rec_props_obj.longitude
        item['early bce/ce'] = rec_props_obj.early_date
        item['late bce/ce'] = rec_props_obj.late_date
        item['item category'] = rec_props_obj.category
        if rec_props_obj.snippet is not False:
            item['snippet'] = rec_props_obj.snippet
        if rec_props_obj.thumbnail_scr is not False:
            item['thumbnail'] = rec_props_obj.thumbnail_scr
        if rec_props_obj.preview_scr is not False:
            item['preview'] = rec_props_obj.preview_scr
        if rec_props_obj.fullfile_scr is not False:
            item['primary-file'] = rec_props_obj.fullfile_scr
        item['published'] = rec_props_obj.published
        item['updated'] = rec_props_obj.updated
        if isinstance(rec_props_obj.other_attributes, list):
            for attribute in rec_props_obj.other_attributes:
                prop_key = attribute['property']
                prop_key = rec_props_obj.prevent_attribute_key_collision(
                    item, prop_key)
                if self.flatten_rec_attributes:
                    if 'value' in attribute:
                        item[prop_key] = attribute['value']
                    elif 'values_list' in attribute:
                        item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(
                            attribute['values_list'])
                else:
                    item[prop_key] = attribute['values_list']
        return item

    def extract_solr_recs(self, solr_json):
        """ extracts solr_recs along with
           some basic metadata from solr_json
        """
        solr_recs = False
        if isinstance(solr_json, dict):
            try:
                self.total_found = solr_json['response']['numFound']
            except KeyError:
                self.total_found = False
            try:
                self.rec_start = solr_json['response']['start']
            except KeyError:
                self.rec_start = False
            try:
                self.highlighting = solr_json['highlighting']
            except KeyError:
                self.highlighting = False
            try:
                solr_recs = solr_json['response']['docs']
            except KeyError:
                solr_recs = False
        return solr_recs

    def get_media_thumbs(self, solr_recs):
        """ gets media thumbnail items """
        thumb_results = {}
        not_media_uuids = []
        media_uuids = []
        rec_props_obj = RecordProperties(self.response_dict_json)
        for solr_rec in solr_recs:
            item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
            if item is not False:
                uuid = item['uuid']
                if item['item_type'] != 'media':
                    not_media_uuids.append(uuid)
                else:
                    media_uuids.append(uuid)
                thumb_results[uuid] = False
        if len(not_media_uuids) > 0:
            if self.do_media_thumbs:
                # only get media_thumbnails if needed
                rows = self.get_thumbs_for_non_media(not_media_uuids)
                for row in rows:
                    uuid = row['uuid']
                    thumb_obj = {}
                    thumb_obj[
                        'href'] = self.base_url + '/media/' + row['media_uuid']
                    thumb_obj[
                        'uri'] = settings.CANONICAL_HOST + '/media/' + row[
                            'media_uuid']
                    thumb_obj['scr'] = row['file_uri']
                    if thumb_results[uuid] is False:
                        thumb_results[uuid] = thumb_obj
        if len(media_uuids) > 0:
            thumbs = Mediafile.objects\
                              .filter(uuid__in=media_uuids,
                                      file_type='oc-gen:thumbnail')
            for thumb in thumbs:
                uuid = thumb.uuid
                thumb_obj = {}
                thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid
                thumb_obj[
                    'uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid
                thumb_obj['scr'] = thumb.file_uri
                thumb_results[uuid] = thumb_obj
        return thumb_results

    def get_all_media_files(self, solr_recs):
        """ gets media thumbnail items """
        media_file_results = {}
        if self.get_all_media:
            media_uuids = []
            rec_props_obj = RecordProperties(self.response_dict_json)
            for solr_rec in solr_recs:
                item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
                if item is not False:
                    uuid = item['uuid']
                    if item['item_type'] == 'media':
                        media_uuids.append(uuid)
                    media_file_results[uuid] = False
            if len(media_uuids) > 0:
                media_files = Mediafile.objects\
                                       .filter(uuid__in=media_uuids)
                for media_file in media_files:
                    uuid = media_file.uuid
                    if uuid not in media_file_results:
                        media_file_results[uuid] = {}
                    else:
                        if media_file_results[uuid] is False:
                            media_file_results[uuid] = {}
                    media_file_results[uuid][
                        media_file.file_type] = media_file.file_uri
        return media_file_results

    def get_thumbs_for_non_media(self, uuid_list):
        q_uuids = self.make_query_uuids(uuid_list)
        query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, '
                 'm.uuid AS media_uuid '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid '
                 'AND m.file_type=\'oc-gen:thumbnail\'  '
                 'WHERE ass.uuid IN (' + q_uuids + ') '
                 'GROUP BY ass.uuid,  m.file_uri, m.uuid; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows

    def make_query_uuids(self, uuid_list):
        """ makes a string for uuid list query """
        uuid_q = []
        for uuid in uuid_list:
            uuid = '\'' + uuid + '\''
            uuid_q.append(uuid)
        return ', '.join(uuid_q)

    def dictfetchall(self, cursor):
        """ Return all rows from a cursor as a dict """
        columns = [col[0] for col in cursor.description]
        return [dict(zip(columns, row)) for row in cursor.fetchall()]

    def get_string_rec_attributes(self, solr_recs):
        """ gets string record attributes from the database.
            The solr index does not keep string-fields in memory
        """
        output = {}
        str_attribs = {}
        for attribute in self.rec_attributes:
            entity = self.m_cache.get_entity(attribute)
            if entity:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.s_cache.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                if entity.data_type == 'xsd:string':
                    str_attribs[attribute] = entity
        if len(str_attribs) > 0:
            uuid_list = []
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = str(solr_rec['uuid'])
                    uuid_list.append(uuid)
            output = self.get_string_attributes(uuid_list, str_attribs)
        return output

    def get_string_attributes(self, uuid_list, str_attribute_ent_dict):
        """ Gets string attribute data for a solr dict """
        output = {}
        pred_uuid_list = []
        pred_uuid_objs = {}
        for key, entity in str_attribute_ent_dict.items():
            if isinstance(entity.uuid, str):
                # add string predicate entity uuid to the list
                pred_uuid_list.append(entity.uuid)
                pred_uuid_objs[entity.uuid] = {
                    'rec_attribute': key,
                    'property': entity.label,
                    'pred_uuid': entity.uuid,
                    'slug': entity.slug
                }
        if len(pred_uuid_list) > 0 and len(uuid_list) > 0:
            q_rows = self.get_string_attributes_sql(uuid_list, pred_uuid_list)
            dict_rows = {}
            for row in q_rows:
                # print(str(row))
                # the whole "dict row" bullshit is because for some reason
                # we can't simply append to the output of the
                uuid = row['uuid']
                pred_uuid = row['predicate_uuid']
                content = row['content']
                if uuid not in dict_rows:
                    dict_rows[uuid] = {}
                if pred_uuid not in dict_rows[uuid]:
                    dict_rows[uuid][pred_uuid] = []
                if isinstance(content, str):
                    dict_rows[uuid][pred_uuid].append(content)
                    # print(str(dict_rows[uuid][pred_uuid]))
            output = {'pred_ents': pred_uuid_objs, 'data': dict_rows}
        return output

    def get_string_attributes_sql(self, uuid_list, pred_uuid_list):
        """ executes SQL query to get strings for the solr uuids and predicates """
        q_uuids = self.make_query_uuids(uuid_list)
        p_uuids = self.make_query_uuids(pred_uuid_list)
        query = (
            'SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, '
            's.content AS content '
            'FROM oc_assertions AS ass '
            'JOIN oc_strings AS s ON ass.object_uuid = s.uuid '
            'WHERE ass.uuid IN (' + q_uuids + ') AND '
            'ass.predicate_uuid IN (' + p_uuids + ')'
            'ORDER BY ass.uuid,  ass.predicate_uuid, s.content; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows