コード例 #1
0
ファイル: equivalence.py プロジェクト: rdhyee/open-context-py
 def get_identifier_list_variants(self, id_list):
     """ makes different variants of identifiers
         for a list of identifiers
     """
     output_list = []
     if not isinstance(id_list, list):
         id_list = [str(id_list)]
     for identifier in id_list:
         output_list.append(identifier)
         if(identifier.startswith('http://') or identifier.startswith('https://')):
             oc_uuid = URImanagement.get_uuid_from_oc_uri(identifier)
             if oc_uuid:
                 output_list.append(oc_uuid)
             prefix_id = URImanagement.prefix_common_uri(identifier)
             if prefix_id:
                 output_list.append(prefix_id)
         elif ':' in identifier:
             full_uri = URImanagement.convert_prefix_to_full_uri(identifier)
             output_list.append(full_uri)
         else:
             # probably an open context uuid or a slug
             m_cache = MemoryCache()
             ent = m_cache.get_entity(identifier)
             if ent:
                 full_uri = ent.uri
                 output_list.append(full_uri)
                 prefix_uri = URImanagement.prefix_common_uri(full_uri)
                 if prefix_uri != full_uri:
                     output_list.append(prefix_uri)
     return output_list
コード例 #2
0
    def add_entity_item_to_act_filter(
        self,
        lookup_val,
        act_filter,
        is_spatial_context=False,
        look_up_mapping_dict=None,
    ):
        """Looks up a entity item to add to an act_filter"""
        lookup_val = str(lookup_val)

        if lookup_val.startswith(configs.RELATED_ENTITY_ID_PREFIX):
            # Strip off the related property prefix. Note that this
            # is a related property.
            lookup_val = lookup_val[len(configs.RELATED_ENTITY_ID_PREFIX):]
            act_filter['oc-api:related-property'] = True

        # Map the lookup_val to a mapping dict
        if look_up_mapping_dict:
            lookup_val = look_up_mapping_dict.get(lookup_val, lookup_val)

        m_cache = MemoryCache()
        items = []
        if configs.REQUEST_OR_OPERATOR in lookup_val:
            lookup_list = lookup_val.split(configs.REQUEST_OR_OPERATOR)
        else:
            lookup_list = [lookup_val]

        for act_val in lookup_list:
            if is_spatial_context:
                item = m_cache.get_entity_by_context(act_val)
            else:
                item = m_cache.get_entity(act_val)
            if not item:
                continue
            items.append(item)

        if not len(items):
            # We didn't find any item entities, so return
            # the lookup list as the label.
            act_filter['label'] = ' OR '.join(lookup_list)
            return act_filter, None

        # Use all the item labels to make a label.
        item_labels = [item.label for item in items]
        act_filter['label'] = ' OR '.join(item_labels)

        if len(items) == 1:
            # We only have 1 item, so define it with a
            # URI and slug.
            act_filter['rdfs:isDefinedBy'] = items[0].uri
            act_filter['oc-api:filter-slug'] = items[0].slug

        return act_filter, item
コード例 #3
0
ファイル: filters.py プロジェクト: rdhyee/open-context-py
class ActiveFilters():

    """ Methods to show search / query filters in use """
    TEXT_SEARCH_TITLE = 'Current Text Search Filter'

    IGNORE_PARAMS = ['geodeep',
                     'chronodeep',
                     'sort',
                     'rows',
                     'start']

    def __init__(self):
        self.m_cache = MemoryCache()  # memory caching object
        self.base_search_link = '/search/'
        self.hierarchy_delim = '---'

    def add_filters_json(self, request_dict):
        """ adds JSON describing search filters """
        fl = FilterLinks()
        fl.base_search_link = self.base_search_link
        filters = []
        string_fields = []  # so we have an interface for string searches
        i = 0
        for param_key, param_vals in request_dict.items():
            if param_key == 'path':
                if param_vals:
                    i += 1
                    f_entity = self.m_cache.get_entity(param_vals)
                    label = http.urlunquote_plus(param_vals)
                    act_filter = LastUpdatedOrderedDict()
                    act_filter['id'] = '#filter-' + str(i)
                    act_filter['oc-api:filter'] = 'Context'
                    act_filter['label'] = label.replace('||', ' OR ')
                    if f_entity:
                        act_filter['rdfs:isDefinedBy'] = f_entity.uri
                    # generate a request dict without the context filter
                    rem_request = fl.make_request_sub(request_dict,
                                                      param_key,
                                                      param_vals)
                    act_filter['oc-api:remove'] = fl.make_request_url(rem_request)
                    act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json')
                    filters.append(act_filter)
            else:
                for param_val in param_vals:
                    i += 1
                    remove_geodeep = False
                    act_filter = LastUpdatedOrderedDict()
                    act_filter['id'] = '#filter-' + str(i)
                    if self.hierarchy_delim in param_val:
                        all_vals = param_val.split(self.hierarchy_delim)
                    else:
                        all_vals = [param_val]
                    if param_key == 'proj':
                        # projects, only care about the last item in the parameter value
                        act_filter['oc-api:filter'] = 'Project'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                    elif param_key == 'prop':
                        # prop, the first item is the filter-label
                        # the last is the filter
                        act_filter['label'] = False
                        if len(all_vals) < 2:
                            act_filter['oc-api:filter'] = 'Description'
                            act_filter['oc-api:filter-slug'] = all_vals[0]
                        else:
                            filt_dict = self.make_filter_label_dict(all_vals[0])
                            act_filter['oc-api:filter'] = filt_dict['label']
                            if 'slug' in filt_dict:
                                act_filter['oc-api:filter-slug'] = filt_dict['slug']
                            if filt_dict['data-type'] == 'string':
                                act_filter['label'] = 'Search Term: \'' + all_vals[-1] + '\''
                        if act_filter['label'] is False:
                            label_dict = self.make_filter_label_dict(all_vals[-1])
                            act_filter['label'] = label_dict['label']
                    elif param_key == 'type':
                        act_filter['oc-api:filter'] = 'Open Context Type'
                        if all_vals[0] in QueryMaker.TYPE_MAPPINGS:
                            type_uri = QueryMaker.TYPE_MAPPINGS[all_vals[0]]
                            label_dict = self.make_filter_label_dict(type_uri)
                            act_filter['label'] = label_dict['label']
                        else:
                            act_filter['label'] = all_vals[0]
                    elif param_key == 'q':
                        act_filter['oc-api:filter'] = self.TEXT_SEARCH_TITLE
                        act_filter['label'] = 'Search Term: \'' + all_vals[0] + '\''
                    elif param_key == 'id':
                        act_filter['oc-api:filter'] = 'Identifier Lookup'
                        act_filter['label'] = 'Identifier: \'' + all_vals[0] + '\''
                    elif param_key == 'form-chronotile':
                        act_filter['oc-api:filter'] = 'Time of formation, use, or life'
                        chrono = ChronoTile()
                        dates = chrono.decode_path_dates(all_vals[0])
                        if isinstance(dates, dict):
                            act_filter['label'] = 'Time range: ' + str(dates['earliest_bce'])
                            act_filter['label'] += ' to ' + str(dates['latest_bce'])
                    elif param_key == 'form-start':
                        act_filter['oc-api:filter'] = 'Earliest formation, use, or life date'
                        try:
                            val_date = int(float(all_vals[0]))
                        except:
                            val_date = False
                        if val_date is False:
                            act_filter['label'] = '[Invalid year]'
                        elif val_date < 0:
                            act_filter['label'] = str(val_date * -1) + ' BCE'
                        else:
                            act_filter['label'] = str(val_date) + ' CE'
                    elif param_key == 'form-stop':
                        act_filter['oc-api:filter'] = 'Latest formation, use, or life date'
                        try:
                            val_date = int(float(all_vals[0]))
                        except:
                            val_date = False
                        if val_date is False:
                            act_filter['label'] = '[Invalid year]'
                        elif val_date < 0:
                            act_filter['label'] = str(val_date * -1) + ' BCE'
                        else:
                            act_filter['label'] = str(val_date) + ' CE'
                    elif param_key == 'disc-geotile':
                        act_filter['oc-api:filter'] = 'Location of discovery or observation'
                        act_filter['label'] = self.make_geotile_filter_label(all_vals[0])
                        remove_geodeep = True
                    elif param_key == 'disc-bbox':
                        act_filter['oc-api:filter'] = 'Location of discovery or observation'
                        act_filter['label'] = self.make_bbox_filter_label(all_vals[0])
                        remove_geodeep = True
                    elif param_key == 'images':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to images'
                    elif param_key == 'other-media':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to media (other than images)'
                    elif param_key == 'documents':
                        act_filter['oc-api:filter'] = 'Has related media'
                        act_filter['label'] = 'Linked to documents'
                    elif param_key == 'dc-subject':
                        act_filter['oc-api:filter'] = 'Has subject metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if 'tdar' == all_vals[-1] or 'tdar*' == all_vals[-1]:
                            act_filter['label'] = 'tDAR defined metadata record(s)'
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-spatial':
                        act_filter['oc-api:filter'] = 'Has spatial metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-coverage':
                        act_filter['oc-api:filter'] = 'Has coverage / period metadata'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-temporal':
                        act_filter['oc-api:filter'] = 'Has temporal coverage'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                            if len(label_dict['entities']) == 1: 
                                if label_dict['entities'][0].entity_type == 'vocabulary':
                                    act_filter['label'] = 'Concepts defined by: ' + label_dict['label']
                            elif 'periodo' in all_vals[-1]:
                                act_filter['label'] = 'PeriodO defined concepts'
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False\
                               and label_dict['entities'][0].vocabulary != label_dict['label']:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'obj':
                        act_filter['oc-api:filter'] = 'Links (in some manner) to object'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'dc-isReferencedBy':
                        act_filter['oc-api:filter'] = 'Is referenced by'
                        label_dict = self.make_filter_label_dict(all_vals[-1])
                        if len(label_dict['label']) > 0:
                            act_filter['label'] = label_dict['label']
                        if len(label_dict['entities']) == 1:
                            act_filter['rdfs:isDefinedBy'] = label_dict['entities'][0].uri
                            if label_dict['entities'][0].vocabulary is not False\
                               and label_dict['entities'][0].vocab_uri != label_dict['entities'][0].uri:
                                act_filter['label'] += ' in ' + label_dict['entities'][0].vocabulary
                    elif param_key == 'linked' and all_vals[-1] == 'dinaa-cross-ref':
                        act_filter['oc-api:filter'] = 'Has cross references'
                        act_filter['label'] = 'Links to, or with, DINAA curated site files'
                    else:
                        act_filter = False
                    if act_filter is not False:
                        rem_request = fl.make_request_sub(request_dict,
                                                          param_key,
                                                          param_val)
                        if 'geodeep' in rem_request and remove_geodeep:
                            rem_request.pop('geodeep', None)    
                        act_filter['oc-api:remove'] = fl.make_request_url(rem_request)
                        act_filter['oc-api:remove-json'] = fl.make_request_url(rem_request, '.json')
                        filters.append(act_filter)
        return filters

    def make_geotile_filter_label(self, raw_geotile):
        """ parses a raw bbox parameter value to make
            a filter label
        """
        output_list = []
        if '||' in raw_geotile:
            tile_list = raw_geotile.split('||')
        else:
            tile_list = [raw_geotile]
        for tile in tile_list:
            geotile = GlobalMercator()
            coordinates = geotile.quadtree_to_lat_lon(tile)
            if coordinates is not False:
                label = 'In the region bounded by: '
                label += str(round(coordinates[0], 3))
                label += ', ' + str(round(coordinates[1], 3))
                label += ' (SW) and ' + str(round(coordinates[2], 3))
                label += ', ' + str(round(coordinates[3], 3))
                label += ' (NE)'
                output_list.append(label)
            else:
                output_list.append('[Ignored invalid geospatial tile]')
        output = '; or '.join(output_list)
        return output

    def make_bbox_filter_label(self, raw_disc_bbox):
        """ parses a raw bbox parameter value to make
            a filter label
        """
        qm = QueryMaker()
        output_list = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                bbox_coors = bbox.split(',')
                bbox_valid = qm.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    label = 'In the bounding-box of: Latitude '
                    label += str(bbox_coors[1])
                    label += ', Longitude ' + str(bbox_coors[0])
                    label += ' (SW) and Latitude ' + str(bbox_coors[3])
                    label += ', Longitude ' + str(bbox_coors[2])
                    label += ' (NE)'
                    output_list.append(label)
                else:
                    output_list.append('[Ignored invalid bounding-box]')
            else:
                output_list.append('[Ignored invalid bounding-box]')
        output = '; or '.join(output_list)
        return output

    def make_filter_label_dict(self, act_val):
        """ returns a dictionary object
            with a label and set of entities (in cases of OR
            searchs)
        """
        related_suffix = ''
        output = {'label': False,
                  'data-type': 'id',
                  'slug': False,
                  'entities': []}
        labels = []
        if '||' in act_val:
            vals = act_val.split('||')
        else:
            vals = [act_val]
        for val in vals:
            qm = QueryMaker()
            db_val = qm.clean_related_slug(val)
            if val != db_val:
                related_suffix = ' (for related items)'
            f_entity = self.m_cache.get_entity(db_val)
            if f_entity:
                # get the solr field data type
                ent_solr_data_type = qm.get_solr_field_type(f_entity.data_type)
                if ent_solr_data_type is not False \
                   and ent_solr_data_type != 'id':
                    output['data-type'] = ent_solr_data_type
                labels.append(f_entity.label)
                output['entities'].append(f_entity)
            else:
                labels.append(val)
        output['label'] = (' OR '.join(labels)) + related_suffix
        output['slug'] = '-or-'.join(vals)
        return output
コード例 #4
0
class RecordProperties():
    """ Methods to make properties for individual record items
        useful for making geospatial feature records or
        lists of items without geospatial data
    """
    ATTRIBUTE_DELIM = '; '  # delimiter for multiple attributes

    def __init__(self, request_dict_json=False):
        self.uuid = False
        self.uri = False  # cannonical uri for the item
        self.href = False  # link to the item in the current deployment
        self.cite_uri = False  # stable / persistent uri
        self.label = False
        self.item_type = False
        self.updated = False
        self.published = False
        self.project_href = False  # link to the project in current deployment
        self.project_uri = False  # cannonical uri for the project
        self.project_label = False 
        self.context_href = False  # link to parent context in current deployment
        self.context_uri = False  # link to parent context cannonical uri
        self.context_label = False
        self.category = False
        self.latitude = False
        self.longitude = False
        self.geojson = False
        self.early_date = False
        self.late_date = False
        self.human_remains_flagged = False  # flagged as relating to human remains
        self.thumbnail_href = False
        self.thumbnail_uri = False
        self.thumbnail_scr = False
        self.preview_scr = False
        self.fullfile_scr = False
        self.snippet = False
        self.cite_uri = False  # stable identifier as an HTTP uri
        self.other_attributes = False  # other attributes to the record
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.attribute_hierarchies = {}
        self.base_url = settings.CANONICAL_HOST
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.m_cache = MemoryCache()  # memory caching object
        self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching
        self.request_dict_json = request_dict_json
        if request_dict_json is not False:
            self.request_dict = json.loads(request_dict_json)
        else:
            self.request_dict = False
        self.highlighting = False
        self.recursive_count = 0
        self.min_date = False
        self.max_date = False
        self.thumbnail_data = {}
        self.media_file_data = {}
        self.string_attrib_data = {}

    def parse_solr_record(self, solr_rec):
        """ Parses a solr rec object """
        if isinstance(solr_rec, dict):
            self.get_item_basics(solr_rec)
            self.get_citation_uri(solr_rec)
            self.get_lat_lon(solr_rec)
            self.get_category(solr_rec)
            self.get_project(solr_rec)
            self.get_context(solr_rec)
            self.get_time(solr_rec)  # get time information, limiting date ranges to query constaints
            self.get_thumbnail(solr_rec)
            self.get_media_files(solr_rec)
            self.get_snippet(solr_rec)  # get snippet of highlighted text
            self.get_attributes(solr_rec)  # get non-standard attributes
            self.get_string_attributes(solr_rec)  # get non-standard string attributes

    def get_item_basics(self, solr_rec):
        """ get basic metadata for an item """
        output = False
        if isinstance(solr_rec, dict):
            if 'uuid' in solr_rec:
                self.uuid = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    output = True
                    self.uri = self.make_url_from_val_string(id_parts['uri'], True)
                    self.href = self.make_url_from_val_string(id_parts['uri'], False)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True)
                    self.item_type = item_type_output['item_type']
                    self.label = id_parts['label']
            if 'updated' in solr_rec:
                self.updated = solr_rec['updated']
            if 'published' in solr_rec:
                self.published = solr_rec['published']
            if 'human_remains' in solr_rec:
                # is the record flagged as related to human remains ?human_remains
                if solr_rec['human_remains'] > 0:
                    self.human_remains_flagged = True
        return output

    def get_snippet(self, solr_rec):
        """ get a text highlighting snippet """
        if isinstance(self.highlighting, dict):
            if self.uuid is False:
                if 'uuid' in solr_rec:
                    self.uuid = solr_rec['uuid']
            if self.uuid in self.highlighting:
                if 'text' in self.highlighting[self.uuid]:
                    text_list = self.highlighting[self.uuid]['text']
                    self.snippet = ' '.join(text_list)
                    # some processing to remove fagments of HTML markup.
                    self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]')
                    self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]')
                    try:
                        self.snippet = '<div>' + self.snippet + '</div>'
                        self.snippet = lxml.html.fromstring(self.snippet).text_content()
                        self.snippet = strip_tags(self.snippet)
                    except:
                        self.snippet = strip_tags(self.snippet)
                    self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>')
                    self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>')

    def get_citation_uri(self, solr_rec):
        """ gets the best citation / persistent uri for the item """
        if 'persistent_uri' in solr_rec:
            for p_uri in solr_rec['persistent_uri']:
                self.cite_uri = p_uri
                if 'dx.doi.org' in p_uri:
                    break  # stop looking once we have a DOI, the best

    def get_lat_lon(self, solr_rec):
        """ gets latitute and longitude information """
        if 'discovery_geolocation' in solr_rec:
            geo_strings = solr_rec['discovery_geolocation']
            geo_coords_str = geo_strings.split(',')
            # NOT geojson ording, since solr uses lat/lon ordering
            self.latitude = float(geo_coords_str[0])
            self.longitude = float(geo_coords_str[1]) 

    def get_category(self, solr_rec):
        """ Gets the most specific category for the item """
        self.recursive_count = 0
        cat_hierarchy = self.get_category_hierarchy(solr_rec)
        if len(cat_hierarchy) > 0:
            self.category = cat_hierarchy[-1]['label']

    def get_context(self, solr_rec):
        """ Get the most specific context parent for the record """
        self.recursive_count = 0
        contexts = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_CONTEXT_SOLR,
                                          '___context',
                                          [])
        if len(contexts) > 0:
            self.context_label = self.make_context_path_label(contexts)
            self.context_uri = self. make_context_link(contexts, True)
            self.context_href = self. make_context_link(contexts, False)

    def get_project(self, solr_rec):
        """ Get the most specific project for the record """
        self.recursive_count = 0
        projects = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_PROJECT_SOLR,
                                          '___project',
                                          [])
        if len(projects) > 0:
            self.project_label = projects[-1]['label']
            self.project_uri = self.make_url_from_val_string(projects[-1]['uri'],
                                                             True)
            self.project_href = self.make_url_from_val_string(projects[-1]['uri'],
                                                              False)

    def get_time(self, solr_rec):
        """ parses time information """
        early_list = False
        late_list = False
        if 'form_use_life_chrono_earliest' in solr_rec:
            early_list = solr_rec['form_use_life_chrono_earliest']
        if 'form_use_life_chrono_latest' in solr_rec:
            late_list = solr_rec['form_use_life_chrono_latest']
        if isinstance(early_list, list):
            date_list = early_list
        else:
            date_list = []
        if isinstance(late_list, list):
            date_list += late_list
        if len(date_list) > 0:
            min_max = self.get_list_min_max(date_list)
            self.early_date = min(min_max)
            self.late_date = max(min_max)

    def get_list_min_max(self, date_list):
        """ Returns the minimum and maximum dates
            from a date list, constrained by
            preset min and max dates
        """
        min_date = False
        max_date = False
        # print(str(date_list))
        if isinstance(date_list, list):
            date_list.sort()
            for date in date_list:
                if self.min_date is not False:
                    if date >= self.min_date \
                       and min_date is False:
                        min_date = date
                if self.max_date is not False:
                    if date <= self.max_date:
                        max_date = date
        if min_date is False:
            min_date = self.min_date
        if max_date is False:
            max_date = self.max_date
        return [min_date, max_date]

    def get_thumbnail(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.thumbnail_data:
                if self.thumbnail_data[uuid] is not False:
                    self.thumbnail_href = self.thumbnail_data[uuid]['href']
                    self.thumbnail_uri = self.thumbnail_data[uuid]['uri']
                    self.thumbnail_scr = self.thumbnail_data[uuid]['scr']
                    rp = RootPath()
                    self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr)
            else:
                # did not precache thumbnail data, get an indivitual record
                self.get_thumbnail_from_database(solr_rec)

    def get_media_files(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.media_file_data:
                if self.media_file_data[uuid] is not False:
                    rp = RootPath()
                    for file_type, file_uri in self.media_file_data[uuid].items():
                        if file_type == 'oc-gen:thumbnail':
                            self.thumbnail_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:preview':
                            self.preview_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:fullfile':
                            self.fullfile_scr = rp.convert_to_https(file_uri)

    def get_thumbnail_from_database(self, solr_rec):
        """ get media record and thumbnail, if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            thumb = []
            if self.item_type != 'media':
                media_item = Assertion.objects\
                                      .filter(uuid=uuid,
                                              object_type='media')[:1]
                if len(media_item) > 0:
                    muuid = media_item[0].object_uuid
                    thumb = Mediafile.objects\
                                     .filter(uuid=muuid,
                                             file_type='oc-gen:thumbnail')[:1]
            else:
                # do this for media items
                muuid = uuid
                thumb = Mediafile.objects\
                                 .filter(uuid=uuid,
                                         file_type='oc-gen:thumbnail')[:1]
            if len(thumb) > 0:
                self.thumbnail_href = self.base_url + '/media/' + muuid
                self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid
                self.thumbnail_scr = thumb[0].file_uri

    def get_category_hierarchy(self, solr_rec):
        """ gets the most specific category
            informtation about
            an item
        """
        cat_hierarchy = []
        if 'item_type' in solr_rec:
            item_type = solr_rec['item_type'][0]
            root_cat_field = 'oc_gen_' + item_type + '___pred_id'
            cat_hierarchy = self.extract_hierarchy(solr_rec,
                                                   root_cat_field,
                                                   '___pred',
                                                   [])
        return cat_hierarchy

    """ The following seciton of code
        processes non-default attributes for records
    """
    def get_attributes(self, solr_rec):
        """ gets attributes for a record, based on the
            predicates requested in the search
            and optional predicates passed by a client
            with a GET request with parameter 'attributes'
        """
        qm = QueryMaker()
        solr_field_entities = {}
        for attribute in self.rec_attributes:
            entity = self.m_cache.get_entity(attribute)
            if entity:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.s_cache.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                field_parts = qm.make_prop_solr_field_parts(entity)
                solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                # print('Found: ' + solr_field)
                # extract children of the solr_field so we know if
                # we have the most specific attributes, then we can get
                # values for the most specific attributes
                self.extract_attribute_children(solr_rec, solr_field)
        self.clean_attribute_hiearchies()
        if isinstance(self.attribute_hierarchies, dict):
            self.other_attributes = []
            for field_slug_key, values in self.attribute_hierarchies.items():
                entity = self.m_cache.get_entity(field_slug_key)
                if entity:
                    attribute_dict = LastUpdatedOrderedDict()
                    attribute_dict['property'] = entity.label
                    attribute_dict['values_list'] = []
                    attribute_dict['value'] = ''
                    string_val = False
                    delim = ''
                    for val in values:
                        if isinstance(val, str):
                            string_val = True
                            parsed_val = self.parse_solr_value_parts(val)
                            attribute_dict["values_list"].append(parsed_val['label'])
                            attribute_dict['value'] += delim + str(parsed_val['label'])
                        else:
                            attribute_dict["values_list"].append(val)
                            attribute_dict['value'] += delim + str(val)
                        delim = self.ATTRIBUTE_DELIM
                    if len(values) == 1 \
                       and string_val is False:
                        attribute_dict['value'] = values[0]
                    self.other_attributes.append(attribute_dict)

    def get_string_attributes(self, solr_rec):
        """ gets string attributes for a solr rec, from a previous database query
            needed because solr does not cache string field data
        """
        if isinstance(self.string_attrib_data, dict):
            # now add predicate attributes for string predicates, from the database
            if 'uuid' in solr_rec and 'data' in self.string_attrib_data:
                uuid = solr_rec['uuid']
                if uuid in self.string_attrib_data['data']:
                    item_data = self.string_attrib_data['data'][uuid]
                    for pred_uuid, values_list in item_data.items():
                        act_attribute = self.string_attrib_data['pred_ents'][pred_uuid]
                        act_attribute['values_list'] = values_list
                        act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list)
                        self.other_attributes.append(act_attribute)

    def prevent_attribute_key_collision(self, item_prop_dict, prop_key):
        """ checks to make sure there's no collision between the prop_key
            and the dict that it will be added to
        """
        i = 2
        output_prop_key = prop_key
        while output_prop_key in item_prop_dict:
            output_prop_key = prop_key + '[' + str(i) + ']'
            i += 1
        return output_prop_key

    def clean_attribute_hiearchies(self):
        """ some post-processing to make sure
            we have clean attribute hierarchies
        """
        if isinstance(self.attribute_hierarchies, dict):
            # print('check: ' + str(self.attribute_hierarchies))
            temp_attribute_hierarchies = self.attribute_hierarchies
            clean_attribute_hiearchies = {}
            for solr_field_key, field_char in self.attribute_hierarchies.items():
                if field_char['most-specific']:
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    specific_ok = True
                    for val in field_char['values']:
                        if isinstance(val, str):
                            #  print('check:' + solr_field_key + ' val: ' + val)
                            parsed_val = self.parse_solr_value_parts(val)
                            check_field = parsed_val['slug'].replace('-', '_')
                            check_field += '___pred_' + parsed_val['data_type']
                            if check_field in temp_attribute_hierarchies:
                                # note a field is NOT at the most specific level
                                specific_ok = False
                            else:
                                # now check a version with the predicate as part of
                                # the solr field
                                check_field = parsed_val['slug'].replace('-', '_')
                                check_field += pred_suffix
                                if check_field in temp_attribute_hierarchies:
                                    # note a field is NOT at the most specific level
                                    specific_ok = False
                    if specific_ok:
                        # ok to add
                        # print('checked OK: ' + solr_field_key)
                        clean_attribute_hiearchies[solr_field_key] = field_char
            # now that we got rid of problem fields, lets sort these for consistent
            # rendering
            self.attribute_hierarchies = LastUpdatedOrderedDict()
            keys = LastUpdatedOrderedDict()
            # order of key types, we want id fields, followed by numeric then date
            key_types = ['___pred_id',
                         '___pred_numeric',
                         '___pred_date']
            for key_type in key_types:
                keys[key_type] = []
                for solr_field_key, field_char in clean_attribute_hiearchies.items():
                    if key_type in solr_field_key:
                        keys[key_type].append(solr_field_key)
                # sort alphabetically. Slugs useful, since they will cluster predicates
                # from similar vocabularies
                keys[key_type].sort()
                for key in keys[key_type]:
                    field_char = clean_attribute_hiearchies[key]
                    field_ex = key.split('___')
                    # the penultimate part is the predicate
                    field_slug = field_ex[-2].replace('_', '-')
                    if field_slug not in self.attribute_hierarchies:
                        self.attribute_hierarchies[field_slug] = []
                    for val in field_char['values']:
                        if val not in self.attribute_hierarchies[field_slug]:
                            self.attribute_hierarchies[field_slug].append(val)

    def extract_attribute_children(self,
                                   solr_rec,
                                   solr_field_key):
        """ extracts ALL children from the hiearchy of
            a solr_field_key
        """
        is_field = False
        if solr_field_key not in self.attribute_hierarchies:
            # so we don't look at the same thing twice!
            if solr_field_key in solr_rec:
                is_field = True
                field_char = {'most-specific': False,
                              'values': []}
                if '___pred_numeric' in solr_field_key \
                   or '___pred_numeric' in solr_field_key:
                    field_char['most-specific'] = True
                    field_char['values'] = solr_rec[solr_field_key]
                elif '___pred_id' in solr_field_key:
                    # make a suffix for the 
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    childless_children = []
                    for child_val in solr_rec[solr_field_key]:
                        # print('Child: ' + solr_field_key + ': ' + child_val)
                        parsed_path_item = self.parse_solr_value_parts(child_val)
                        new_field_prefix = parsed_path_item['slug'].replace('-', '_')
                        new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type']
                        if parsed_path_item['data_type'] == 'id':
                            child_is_field = self.extract_attribute_children(solr_rec,
                                                                             new_field_key)
                            if child_is_field is False:
                                # now check an alternative combining the child
                                # slug with the predicate of the parent
                                new_field_key = new_field_prefix + pred_suffix
                                # print('check: ' + new_field_key)
                                child_is_field = self.extract_attribute_children(solr_rec,
                                                                                 new_field_key)
                                if child_is_field is False:
                                    childless_children.append(child_val)
                    if len(childless_children) > 0:
                        field_char['most-specific'] = True
                        field_char['values'] = childless_children
                else:
                    pass
                self.attribute_hierarchies[solr_field_key] = field_char
        return is_field

    def extract_hierarchy(self,
                          solr_rec,
                          facet_field_key,
                          facet_suffix,
                          hierarchy=[],
                          pred_field=False):
        """ extracts a hierarchy from a solr_record.
            The output is a list starting with the most
            general parent of the hiearchy,
            then going to the most specific

            This is a recursive function and
            default / starts with the root
            of the hiearchy as the facet_field_key

            This only follows a single path (not multiple paths)
        """
        alt_facet_field_key = facet_field_key
        if pred_field is not False:
            # do this to allow search of hiarchy in a named
            # predicate field
            f_parts = facet_field_key.split('___')
            if len(f_parts) == 2:
                alt_f_parts = [f_parts[0],
                               pred_field.replace('-', '_'),
                               f_parts[1]]
                alt_facet_field_key = '___'.join(alt_f_parts)
                # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key)
        if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\
           and self.recursive_count < 20:
            self.recursive_count += 1
            if facet_field_key in solr_rec:
                path_item_val = solr_rec[facet_field_key][0]
            else:
                path_item_val = solr_rec[alt_facet_field_key][0]
            parsed_path_item = self.parse_solr_value_parts(path_item_val)
            if isinstance(parsed_path_item, dict):
                hierarchy.append(parsed_path_item)
                new_facet_field = parsed_path_item['slug'].replace('-', '_')
                new_facet_field += facet_suffix + '_' + parsed_path_item['data_type']
                # print('New hierarchy field: ' + new_facet_field)
                hierarchy = self.extract_hierarchy(solr_rec,
                                                   new_facet_field,
                                                   facet_suffix,
                                                   hierarchy)
        return hierarchy

    def make_context_path_label(self, contexts):
        """ Makes a '/' delimited context
            path for easy human readability
        """
        context_path = False
        if len(contexts) > 0:
            context_labels = []
            for context in contexts:
                context_labels.append(context['label'])
            context_path = '/'.join(context_labels)
        return context_path

    def make_context_link(self, contexts, cannonical=False):
        """ makes a URI for a context """
        context_uri = False
        if len(contexts) > 0:
            context_uri = self.make_url_from_val_string(contexts[-1]['uri'],
                                                        cannonical)
        return context_uri

    def make_url_from_val_string(self,
                                 partial_url,
                                 use_cannonical=True):
        """ parses a solr value if it has
            '___' delimiters, to get the URI part
            string.
            if it's already a URI part, it makes
            a URL
        """
        if use_cannonical:
            base_url = settings.CANONICAL_HOST
        else:
            base_url = self.base_url
        solr_parts = self.parse_solr_value_parts(partial_url)
        if isinstance(solr_parts, dict):
            partial_url = solr_parts['uri']
        if 'http://' not in partial_url \
           and 'https://' not in partial_url:
            url = base_url + partial_url
        else:
            url = partial_url
        return url

    def add_record_fields(self):
        """ adds fields to include in the GeoJSON properties """
        if 'rec-field' in self.response_dict:
            raw_rec_fields = self.response_dict['rec-field'][0]
            if ',' in raw_rec_fields:
                self.record_fields = raw_rec_fields.split(',')
            else:
                self.record_fields = [raw_rec_fields]
        else:
            self.record_fields = []
        return self.record_fields

    def parse_solr_value_parts(self, solr_value):
        """ parses a solr_value string into
            slug, solr-data-type, uri, and label
            parts
        """
        output = False
        if isinstance(solr_value, str):
            if '___' in solr_value:
                solr_ex = solr_value.split('___')
                if len(solr_ex) == 4:
                    output = {}
                    output['slug'] = solr_ex[0]
                    output['data_type'] = solr_ex[1]
                    output['uri'] = solr_ex[2]
                    output['label'] = solr_ex[3]
            else:
                output = solr_value
        else:
            output = solr_value
        return output

    def get_solr_record_uuid_type(self, solr_rec):
        """ get item uuid, label, and type from a solr_rec """
        output = False
        if isinstance(solr_rec, dict):
            output = {'uuid': False,
                      'label': False,
                      'item_type': False}
            if 'uuid' in solr_rec:
                output['uuid'] = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    uri = self.make_url_from_val_string(id_parts['uri'], True)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True)
                    output['item_type'] = item_type_output['item_type']
                    output['label'] = id_parts['label']
        return output

    def get_key_val(self, key, dict_obj):
        """ returns the value associated
            with a key, if the key exists
            else, none
        """
        output = None
        if isinstance(dict_obj, dict):
            if key in dict_obj:
                output = dict_obj[key]
        return output
コード例 #5
0
    def make_facet_dict_from_solr_field(
        self,
        solr_facet_field_key,
        facet_type,
        facet_labeling,
        range_data_type=None,
    ):
        """Makes the dict for a fact with id options."""

        if configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key):
            # We have a standard "root" field. Return the facet
            # dict object for it.
            return configs.FACET_STANDARD_ROOT_FIELDS.get(solr_facet_field_key)

        solr_slug_parts = solr_facet_field_key.split(
            SolrDocument.SOLR_VALUE_DELIM)

        # Making this dict will require some database (usually from
        # the cache) because it is not a standard root solr field,
        # rather it is a solr field deeper in a hierarchy.
        m_cache = MemoryCache()

        # The solr field parts are in reverse hierarchy order
        solr_slug_parts.reverse()

        # Iterate through the parts, skipping the first item
        # which is the most general part (the field suffix).
        items = []
        for solr_slug in solr_slug_parts[1:]:
            is_related = False
            slug = solr_slug.replace('_', '-')
            if slug.startswith(configs.RELATED_ENTITY_ID_PREFIX):
                is_related = True
                slug = slug[len(configs.RELATED_ENTITY_ID_PREFIX):]
            item = m_cache.get_entity(slug)
            if not item:
                continue

            # Add an "is_related" attribute
            item.is_related = is_related
            items.append(item)

        if not len(items):
            return None

        slugs_id = configs.REQUEST_PROP_HIERARCHY_DELIM.join(
            [item.slug for item in items])
        facet = LastUpdatedOrderedDict()

        if range_data_type is None:
            id_prefix = 'facet'
        else:
            id_prefix = 'range-facet'

        if is_related:
            facet['id'] = '#{}-{}{}'.format(id_prefix,
                                            configs.RELATED_ENTITY_ID_PREFIX,
                                            slugs_id)
        else:
            facet['id'] = '#{}-{}'.format(id_prefix, slugs_id)

        labels = [item.label for item in items]
        if len(labels) == 1:
            labels.append(facet_labeling)
        # Put the last label in parentheses.
        labels[-1] = '({})'.format(labels[-1])
        facet['label'] = ' '.join(labels)
        facet['rdfs:isDefinedBy'] = items[0].uri
        facet['slug'] = items[0].slug
        facet['type'] = facet_type
        if range_data_type:
            facet['data-type'] = range_data_type
        if items[0].is_related:
            facet['oc-api:related-property'] = True
        return facet
コード例 #6
0
 def get_entity(self, identifier):
     """ gets entities, but checkes first if they are in memory """
     mc = MemoryCache()
     return mc.get_entity(identifier)
コード例 #7
0
class RecordProperties():
    """ Methods to make properties for individual record items
        useful for making geospatial feature records or
        lists of items without geospatial data
    """
    ATTRIBUTE_DELIM = '; '  # delimiter for multiple attributes

    def __init__(self, request_dict_json=False):
        self.uuid = False
        self.uri = False  # cannonical uri for the item
        self.href = False  # link to the item in the current deployment
        self.cite_uri = False  # stable / persistent uri
        self.label = False
        self.item_type = False
        self.updated = False
        self.published = False
        self.project_href = False  # link to the project in current deployment
        self.project_uri = False  # cannonical uri for the project
        self.project_label = False 
        self.context_href = False  # link to parent context in current deployment
        self.context_uri = False  # link to parent context cannonical uri
        self.context_label = False
        self.category = False
        self.latitude = False
        self.longitude = False
        self.geojson = False
        self.early_date = False
        self.late_date = False
        self.thumbnail_href = False
        self.thumbnail_uri = False
        self.thumbnail_scr = False
        self.preview_scr = False
        self.fullfile_scr = False
        self.snippet = False
        self.cite_uri = False  # stable identifier as an HTTP uri
        self.other_attributes = False  # other attributes to the record
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.attribute_hierarchies = {}
        self.base_url = settings.CANONICAL_HOST
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.request_dict_json = request_dict_json
        if request_dict_json is not False:
            self.request_dict = json.loads(request_dict_json)
        else:
            self.request_dict = False
        self.highlighting = False
        self.recursive_count = 0
        self.min_date = False
        self.max_date = False
        self.thumbnail_data = {}
        self.media_file_data = {}
        self.string_attrib_data = {}

    def parse_solr_record(self, solr_rec):
        """ Parses a solr rec object """
        if isinstance(solr_rec, dict):
            self.get_item_basics(solr_rec)
            self.get_citation_uri(solr_rec)
            self.get_lat_lon(solr_rec)
            self.get_category(solr_rec)
            self.get_project(solr_rec)
            self.get_context(solr_rec)
            self.get_time(solr_rec)  # get time information, limiting date ranges to query constaints
            self.get_thumbnail(solr_rec)
            self.get_media_files(solr_rec)
            self.get_snippet(solr_rec)  # get snippet of highlighted text
            self.get_attributes(solr_rec)  # get non-standard attributes
            self.get_string_attributes(solr_rec)  # get non-standard string attributes

    def get_item_basics(self, solr_rec):
        """ get basic metadata for an item """
        output = False
        if isinstance(solr_rec, dict):
            if 'uuid' in solr_rec:
                self.uuid = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    output = True
                    self.uri = self.make_url_from_val_string(id_parts['uri'], True)
                    self.href = self.make_url_from_val_string(id_parts['uri'], False)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(self.uri, True)
                    self.item_type = item_type_output['item_type']
                    self.label = id_parts['label']
            if 'updated' in solr_rec:
                self.updated = solr_rec['updated']
            if 'published' in solr_rec:
                self.published = solr_rec['published']
        return output

    def get_snippet(self, solr_rec):
        """ get a text highlighting snippet """
        if isinstance(self.highlighting, dict):
            if self.uuid is False:
                if 'uuid' in solr_rec:
                    self.uuid = solr_rec['uuid']
            if self.uuid in self.highlighting:
                if 'text' in self.highlighting[self.uuid]:
                    text_list = self.highlighting[self.uuid]['text']
                    self.snippet = ' '.join(text_list)
                    # some processing to remove fagments of HTML markup.
                    self.snippet = self.snippet.replace('<em>', '[[[[mark]]]]')
                    self.snippet = self.snippet.replace('</em>', '[[[[/mark]]]]')
                    try:
                        self.snippet = '<div>' + self.snippet + '</div>'
                        self.snippet = lxml.html.fromstring(self.snippet).text_content()
                        self.snippet = strip_tags(self.snippet)
                    except:
                        self.snippet = strip_tags(self.snippet)
                    self.snippet = self.snippet.replace('[[[[mark]]]]', '<em>')
                    self.snippet = self.snippet.replace('[[[[/mark]]]]', '</em>')

    def get_citation_uri(self, solr_rec):
        """ gets the best citation / persistent uri for the item """
        if 'persistent_uri' in solr_rec:
            for p_uri in solr_rec['persistent_uri']:
                self.cite_uri = p_uri
                if 'dx.doi.org' in p_uri:
                    break  # stop looking once we have a DOI, the best

    def get_lat_lon(self, solr_rec):
        """ gets latitute and longitude information """
        if 'discovery_geolocation' in solr_rec:
            geo_strings = solr_rec['discovery_geolocation']
            geo_coords_str = geo_strings.split(',')
            # NOT geojson ording, since solr uses lat/lon ordering
            self.latitude = float(geo_coords_str[0])
            self.longitude = float(geo_coords_str[1]) 

    def get_category(self, solr_rec):
        """ Gets the most specific category for the item """
        self.recursive_count = 0
        cat_hierarchy = self.get_category_hierarchy(solr_rec)
        if len(cat_hierarchy) > 0:
            self.category = cat_hierarchy[-1]['label']

    def get_context(self, solr_rec):
        """ Get the most specific context parent for the record """
        self.recursive_count = 0
        contexts = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_CONTEXT_SOLR,
                                          '___context',
                                          [])
        if len(contexts) > 0:
            self.context_label = self.make_context_path_label(contexts)
            self.context_uri = self. make_context_link(contexts, True)
            self.context_href = self. make_context_link(contexts, False)

    def get_project(self, solr_rec):
        """ Get the most specific project for the record """
        self.recursive_count = 0
        projects = self.extract_hierarchy(solr_rec,
                                          SolrDocument.ROOT_PROJECT_SOLR,
                                          '___project',
                                          [])
        if len(projects) > 0:
            self.project_label = projects[-1]['label']
            self.project_uri = self.make_url_from_val_string(projects[-1]['uri'],
                                                             True)
            self.project_href = self.make_url_from_val_string(projects[-1]['uri'],
                                                              False)

    def get_time(self, solr_rec):
        """ parses time information """
        early_list = False
        late_list = False
        if 'form_use_life_chrono_earliest' in solr_rec:
            early_list = solr_rec['form_use_life_chrono_earliest']
        if 'form_use_life_chrono_latest' in solr_rec:
            late_list = solr_rec['form_use_life_chrono_latest']
        if isinstance(early_list, list):
            date_list = early_list
        else:
            date_list = []
        if isinstance(late_list, list):
            date_list += late_list
        if len(date_list) > 0:
            min_max = self.get_list_min_max(date_list)
            self.early_date = min(min_max)
            self.late_date = max(min_max)

    def get_list_min_max(self, date_list):
        """ Returns the minimum and maximum dates
            from a date list, constrained by
            preset min and max dates
        """
        min_date = False
        max_date = False
        # print(str(date_list))
        if isinstance(date_list, list):
            date_list.sort()
            for date in date_list:
                if self.min_date is not False:
                    if date >= self.min_date \
                       and min_date is False:
                        min_date = date
                if self.max_date is not False:
                    if date <= self.max_date:
                        max_date = date
        if min_date is False:
            min_date = self.min_date
        if max_date is False:
            max_date = self.max_date
        return [min_date, max_date]

    def get_thumbnail(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.thumbnail_data:
                if self.thumbnail_data[uuid] is not False:
                    self.thumbnail_href = self.thumbnail_data[uuid]['href']
                    self.thumbnail_uri = self.thumbnail_data[uuid]['uri']
                    self.thumbnail_scr = self.thumbnail_data[uuid]['scr']
                    rp = RootPath()
                    self.thumbnail_scr = rp.convert_to_https(self.thumbnail_scr)
            else:
                # did not precache thumbnail data, get an indivitual record
                self.get_thumbnail_from_database(solr_rec)

    def get_media_files(self, solr_rec):
        """ get media record and thumbnai if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            if uuid in self.media_file_data:
                if self.media_file_data[uuid] is not False:
                    rp = RootPath()
                    for file_type, file_uri in self.media_file_data[uuid].items():
                        if file_type == 'oc-gen:thumbnail':
                            self.thumbnail_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:preview':
                            self.preview_scr = rp.convert_to_https(file_uri)
                        elif file_type == 'oc-gen:fullfile':
                            self.fullfile_scr = rp.convert_to_https(file_uri)

    def get_thumbnail_from_database(self, solr_rec):
        """ get media record and thumbnail, if it exists """
        if 'uuid' in solr_rec:
            uuid = solr_rec['uuid']
            thumb = []
            if self.item_type != 'media':
                media_item = Assertion.objects\
                                      .filter(uuid=uuid,
                                              object_type='media')[:1]
                if len(media_item) > 0:
                    muuid = media_item[0].object_uuid
                    thumb = Mediafile.objects\
                                     .filter(uuid=muuid,
                                             file_type='oc-gen:thumbnail')[:1]
            else:
                # do this for media items
                muuid = uuid
                thumb = Mediafile.objects\
                                 .filter(uuid=uuid,
                                         file_type='oc-gen:thumbnail')[:1]
            if len(thumb) > 0:
                self.thumbnail_href = self.base_url + '/media/' + muuid
                self.thumbnail_uri = settings.CANONICAL_HOST + '/media/' + muuid
                self.thumbnail_scr = thumb[0].file_uri

    def get_category_hierarchy(self, solr_rec):
        """ gets the most specific category
            informtation about
            an item
        """
        cat_hierarchy = []
        if 'item_type' in solr_rec:
            item_type = solr_rec['item_type'][0]
            root_cat_field = 'oc_gen_' + item_type + '___pred_id'
            cat_hierarchy = self.extract_hierarchy(solr_rec,
                                                   root_cat_field,
                                                   '___pred',
                                                   [])
        return cat_hierarchy

    """ The following seciton of code
        processes non-default attributes for records
    """
    def get_attributes(self, solr_rec):
        """ gets attributes for a record, based on the
            predicates requested in the search
            and optional predicates passed by a client
            with a GET request with parameter 'attributes'
        """
        qm = QueryMaker()
        solr_field_entities = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                field_parts = qm.make_prop_solr_field_parts(entity)
                solr_field = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                # print('Found: ' + solr_field)
                # extract children of the solr_field so we know if
                # we have the most specific attributes, then we can get
                # values for the most specific attributes
                self.extract_attribute_children(solr_rec, solr_field)
        self.clean_attribute_hiearchies()
        if isinstance(self.attribute_hierarchies, dict):
            self.other_attributes = []
            for field_slug_key, values in self.attribute_hierarchies.items():
                entity = self.mem_cache_obj.get_entity(field_slug_key, False)
                if entity is not False:
                    attribute_dict = LastUpdatedOrderedDict()
                    attribute_dict['property'] = entity.label
                    attribute_dict['values_list'] = []
                    attribute_dict['value'] = ''
                    string_val = False
                    delim = ''
                    for val in values:
                        if isinstance(val, str):
                            string_val = True
                            parsed_val = self.parse_solr_value_parts(val)
                            attribute_dict["values_list"].append(parsed_val['label'])
                            attribute_dict['value'] += delim + str(parsed_val['label'])
                        else:
                            attribute_dict["values_list"].append(val)
                            attribute_dict['value'] += delim + str(val)
                        delim = self.ATTRIBUTE_DELIM
                    if len(values) == 1 \
                       and string_val is False:
                        attribute_dict['value'] = values[0]
                    self.other_attributes.append(attribute_dict)

    def get_string_attributes(self, solr_rec):
        """ gets string attributes for a solr rec, from a previous database query
            needed because solr does not cache string field data
        """
        if isinstance(self.string_attrib_data, dict):
            # now add predicate attributes for string predicates, from the database
            if 'uuid' in solr_rec and 'data' in self.string_attrib_data:
                uuid = solr_rec['uuid']
                if uuid in self.string_attrib_data['data']:
                    item_data = self.string_attrib_data['data'][uuid]
                    for pred_uuid, values_list in item_data.items():
                        act_attribute = self.string_attrib_data['pred_ents'][pred_uuid]
                        act_attribute['values_list'] = values_list
                        act_attribute['value'] = self.ATTRIBUTE_DELIM.join(values_list)
                        self.other_attributes.append(act_attribute)

    def prevent_attribute_key_collision(self, item_prop_dict, prop_key):
        """ checks to make sure there's no collision between the prop_key
            and the dict that it will be added to
        """
        i = 2
        output_prop_key = prop_key
        while output_prop_key in item_prop_dict:
            output_prop_key = prop_key + '[' + str(i) + ']'
            i += 1
        return output_prop_key

    def clean_attribute_hiearchies(self):
        """ some post-processing to make sure
            we have clean attribute hierarchies
        """
        if isinstance(self.attribute_hierarchies, dict):
            # print('check: ' + str(self.attribute_hierarchies))
            temp_attribute_hierarchies = self.attribute_hierarchies
            clean_attribute_hiearchies = {}
            for solr_field_key, field_char in self.attribute_hierarchies.items():
                if field_char['most-specific']:
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    specific_ok = True
                    for val in field_char['values']:
                        if isinstance(val, str):
                            #  print('check:' + solr_field_key + ' val: ' + val)
                            parsed_val = self.parse_solr_value_parts(val)
                            check_field = parsed_val['slug'].replace('-', '_')
                            check_field += '___pred_' + parsed_val['data_type']
                            if check_field in temp_attribute_hierarchies:
                                # note a field is NOT at the most specific level
                                specific_ok = False
                            else:
                                # now check a version with the predicate as part of
                                # the solr field
                                check_field = parsed_val['slug'].replace('-', '_')
                                check_field += pred_suffix
                                if check_field in temp_attribute_hierarchies:
                                    # note a field is NOT at the most specific level
                                    specific_ok = False
                    if specific_ok:
                        # ok to add
                        # print('checked OK: ' + solr_field_key)
                        clean_attribute_hiearchies[solr_field_key] = field_char
            # now that we got rid of problem fields, lets sort these for consistent
            # rendering
            self.attribute_hierarchies = LastUpdatedOrderedDict()
            keys = LastUpdatedOrderedDict()
            # order of key types, we want id fields, followed by numeric then date
            key_types = ['___pred_id',
                         '___pred_numeric',
                         '___pred_date']
            for key_type in key_types:
                keys[key_type] = []
                for solr_field_key, field_char in clean_attribute_hiearchies.items():
                    if key_type in solr_field_key:
                        keys[key_type].append(solr_field_key)
                # sort alphabetically. Slugs useful, since they will cluster predicates
                # from similar vocabularies
                keys[key_type].sort()
                for key in keys[key_type]:
                    field_char = clean_attribute_hiearchies[key]
                    field_ex = key.split('___')
                    # the penultimate part is the predicate
                    field_slug = field_ex[-2].replace('_', '-')
                    if field_slug not in self.attribute_hierarchies:
                        self.attribute_hierarchies[field_slug] = []
                    for val in field_char['values']:
                        if val not in self.attribute_hierarchies[field_slug]:
                            self.attribute_hierarchies[field_slug].append(val)

    def extract_attribute_children(self,
                                   solr_rec,
                                   solr_field_key):
        """ extracts ALL children from the hiearchy of
            a solr_field_key
        """
        is_field = False
        if solr_field_key not in self.attribute_hierarchies:
            # so we don't look at the same thing twice!
            if solr_field_key in solr_rec:
                is_field = True
                field_char = {'most-specific': False,
                              'values': []}
                if '___pred_numeric' in solr_field_key \
                   or '___pred_numeric' in solr_field_key:
                    field_char['most-specific'] = True
                    field_char['values'] = solr_rec[solr_field_key]
                elif '___pred_id' in solr_field_key:
                    # make a suffix for the 
                    par_field_ex = solr_field_key.split('___')
                    # last two parts make the suffix, a pred-slug[-2] and a field type [-1]
                    pred_suffix = '___' + par_field_ex[-2] + '___' + par_field_ex[-1]
                    childless_children = []
                    for child_val in solr_rec[solr_field_key]:
                        # print('Child: ' + solr_field_key + ': ' + child_val)
                        parsed_path_item = self.parse_solr_value_parts(child_val)
                        new_field_prefix = parsed_path_item['slug'].replace('-', '_')
                        new_field_key = new_field_prefix + '___pred_' + parsed_path_item['data_type']
                        if parsed_path_item['data_type'] == 'id':
                            child_is_field = self.extract_attribute_children(solr_rec,
                                                                             new_field_key)
                            if child_is_field is False:
                                # now check an alternative combining the child
                                # slug with the predicate of the parent
                                new_field_key = new_field_prefix + pred_suffix
                                # print('check: ' + new_field_key)
                                child_is_field = self.extract_attribute_children(solr_rec,
                                                                                 new_field_key)
                                if child_is_field is False:
                                    childless_children.append(child_val)
                    if len(childless_children) > 0:
                        field_char['most-specific'] = True
                        field_char['values'] = childless_children
                else:
                    pass
                self.attribute_hierarchies[solr_field_key] = field_char
        return is_field

    def extract_hierarchy(self,
                          solr_rec,
                          facet_field_key,
                          facet_suffix,
                          hierarchy=[],
                          pred_field=False):
        """ extracts a hierarchy from a solr_record.
            The output is a list starting with the most
            general parent of the hiearchy,
            then going to the most specific

            This is a recursive function and
            default / starts with the root
            of the hiearchy as the facet_field_key

            This only follows a single path (not multiple paths)
        """
        alt_facet_field_key = facet_field_key
        if pred_field is not False:
            # do this to allow search of hiarchy in a named
            # predicate field
            f_parts = facet_field_key.split('___')
            if len(f_parts) == 2:
                alt_f_parts = [f_parts[0],
                               pred_field.replace('-', '_'),
                               f_parts[1]]
                alt_facet_field_key = '___'.join(alt_f_parts)
                # print('Check: ' + facet_field_key + ', ' + alt_facet_field_key)
        if (facet_field_key in solr_rec or alt_facet_field_key in solr_rec)\
           and self.recursive_count < 20:
            self.recursive_count += 1
            if facet_field_key in solr_rec:
                path_item_val = solr_rec[facet_field_key][0]
            else:
                path_item_val = solr_rec[alt_facet_field_key][0]
            parsed_path_item = self.parse_solr_value_parts(path_item_val)
            if isinstance(parsed_path_item, dict):
                hierarchy.append(parsed_path_item)
                new_facet_field = parsed_path_item['slug'].replace('-', '_')
                new_facet_field += facet_suffix + '_' + parsed_path_item['data_type']
                # print('New hierarchy field: ' + new_facet_field)
                hierarchy = self.extract_hierarchy(solr_rec,
                                                   new_facet_field,
                                                   facet_suffix,
                                                   hierarchy)
        return hierarchy

    def make_context_path_label(self, contexts):
        """ Makes a '/' delimited context
            path for easy human readability
        """
        context_path = False
        if len(contexts) > 0:
            context_labels = []
            for context in contexts:
                context_labels.append(context['label'])
            context_path = '/'.join(context_labels)
        return context_path

    def make_context_link(self, contexts, cannonical=False):
        """ makes a URI for a context """
        context_uri = False
        if len(contexts) > 0:
            context_uri = self.make_url_from_val_string(contexts[-1]['uri'],
                                                        cannonical)
        return context_uri

    def make_url_from_val_string(self,
                                 partial_url,
                                 use_cannonical=True):
        """ parses a solr value if it has
            '___' delimiters, to get the URI part
            string.
            if it's already a URI part, it makes
            a URL
        """
        if use_cannonical:
            base_url = settings.CANONICAL_HOST
        else:
            base_url = self.base_url
        solr_parts = self.parse_solr_value_parts(partial_url)
        if isinstance(solr_parts, dict):
            partial_url = solr_parts['uri']
        if 'http://' not in partial_url \
           and 'https://' not in partial_url:
            url = base_url + partial_url
        else:
            url = partial_url
        return url

    def add_record_fields(self):
        """ adds fields to include in the GeoJSON properties """
        if 'rec-field' in self.response_dict:
            raw_rec_fields = self.response_dict['rec-field'][0]
            if ',' in raw_rec_fields:
                self.record_fields = raw_rec_fields.split(',')
            else:
                self.record_fields = [raw_rec_fields]
        else:
            self.record_fields = []
        return self.record_fields

    def parse_solr_value_parts(self, solr_value):
        """ parses a solr_value string into
            slug, solr-data-type, uri, and label
            parts
        """
        output = False
        if isinstance(solr_value, str):
            if '___' in solr_value:
                solr_ex = solr_value.split('___')
                if len(solr_ex) == 4:
                    output = {}
                    output['slug'] = solr_ex[0]
                    output['data_type'] = solr_ex[1]
                    output['uri'] = solr_ex[2]
                    output['label'] = solr_ex[3]
            else:
                output = solr_value
        else:
            output = solr_value
        return output

    def get_solr_record_uuid_type(self, solr_rec):
        """ get item uuid, label, and type from a solr_rec """
        output = False
        if isinstance(solr_rec, dict):
            output = {'uuid': False,
                      'label': False,
                      'item_type': False}
            if 'uuid' in solr_rec:
                output['uuid'] = solr_rec['uuid']
            if 'slug_type_uri_label' in solr_rec:
                id_parts = self.parse_solr_value_parts(solr_rec['slug_type_uri_label'])
                if id_parts is not False:
                    uri = self.make_url_from_val_string(id_parts['uri'], True)
                    item_type_output = URImanagement.get_uuid_from_oc_uri(uri, True)
                    output['item_type'] = item_type_output['item_type']
                    output['label'] = id_parts['label']
        return output

    def get_key_val(self, key, dict_obj):
        """ returns the value associated
            with a key, if the key exists
            else, none
        """
        output = None
        if isinstance(dict_obj, dict):
            if key in dict_obj:
                output = dict_obj[key]
        return output
コード例 #8
0
ファイル: recursion.py プロジェクト: ekansa/open-context-py
class LinkRecursion():
    """
    Does recursive look ups on link annotations, especially to find hierarchies

from opencontext_py.apps.ldata.linkannotations.recursion import LinkRecursion
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('oc-gen:cat-bio-subj-ecofact')
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('oc-gen:cat-arch-element')
lr = LinkRecursion()
lr.get_jsonldish_entity_parents('http://eol.org/pages/7680')
lr = LinkRecursion()
lr.get_entity_children('http://eol.org/pages/4077', True)
    """
    def __init__(self):
        self.m_cache = MemoryCache()
        self.parent_entities = None
        self.child_entities = None
        # cache prefix for the json-ldish-parents
        self.jsonldish_p_prefix = 'json-ldish-parents-{}'
        # cache prefix for list of parents
        self.p_prefix = 'lr-parents'
        # cache prefix for children of an item
        self.children_prefix = 'lr-children-{}'
        # cache prefix for full tree of child items
        self.child_tree_prefix = 'lr-child-tree-{}'

    def get_jsonldish_entity_parents(self, identifier, add_original=True):
        """
        Gets parent concepts for a given URI or UUID identified entity
        returns a list of dictionary objects similar to JSON-LD expectations
        This is useful for faceted search

        If add_original is true, add the original UUID for the entity
        that's the childmost item, at the bottom of the hierarchy
        """
        cache_key = self.m_cache.make_cache_key(
            self.jsonldish_p_prefix.format(str(add_original)),
            identifier
        )
        obj = self.m_cache.get_cache_object(cache_key)
        if obj is not None:
            return obj
        # We don't have it cached, so get from the database.
        obj = self._get_jsonldish_entity_parents_db(
            identifier,
            add_original
        )
        if obj:
            self.m_cache.save_cache_object(cache_key, obj)
        return obj

    def _get_jsonldish_entity_parents_db(self, identifier, add_original=True):
        """
        Gets parent concepts for a given URI or UUID identified entity
        returns a list of dictionary objects similar to JSON-LD expectations
        This is useful for faceted search

        If add_original is true, add the original UUID for the entity
        that's the childmost item, at the bottom of the hierarchy
        """
        output = False
        if add_original:
            # add the original identifer to the list of parents, at lowest rank
            raw_parents = (
                [identifier] +
                self.get_entity_parents(identifier, [], 0)
            )
        else:
            raw_parents = self.get_entity_parents(
                identifier,
                [],
                0
            )
        if not len(raw_parents):
            # No parents. Returns false.
            return output
        # Make the output.
        # reverse the order of the list, to make top most concept
        # first
        output = []
        for par_id in raw_parents[::-1]:
            # print('par_id is: ' + par_id)
            ent = self.m_cache.get_entity(par_id)
            if not ent:
                continue
            p_item = LastUpdatedOrderedDict()
            p_item['id'] = ent.uri
            p_item['slug'] = ent.slug
            p_item['label'] = ent.label
            if ent.data_type is not False:
                p_item['type'] = ent.data_type
            else:
                p_item['type'] = '@id'
            p_item['ld_object_ok'] = ent.ld_object_ok
            output.append(p_item)
        return output
    
    def get_entity_parents(self, identifier, parent_list=None, loop_count=0):
        """
        Gets parent concepts for a given URI or UUID identified entity
        """
        if not parent_list:
            parent_list = []
        loop_count += 1
        parent_id = self._get_parent_id(identifier)
        # print('ID: {} has parent: {}'.format(identifier, parent_id))
        if parent_id:
            if parent_id not in parent_list:
                parent_list.append(parent_id)
                # print('Parent list is: ' + str(parent_list))
            if loop_count <= 50:
                parent_list = self.get_entity_parents(parent_id, parent_list, loop_count)
        else:
            # all done, save the parents
            self.parent_entities = parent_list
        return parent_list
    
    def _get_parent_id(self, identifier):
        """Get the parent id for the current identifier, or from the cache."""
        cache_key = self.m_cache.make_cache_key(self.p_prefix,
                                                identifier)
        obj = self.m_cache.get_cache_object(cache_key)
        if obj is not None:
            return obj
        else:
            obj = self._get_parent_id_db(identifier)
            if obj:
                self.m_cache.save_cache_object(cache_key, obj)
            return obj

    def _get_parent_id_db(self, identifier):
        """Get the parent id for the current identifier """
        parent_id = None
        lequiv = LinkEquivalence()
        identifiers = lequiv.get_identifier_list_variants(identifier)
        # print('identifiers: {}'.format(identifiers))
        p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ
        preds_for_superobjs = lequiv.get_identifier_list_variants(p_for_superobjs)
        p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ
        preds_for_subobjs = lequiv.get_identifier_list_variants(p_for_subobjs)
        try:
            # look for superior items in the objects of the assertion
            # sorting by sort so we can privelage a certain hierarchy path
            superobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers,
                                                           predicate_uri__in=preds_for_superobjs)\
                                                   .exclude(object_uri__in=identifiers)\
                                                   .order_by('sort', 'object_uri')[:1]
            if len(superobjs_anno) < 1:
                superobjs_anno = False
        except LinkAnnotation.DoesNotExist:
            superobjs_anno = False
        if superobjs_anno:
            parent_id = superobjs_anno[0].object_uri
            # print('Subject {} is child of {}'.format(identifiers, parent_id))
            oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id)
            if oc_uuid:
                parent_id = oc_uuid
        try:
            """
            Now look for superior entities in the subject, not the object
            sorting by sort so we can privelage a certain hierarchy path
            """
            supersubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers,
                                                           predicate_uri__in=preds_for_subobjs)\
                                                   .exclude(subject__in=identifiers)\
                                                   .order_by('sort', 'subject')[:1]
            if len(supersubj_anno) < 1:
                supersubj_anno = False
        except LinkAnnotation.DoesNotExist:
            supersubj_anno = False
        if supersubj_anno:
            parent_id = supersubj_anno[0].subject
            # print('Subject {} is parent of {}'.format(parent_id, identifiers))
            oc_uuid = URImanagement.get_uuid_from_oc_uri(parent_id)
            if oc_uuid:
                parent_id = oc_uuid
        return parent_id

    def get_entity_children(self, identifier, recursive=True):
        cache_key = self.m_cache.make_cache_key(self.children_prefix.format(str(recursive)),
                                                identifier)
        tree_cache_key = self.m_cache.make_cache_key(self.child_tree_prefix.format(str(recursive)),
                                                     identifier)
        obj = self.m_cache.get_cache_object(cache_key)
        tree_obj = self.m_cache.get_cache_object(tree_cache_key)
        if obj is not None and tree_obj is not None:
            # print('Hit child cache on {}'.format(identifier))
            self.child_entities = tree_obj  # the fill tree of child entities
            return obj
        else:
            obj = self._get_entity_children_db(identifier, recursive)
            if obj:
                # print('Hit child DB on {}'.format(identifier))
                self.m_cache.save_cache_object(cache_key, obj)
                self.m_cache.save_cache_object(tree_cache_key, self.child_entities)
            return obj
    
    def _get_entity_children_db(self, identifier, recursive=True):
        """
        Gets child concepts for a given URI or UUID identified entity
        """
        if not self.child_entities:
            self.child_entities = LastUpdatedOrderedDict()
        if identifier in self.child_entities and recursive:
            output = self.child_entities[identifier]
        else:
            act_children = []
            p_for_superobjs = LinkAnnotation.PREDS_SBJ_IS_SUB_OF_OBJ
            p_for_subobjs = LinkAnnotation.PREDS_SBJ_IS_SUPER_OF_OBJ
            lequiv = LinkEquivalence()
            identifiers = lequiv.get_identifier_list_variants(identifier)
            try:
                # look for child items in the objects of the assertion
                subobjs_anno = LinkAnnotation.objects.filter(subject__in=identifiers,
                                                             predicate_uri__in=p_for_subobjs)
                if(len(subobjs_anno) < 1):
                    subobjs_anno = False
            except LinkAnnotation.DoesNotExist:
                subobjs_anno = False
            if subobjs_anno is not False:
                for sub_obj in subobjs_anno:
                    child_id = sub_obj.object_uri
                    act_children.append(child_id)
            try:
                """
                Now look for subordinate entities in the subject, not the object
                """
                subsubj_anno = LinkAnnotation.objects.filter(object_uri__in=identifiers,
                                                             predicate_uri__in=p_for_superobjs)
                if len(subsubj_anno) < 1:
                    subsubj_anno = False
            except LinkAnnotation.DoesNotExist:
                subsubj_anno = False
            if subsubj_anno is not False:
                for sub_sub in subsubj_anno:
                    child_id = sub_sub.subject
                    act_children.append(child_id)
            if len(act_children) > 0:
                identifier_children = []
                for child_id in act_children:
                    if child_id.count('/') > 1:
                        oc_uuid = URImanagement.get_uuid_from_oc_uri(child_id)
                        if oc_uuid:
                            child_id = oc_uuid
                    identifier_children.append(child_id)
                    # recursively get the children of the child
                    if recursive:
                        self.get_entity_children(child_id, recursive)
                # same the list of children of the current identified item
                if identifier not in self.child_entities:
                    self.child_entities[identifier] = identifier_children
            else:
                # save a False for the current identified item. it has no children
                if identifier not in self.child_entities:
                    self.child_entities[identifier] = []
            output = self.child_entities[identifier]
        return output

    def get_pred_top_rank_types(self, predicate_uuid):
        """ gets the top ranked (not a subordinate) of any other
            type for a predicate
        """
        types = False
        try:
            pred_obj = Predicate.objects.get(uuid=predicate_uuid)
        except Predicate.DoesNotExist:
            pred_obj = False
        if pred_obj is not False:
            # print('found: ' + predicate_uuid)
            if pred_obj.data_type == 'id':
                types = []
                id_list = []
                pred_types = OCtype.objects\
                                   .filter(predicate_uuid=predicate_uuid)
                for p_type in pred_types:
                    type_pars = self.get_jsonldish_entity_parents(p_type.uuid)
                    self.parent_entities = []
                    self.loop_count = 0
                    if type_pars[0]['id'] not in id_list:
                        # so the top parent is only listed once
                        id_list.append(type_pars[0]['id'])
                        types.append(type_pars[0])
        return types
    
    def get_entity(self, identifier):
        """ Gets an entity either from the cache or from
            database lookups. This is a wrapper for the
            MemoryCache().get_entity function.
        """
        return self.m_cache.get_entity(identifier)
コード例 #9
0
class SolrUUIDs():
    """ methods to make get UUIDs from a solr
        search result JSON document,

        also makes URIs
    """
    def __init__(self, response_dict_json=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.uuids = []
        self.uris = []
        self.m_cache = MemoryCache()  # memory caching object
        self.s_cache = SearchGenerationCache(
        )  # supplemental caching object, specific for searching
        self.response_dict_json = response_dict_json
        self.highlighting = False
        # make values to these fields "flat" not a list
        self.flatten_rec_fields = True
        self.total_found = False
        self.rec_start = False
        self.min_date = False
        self.max_date = False
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.do_media_thumbs = True  # get thumbnails for records
        self.get_all_media = False  # get links to all media files for an item

    def make_uuids_from_solr(self, solr_json):
        """ makes geojson-ld point records from a solr response """
        #first do lots of checks to make sure the solr-json is OK
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = solr_rec['uuid']
                    self.uuids.append(uuid)
        return self.uuids

    def make_uris_from_solr(self, solr_json, uris_only=True):
        """ processes the solr_json to
             make GeoJSON records
        """
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            if uris_only:
                self.do_media_thumbs = False
            if self.get_all_media:
                self.do_media_thumbs = False
            if 'thumbnail' in self.rec_attributes:
                self.do_media_thumbs = True
            thumbnail_data = self.get_media_thumbs(solr_recs)
            media_file_data = self.get_all_media_files(solr_recs)
            string_attrib_data = self.get_string_rec_attributes(solr_recs)
            for solr_rec in solr_recs:
                rec_props_obj = RecordProperties(self.response_dict_json)
                rec_props_obj.min_date = self.min_date
                rec_props_obj.max_date = self.max_date
                rec_props_obj.highlighting = self.highlighting
                rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes
                rec_props_obj.rec_attributes = self.rec_attributes
                rec_props_obj.thumbnail_data = thumbnail_data
                rec_props_obj.media_file_data = media_file_data
                rec_props_obj.string_attrib_data = string_attrib_data
                item_ok = rec_props_obj.get_item_basics(solr_rec)
                if item_ok:
                    if uris_only:
                        item = rec_props_obj.uri
                    else:
                        rec_props_obj.parse_solr_record(solr_rec)
                        item = self.make_item_dict_from_rec_props_obj(
                            rec_props_obj)
                    self.uris.append(item)
        return self.uris

    def make_item_dict_from_rec_props_obj(self,
                                          rec_props_obj,
                                          cannonical=True):
        """ makes item dictionary object from a record prop obj """
        item = LastUpdatedOrderedDict()
        item['uri'] = rec_props_obj.uri
        if cannonical is False or 'href' in self.rec_attributes:
            item['href'] = rec_props_obj.href
        item['citation uri'] = rec_props_obj.cite_uri
        item['label'] = rec_props_obj.label
        item['project label'] = rec_props_obj.project_label
        if cannonical:
            item['project uri'] = rec_props_obj.project_uri
        else:
            item['project href'] = rec_props_obj.project_href
        item['context label'] = rec_props_obj.context_label
        if cannonical:
            item['context uri'] = rec_props_obj.context_uri
        else:
            item['context href'] = rec_props_obj.context_href
        item['latitude'] = rec_props_obj.latitude
        item['longitude'] = rec_props_obj.longitude
        item['early bce/ce'] = rec_props_obj.early_date
        item['late bce/ce'] = rec_props_obj.late_date
        item['item category'] = rec_props_obj.category
        if rec_props_obj.snippet is not False:
            item['snippet'] = rec_props_obj.snippet
        if rec_props_obj.thumbnail_scr is not False:
            item['thumbnail'] = rec_props_obj.thumbnail_scr
        if rec_props_obj.preview_scr is not False:
            item['preview'] = rec_props_obj.preview_scr
        if rec_props_obj.fullfile_scr is not False:
            item['primary-file'] = rec_props_obj.fullfile_scr
        item['published'] = rec_props_obj.published
        item['updated'] = rec_props_obj.updated
        if isinstance(rec_props_obj.other_attributes, list):
            for attribute in rec_props_obj.other_attributes:
                prop_key = attribute['property']
                prop_key = rec_props_obj.prevent_attribute_key_collision(
                    item, prop_key)
                if self.flatten_rec_attributes:
                    if 'value' in attribute:
                        item[prop_key] = attribute['value']
                    elif 'values_list' in attribute:
                        item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(
                            attribute['values_list'])
                else:
                    item[prop_key] = attribute['values_list']
        return item

    def extract_solr_recs(self, solr_json):
        """ extracts solr_recs along with
           some basic metadata from solr_json
        """
        solr_recs = False
        if isinstance(solr_json, dict):
            try:
                self.total_found = solr_json['response']['numFound']
            except KeyError:
                self.total_found = False
            try:
                self.rec_start = solr_json['response']['start']
            except KeyError:
                self.rec_start = False
            try:
                self.highlighting = solr_json['highlighting']
            except KeyError:
                self.highlighting = False
            try:
                solr_recs = solr_json['response']['docs']
            except KeyError:
                solr_recs = False
        return solr_recs

    def get_media_thumbs(self, solr_recs):
        """ gets media thumbnail items """
        thumb_results = {}
        not_media_uuids = []
        media_uuids = []
        rec_props_obj = RecordProperties(self.response_dict_json)
        for solr_rec in solr_recs:
            item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
            if item is not False:
                uuid = item['uuid']
                if item['item_type'] != 'media':
                    not_media_uuids.append(uuid)
                else:
                    media_uuids.append(uuid)
                thumb_results[uuid] = False
        if len(not_media_uuids) > 0:
            if self.do_media_thumbs:
                # only get media_thumbnails if needed
                rows = self.get_thumbs_for_non_media(not_media_uuids)
                for row in rows:
                    uuid = row['uuid']
                    thumb_obj = {}
                    thumb_obj[
                        'href'] = self.base_url + '/media/' + row['media_uuid']
                    thumb_obj[
                        'uri'] = settings.CANONICAL_HOST + '/media/' + row[
                            'media_uuid']
                    thumb_obj['scr'] = row['file_uri']
                    if thumb_results[uuid] is False:
                        thumb_results[uuid] = thumb_obj
        if len(media_uuids) > 0:
            thumbs = Mediafile.objects\
                              .filter(uuid__in=media_uuids,
                                      file_type='oc-gen:thumbnail')
            for thumb in thumbs:
                uuid = thumb.uuid
                thumb_obj = {}
                thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid
                thumb_obj[
                    'uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid
                thumb_obj['scr'] = thumb.file_uri
                thumb_results[uuid] = thumb_obj
        return thumb_results

    def get_all_media_files(self, solr_recs):
        """ gets media thumbnail items """
        media_file_results = {}
        if self.get_all_media:
            media_uuids = []
            rec_props_obj = RecordProperties(self.response_dict_json)
            for solr_rec in solr_recs:
                item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
                if item is not False:
                    uuid = item['uuid']
                    if item['item_type'] == 'media':
                        media_uuids.append(uuid)
                    media_file_results[uuid] = False
            if len(media_uuids) > 0:
                media_files = Mediafile.objects\
                                       .filter(uuid__in=media_uuids)
                for media_file in media_files:
                    uuid = media_file.uuid
                    if uuid not in media_file_results:
                        media_file_results[uuid] = {}
                    else:
                        if media_file_results[uuid] is False:
                            media_file_results[uuid] = {}
                    media_file_results[uuid][
                        media_file.file_type] = media_file.file_uri
        return media_file_results

    def get_thumbs_for_non_media(self, uuid_list):
        q_uuids = self.make_query_uuids(uuid_list)
        query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, '
                 'm.uuid AS media_uuid '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid '
                 'AND m.file_type=\'oc-gen:thumbnail\'  '
                 'WHERE ass.uuid IN (' + q_uuids + ') '
                 'GROUP BY ass.uuid,  m.file_uri, m.uuid; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows

    def make_query_uuids(self, uuid_list):
        """ makes a string for uuid list query """
        uuid_q = []
        for uuid in uuid_list:
            uuid = '\'' + uuid + '\''
            uuid_q.append(uuid)
        return ', '.join(uuid_q)

    def dictfetchall(self, cursor):
        """ Return all rows from a cursor as a dict """
        columns = [col[0] for col in cursor.description]
        return [dict(zip(columns, row)) for row in cursor.fetchall()]

    def get_string_rec_attributes(self, solr_recs):
        """ gets string record attributes from the database.
            The solr index does not keep string-fields in memory
        """
        output = {}
        str_attribs = {}
        for attribute in self.rec_attributes:
            entity = self.m_cache.get_entity(attribute)
            if entity:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.s_cache.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                if entity.data_type == 'xsd:string':
                    str_attribs[attribute] = entity
        if len(str_attribs) > 0:
            uuid_list = []
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = str(solr_rec['uuid'])
                    uuid_list.append(uuid)
            output = self.get_string_attributes(uuid_list, str_attribs)
        return output

    def get_string_attributes(self, uuid_list, str_attribute_ent_dict):
        """ Gets string attribute data for a solr dict """
        output = {}
        pred_uuid_list = []
        pred_uuid_objs = {}
        for key, entity in str_attribute_ent_dict.items():
            if isinstance(entity.uuid, str):
                # add string predicate entity uuid to the list
                pred_uuid_list.append(entity.uuid)
                pred_uuid_objs[entity.uuid] = {
                    'rec_attribute': key,
                    'property': entity.label,
                    'pred_uuid': entity.uuid,
                    'slug': entity.slug
                }
        if len(pred_uuid_list) > 0 and len(uuid_list) > 0:
            q_rows = self.get_string_attributes_sql(uuid_list, pred_uuid_list)
            dict_rows = {}
            for row in q_rows:
                # print(str(row))
                # the whole "dict row" bullshit is because for some reason
                # we can't simply append to the output of the
                uuid = row['uuid']
                pred_uuid = row['predicate_uuid']
                content = row['content']
                if uuid not in dict_rows:
                    dict_rows[uuid] = {}
                if pred_uuid not in dict_rows[uuid]:
                    dict_rows[uuid][pred_uuid] = []
                if isinstance(content, str):
                    dict_rows[uuid][pred_uuid].append(content)
                    # print(str(dict_rows[uuid][pred_uuid]))
            output = {'pred_ents': pred_uuid_objs, 'data': dict_rows}
        return output

    def get_string_attributes_sql(self, uuid_list, pred_uuid_list):
        """ executes SQL query to get strings for the solr uuids and predicates """
        q_uuids = self.make_query_uuids(uuid_list)
        p_uuids = self.make_query_uuids(pred_uuid_list)
        query = (
            'SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, '
            's.content AS content '
            'FROM oc_assertions AS ass '
            'JOIN oc_strings AS s ON ass.object_uuid = s.uuid '
            'WHERE ass.uuid IN (' + q_uuids + ') AND '
            'ass.predicate_uuid IN (' + p_uuids + ')'
            'ORDER BY ass.uuid,  ass.predicate_uuid, s.content; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows
コード例 #10
0
class QueryMaker():

    # main item-types mapped to their slugs to get solr-facet field prefix
    TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects',
                     'media': 'oc-gen-media',
                     'documents': 'oc-gen-documents',
                     'persons': 'oc-gen-persons',
                     'projects': 'oc-gen-projects',
                     'types': 'oc-gen-types',
                     'predicates': 'oc-gen-predicates',
                     'tables': 'oc-gen-tables'}

    TYPE_URIS = {'subjects': 'oc-gen:subjects',
                 'media': 'oc-gen:media',
                 'documents': 'oc-gen:documents',
                 'persons': 'oc-gen:persons',
                 'projects': 'oc-gen:projects',
                 'types': 'oc-gen:types',
                 'predicates': 'oc-gen:predicates',
                 'tables': 'oc-gen:tables'}

    def __init__(self):
        self.error = False
        self.histogram_groups = 10
        self.m_cache = MemoryCache() # memory caching object
        self.s_cache = SearchGenerationCache() # supplemental caching object, specific for searching

    def _get_context_paths(self, spatial_context):
        '''
        Takes a context path and returns an iterator with the list of possible
        contexts. Parses the list of boolean '||' (OR) and returns a list
        of contexts.

        For example:

        >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray')

        ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray']

        '''
        # Split the context path by '/' and then by '||'
        context_lists = (value.split('||') for value in
                         spatial_context.split('/'))
        # Create a list of the various permutations
        context_tuple_list = list(itertools.product(*context_lists))
        # Turn the lists back into URIs
        return ('/'.join(value) for value in context_tuple_list)

    def _get_context_depth(self, spatial_context):
        '''
        Takes a context path and returns its depth as an interger. For
        example, the context '/Turkey/Domuztepe'
        would have a depth of 2.
        '''
        # Remove a possible trailing slash before calculating the depth
        return len(spatial_context.rstrip('/').split('/'))

    def _get_valid_context_slugs(self, contexts):
        '''
        Takes a list of contexts and, for valid contexts, returns a list of
        slugs
        '''
        valid_context_slugs = []
        context_list = list(contexts)
        for context in context_list:
            # Verify that the contexts are valid
            # find and save the enity to memory
            context = context.replace('+', ' ')
            context = context.replace('%20', ' ')
            # print('check: ' + context)
            entity = self.m_cache.get_entity_by_context(context)
            if entity:
                valid_context_slugs.append(entity.slug)
        # print('context-slugs: ' + str(valid_context_slugs))
        return valid_context_slugs

    def _get_parent_slug(self, slug):
        '''
        Takes a slug and returns the slug of its parent. Returns 'root' if
        a slug has no parent.
        '''
        cache_key = self.m_cache.make_cache_key('par-slug', slug)
        parent_slug = self.m_cache.get_cache_object(cache_key)
        if parent_slug is None:
            contain_obj = Containment()
            contain_obj.use_cache = False  # because it seems to introduce memory errors
            parent_slug = contain_obj.get_parent_slug_by_slug(slug)
            self.m_cache.save_cache_object(cache_key, parent_slug)
        if parent_slug:
            return parent_slug
        else:
            return 'root'

    def _prepare_filter_query(self, parent_child_slug):
        # TODO docstring
        parent_child_set = parent_child_slug.split('___')
        return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \
            parent_child_set[1]

    def expand_hierarchy_options(self,
                                 path_param_val,
                                 hier_delim='---',
                                 or_delim='||'):
        """ Exapands a hiearchic path string into a
            list of listed hierachically ordered items.
            This method also makes a new hiearchic ordered
            list if there is an 'or_delim'.
        """
        if isinstance(path_param_val, list):
            inital_path_list = path_param_val
        else:
            inital_path_list = [path_param_val]
        path_list = []
        for path_string in inital_path_list:
            raw_path_list = (value.split(or_delim) for value in
                             path_string.split(hier_delim))
            # Create a list of the various permutations
            path_tuple_list = list(itertools.product(*raw_path_list))
            for item in path_tuple_list:
                path_list.append(list(item))
        return path_list

    def get_solr_field_type(self, data_type, prefix=''):
        '''
        Defines whether our dynamic solr fields names for
        predicates end with ___pred_id, ___pred_numeric, etc.
        '''
        if data_type in ['@id', 'id', False]:
            return prefix + 'id'
        elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']:
            return prefix + 'numeric'
        elif data_type == 'xsd:string':
            return prefix + 'string'
        elif data_type == 'xsd:date':
            return prefix + 'date'
        else:
            raise Exception("Error: Unknown predicate type")

    def make_prop_solr_field_parts(self, entity):
        """ Makes a solr field for a property """
        output = {}
        output['prefix'] = entity.slug.replace('-', '_')
        output['suffix'] = self.get_solr_field_type(entity.data_type)
        return output

    def process_proj(self, proj_path):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        project_path_lists = self.expand_hierarchy_options(proj_path)
        for proj_path_list in project_path_lists:
            i = 0
            path_list_len = len(proj_path_list)
            fq_field = SolrDocument.ROOT_PROJECT_SOLR
            fq_path_terms = []
            for proj_slug in proj_path_list:
                entity = self.m_cache.get_entity(proj_slug)
                if entity:
                    # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                    # the below is a bit of a hack. We should have a query field
                    # as with ___pred_ to query just the slug. But this works for now
                    proj_slug = entity.slug
                    if len(proj_slug) > 56:
                        proj_slug = proj_slug[0:56]
                    fq_path_term = fq_field + ':' + proj_slug + '*'
                    if entity.par_proj_man_obj is not False and \
                       fq_field == SolrDocument.ROOT_PROJECT_SOLR:
                        # this entity has a parent object, so make sure to look for it as a child of
                        # that parent project
                        alt_fq_field = entity.par_proj_man_obj.slug.replace('-', '_') + '___project_id'
                        alt_fq_term = alt_fq_field + ':' + proj_slug + '*'
                        fq_path_term = ' (' + fq_path_term + ' OR ' + alt_fq_term + ' ) '
                else:
                    fq_path_term = fq_field + ':' + proj_slug
                fq_path_terms.append(fq_path_term)
                fq_field = proj_slug.replace('-', '_') + '___project_id'
                i += 1
                if i >= path_list_len and fq_field not in query_dict['facet.field']:
                    query_dict['facet.field'].append(fq_field)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_ld_object(self, objects):
        # TODO docstring
        query_dict = {'fq': []}
        fq_terms = []
        if not isinstance(objects, list):
            objects = [objects]
        for raw_obj in objects:
            if '||' in raw_obj:
                or_objects = raw_obj.split('||')
            else:
                or_objects = [raw_obj]
            fq_or_terms = []
            for obj in or_objects:
                # find and save the entity to memory
                entity = self.m_cache.get_entity(obj)
                if entity:
                    fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri)
                    fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"'
                else:
                    fq_term = 'object_uri:' + obj
                fq_or_terms.append(fq_term)
            fq_all_ors = ' OR '.join(fq_or_terms)
            fq_all_ors = '(' + fq_all_ors + ')'
            fq_terms.append(fq_all_ors)
        fq_final = ' AND '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_dc_term(self, dc_param, dc_terms, add_facet=False):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        if dc_param in DCterms.DC_META_FIELDS:
            fq_field = DCterms.DC_META_FIELDS[dc_param]
            if fq_field not in query_dict['facet.field'] and add_facet:
                query_dict['facet.field'].append(fq_field)
            add_to_fq = False
            for raw_dc_term in dc_terms:
                if '||' in raw_dc_term:
                    use_dc_terms = raw_dc_term.split('||')
                else:
                    use_dc_terms = [raw_dc_term]
                fq_path_terms = []
                for dc_term in use_dc_terms:
                    if len(dc_term) > 0:
                        add_to_fq = True
                        # check if entity exists, and or store in memory
                        entity = self.m_cache.get_entity(dc_term)
                        if entity:
                            # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                            # the below is a bit of a hack. We should have a query field
                            # as with ___pred_ to query just the slug. But this works for now
                            fq_path_term = '(' + fq_field + '_fq:' + entity.slug + ')'
                            fq_path_term += ' OR (' + fq_field + ':' + entity.slug + '*)'
                            fq_path_term += ' OR (obj_all___' + fq_field + ':' + entity.slug + '___*)'
                            fq_path_term += '(' + fq_path_term + ')'
                            # print('vocab: ' + str(entity.vocabulary))
                            if entity.vocabulary == entity.label:
                                par_slug_part = entity.slug.replace('-', '_')
                                child_facet_field = par_slug_part + '___' + fq_field
                                print('adding: ' + child_facet_field)
                                query_dict['facet.field'].append(child_facet_field)
                            if dc_param == 'dc-temporal' \
                               and entity.entity_type == 'vocabulary' \
                               and 'periodo' in entity.slug:
                                # it's a temporal vocabulary from periodo
                                # so search for specific periods contained in
                                # the vocabulary
                                fq_path_term = '(' + fq_path_term +\
                                               ' OR ' + fq_path_term + '*)'
                        else:
                            if dc_term[-1] != '*':
                                dc_term += '*'
                            fq_path_term = fq_field + ':' + dc_term
                        fq_path_terms.append(fq_path_term)
                final_path_term = ' OR '.join(fq_path_terms)
                final_path_term = '(' + final_path_term + ')'
                fq_terms.append(final_path_term)
            fq_final = ' AND '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            if add_to_fq:
                query_dict['fq'].append(fq_final)
        return query_dict

    def get_related_slug_field_prefix(self, slug):
        """ gets the field prefix for a related property
            if it is present in the slug, 
            then return the solr_field prefix otherwise
            return a '' string
        """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            return field_prefix
        else:
            return ''

    def clean_related_slug(self, slug):
        """ removes the field_prefix for related slugs """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            slug = slug[prefix_len:]
        return slug

    def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq):
        """ makes sure the solr prefix is on the fq if needed """
        if solr_f_prefix != '':
            if solr_f_prefix not in act_field_fq:
                act_field_fq = solr_f_prefix + act_field_fq
        return act_field_fq

    def process_prop(self, props):
        """ processes 'prop' (property) parameters
            property parameters are tricky because they
            can come in hierarchies
            that's why there's some complexity to this
        """
        # is the property for the item itself, or for a related item?
        query_dict = {'fq': [],
                      'facet.field': [],
                      'stats.field': [],
                      'prequery-stats': [],
                      'facet.range': [],
                      'hl-queries': [],
                      'ranges': {}}
        fq_terms = []
        prop_path_lists = self.expand_hierarchy_options(props)
        for prop_path_list in prop_path_lists:
            i = 0
            path_list_len = len(prop_path_list)
            fq_path_terms = []
            act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
            act_field_data_type = 'id'
            last_field_label = False  # needed for full text highlighting
            predicate_solr_slug = False
            for prop_slug in prop_path_list:
                field_prefix = self.get_related_slug_field_prefix(prop_slug)
                solr_f_prefix = field_prefix.replace('-', '_')
                db_prop_slug = self.clean_related_slug(prop_slug)
                l_prop_entity = False
                pred_prop_entity = False
                require_id_field = False
                if act_field_data_type == 'id':
                    # check entity exists, and save to memory
                    entity = self.m_cache.get_entity(db_prop_slug)
                    if entity:
                        last_field_label = entity.label
                        prop_slug = field_prefix + entity.slug
                        if entity.item_type == 'uri' and not db_prop_slug.startswith('oc-gen'):
                            if entity.entity_type == 'property':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                l_prop_entity = True
                                children = LinkRecursion().get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        else:
                            if entity.item_type == 'predicates':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                children = LinkRecursion().get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        if i == 0:
                            if db_prop_slug.startswith('oc-gen'):
                                # for open context categories / types
                                act_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                lr = LinkRecursion()
                                parents = lr.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        act_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                        act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                                    except:
                                        pass
                                        print('Predicate Parent exception: '+ str(parents))
                            elif entity.item_type == 'uri':
                                act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR
                            elif entity.item_type == 'predicates':
                                temp_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                lr = LinkRecursion()
                                parents = lr.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        temp_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                    except:
                                        print('Predicate Parent exception: '+ str(parents))
                                        temp_field_fq = False
                                if temp_field_fq is not False:
                                    act_field_fq = temp_field_fq
                                else:
                                    act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                            else:
                                act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                        # ---------------------------------------------------
                        # THIS PART BUILDS THE FACET-QUERY
                        # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                        # the below is a bit of a hack. We should have a query field
                        # as with ___pred_ to query just the slug. But this works for now
                        fq_field = act_field_fq + '_fq'
                        if path_list_len >= 2 and act_field_data_type == 'id':
                            # could be an object deeper in the hierarchy, so allow the obj_all version
                            fq_path_term = '(' + fq_field + ':' + prop_slug
                            fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')'
                        else:
                            fq_path_term = fq_field + ':' + prop_slug
                        fq_path_terms.append(fq_path_term)
                        #---------------------------------------------------
                        #
                        #---------------------------------------------------
                        # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS
                        #
                        # print('pred-solr-slug: ' + predicate_solr_slug)
                        field_parts = self.make_prop_solr_field_parts(entity)
                        act_field_data_type = field_parts['suffix']
                        if require_id_field:
                            act_field_data_type = 'id'
                            field_parts['suffix'] = 'id'
                        # check if the last or penultimate field has
                        # a different data-type (for linked-data)
                        if i >= (path_list_len - 2) \
                           and l_prop_entity:
                            dtypes = self.s_cache.get_dtypes(entity.uri)
                            if isinstance(dtypes, list):
                                # set the data type and the act-field
                                act_field_data_type = self.get_solr_field_type(dtypes[0])
                        if not predicate_solr_slug or pred_prop_entity:
                            act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            # get a facet on this field
                            if act_field_data_type != 'string':
                                # adds a prefix for related properties
                                ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix']
                                if ffield not in query_dict['facet.field'] and \
                                   i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                        else:
                            if act_field_data_type == 'id':
                                act_field_fq = 'obj_all___' + predicate_solr_slug \
                                               + '___pred_' + field_parts['suffix']
                                # get a facet on this field
                                if predicate_solr_slug != field_parts['prefix']:
                                    # the predicate_solr_slug is not the
                                    # prefix of the current field part, meaning
                                    # the field_parts[prefix] is the type, and
                                    # we want facets for the predicate -> type
                                    ffield = field_parts['prefix'] \
                                             + '___' \
                                             + predicate_solr_slug \
                                             + '___pred_' + field_parts['suffix']
                                else:
                                    # get facets for the predicate
                                    ffield = field_parts['prefix'] \
                                             + '___pred_' \
                                             + field_parts['suffix']
                                # adds a prefix, in case of a related property
                                ffield = solr_f_prefix + ffield
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                            else:
                                act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix']
                        # -------------------------------------------
                        if act_field_data_type == 'numeric':
                            # print('Numeric field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_numeric'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_math_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        elif act_field_data_type == 'date':
                            # print('Date field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_date'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_date_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        # print('Current data type (' + str(i) + '): ' + act_field_data_type)
                        # print('Current field (' + str(i) + '): ' + act_field_fq)
                    i += 1
                elif act_field_data_type == 'string':
                    # case for a text search
                    # last_field_label = False  # turn off using the field label for highlighting
                    string_terms = self.prep_string_search_term(prop_slug)
                    for escaped_term in string_terms:
                        search_term = act_field_fq + ':' + escaped_term
                        if last_field_label is False:
                            query_dict['hl-queries'].append(escaped_term)
                        else:
                            query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term)
                        fq_path_terms.append(search_term)
                elif act_field_data_type == 'numeric':
                    # numeric search. assume it's well formed solr numeric request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the numeric ranges from query to the range facets
                    query_dict = self.add_math_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
                elif act_field_data_type == 'date':
                    # date search. assume it's well formed solr request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the date ranges from query to the range facets
                    query_dict = self.add_date_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def add_math_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = self.histogram_groups
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                vals = []
                # get the numbers out
                q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query)
                for q_num_str in q_nums_strs:
                    vals.append(float(q_num_str))
                vals.sort()
                if len(vals) > 1:
                    ok = True
                    min_val = vals[0]
                    max_val = vals[-1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = min_val
            query_dict['ranges'][fend] = max_val
            query_dict['ranges'][fgap] = (max_val - min_val) / groups
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def add_date_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = 4
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query)
                if len(q_dt_strs) < 2:
                    # try a less strict regular expression to get dates
                    q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query)
                if len(q_dt_strs) >= 2:
                    ok = True
                    vals = []
                    for q_dt_str in q_dt_strs:
                        vals.append(q_dt_str)
                    vals.sort()
                    min_val = vals[0]
                    max_val = vals[1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val)
            query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val)
            query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups)
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def get_date_difference_for_solr(self, min_date, max_date, groups):
        """ Gets a solr date difference from two values """
        min_dt = self.date_convert(min_date)
        max_dt = self.date_convert(max_date)
        dif_dt = (max_dt - min_dt) / groups
        if dif_dt.days >= 366:
            solr_val = int(round((dif_dt.days / 365.25), 0))
            solr_dif = '+' + str(solr_val) + 'YEAR'
        elif dif_dt.days >= 31:
            solr_val = int(round((dif_dt.days / 30), 0))
            solr_dif = '+' + str(solr_val) + 'MONTH'
        elif dif_dt.days >= 1:
            solr_val = int(round(dif_dt.days, 0))
            solr_dif = '+' + str(solr_val) + 'DAY'
        elif (dif_dt.seconds // 3600) >= 1:
            solr_val = int(round((dif_dt.seconds // 3600), 0))
            solr_dif = '+' + str(solr_val) + 'HOUR'
        elif ((dif_dt.seconds % 3600) // 60) >= 1:
            solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0))
            solr_dif = '+' + str(solr_val) + 'MINUTE'
        elif dif_dt.seconds >= 1:
            solr_val = int(round(dif_dt.seconds, 0))
            solr_dif = '+' + str(solr_val) + 'SECOND'
        else:
            solr_dif = '+1YEAR'
        return solr_dif

    def add_solr_gap_to_date(self, date_val, solr_gap):
        """ adds a solr gap to a date_val """
        solr_val = re.sub(r'[^\d.]', r'', solr_gap)
        solr_val = int(float(solr_val))
        dt = self.date_convert(date_val)
        if 'YEAR' in solr_gap:
            dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0)))
        elif 'MONTH' in solr_gap:
            dt = dt + datetime.timedelta(days=(solr_val * 30))
        elif 'DAY' in solr_gap:
            dt = dt + datetime.timedelta(days=solr_val)
        elif 'HOUR' in solr_gap:
            dt = dt + datetime.timedelta(hours=solr_val)
        elif 'MINUTE' in solr_gap:
            dt = dt + datetime.timedelta(minutes=solr_val)
        elif 'SECOND' in solr_gap:
            dt = dt + datetime.timedelta(seconds=solr_val)
        else:
            dt = dt
        return dt

    def convert_date_to_solr_date(self, date_val):
        """ Conversts a string for a date into
            a Solr formated datetime string
        """
        dt = self.date_convert(date_val)
        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    def make_human_readable_date(self, date_val):
        """ Converts a date value into something
            easier to read
        """
        dt = self.date_convert(date_val)
        check_date = dt.strftime('%Y-%m-%d')
        check_dt = self.date_convert(date_val)
        if check_dt == dt:
            return check_date
        else:
            return dt.strftime('%Y-%m-%d:%H:%M:%S')

    def date_convert(self, date_val):
        """ converts to a python datetime if not already so """
        if isinstance(date_val, str):
            date_val = date_val.replace('Z', '')
            dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S')
        else:
            dt = date_val
        return dt

    def get_parent_item_type_facet_field(self, category_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        lr = LinkRecursion()
        parents = lr.get_jsonldish_entity_parents(category_uri)
        for par in parents:
            if par['slug'] in self.TYPE_MAPPINGS.values():
                # the parent exists in the Type Mappings
                output = par['slug'].replace('-', '_') + '___pred_id'
                break
        return output

    def get_parent_entity_facet_field(self, entity_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        lr = LinkRecursion()
        parents = lr.get_jsonldish_entity_parents(entity_uri)
        if isinstance(parents, list):
            if len(parents) > 1:
                # get the penultimate field
                output = parents[-2]['slug'].replace('-', '_') + '___pred_id'
        return output

    def process_item_type(self, raw_item_type):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        item_type_lists = self.expand_hierarchy_options(raw_item_type)
        for item_type_list in item_type_lists:
            i = 0
            path_list_len = len(item_type_list)
            fq_path_terms = []
            item_type = item_type_list[0]  # no hiearchy in this field, just the type
            fq_term = 'item_type:' + item_type
            fq_terms.append(fq_term)
            if item_type in self.TYPE_MAPPINGS:
                act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id'
                query_dict['facet.field'].append(act_field)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_id(self, identifier):
        # check for identifier
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        id_list = [identifier]
        id_list = self.make_http_https_options(id_list)
        for act_id in id_list:
            escape_id = self.escape_solr_arg(act_id)
            fq_terms.append('persistent_uri:' + escape_id)
            fq_terms.append('uuid:' + escape_id)
         # now make URIs in case we have a naked identifier
        prefix_removes = [
            'doi:',
            'orcid:',
            'http://dx.doi.org/',
            'https://dx.doi.org/',
            'http://doi.org/',
            'https://doi.org/'
        ]
        for prefix in prefix_removes:
            # strip ID prefixes, case insensitive
            re_gone = re.compile(re.escape(prefix), re.IGNORECASE)
            identifier = re_gone.sub('', identifier)
        uris = [
            'http://dx.doi.org/' + identifier,  # DOI (old)
            'http://doi.org/' + identifier,  # DOI (new)
            'http://n2t.net/' + identifier,  # ARK (CDL / Merritt)
            'http://orcid.org/' + identifier # Orcid (people)
        ]
        # now make https http varients of the URIs
        uris = self.make_http_https_options(uris)
        for uri in uris:
            # now make a DOI URI in case this is just a naked DOI
            escaped_uri = self.escape_solr_arg(uri)
            fq_terms.append('persistent_uri:' + escaped_uri)
        tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True)
        if tcheck is not False:
            uuid = tcheck['uuid']
            fq_terms.append('uuid:' + uuid)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        # print(fq_final)
        return query_dict

    def process_form_use_life_chrono(self, raw_form_use_life_chrono):
        # creates facet query for form-use-life chronological tiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('form_use_life_chrono_tile')
        if '||' in raw_form_use_life_chrono:
            chrono_paths = raw_form_use_life_chrono.split('||')
        else:
            chrono_paths = [raw_form_use_life_chrono]
        for chrono_path in chrono_paths:
            i = 0
            if len(chrono_path) < 30:
                chrono_path += '*'
            fq_term = 'form_use_life_chrono_tile:' + chrono_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_form_date_chrono(self, form_use_life_date, date_type):
        # creates facet query for form-use-life dates
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        if date_type == 'start':
            qterm = '[' + str(form_use_life_date) + ' TO *]'
            fquery = 'form_use_life_chrono_earliest: ' + qterm
        else:
            qterm = '[* TO ' + str(form_use_life_date) + ']'
            fquery = 'form_use_life_chrono_latest: ' + qterm
        query_dict['fq'].append(fquery)
        return query_dict

    def process_discovery_geo(self, raw_disc_geo):
        # creates facet query for discovery geotiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('discovery_geotile')
        if '||' in raw_disc_geo:
            disc_geo_paths = raw_disc_geo.split('||')
        else:
            disc_geo_paths = [raw_disc_geo]
        for disc_path in disc_geo_paths:
            i = 0
            if len(disc_path) < 20:
                disc_path += '*'
            fq_term = 'discovery_geotile:' + disc_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_discovery_bbox(self, raw_disc_bbox):
        # creates facet query for bounding box searches
        # supports or {'||') queries
        query_dict = {'fq': []}
        fq_terms = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                # comma seperated list of coordinates
                bbox_coors = bbox.split(',')
                bbox_valid = self.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    # valid bounding box, now make a solr-query
                    # not how solr expacts latitude / longitude order, which
                    # is the revserse of geojson!
                    fq_term = 'discovery_geolocation:'
                    fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0])
                    fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2])
                    fq_term += ']'
                    fq_terms.append(fq_term)
        if len(fq_terms) > 0:
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            query_dict['fq'].append(fq_final)
        return query_dict

    def validate_bbox_coordiantes(self, bbox_coors):
        """ validates a set of bounding box coordinates """
        is_valid = False
        if len(bbox_coors) == 4:
            lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0],
                                                         bbox_coors[1])
            top_right_valid = self.validate_geo_lon_lat(bbox_coors[2],
                                                        bbox_coors[3])
            # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid))
            if lower_left_valid and top_right_valid:
                if float(bbox_coors[0]) < float(bbox_coors[2]) and\
                   float(bbox_coors[1]) < float(bbox_coors[3]):
                    is_valid = True
        return is_valid

    def validate_geo_lon_lat(self, lon, lat):
        """ checks to see if a lon, lat pair
            are valid. Note the GeoJSON ordering
            of the coordinates
        """
        is_valid = False
        lon_valid = self.validate_geo_coordinate(lon, 'lon')
        lat_valid = self.validate_geo_coordinate(lat, 'lat')
        if lon_valid and lat_valid:
            is_valid = True
        return is_valid

    def validate_geo_coordinate(self, coordinate, coord_type):
        """ validates a geo-spatial coordinate """
        is_valid = False
        try:
            fl_coord = float(coordinate)
        except ValueError:
            fl_coord = False
        if fl_coord is not False:
            if 'lat' in coord_type:
                if fl_coord <= 90 and\
                   fl_coord >= -90:
                    is_valid = True
            elif 'lon' in coord_type:
                if fl_coord <= 180 and\
                   fl_coord >= -180:
                    is_valid = True
        return is_valid

    def make_solr_value_from_entity(self, entity, value_type='id'):
        """ makes a solr value as indexed in SolrDocument
            see _concat_solr_string_value
        """
        id_part = entity.uri
        if 'http://opencontext.org' in entity.uri:
            if '/vocabularies/' not in entity.uri:
                id_part = entity.uri.split('http://opencontext.org')[1]
        return entity.slug + '___' + value_type + '___' + \
            id_part + '___' + entity.label
        return output

    def _process_spatial_context(self, spatial_context=None):
        # TODO docstring
        context = {}
        if spatial_context:
            context_paths = self._get_context_paths(spatial_context)
            context_slugs = self._get_valid_context_slugs(context_paths)
            # print('Context slugs: ' + str(context_slugs))
            # If we cannot find a valid context, raise a 404
            if not context_slugs:
                raise Http404
            # Solr 'fq' parameters
            parent_child_slugs = []
            # Solr 'facet.field' parameters
            facet_field = []
            for slug in context_slugs:
                # fq parameters
                parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug)
                # facet.field parameters
                facet_field.append(slug.replace('-', '_') + '___context_id')
            # First, handle the most likely scenario of a single context
            if len(parent_child_slugs) == 1:
                context['fq'] = self._prepare_filter_query(parent_child_slugs[0])
            # Otherwise, combine multiple contexts into an OR filter
            else:
                fq_string = ' OR '.join(
                    (self._prepare_filter_query(slug_set) for slug_set
                        in parent_child_slugs)
                    )
                context['fq'] = '(' + fq_string + ')'
            context['facet.field'] = facet_field
        # No spatial context provided
        else:
            context['fq'] = None
            context['facet.field'] = ['root___context_id']
        return context

    def prep_string_search_term(self, raw_term):
        """ prepares a string search
            returns a list of search terms
            for AND queries
        """
        if '"' in raw_term:
            nq_term = raw_term.replace('"', ' ')  # get rid of quotes in the search term
            quoted_list = re.findall(r"\"(.*?)\"", raw_term)
            terms = []
            terms.append(self.escape_solr_arg(nq_term))
            for quote_item in quoted_list:
                quote_item = self.escape_solr_arg(quote_item)  # escape characters
                quote_item = '"' + quote_item + '"'  # put quotes back around it
                terms.append(quote_item)
        else:
            terms = []
            terms.append(self.escape_solr_arg(raw_term))
        return terms
    
    def make_http_https_options(self, terms):
        """ checks a list of terms for http:// or https://
            strings, if those exist, then add the alternative
            to the list
        """
        output_terms = terms
        if isinstance(terms, list):
            output_terms = []
            for term in terms:
                output_terms.append(term)
                if isinstance(term, str):
                    if 'http://' in term:
                        new_term = term.replace('http://', 'https://')
                    elif 'https://' in term:
                        new_term = term.replace('https://', 'http://')
                    else:
                        new_term = None
                    if new_term is not None:
                        output_terms.append(new_term)
        else:
            output_terms = terms
        return output_terms

    def escaped_seq(self, term):
        """ Yield the next string based on the
            next character (either this char
            or escaped version """
        escaperules = {'+': r'\+',
                       '-': r'\-',
                       '&': r'\&',
                       '|': r'\|',
                       '!': r'\!',
                       '(': r'\(',
                       ')': r'\)',
                       '{': r'\{',
                       '}': r'\}',
                       '[': r'\[',
                       ']': r'\]',
                       '^': r'\^',
                       '~': r'\~',
                       '*': r'\*',
                       '?': r'\?',
                       ':': r'\:',
                       '"': r'\"',
                       ';': r'\;',
                       ' ': r'\ '}
        for char in term:
            if char in escaperules.keys():
                yield escaperules[char]
            else:
                yield char

    def escape_solr_arg(self, term):
        """ Apply escaping to the passed in query terms
            escaping special characters like : , etc"""
        term = term.replace('\\', r'\\')   # escape \ first
        return "".join([next_str for next_str in self.escaped_seq(term)])
コード例 #11
0
ファイル: uuids.py プロジェクト: mfindeisen/open-context-py
class SolrUUIDs():
    """ methods to make get UUIDs from a solr
        search result JSON document,

        also makes URIs
    """

    def __init__(self, response_dict_json=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.uuids = []
        self.uris = []
        self.mem_cache_obj = MemoryCache()  # memory caching object
        self.response_dict_json = response_dict_json
        self.highlighting = False
        # make values to these fields "flat" not a list
        self.flatten_rec_fields = True
        self.total_found = False
        self.rec_start = False
        self.min_date = False
        self.max_date = False
        # flatten list of an attribute values to single value
        self.flatten_rec_attributes = False
        # A list of (non-standard) attributes to include in a record
        self.rec_attributes = []
        self.do_media_thumbs = True  # get thumbnails for records
        self.get_all_media = False  # get links to all media files for an item

    def make_uuids_from_solr(self, solr_json):
        """ makes geojson-ld point records from a solr response """
        #first do lots of checks to make sure the solr-json is OK
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = solr_rec['uuid']
                    self.uuids.append(uuid)
        return self.uuids

    def make_uris_from_solr(self, solr_json, uris_only=True):
        """ processes the solr_json to
             make GeoJSON records
        """
        solr_recs = self.extract_solr_recs(solr_json)
        if isinstance(solr_recs, list):
            if uris_only:
                self.do_media_thumbs = False
            if self.get_all_media:
                self.do_media_thumbs = False
            if 'thumbnail' in self.rec_attributes:
                self.do_media_thumbs = True
            thumbnail_data = self.get_media_thumbs(solr_recs)
            media_file_data = self.get_all_media_files(solr_recs)
            string_attrib_data = self.get_string_rec_attributes(solr_recs)
            for solr_rec in solr_recs:
                rec_props_obj = RecordProperties(self.response_dict_json)
                rec_props_obj.mem_cache_obj = self.mem_cache_obj
                rec_props_obj.min_date = self.min_date
                rec_props_obj.max_date = self.max_date
                rec_props_obj.highlighting = self.highlighting
                rec_props_obj.flatten_rec_attributes = self.flatten_rec_attributes
                rec_props_obj.rec_attributes = self.rec_attributes
                rec_props_obj.thumbnail_data = thumbnail_data
                rec_props_obj.media_file_data = media_file_data
                rec_props_obj.string_attrib_data = string_attrib_data
                item_ok = rec_props_obj.get_item_basics(solr_rec)
                if item_ok:
                    if uris_only:
                        item = rec_props_obj.uri
                    else:
                        rec_props_obj.parse_solr_record(solr_rec)
                        self.mem_cache_obj = rec_props_obj.mem_cache_obj  # add to existing list of entities, reduce lookups
                        item = self.make_item_dict_from_rec_props_obj(rec_props_obj)
                    self.uris.append(item)
        return self.uris

    def make_item_dict_from_rec_props_obj(self, rec_props_obj, cannonical=True):
        """ makes item dictionary object from a record prop obj """
        item = LastUpdatedOrderedDict()
        item['uri'] = rec_props_obj.uri
        if cannonical is False or 'href' in self.rec_attributes:
            item['href'] = rec_props_obj.href
        item['citation uri'] = rec_props_obj.cite_uri
        item['label'] = rec_props_obj.label
        item['project label'] = rec_props_obj.project_label
        if cannonical:
            item['project uri'] = rec_props_obj.project_uri
        else:
            item['project href'] = rec_props_obj.project_href
        item['context label'] = rec_props_obj.context_label
        if cannonical:
            item['context uri'] = rec_props_obj.context_uri
        else:
            item['context href'] = rec_props_obj.context_href
        item['latitude'] = rec_props_obj.latitude
        item['longitude'] = rec_props_obj.longitude
        item['early bce/ce'] = rec_props_obj.early_date
        item['late bce/ce'] = rec_props_obj.late_date
        item['item category'] = rec_props_obj.category
        if rec_props_obj.snippet is not False:
            item['snippet'] = rec_props_obj.snippet
        if rec_props_obj.thumbnail_scr is not False:
            item['thumbnail'] = rec_props_obj.thumbnail_scr
        if rec_props_obj.preview_scr is not False:
            item['preview'] = rec_props_obj.preview_scr
        if rec_props_obj.fullfile_scr is not False:
            item['primary-file'] = rec_props_obj.fullfile_scr
        item['published'] = rec_props_obj.published
        item['updated'] = rec_props_obj.updated
        if isinstance(rec_props_obj.other_attributes, list):
            for attribute in rec_props_obj.other_attributes:
                prop_key = attribute['property']
                prop_key = rec_props_obj.prevent_attribute_key_collision(item,
                                                                         prop_key)
                if self.flatten_rec_attributes:
                    if 'value' in attribute:
                        item[prop_key] = attribute['value']
                    elif 'values_list' in attribute:
                        item[prop_key] = RecordProperties.ATTRIBUTE_DELIM.join(attribute['values_list'])
                else:
                    item[prop_key] = attribute['values_list']
        return item

    def extract_solr_recs(self, solr_json):
        """ extracts solr_recs along with
           some basic metadata from solr_json
        """
        solr_recs = False
        if isinstance(solr_json, dict):
            try:
                self.total_found = solr_json['response']['numFound']
            except KeyError:
                self.total_found = False
            try:
                self.rec_start = solr_json['response']['start']
            except KeyError:
                self.rec_start = False
            try:
                self.highlighting = solr_json['highlighting']
            except KeyError:
                self.highlighting = False
            try:
                solr_recs = solr_json['response']['docs']
            except KeyError:
                solr_recs = False
        return solr_recs

    def get_media_thumbs(self, solr_recs):
        """ gets media thumbnail items """
        thumb_results = {}
        not_media_uuids = []
        media_uuids = []
        rec_props_obj = RecordProperties(self.response_dict_json)
        for solr_rec in solr_recs:
            item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
            if item is not False:
                uuid = item['uuid']
                if item['item_type'] != 'media':
                    not_media_uuids.append(uuid)
                else:
                    media_uuids.append(uuid)
                thumb_results[uuid] = False
        if len(not_media_uuids) > 0:
            if self.do_media_thumbs:
                # only get media_thumbnails if needed
                rows = self.get_thumbs_for_non_media(not_media_uuids)
                for row in rows:
                    uuid = row['uuid']
                    thumb_obj = {}
                    thumb_obj['href'] = self.base_url + '/media/' + row['media_uuid']
                    thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + row['media_uuid']
                    thumb_obj['scr'] = row['file_uri']
                    if thumb_results[uuid] is False:
                        thumb_results[uuid] = thumb_obj
        if len(media_uuids) > 0:
            thumbs = Mediafile.objects\
                              .filter(uuid__in=media_uuids,
                                      file_type='oc-gen:thumbnail')
            for thumb in thumbs:
                uuid = thumb.uuid
                thumb_obj = {}
                thumb_obj['href'] = self.base_url + '/media/' + thumb.uuid
                thumb_obj['uri'] = settings.CANONICAL_HOST + '/media/' + thumb.uuid
                thumb_obj['scr'] = thumb.file_uri
                thumb_results[uuid] = thumb_obj
        return thumb_results

    def get_all_media_files(self, solr_recs):
        """ gets media thumbnail items """
        media_file_results = {}
        if self.get_all_media:
            media_uuids = []
            rec_props_obj = RecordProperties(self.response_dict_json)
            for solr_rec in solr_recs:
                item = rec_props_obj.get_solr_record_uuid_type(solr_rec)
                if item is not False:
                    uuid = item['uuid']
                    if item['item_type'] == 'media':
                        media_uuids.append(uuid)
                    media_file_results[uuid] = False
            if len(media_uuids) > 0:
                media_files = Mediafile.objects\
                                       .filter(uuid__in=media_uuids)
                for media_file in media_files:
                    uuid = media_file.uuid
                    if uuid not in media_file_results:
                        media_file_results[uuid] = {}
                    else:
                        if media_file_results[uuid] is False:
                            media_file_results[uuid] = {}
                    media_file_results[uuid][media_file.file_type] = media_file.file_uri
        return media_file_results

    def get_thumbs_for_non_media(self, uuid_list):
        q_uuids = self.make_query_uuids(uuid_list)
        query = ('SELECT ass.uuid AS uuid, m.file_uri AS file_uri, '
                 'm.uuid AS media_uuid '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_mediafiles AS m ON ass.object_uuid = m.uuid '
                 'AND m.file_type=\'oc-gen:thumbnail\'  '
                 'WHERE ass.uuid IN (' + q_uuids + ') '
                 'GROUP BY ass.uuid,  m.file_uri, m.uuid; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows

    def make_query_uuids(self, uuid_list):
        """ makes a string for uuid list query """
        uuid_q = []
        for uuid in uuid_list:
            uuid = '\'' + uuid + '\''
            uuid_q.append(uuid)
        return ', '.join(uuid_q)

    def dictfetchall(self, cursor):
        """ Return all rows from a cursor as a dict """
        columns = [col[0] for col in cursor.description]
        return [
            dict(zip(columns, row))
            for row in cursor.fetchall()
        ]

    def get_string_rec_attributes(self, solr_recs):
        """ gets string record attributes from the database.
            The solr index does not keep string-fields in memory
        """
        output = {}
        str_attribs = {}
        for attribute in self.rec_attributes:
            entity = self.mem_cache_obj.get_entity(attribute, False)
            if entity is not False:
                prop_slug = entity.slug
                # check to make sure we have the entity data type for linked fields
                if entity.data_type is False and entity.item_type == 'uri':
                    dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                    if isinstance(dtypes, list):
                        # set te data type and the act-field
                        # print('Found for ' + prop_slug + ' ' + dtypes[0])
                        entity.data_type = dtypes[0]
                if entity.data_type == 'xsd:string':
                    str_attribs[attribute] = entity
        if len(str_attribs) > 0:
            uuid_list = []
            for solr_rec in solr_recs:
                if 'uuid' in solr_rec:
                    uuid = str(solr_rec['uuid'])
                    uuid_list.append(uuid)
            output = self.get_string_attributes(uuid_list, str_attribs)
        return output

    def get_string_attributes(self, uuid_list, str_attribute_ent_dict):
        """ Gets string attribute data for a solr dict """
        output = {}
        pred_uuid_list = []
        pred_uuid_objs = {}
        for key, entity in str_attribute_ent_dict.items():
            if isinstance(entity.uuid, str):
                # add string predicate entity uuid to the list
                pred_uuid_list.append(entity.uuid)
                pred_uuid_objs[entity.uuid] = {'rec_attribute': key,
                                               'property': entity.label,
                                               'pred_uuid': entity.uuid,
                                               'slug': entity.slug}
        if len(pred_uuid_list) > 0 and len(uuid_list) > 0:
            q_rows = self. get_string_attributes_sql(uuid_list, pred_uuid_list)
            dict_rows = {}
            for row in q_rows:
                # print(str(row))
                # the whole "dict row" bullshit is because for some reason
                # we can't simply append to the output of the 
                uuid = row['uuid']
                pred_uuid = row['predicate_uuid']
                content = row['content']
                if uuid not in dict_rows:
                    dict_rows[uuid] = {}
                if pred_uuid not in dict_rows[uuid]:
                    dict_rows[uuid][pred_uuid] = []
                if isinstance(content, str):
                    dict_rows[uuid][pred_uuid].append(content)
                    # print(str(dict_rows[uuid][pred_uuid]))
            output = {'pred_ents': pred_uuid_objs,
                      'data': dict_rows}
        return output

    def get_string_attributes_sql(self, uuid_list, pred_uuid_list):
        """ executes SQL query to get strings for the solr uuids and predicates """
        q_uuids = self.make_query_uuids(uuid_list)
        p_uuids = self.make_query_uuids(pred_uuid_list)
        query = ('SELECT ass.uuid AS uuid, ass.predicate_uuid AS predicate_uuid, '
                 's.content AS content '
                 'FROM oc_assertions AS ass '
                 'JOIN oc_strings AS s ON ass.object_uuid = s.uuid '
                 'WHERE ass.uuid IN (' + q_uuids + ') AND '
                 'ass.predicate_uuid IN (' + p_uuids + ')'
                 'ORDER BY ass.uuid,  ass.predicate_uuid, s.content; ')
        cursor = connection.cursor()
        cursor.execute(query)
        rows = self.dictfetchall(cursor)
        return rows
コード例 #12
0
class FilterLinks():

    BASE_SOLR_FIELD_PARAM_MAPPINGS = \
        {'___project_id': 'proj',
         '___context_id': 'path',
         'obj_all___biol_term_hastaxonomy___pred_id': 'reconcile',
         '___pred_': 'prop',
         'item_type': 'type'}

    def __init__(self, request_dict=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.base_search_link = '/search/'
        self.base_request = request_dict
        self.base_request_json = False
        self.base_r_full_path = False
        self.spatial_context = False
        self.testing = settings.DEBUG
        self.hierarchy_delim = '---'
        self.partial_param_val_match = False
        self.remove_start_param = True
        self.m_cache = MemoryCache(
        )  # memory caching object  # memory caching object
        self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS
        for param_key, solr_field in DCterms.DC_META_FIELDS.items():
            self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key

    def make_request_urls(self, new_rparams):
        """ makes request urls from the new request object """
        output = {}
        output['html'] = self.make_request_url(new_rparams)
        output['json'] = self.make_request_url(new_rparams, '.json')
        output['atom'] = self.make_request_url(new_rparams, '.atom')
        return output

    def make_request_url(self, new_rparams, doc_format=''):
        """ makes request urls from the new request object
            default doc_format is '' (HTML)
        """
        url = self.base_url + self.base_search_link
        if 'path' in new_rparams:
            if new_rparams['path'] is not None \
               and new_rparams['path'] is not False:
                # context_path = iri_to_uri(new_rparams['path'])
                context_path = new_rparams['path']
                context_path = context_path.replace(' ', '+')
                url += context_path
        url += doc_format
        param_sep = '?'
        param_list = []
        for param, param_vals in new_rparams.items():
            if param != 'path':
                for val in param_vals:
                    quote_val = quote_plus(val)
                    quote_val = quote_val.replace('%7BSearchTerm%7D',
                                                  '{SearchTerm}')
                    param_item = param + '=' + quote_val
                    param_list.append(param_item)
        if len(param_list) > 0:
            # keep a consistent sort order on query parameters + values.
            param_list.sort()
            url += '?' + '&'.join(param_list)
        return url

    def make_request_sub(self,
                         old_request_dict,
                         rem_param_key,
                         rem_param_val,
                         sub_param_val=None):
        """ makes a dictionary object for
            request parameters WITHOUT the current fparam_key
            and fparam_vals
        """
        filter_request = LastUpdatedOrderedDict()
        for ch_param_key, ch_param_vals in old_request_dict.items():
            if ch_param_key != rem_param_key:
                # a different parameter than the one in the filter, so add
                filter_request[ch_param_key] = ch_param_vals
            else:
                if rem_param_key != 'path' and len(ch_param_vals) > 0:
                    filter_request[ch_param_key] = []
                    for ch_param_val in ch_param_vals:
                        if rem_param_val != ch_param_val:
                            # the filter value for this key is not the same
                            # as the check value for this key, so add
                            # to the filter request
                            filter_request[ch_param_key].append(ch_param_val)
                        else:
                            if sub_param_val is not None:
                                # put in the substitute value
                                filter_request[ch_param_key].append(
                                    sub_param_val)
        return filter_request

    def add_to_request_by_solr_field(self, solr_facet_key, new_value):
        """ uses the solr_facet_key to determine the
           request parameter
        """
        param = self.get_param_from_solr_facet_key(solr_facet_key)
        slugs = self.parse_slugs_in_solr_facet_key(solr_facet_key)
        if slugs is not False:
            add_to_value = self.hierarchy_delim.join(slugs)
        else:
            add_to_value = None
        #print('New param: ' + param + ' new val: ' + new_value + ' len:' + str(self.base_request))
        new_rparams = self.add_to_request(param, new_value, add_to_value)
        return new_rparams

    def add_to_request(self, param, new_value, add_to_value=None):
        """ adds to the new request object a parameter and value """
        if self.base_request_json is not False:
            # start of with JSON encoded base request parameters
            new_rparams = json.loads(self.base_request_json)
        elif self.base_r_full_path is not False:
            # start of with parsing a URL string
            new_rparams = self.make_base_params_from_url(self.base_r_full_path)
        elif self.base_request is not False:
            # start with a dictionary object of the base request
            # for some reason this often leads to memory errors
            new_rparams = self.base_request
        else:
            new_rparams = {}
        if 'start' in new_rparams and self.remove_start_param:
            # remove paging information when composing a new link
            new_rparams.pop('start', None)
        if param == 'path':
            entity = self.m_cache.get_entity(new_value)
            if entity:
                # convert the (slug) value into a context path
                new_value = entity.context
        if param not in new_rparams:
            if param == 'path':
                new_rparams[param] = new_value
            else:
                new_rparams[param] = [new_value]
        else:
            if param == 'path':
                new_rparams['path'] = new_value
            else:
                if add_to_value is not None:
                    new_list = []
                    old_found = False
                    for old_val in new_rparams[param]:
                        old_prefix = self.remove_solr_part(old_val)
                        first_last_old_val = False
                        if self.hierarchy_delim in old_val:
                            old_val_ex = old_val.split(self.hierarchy_delim)
                            if len(old_val_ex) > 2:
                                first_last_old_val = old_val_ex[0]
                                first_last_old_val += self.hierarchy_delim
                                first_last_old_val += old_val_ex[-1]
                        if old_val == add_to_value:
                            old_found = True
                            new_list_val = old_val + self.hierarchy_delim + new_value
                        elif old_prefix == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        elif first_last_old_val == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        else:
                            new_list_val = old_val
                        new_list.append(new_list_val)
                    if old_found is False:
                        if self.partial_param_val_match:
                            for old_val in new_rparams[param]:
                                if add_to_value in old_val:
                                    old_found = True
                                    old_prefix = self.remove_solr_part(old_val)
                                    new_list_val = old_prefix + self.hierarchy_delim + new_value
                                    # add the new item
                                    new_list.append(new_list_val)
                                    # remove the old
                                    new_list.remove(old_val)
                    new_rparams[param] = new_list
                    if old_found is False:
                        new_rparams[param].append(new_value)
                else:
                    new_rparams[param].append(new_value)
        return new_rparams

    def remove_solr_part(self, old_val):
        """ removes part of a query parameter that
            is in solr query syntax, inside square
            brackets []
        """
        output = old_val
        splitter = self.hierarchy_delim + '['
        if splitter in old_val:
            old_ex = old_val.split(splitter)
            output = old_ex[0]
        return output

    def make_base_params_from_url(self, request_url):
        """ makes the base parameters from the url """
        rparams = {}
        url_o = urlparse(request_url)
        rparams = parse_qs(url_o.query)
        if self.spatial_context is False:
            self.spatial_context = self.get_context_from_path(url_o.path)
        rparams['path'] = self.spatial_context
        return rparams

    def get_context_from_path(self, path):
        """ geths the spatial context from a request path """
        context = False
        if '.' in path:
            pathex = path.split('.')
            path = pathex[0]
        if '/' in path:
            pathex = path.split('/')
            print(str(pathex))
            if len(pathex) > 2:
                # remove the part that's the first slash
                pathex.pop(0)
                # remove the part that's for the url of search
                pathex.pop(0)
            context = '/'.join(pathex)
        return context

    def get_param_from_solr_facet_key(self, solr_facet_key):
        """" returns the public parameter from the solr_facet_key """
        output = solr_facet_key
        exact_match = False
        for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items(
        ):
            if solr_field_part_key == solr_facet_key:
                output = param
                exact_match = True
                break
        if exact_match is False:
            for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items(
            ):
                if solr_field_part_key in solr_facet_key:
                    output = param
                    break
        return output

    def parse_slugs_in_solr_facet_key(self, solr_facet_key):
        """ returns a list of slugs encoded in a solr_facet_key
            the solr field has these slugs in reverse order
        """
        no_slug_field_list = [
            SolrDocument.ROOT_CONTEXT_SOLR, SolrDocument.ROOT_PROJECT_SOLR,
            SolrDocument.ROOT_LINK_DATA_SOLR, SolrDocument.ROOT_PREDICATE_SOLR
        ]
        if solr_facet_key in no_slug_field_list:
            slugs = False
        else:
            raw_slugs = []
            facet_key_list = solr_facet_key.split('___')
            list_len = len(facet_key_list)
            i = 0
            for list_item in facet_key_list:
                i += 1
                if i < list_len:
                    # last item is the suffix for the field type
                    # also replace '_' with '-' to get a slug
                    raw_slugs.append(list_item.replace('_', '-'))
            slugs = raw_slugs[::-1]
        return slugs

    def prep_base_request_obj(self, request_dict):
        """ prepares a base request object from the old request object
            to use to create new requests
        """
        self.base_request = request_dict
        return self.base_request

    def get_request_param(self, param, default, as_list=False):
        """ get a string or list to use in queries from either
            the request object or the internal_request object
            so we have flexibility in doing searches without
            having to go through HTTP
        """
        output = False
        if self.request is not False:
            if as_list:
                output = self.request.GET.getlist(param)
            else:
                output = self.request.GET.get(param, default=default)
        elif self.internal_request is not False:
            if as_list:
                if param in self.internal_request:
                    param_obj = self.internal_request[param]
                    if isinstance(param_obj, list):
                        output = param_obj
                    else:
                        output = [param_obj]
            else:
                if param in self.internal_request:
                    output = self.internal_request[param]
                else:
                    output = default
        else:
            output = False
        return output
コード例 #13
0
class QueryMaker():

    # main item-types mapped to their slugs to get solr-facet field prefix
    TYPE_MAPPINGS = {'subjects': 'oc-gen-subjects',
                     'media': 'oc-gen-media',
                     'documents': 'oc-gen-documents',
                     'persons': 'oc-gen-persons',
                     'projects': 'oc-gen-projects',
                     'types': 'oc-gen-types',
                     'predicates': 'oc-gen-predicates'}

    TYPE_URIS = {'subjects': 'oc-gen:subjects',
                 'media': 'oc-gen:media',
                 'documents': 'oc-gen:documents',
                 'persons': 'oc-gen:persons',
                 'projects': 'oc-gen:projects',
                 'types': 'oc-gen:types',
                 'predicates': 'oc-gen:predicates'}

    def __init__(self):
        self.error = False
        self.histogram_groups = 10
        self.mem_cache_obj = MemoryCache()  # memory caching object

    def _get_context_paths(self, spatial_context):
        '''
        Takes a context path and returns an iterator with the list of possible
        contexts. Parses the list of boolean '||' (OR) and returns a list
        of contexts.

        For example:

        >>> _get_context_paths('Turkey/Domuztepe/I||II||Stray')

        ['Turkey/Domuztepe/I', 'Turkey/Domuztepe/II', 'Turkey/Domuztepe/Stray']

        '''
        # Split the context path by '/' and then by '||'
        context_lists = (value.split('||') for value in
                         spatial_context.split('/'))
        # Create a list of the various permutations
        context_tuple_list = list(itertools.product(*context_lists))
        # Turn the lists back into URIs
        return ('/'.join(value) for value in context_tuple_list)

    def _get_context_depth(self, spatial_context):
        '''
        Takes a context path and returns its depth as an interger. For
        example, the context '/Turkey/Domuztepe'
        would have a depth of 2.
        '''
        # Remove a possible trailing slash before calculating the depth
        return len(spatial_context.rstrip('/').split('/'))

    def _get_valid_context_slugs(self, contexts):
        '''
        Takes a list of contexts and, for valid contexts, returns a list of
        slugs
        '''
        entity = Entity()
        valid_context_slugs = []
        context_list = list(contexts)
        for context in context_list:
            # Verify that the contexts are valid
            # find and save the enity to memory
            # print('check: ' + context)
            found = self.mem_cache_obj.check_entity_found(context,
                                                          True)
            # print('found: ' + str(found))
            if found:
                entity = self.mem_cache_obj.get_entity(context,
                                                       True)
                valid_context_slugs.append(entity.slug)
        return valid_context_slugs

    def _get_parent_slug(self, slug):
        '''
        Takes a slug and returns the slug of its parent. Returns 'root' if
        a slug has no parent.
        '''
        cache_key = self.mem_cache_obj.make_memory_cache_key('par-slug', slug)
        parent_slug = self.mem_cache_obj.get_cache_object(cache_key)
        if parent_slug is None:
            contain_obj = Containment()
            contain_obj.use_cache = False  # because it seems to introduce memory errors
            parent_slug = contain_obj.get_parent_slug_by_slug(slug)
            self.mem_cache_obj.save_cache_object(cache_key, parent_slug)
        if parent_slug:
            return parent_slug
        else:
            return 'root'

    def _prepare_filter_query(self, parent_child_slug):
        # TODO docstring
        parent_child_set = parent_child_slug.split('___')
        return parent_child_set[0].replace('-', '_') + '___context_id_fq:' + \
            parent_child_set[1]

    def expand_hierarchy_options(self,
                                 path_param_val,
                                 hier_delim='---',
                                 or_delim='||'):
        """ Exapands a hiearchic path string into a
            list of listed hierachically ordered items.
            This method also makes a new hiearchic ordered
            list if there is an 'or_delim'.
        """
        if isinstance(path_param_val, list):
            inital_path_list = path_param_val
        else:
            inital_path_list = [path_param_val]
        path_list = []
        for path_string in inital_path_list:
            raw_path_list = (value.split(or_delim) for value in
                             path_string.split(hier_delim))
            # Create a list of the various permutations
            path_tuple_list = list(itertools.product(*raw_path_list))
            for item in path_tuple_list:
                path_list.append(list(item))
        return path_list

    def get_solr_field_type(self, data_type, prefix=''):
        '''
        Defines whether our dynamic solr fields names for
        predicates end with ___pred_id, ___pred_numeric, etc.
        '''
        if data_type in ['@id', 'id', False]:
            return prefix + 'id'
        elif data_type in ['xsd:integer', 'xsd:double', 'xsd:boolean']:
            return prefix + 'numeric'
        elif data_type == 'xsd:string':
            return prefix + 'string'
        elif data_type == 'xsd:date':
            return prefix + 'date'
        else:
            raise Exception("Error: Unknown predicate type")

    def make_prop_solr_field_parts(self, entity):
        """ Makes a solr field for a property """
        output = {}
        output['prefix'] = entity.slug.replace('-', '_')
        output['suffix'] = self.get_solr_field_type(entity.data_type)
        return output

    def process_proj(self, proj_path):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        project_path_lists = self.expand_hierarchy_options(proj_path)
        for proj_path_list in project_path_lists:
            i = 0
            path_list_len = len(proj_path_list)
            fq_field = SolrDocument.ROOT_PROJECT_SOLR
            fq_path_terms = []
            for proj_slug in proj_path_list:
                found = self.mem_cache_obj.check_entity_found(proj_slug, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(proj_slug, False)
                    # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                    # the below is a bit of a hack. We should have a query field
                    # as with ___pred_ to query just the slug. But this works for now
                    proj_slug = entity.slug
                    fq_path_term = fq_field + ':' + proj_slug + '*'
                else:
                    fq_path_term = fq_field + ':' + proj_slug
                fq_path_terms.append(fq_path_term)
                fq_field = proj_slug.replace('-', '_') + '___project_id'
                i += 1
                if i >= path_list_len and fq_field not in query_dict['facet.field']:
                    query_dict['facet.field'].append(fq_field)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_ld_object(self, objects):
        # TODO docstring
        query_dict = {'fq': []}
        fq_terms = []
        if not isinstance(objects, list):
            objects = [objects]
        for raw_obj in objects:
            if '||' in raw_obj:
                or_objects = raw_obj.split('||')
            else:
                or_objects = [raw_obj]
            fq_or_terms = []
            for obj in or_objects:
                # find and save the entity to memory
                found = self.mem_cache_obj.check_entity_found(obj, False)
                if found:
                    entity = self.mem_cache_obj.get_entity(obj, False)
                    fq_term = 'object_uri:' + self.escape_solr_arg(entity.uri)
                    fq_term += ' OR text:"' + self.escape_solr_arg(entity.uri) + '"'
                else:
                    fq_term = 'object_uri:' + obj
                fq_or_terms.append(fq_term)
            fq_all_ors = ' OR '.join(fq_or_terms)
            fq_all_ors = '(' + fq_all_ors + ')'
            fq_terms.append(fq_all_ors)
        fq_final = ' AND '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_dc_term(self, dc_param, dc_terms, add_facet=False):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        if dc_param in DCterms.DC_META_FIELDS:
            fq_field = DCterms.DC_META_FIELDS[dc_param]
            if fq_field not in query_dict['facet.field'] and add_facet:
                query_dict['facet.field'].append(fq_field)
            add_to_fq = False
            for raw_dc_term in dc_terms:
                if '||' in raw_dc_term:
                    use_dc_terms = raw_dc_term.split('||')
                else:
                    use_dc_terms = [raw_dc_term]
                fq_path_terms = []
                for dc_term in use_dc_terms:
                    if len(dc_term) > 0:
                        add_to_fq = True
                        # check if entity exists, and or store in memory
                        found = self.mem_cache_obj.check_entity_found(dc_term, False)
                        if found:
                            # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                            # the below is a bit of a hack. We should have a query field
                            # as with ___pred_ to query just the slug. But this works for now
                            entity = self.mem_cache_obj.get_entity(dc_term, False)
                            fq_path_term = fq_field + '_fq:' + entity.slug
                            if dc_param == 'dc-temporal' \
                               and entity.entity_type == 'vocabulary' \
                               and 'periodo' in entity.slug:
                                # it's a temporal vocabulary from periodo
                                # so search for specific periods contained in
                                # the vocabulary
                                fq_path_term = '(' + fq_path_term +\
                                               ' OR ' + fq_path_term + '*)'
                        else:
                            if dc_term[-1] != '*':
                                dc_term += '*'
                            fq_path_term = fq_field + ':' + dc_term
                        fq_path_terms.append(fq_path_term)
                final_path_term = ' AND '.join(fq_path_terms)
                final_path_term = '(' + final_path_term + ')'
                fq_terms.append(final_path_term)
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            if add_to_fq:
                query_dict['fq'].append(fq_final)
        return query_dict

    def get_related_slug_field_prefix(self, slug):
        """ gets the field prefix for a related property
            if it is present in the slug, 
            then return the solr_field prefix otherwise
            return a '' string
        """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            return field_prefix
        else:
            return ''

    def clean_related_slug(self, slug):
        """ removes the field_prefix for related slugs """
        field_prefix = SolrDocument.RELATED_SOLR_FIELD_PREFIX
        prefix_len = len(field_prefix)
        slug_start = slug[:prefix_len]
        if slug_start == field_prefix:
            slug = slug[prefix_len:]
        return slug

    def correct_solr_prefix_for_fq(self, solr_f_prefix, act_field_fq):
        """ makes sure the solr prefix is on the fq if needed """
        if solr_f_prefix != '':
            if solr_f_prefix not in act_field_fq:
                act_field_fq = solr_f_prefix + act_field_fq
        return act_field_fq

    def process_prop(self, props):
        """ processes 'prop' (property) parameters
            property parameters are tricky because they
            can come in hierarchies
            that's why there's some complexity to this
        """
        # is the property for the item itself, or for a related item?
        query_dict = {'fq': [],
                      'facet.field': [],
                      'stats.field': [],
                      'prequery-stats': [],
                      'facet.range': [],
                      'hl-queries': [],
                      'ranges': {}}
        fq_terms = []
        prop_path_lists = self.expand_hierarchy_options(props)
        for prop_path_list in prop_path_lists:
            i = 0
            path_list_len = len(prop_path_list)
            fq_path_terms = []
            act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
            act_field_data_type = 'id'
            last_field_label = False  # needed for full text highlighting
            predicate_solr_slug = False
            for prop_slug in prop_path_list:
                field_prefix = self.get_related_slug_field_prefix(prop_slug)
                solr_f_prefix = field_prefix.replace('-', '_')
                db_prop_slug = self.clean_related_slug(prop_slug)
                l_prop_entity = False
                pred_prop_entity = False
                require_id_field = False
                if act_field_data_type == 'id':
                    # check entity exists, and save to memory
                    found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                    if found:
                        entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                        last_field_label = entity.label
                        prop_slug = field_prefix + entity.slug
                        if entity.item_type == 'uri' and 'oc-gen' not in db_prop_slug:
                            if entity.entity_type == 'property':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                l_prop_entity = True
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        else:
                            if entity.item_type == 'predicates':
                                pred_prop_entity = True
                                predicate_solr_slug = prop_slug.replace('-', '_')
                                children = self.mem_cache_obj.get_entity_children(entity.uri)
                                if len(children) > 0:
                                    # ok, this field has children. require it
                                    # to be treated as an ID field
                                    require_id_field = True
                        if i == 0:
                            if 'oc-gen' in db_prop_slug:
                                # for open context categories / types
                                act_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                lr = LinkRecursion()
                                parents = lr.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        act_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                        act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                                    except:
                                        pass
                            elif entity.item_type == 'uri':
                                act_field_fq = SolrDocument.ROOT_LINK_DATA_SOLR
                            elif entity.item_type == 'predicates':
                                temp_field_fq = self.get_parent_item_type_facet_field(entity.uri)
                                parents = self.mem_cache_obj.get_jsonldish_entity_parents(entity.uri)
                                if len(parents) > 1:
                                    try:
                                        p_slug = parents[-2]['slug']
                                        temp_field_fq = p_slug.replace('-', '_') + '___pred_id'
                                    except:
                                        print('Predicate Parent exception: '+ str(parents))
                                        temp_field_fq = False
                                if temp_field_fq is not False:
                                    act_field_fq = temp_field_fq
                                else:
                                    act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                            else:
                                act_field_fq = SolrDocument.ROOT_PREDICATE_SOLR
                        # ---------------------------------------------------
                        # THIS PART BUILDS THE FACET-QUERY
                        # fq_path_term = fq_field + ':' + self.make_solr_value_from_entity(entity)
                        # the below is a bit of a hack. We should have a query field
                        # as with ___pred_ to query just the slug. But this works for now
                        fq_field = act_field_fq + '_fq'
                        if path_list_len >= 2 and act_field_data_type == 'id':
                            # could be an object deeper in the hierarchy, so allow the obj_all version
                            fq_path_term = '(' + fq_field + ':' + prop_slug
                            fq_path_term += ' OR obj_all___' + fq_field + ':' + prop_slug + ')'
                        else:
                            fq_path_term = fq_field + ':' + prop_slug
                        fq_path_terms.append(fq_path_term)
                        #---------------------------------------------------
                        #
                        #---------------------------------------------------
                        # THIS PART PREPARES FOR LOOPING OR FINAL FACET-FIELDS
                        #
                        # print('pred-solr-slug: ' + predicate_solr_slug)
                        field_parts = self.make_prop_solr_field_parts(entity)
                        act_field_data_type = field_parts['suffix']
                        if require_id_field:
                            act_field_data_type = 'id'
                            field_parts['suffix'] = 'id'
                        # check if the last or penultimate field has
                        # a different data-type (for linked-data)
                        if i >= (path_list_len - 2) \
                           and l_prop_entity:
                            dtypes = self.mem_cache_obj.get_dtypes(entity.uri)
                            if isinstance(dtypes, list):
                                # set te data type and the act-field
                                found = self.mem_cache_obj.check_entity_found(db_prop_slug, False)
                                if found:
                                    entity = self.mem_cache_obj.get_entity(db_prop_slug, False)
                                    entity.date_type = dtypes[0]  # store for later use
                                    self.mem_cache_obj.entities[db_prop_slug] = entity  # store for later use
                                act_field_data_type = self.get_solr_field_type(dtypes[0])
                        if predicate_solr_slug is False or pred_prop_entity:
                            act_field_fq = field_parts['prefix'] + '___pred_' + field_parts['suffix']
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            # get a facet on this field
                            if act_field_data_type != 'string':
                                # adds a prefix for related properties
                                ffield = solr_f_prefix + field_parts['prefix'] + '___pred_' + field_parts['suffix']
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                        else:
                            if act_field_data_type == 'id':
                                act_field_fq = 'obj_all___' + predicate_solr_slug \
                                               + '___pred_' + field_parts['suffix']
                                # get a facet on this field
                                if predicate_solr_slug != field_parts['prefix']:
                                    # the predicate_solr_slug is not the
                                    # prefix of the current field part, meaning
                                    # the field_parts[prefix] is the type, and
                                    # we want facets for the predicate -> type
                                    ffield = field_parts['prefix'] \
                                             + '___' \
                                             + predicate_solr_slug \
                                             + '___pred_' + field_parts['suffix']
                                else:
                                    # get facets for the predicate
                                    ffield = field_parts['prefix'] \
                                             + '___pred_' \
                                             + field_parts['suffix']
                                # adds a prefix, in case of a related property
                                ffield = solr_f_prefix + ffield
                                if ffield not in query_dict['facet.field'] \
                                   and i >= (path_list_len - 1):
                                    query_dict['facet.field'].append(ffield)
                            else:
                                act_field_fq = predicate_solr_slug + '___pred_' + field_parts['suffix']
                        # -------------------------------------------
                        if act_field_data_type == 'numeric':
                            # print('Numeric field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_numeric'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_math_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        elif act_field_data_type == 'date':
                            # print('Date field: ' + act_field)
                            act_field_fq = field_parts['prefix'] + '___pred_date'
                            act_field_fq = self.correct_solr_prefix_for_fq(solr_f_prefix, act_field_fq)
                            query_dict = self.add_date_facet_ranges(query_dict,
                                                                    act_field_fq,
                                                                    entity)
                        # print('Current data type (' + str(i) + '): ' + act_field_data_type)
                        # print('Current field (' + str(i) + '): ' + act_field_fq)
                    i += 1
                elif act_field_data_type == 'string':
                    # case for a text search
                    # last_field_label = False  # turn off using the field label for highlighting
                    string_terms = self.prep_string_search_term(prop_slug)
                    for escaped_term in string_terms:
                        search_term = act_field_fq + ':' + escaped_term
                        if last_field_label is False:
                            query_dict['hl-queries'].append(escaped_term)
                        else:
                            query_dict['hl-queries'].append(last_field_label + ' ' + escaped_term)
                        fq_path_terms.append(search_term)
                elif act_field_data_type == 'numeric':
                    # numeric search. assume it's well formed solr numeric request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the numeric ranges from query to the range facets
                    query_dict = self.add_math_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
                elif act_field_data_type == 'date':
                    # date search. assume it's well formed solr request
                    search_term = act_field_fq + ':' + prop_slug
                    fq_path_terms.append(search_term)
                    # now limit the date ranges from query to the range facets
                    query_dict = self.add_date_facet_ranges(query_dict,
                                                            act_field_fq,
                                                            False,
                                                            prop_slug)
            final_path_term = ' AND '.join(fq_path_terms)
            final_path_term = '(' + final_path_term + ')'
            fq_terms.append(final_path_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def add_math_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = self.histogram_groups
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                vals = []
                # get the numbers out
                q_nums_strs = re.findall(r'[-+]?\d*\.\d+|\d+', solr_query)
                for q_num_str in q_nums_strs:
                    vals.append(float(q_num_str))
                vals.sort()
                if len(vals) > 1:
                    ok = True
                    min_val = vals[0]
                    max_val = vals[-1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = min_val
            query_dict['ranges'][fend] = max_val
            query_dict['ranges'][fgap] = (max_val - min_val) / groups
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def add_date_facet_ranges(self,
                              query_dict,
                              act_field,
                              entity=False,
                              solr_query=False):
        """ this does some math for facet
            ranges for numeric fields
        """
        ok = False
        groups = 4
        fstart = 'f.' + act_field + '.facet.range.start'
        fend = 'f.' + act_field + '.facet.range.end'
        fgap = 'f.' + act_field + '.facet.range.gap'
        findex = 'f.' + act_field + '.facet.sort'
        fother = 'f.' + act_field + '.facet.range.other'
        finclude = 'f.' + act_field + '.facet.range.include'
        if entity is not False:
            # this is a field with no value limits
            # we need to do a stats-prequery first
            query_dict['prequery-stats'].append(act_field)
        else:
            if solr_query is not False:
                q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}[T:]\d{2}:\d{2}:\d{2}', solr_query)
                if len(q_dt_strs) < 2:
                    # try a less strict regular expression to get dates
                    q_dt_strs = re.findall(r'\d{4}-\d{2}-\d{2}', solr_query)
                if len(q_dt_strs) >= 2:
                    ok = True
                    vals = []
                    for q_dt_str in q_dt_strs:
                        vals.append(q_dt_str)
                    vals.sort()
                    min_val = vals[0]
                    max_val = vals[1]
        if ok:
            if act_field not in query_dict['stats.field']:
                query_dict['stats.field'].append(act_field)
            if act_field not in query_dict['facet.range']:
                query_dict['facet.range'].append(act_field)
            query_dict['ranges'][fother] = 'all'
            query_dict['ranges'][finclude] = 'all'
            query_dict['ranges'][fstart] = self.convert_date_to_solr_date(min_val)
            query_dict['ranges'][fend] = self.convert_date_to_solr_date(max_val)
            query_dict['ranges'][fgap] = self.get_date_difference_for_solr(min_val, max_val, groups)
            query_dict['ranges'][findex] = 'index'  # sort by index, not by count
        return query_dict

    def get_date_difference_for_solr(self, min_date, max_date, groups):
        """ Gets a solr date difference from two values """
        min_dt = self.date_convert(min_date)
        max_dt = self.date_convert(max_date)
        dif_dt = (max_dt - min_dt) / groups
        if dif_dt.days >= 366:
            solr_val = int(round((dif_dt.days / 365.25), 0))
            solr_dif = '+' + str(solr_val) + 'YEAR'
        elif dif_dt.days >= 31:
            solr_val = int(round((dif_dt.days / 30), 0))
            solr_dif = '+' + str(solr_val) + 'MONTH'
        elif dif_dt.days >= 1:
            solr_val = int(round(dif_dt.days, 0))
            solr_dif = '+' + str(solr_val) + 'DAY'
        elif (dif_dt.seconds // 3600) >= 1:
            solr_val = int(round((dif_dt.seconds // 3600), 0))
            solr_dif = '+' + str(solr_val) + 'HOUR'
        elif ((dif_dt.seconds % 3600) // 60) >= 1:
            solr_val = int(round(((dif_dt.seconds % 3600) // 60), 0))
            solr_dif = '+' + str(solr_val) + 'MINUTE'
        elif dif_dt.seconds >= 1:
            solr_val = int(round(dif_dt.seconds, 0))
            solr_dif = '+' + str(solr_val) + 'SECOND'
        else:
            solr_dif = '+1YEAR'
        return solr_dif

    def add_solr_gap_to_date(self, date_val, solr_gap):
        """ adds a solr gap to a date_val """
        solr_val = re.sub(r'[^\d.]', r'', solr_gap)
        solr_val = int(float(solr_val))
        dt = self.date_convert(date_val)
        if 'YEAR' in solr_gap:
            dt = dt + datetime.timedelta(days=int(round((solr_val * 365.25), 0)))
        elif 'MONTH' in solr_gap:
            dt = dt + datetime.timedelta(days=(solr_val * 30))
        elif 'DAY' in solr_gap:
            dt = dt + datetime.timedelta(days=solr_val)
        elif 'HOUR' in solr_gap:
            dt = dt + datetime.timedelta(hours=solr_val)
        elif 'MINUTE' in solr_gap:
            dt = dt + datetime.timedelta(minutes=solr_val)
        elif 'SECOND' in solr_gap:
            dt = dt + datetime.timedelta(seconds=solr_val)
        else:
            dt = dt
        return dt

    def convert_date_to_solr_date(self, date_val):
        """ Conversts a string for a date into
            a Solr formated datetime string
        """
        dt = self.date_convert(date_val)
        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    def make_human_readable_date(self, date_val):
        """ Converts a date value into something
            easier to read
        """
        dt = self.date_convert(date_val)
        check_date = dt.strftime('%Y-%m-%d')
        check_dt = self.date_convert(date_val)
        if check_dt == dt:
            return check_date
        else:
            return dt.strftime('%Y-%m-%d:%H:%M:%S')

    def date_convert(self, date_val):
        """ converts to a python datetime if not already so """
        if isinstance(date_val, str):
            date_val = date_val.replace('Z', '')
            dt = datetime.datetime.strptime(date_val, '%Y-%m-%dT%H:%M:%S')
        else:
            dt = date_val
        return dt

    def get_parent_item_type_facet_field(self, category_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(category_uri)
        for par in parents:
            if par['slug'] in self.TYPE_MAPPINGS.values():
                # the parent exists in the Type Mappings
                output = par['slug'].replace('-', '_') + '___pred_id'
                break
        return output

    def get_parent_entity_facet_field(self, entity_uri):
        """ Gets the parent facet field for a given
            category_uri. This assumes the category_uri is an entity
            that exists in the database.
        """
        output = False;
        parents = LinkRecursion().get_jsonldish_entity_parents(entity_uri)
        if isinstance(parents, list):
            if len(parents) > 1:
                # get the penultimate field
                output = parents[-2]['slug'].replace('-', '_') + '___pred_id'
        return output

    def process_item_type(self, raw_item_type):
        # TODO docstring
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        item_type_lists = self.expand_hierarchy_options(raw_item_type)
        for item_type_list in item_type_lists:
            i = 0
            path_list_len = len(item_type_list)
            fq_path_terms = []
            item_type = item_type_list[0]  # no hiearchy in this field, just the type
            fq_term = 'item_type:' + item_type
            fq_terms.append(fq_term)
            if item_type in self.TYPE_MAPPINGS:
                act_field = self.TYPE_MAPPINGS[item_type].replace('-', '_') + '___pred_id'
                query_dict['facet.field'].append(act_field)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_id(self, identifier):
        # check for identifier
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        escape_id = self.escape_solr_arg(identifier)
        fq_terms.append('persistent_uri:' + escape_id)
        # now make a DOI URI in case this is just a naked DOI
        doi_uri = self.escape_solr_arg('http://dx.doi.org/' + identifier)
        fq_terms.append('persistent_uri:' + doi_uri)
        # now make an ARK URI in case this is just a naked ARK
        ark_uri = self.escape_solr_arg('http://n2t.net/' + identifier)
        fq_terms.append('persistent_uri:' + ark_uri)
        # now make an ORCID URI in case this is just a naked ORCID
        orcid_uri = self.escape_solr_arg('http://orcid.org/' + identifier)
        fq_terms.append('persistent_uri:' + orcid_uri)
        fq_terms.append('uuid:' + escape_id)
        tcheck = URImanagement.get_uuid_from_oc_uri(identifier, True)
        if tcheck is not False:
            uuid = tcheck['uuid']
            fq_terms.append('uuid:' + uuid)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        # print(fq_final)
        return query_dict

    def process_form_use_life_chrono(self, raw_form_use_life_chrono):
        # creates facet query for form-use-life chronological tiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('form_use_life_chrono_tile')
        if '||' in raw_form_use_life_chrono:
            chrono_paths = raw_form_use_life_chrono.split('||')
        else:
            chrono_paths = [raw_form_use_life_chrono]
        for chrono_path in chrono_paths:
            i = 0
            if len(chrono_path) < 30:
                chrono_path += '*'
            fq_term = 'form_use_life_chrono_tile:' + chrono_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_form_date_chrono(self, form_use_life_date, date_type):
        # creates facet query for form-use-life dates
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        if date_type == 'start':
            qterm = '[' + str(form_use_life_date) + ' TO *]'
            fquery = 'form_use_life_chrono_earliest: ' + qterm
        else:
            qterm = '[* TO ' + str(form_use_life_date) + ']'
            fquery = 'form_use_life_chrono_latest: ' + qterm
        query_dict['fq'].append(fquery)
        return query_dict

    def process_discovery_geo(self, raw_disc_geo):
        # creates facet query for discovery geotiles
        # supports or {'||') queries in the path also
        query_dict = {'fq': [],
                      'facet.field': []}
        fq_terms = []
        query_dict['facet.field'].append('discovery_geotile')
        if '||' in raw_disc_geo:
            disc_geo_paths = raw_disc_geo.split('||')
        else:
            disc_geo_paths = [raw_disc_geo]
        for disc_path in disc_geo_paths:
            i = 0
            if len(disc_path) < 20:
                disc_path += '*'
            fq_term = 'discovery_geotile:' + disc_path
            fq_terms.append(fq_term)
        fq_final = ' OR '.join(fq_terms)
        fq_final = '(' + fq_final + ')'
        query_dict['fq'].append(fq_final)
        return query_dict

    def process_discovery_bbox(self, raw_disc_bbox):
        # creates facet query for bounding box searches
        # supports or {'||') queries
        query_dict = {'fq': []}
        fq_terms = []
        if '||' in raw_disc_bbox:
            bbox_list = raw_disc_bbox.split('||')
        else:
            bbox_list = [raw_disc_bbox]
        for bbox in bbox_list:
            if ',' in bbox:
                # comma seperated list of coordinates
                bbox_coors = bbox.split(',')
                bbox_valid = self.validate_bbox_coordiantes(bbox_coors)
                if bbox_valid:
                    # valid bounding box, now make a solr-query
                    # not how solr expacts latitude / longitude order, which
                    # is the revserse of geojson!
                    fq_term = 'discovery_geolocation:'
                    fq_term += '[' + str(bbox_coors[1]) + ',' + str(bbox_coors[0])
                    fq_term += ' TO ' + str(bbox_coors[3]) + ',' + str(bbox_coors[2])
                    fq_term += ']'
                    fq_terms.append(fq_term)
        if len(fq_terms) > 0:
            fq_final = ' OR '.join(fq_terms)
            fq_final = '(' + fq_final + ')'
            query_dict['fq'].append(fq_final)
        return query_dict

    def validate_bbox_coordiantes(self, bbox_coors):
        """ validates a set of bounding box coordinates """
        is_valid = False
        if len(bbox_coors) == 4:
            lower_left_valid = self.validate_geo_lon_lat(bbox_coors[0],
                                                         bbox_coors[1])
            top_right_valid = self.validate_geo_lon_lat(bbox_coors[2],
                                                        bbox_coors[3])
            # print('ok: ' + str(lower_left_valid) + ' ' + str(top_right_valid))
            if lower_left_valid and top_right_valid:
                if float(bbox_coors[0]) < float(bbox_coors[2]) and\
                   float(bbox_coors[1]) < float(bbox_coors[3]):
                    is_valid = True
        return is_valid

    def validate_geo_lon_lat(self, lon, lat):
        """ checks to see if a lon, lat pair
            are valid. Note the GeoJSON ordering
            of the coordinates
        """
        is_valid = False
        lon_valid = self.validate_geo_coordinate(lon, 'lon')
        lat_valid = self.validate_geo_coordinate(lat, 'lat')
        if lon_valid and lat_valid:
            is_valid = True
        return is_valid

    def validate_geo_coordinate(self, coordinate, coord_type):
        """ validates a geo-spatial coordinate """
        is_valid = False
        try:
            fl_coord = float(coordinate)
        except ValueError:
            fl_coord = False
        if fl_coord is not False:
            if 'lat' in coord_type:
                if fl_coord <= 90 and\
                   fl_coord >= -90:
                    is_valid = True
            elif 'lon' in coord_type:
                if fl_coord <= 180 and\
                   fl_coord >= -180:
                    is_valid = True
        return is_valid

    def make_solr_value_from_entity(self, entity, value_type='id'):
        """ makes a solr value as indexed in SolrDocument
            see _concat_solr_string_value
        """
        id_part = entity.uri
        if 'http://opencontext.org' in entity.uri:
            if '/vocabularies/' not in entity.uri:
                id_part = entity.uri.split('http://opencontext.org')[1]
        return entity.slug + '___' + value_type + '___' + \
            id_part + '___' + entity.label
        return output

    def _process_spatial_context(self, spatial_context=None):
        # TODO docstring
        context = {}
        if spatial_context:
            context_paths = self._get_context_paths(spatial_context)
            context_slugs = self._get_valid_context_slugs(context_paths)
            # print('Context slugs: ' + str(context_slugs))
            # If we cannot find a valid context, raise a 404
            if not context_slugs:
                raise Http404
            # Solr 'fq' parameters
            parent_child_slugs = []
            # Solr 'facet.field' parameters
            facet_field = []
            for slug in context_slugs:
                # fq parameters
                parent_child_slugs.append(self._get_parent_slug(slug) + '___' + slug)
                # facet.field parameters
                facet_field.append(slug.replace('-', '_') + '___context_id')
            # First, handle the most likely scenario of a single context
            if len(parent_child_slugs) == 1:
                context['fq'] = self._prepare_filter_query(parent_child_slugs[0])
            # Otherwise, combine multiple contexts into an OR filter
            else:
                fq_string = ' OR '.join(
                    (self._prepare_filter_query(slug_set) for slug_set
                        in parent_child_slugs)
                    )
                context['fq'] = '(' + fq_string + ')'
            context['facet.field'] = facet_field
        # No spatial context provided
        else:
            context['fq'] = None
            context['facet.field'] = ['root___context_id']
        return context

    def prep_string_search_term(self, raw_term):
        """ prepares a string search
            returns a list of search terms
            for AND queries
        """
        if '"' in raw_term:
            nq_term = raw_term.replace('"', ' ')  # get rid of quotes in the search term
            quoted_list = re.findall(r"\"(.*?)\"", raw_term)
            terms = []
            terms.append(self.escape_solr_arg(nq_term))
            for quote_item in quoted_list:
                quote_item = self.escape_solr_arg(quote_item)  # escape characters
                quote_item = '"' + quote_item + '"'  # put quotes back around it
                terms.append(quote_item)
        else:
            terms = []
            terms.append(self.escape_solr_arg(raw_term))
        return terms

    def escaped_seq(self, term):
        """ Yield the next string based on the
            next character (either this char
            or escaped version """
        escaperules = {'+': r'\+',
                       '-': r'\-',
                       '&': r'\&',
                       '|': r'\|',
                       '!': r'\!',
                       '(': r'\(',
                       ')': r'\)',
                       '{': r'\{',
                       '}': r'\}',
                       '[': r'\[',
                       ']': r'\]',
                       '^': r'\^',
                       '~': r'\~',
                       '*': r'\*',
                       '?': r'\?',
                       ':': r'\:',
                       '"': r'\"',
                       ';': r'\;',
                       ' ': r'\ '}
        for char in term:
            if char in escaperules.keys():
                yield escaperules[char]
            else:
                yield char

    def escape_solr_arg(self, term):
        """ Apply escaping to the passed in query terms
            escaping special characters like : , etc"""
        term = term.replace('\\', r'\\')   # escape \ first
        return "".join([next_str for next_str in self.escaped_seq(term)])
コード例 #14
0
ファイル: querymaker.py プロジェクト: rdhyee/open-context-py
def get_general_hierarchic_path_query_dict(
    path_list,
    root_field,
    field_suffix,
    obj_all_slug='',
    fq_solr_field_suffix='',
    attribute_field_part='',
    value_slug_length_limit=120,
):
    """Gets a solr query dict for a general hierarchic list of
    path item identifiers (usually slugs).
    
    :param list path_list: List of string identifiers (usually slugs)
        for entities, and possibly literals that the client provides to
        search solr.
    :param str root_field: The root dynamic field for this solr query.
        It can be a root___project_id, a root___pred_id, etc.
    :param str field_suffix: They type of solr dynamic field being
        queried. (project_id, pred_id) etc.
    :param str obj_all_slug: An optional slug to identify a more
        specific solr "obj_all" field.
    """
    # NOTE: The goal for this function is to be as general and
    # reusable as possible for generating the solr query fq and
    # facet.fields arguments. It's intended for use with most of the
    # non-spatial-context hierarchies that we index. Because of that
    # it will be somewhat abstract and difficult to understand at
    # first.
    m_cache = MemoryCache()
    query_dict = {'fq': [], 'facet.field': []}

    if obj_all_slug:
        # This makes a more specific "obj_all" field that we use to
        # query all levels of the hierarchy in solr.
        obj_all_slug = (obj_all_slug.replace('-', '_') +
                        SolrDocument.SOLR_VALUE_DELIM)

    if SolrDocument.DO_LEGACY_FQ:
        # Doing the legacy filter query method, so add a
        # suffix of _fq to the solr field.
        fq_solr_field_suffix = '_fq'

    if field_suffix != 'pred_id':
        obj_all_field_fq = ('obj_all' + SolrDocument.SOLR_VALUE_DELIM +
                            obj_all_slug + field_suffix + fq_solr_field_suffix)
    else:
        # Don't make an obj_all_field_fq for root predicates.
        obj_all_field_fq = None

    # Now start composing fq's for the parent item field with the
    # child as a value of the parent item field.
    facet_field = root_field

    # NOTE: The attribute_field_part is a part of a solr-field
    # for cases where the attribute is an entity in the database.
    # It starts with the default value of '' because we start
    # formulating solr queries on general/universal metadata
    # attributes, not the more specific, rarely used attributes that
    # are stored in the database.
    attribute_item = None

    # Default to no solr related prefix
    use_solr_rel_prefix = ''

    last_path_index = len(path_list) - 1
    for path_index, item_id in enumerate(path_list):
        if (attribute_item is not None
                and getattr(attribute_item, 'data_type',
                            None) in configs.LITERAL_DATA_TYPES):
            # Process literals in requests, because some requests will filter according to
            # numeric, date, or string criteria.
            literal_query_dict = compose_filter_query_on_literal(
                raw_literal=item_id,
                attribute_item=attribute_item,
                field_fq=field_fq,
            )

            # Now combine the query dict for the literals with
            # the main query dict for this function
            query_dict = utilities.combine_query_dict_lists(
                part_query_dict=literal_query_dict,
                main_query_dict=query_dict,
            )
            # Skip out, because a literal query will never involve
            # children in a hierarchy path (because these are literals,
            # not entities in the database)
            return query_dict

        if not item_id:
            # It is empty or not a string, so skip out.
            return None

        use_solr_rel_prefix = ''
        if item_id.startswith(configs.RELATED_ENTITY_ID_PREFIX):
            # Strip off the prefix.
            item_id = item_id[len(configs.RELATED_ENTITY_ID_PREFIX):]
            use_solr_rel_prefix = SolrDocument.RELATED_SOLR_DOC_PREFIX

        # Add the solr-rel prefix if needed.
        obj_all_field_fq = add_rel_prefix_if_needed(obj_all_field_fq,
                                                    prefix=use_solr_rel_prefix)

        item = m_cache.get_entity(item_id)
        if not item:
            # We don't recognize the first item, and it is not
            # a literal of an attribute field. So return None.
            return None

        item_parent = get_entity_item_parent_entity(item)
        if item_parent and item_parent.get('slug'):
            # The item has a parent item, and that parent item will
            # make a solr_field for the current item.
            parent_slug_part = item_parent['slug'].replace('-', '_')
            if not attribute_field_part.startswith(parent_slug_part):
                facet_field = (
                    # Use the most immediate parent item of the item entity
                    # to identify the solr field we need to query. That
                    # most immediate item is index -1 (because the item
                    # item entity itself is not included in this list, as
                    # specified by the add_original=False arg).
                    parent_slug_part + SolrDocument.SOLR_VALUE_DELIM +
                    attribute_field_part + field_suffix)

        # If the item is a linked data entity, and we have a
        # root field field defined for project specific predicates.
        # So, change the root solr field to be the linked data root.
        if (item.item_type == 'uri'
                and facet_field == SolrDocument.ROOT_PREDICATE_SOLR):
            facet_field = SolrDocument.ROOT_LINK_DATA_SOLR

        # Add the solr related prefix for related entity searches
        # and it not already used as a prefix.
        # Add the solr-rel prefix if needed.
        facet_field = add_rel_prefix_if_needed(facet_field,
                                               prefix=use_solr_rel_prefix)

        # NOTE: If SolrDocument.DO_LEGACY_FQ, we're doing the older
        # approach of legacy "_fq" filter query fields. If this is
        # False, the field_fq does NOT have a "_fq" suffix.
        #
        # NOTE ON DO_LEGACY_FQ:
        # Add the _fq suffix to make the field_fq which is what we use
        # to as the solr field to query for the current item. Note! The
        # field_fq is different from the facet_field because when we
        # query solr for slugs, we query solr-fields that end with "_fq".
        # The solr fields that don't have "_fq" are used exclusively for
        # making facets (counts of metadata values in different documents).
        field_fq = facet_field
        if not field_fq.endswith(fq_solr_field_suffix):
            field_fq += fq_solr_field_suffix

        # Make the query for the item and the solr field associated
        # with the item's immediate parent (or root, if it has no
        # parents).
        fq_item_slug = add_rel_prefix_if_needed(utilities.fq_slug_value_format(
            item.slug),
                                                prefix=use_solr_rel_prefix)

        query_dict['fq'].append('{field_fq}:{item_slug}'.format(
            field_fq=field_fq, item_slug=fq_item_slug))
        # Now make the query for the item and the solr field
        # associated with all items in the whole hierarchy for this
        # type of solr dynamic field.
        if obj_all_field_fq:
            query_dict['fq'].append('{field_fq}:{item_slug}'.format(
                field_fq=obj_all_field_fq, item_slug=fq_item_slug))
        # Use the current item as the basis for the next solr_field
        # that will be used to query child items in the next iteration
        # of this loop.
        facet_field = (item.slug.replace('-', '_') +
                       SolrDocument.SOLR_VALUE_DELIM + attribute_field_part +
                       field_suffix)
        facet_field = add_rel_prefix_if_needed(facet_field,
                                               prefix=use_solr_rel_prefix)

        field_fq = facet_field
        if not field_fq.endswith(fq_solr_field_suffix):
            field_fq += fq_solr_field_suffix

        if ((getattr(item, 'item_type', None) == 'predicates')
                or (getattr(item, 'entity_type', None) == 'property')):
            # The current item entity is a "predicates" or a "property"
            # type of item. That means the item is a kind of attribute
            # or a "predicate" in linked-data speak, (NOT the value of
            # an attribute). The slugs for such attribute entities are
            # used in solr fields. These will be used in all of the
            # queries of child items as we iterate through this
            # path_list.

            # The current item is an attribute item, so copy it for
            # use as we continue to iterate through this path_list.
            attribute_item = item
            if False:
                # Keep for debugging but turn it off
                print('attribute item {} is a {}, {}'.format(
                    attribute_item.label, attribute_item.item_type,
                    attribute_item.data_type))

            if (getattr(attribute_item, 'data_type', None)
                    in configs.LITERAL_DATA_TYPES):
                # This attribute_item has a data type for literal
                # values.

                children = get_entity_item_children_list(item)
                if len(children):
                    # The (supposedly) literal attribute item
                    # has children so force it to have a data_type of
                    # 'id'.
                    attribute_item.data_type = 'id'

                # NOTE: Generally, we don't make facets on literal
                # attributes. However, some literal attributes are
                # actually parents of other literal atttributes, so
                # we should make facets for them.
                if path_index != last_path_index:
                    # The current item_id is not the last item in the
                    # in the path_list, so we do not need to check
                    # for child items.
                    facet_field = None
                elif attribute_item.data_type == 'xsd:boolean':
                    # Make sure we get the facet field, identified
                    # with the correct data type, for boolean values.
                    facet_field = utilities.rename_solr_field_for_data_type(
                        attribute_item.data_type,
                        (use_solr_rel_prefix + item.slug.replace('-', '_') +
                         SolrDocument.SOLR_VALUE_DELIM + field_suffix))
                elif attribute_item.data_type != 'id':
                    # The attribute item data type has not been reset
                    # to be 'id', b/c there are no children items to
                    # this literal attribute item. Thus, there is no
                    # need to make a facet field for it.
                    facet_field = None

                # Format the field_fq appropriately for this specific
                # data type.
                field_fq = utilities.rename_solr_field_for_data_type(
                    attribute_item.data_type,
                    (use_solr_rel_prefix + item.slug.replace('-', '_') +
                     SolrDocument.SOLR_VALUE_DELIM + field_suffix))

                # The attribute item is for a literal type field.
                # Gather numeric and date fields that need a
                range_query_dict = get_range_stats_fields(
                    attribute_item, field_fq)
                # Now combine the query dict for the range fields with
                # the main query dict for this function
                query_dict = utilities.combine_query_dict_lists(
                    part_query_dict=range_query_dict,
                    main_query_dict=query_dict,
                )
            elif (attribute_item.item_type == 'predicates'
                  or (attribute_item.item_type == 'uri'
                      and getattr(item, 'entity_type', None) == 'property'
                      and not attribute_field_part)):
                # This attribute is for making descriptions with
                # non-literal values (meaning entities in the DB).
                if False:
                    # Keep for debugging, but turn it off.
                    print('Pred attribute: {}, {}'.format(
                        attribute_item.item_type,
                        getattr(item, 'entity_type', None)))
                attribute_field_part = (attribute_item.slug.replace('-', '_') +
                                        SolrDocument.SOLR_VALUE_DELIM)

                attribute_field_part = add_rel_prefix_if_needed(
                    attribute_field_part, prefix=use_solr_rel_prefix)
                # Now also update the obj_all_field_fq
                obj_all_field_fq = ('obj_all' + SolrDocument.SOLR_VALUE_DELIM +
                                    attribute_field_part + field_suffix +
                                    fq_solr_field_suffix)
                obj_all_field_fq = add_rel_prefix_if_needed(
                    obj_all_field_fq, prefix=use_solr_rel_prefix)

    # Make the facet field so solr will return any possible
    # facet values for children of the LAST item in this path_list.
    if facet_field:
        query_dict['facet.field'].append(facet_field)
    return query_dict
コード例 #15
0
ファイル: filterlinks.py プロジェクト: ekansa/open-context-py
class FilterLinks():

    BASE_SOLR_FIELD_PARAM_MAPPINGS = \
        {'___project_id': 'proj',
         '___context_id': 'path',
         'obj_all___biol_term_hastaxonomy___pred_id': 'reconcile',
         '___pred_': 'prop',
         'item_type': 'type'}

    def __init__(self, request_dict=False):
        rp = RootPath()
        self.base_url = rp.get_baseurl()
        self.base_search_link = '/search/'
        self.base_request = request_dict
        self.base_request_json = False
        self.base_r_full_path = False
        self.spatial_context = False
        self.testing = settings.DEBUG
        self.hierarchy_delim = '---'
        self.partial_param_val_match = False
        self.remove_start_param = True
        self.m_cache = MemoryCache() # memory caching object  # memory caching object
        self.SOLR_FIELD_PARAM_MAPPINGS = self.BASE_SOLR_FIELD_PARAM_MAPPINGS
        for param_key, solr_field in DCterms.DC_META_FIELDS.items():
            self.SOLR_FIELD_PARAM_MAPPINGS[solr_field] = param_key

    def make_request_urls(self, new_rparams):
        """ makes request urls from the new request object """
        output = {}
        output['html'] = self.make_request_url(new_rparams)
        output['json'] = self.make_request_url(new_rparams, '.json')
        output['atom'] = self.make_request_url(new_rparams, '.atom')
        return output

    def make_request_url(self,
                         new_rparams,
                         doc_format=''):
        """ makes request urls from the new request object
            default doc_format is '' (HTML)
        """
        url = self.base_url + self.base_search_link
        if 'path' in new_rparams:
            if new_rparams['path'] is not None \
               and new_rparams['path'] is not False:
                # context_path = iri_to_uri(new_rparams['path'])
                context_path = new_rparams['path']
                context_path = context_path.replace(' ', '+')
                url += context_path
        url += doc_format
        param_sep = '?'
        param_list = []
        for param, param_vals in new_rparams.items():
            if param != 'path':
                for val in param_vals:
                    quote_val = quote_plus(val)
                    quote_val = quote_val.replace('%7BSearchTerm%7D', '{SearchTerm}')
                    param_item = param + '=' + quote_val
                    param_list.append(param_item)
        if len(param_list) > 0:
            # keep a consistent sort order on query parameters + values.
            param_list.sort()
            url += '?' + '&'.join(param_list)
        return url

    def make_request_sub(self,
                         old_request_dict,
                         rem_param_key,
                         rem_param_val,
                         sub_param_val=None):
        """ makes a dictionary object for
            request parameters WITHOUT the current fparam_key
            and fparam_vals
        """
        filter_request = LastUpdatedOrderedDict()
        for ch_param_key, ch_param_vals in old_request_dict.items():
            if ch_param_key != rem_param_key:
                # a different parameter than the one in the filter, so add
                filter_request[ch_param_key] = ch_param_vals
            else:
                if rem_param_key != 'path' and len(ch_param_vals) > 0:
                    filter_request[ch_param_key] = []
                    for ch_param_val in ch_param_vals:
                        if rem_param_val != ch_param_val:
                            # the filter value for this key is not the same
                            # as the check value for this key, so add
                            # to the filter request
                            filter_request[ch_param_key].append(ch_param_val)
                        else:
                            if sub_param_val is not None:
                                # put in the substitute value
                                filter_request[ch_param_key].append(sub_param_val)
        return filter_request

    def add_to_request_by_solr_field(self,
                                     solr_facet_key,
                                     new_value):
        """ uses the solr_facet_key to determine the
           request parameter
        """
        param = self.get_param_from_solr_facet_key(solr_facet_key)
        slugs = self.parse_slugs_in_solr_facet_key(solr_facet_key)
        if slugs is not False:
            add_to_value = self.hierarchy_delim.join(slugs)
        else:
            add_to_value = None
        #print('New param: ' + param + ' new val: ' + new_value + ' len:' + str(self.base_request))
        new_rparams = self.add_to_request(param,
                                          new_value,
                                          add_to_value)
        return new_rparams

    def add_to_request(self,
                       param,
                       new_value,
                       add_to_value=None):
        """ adds to the new request object a parameter and value """
        if self.base_request_json is not False:
            # start of with JSON encoded base request parameters
            new_rparams = json.loads(self.base_request_json)
        elif self.base_r_full_path is not False:
            # start of with parsing a URL string
            new_rparams = self.make_base_params_from_url(self.base_r_full_path)
        elif self.base_request is not False:
            # start with a dictionary object of the base request
            # for some reason this often leads to memory errors
            new_rparams = self.base_request
        else:
            new_rparams = {}
        if 'start' in new_rparams and self.remove_start_param:
            # remove paging information when composing a new link
            new_rparams.pop('start', None)
        if param == 'path':
            entity = self.m_cache.get_entity(new_value)
            if entity:
                # convert the (slug) value into a context path
                new_value = entity.context
        if param not in new_rparams:
            if param == 'path':
                new_rparams[param] = new_value
            else:
                new_rparams[param] = [new_value]
        else:
            if param == 'path':
                new_rparams['path'] = new_value
            else:
                if add_to_value is not None:
                    new_list = []
                    old_found = False
                    for old_val in new_rparams[param]:
                        old_prefix = self.remove_solr_part(old_val)
                        first_last_old_val = False
                        if self.hierarchy_delim in old_val:
                            old_val_ex = old_val.split(self.hierarchy_delim)
                            if len(old_val_ex) > 2:
                                first_last_old_val = old_val_ex[0]
                                first_last_old_val += self.hierarchy_delim
                                first_last_old_val += old_val_ex[-1]
                        if old_val == add_to_value:
                            old_found = True
                            new_list_val = old_val + self.hierarchy_delim + new_value
                        elif old_prefix == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        elif first_last_old_val == add_to_value:
                            old_found = True
                            new_list_val = old_prefix + self.hierarchy_delim + new_value
                        else:
                            new_list_val = old_val
                        new_list.append(new_list_val)
                    if old_found is False:
                        if self.partial_param_val_match:
                            for old_val in new_rparams[param]:
                                if add_to_value in old_val:
                                    old_found = True
                                    old_prefix = self.remove_solr_part(old_val)
                                    new_list_val = old_prefix + self.hierarchy_delim + new_value
                                    # add the new item
                                    new_list.append(new_list_val)
                                    # remove the old
                                    new_list.remove(old_val)
                    new_rparams[param] = new_list
                    if old_found is False:
                        new_rparams[param].append(new_value)
                else:
                    new_rparams[param].append(new_value)
        return new_rparams

    def remove_solr_part(self, old_val):
        """ removes part of a query parameter that
            is in solr query syntax, inside square
            brackets []
        """
        output = old_val
        splitter = self.hierarchy_delim + '['
        if splitter in old_val:
            old_ex = old_val.split(splitter)
            output = old_ex[0]
        return output

    def make_base_params_from_url(self, request_url):
        """ makes the base parameters from the url """
        rparams = {}
        url_o = urlparse(request_url)
        rparams = parse_qs(url_o.query)
        if self.spatial_context is False:
            self.spatial_context = self.get_context_from_path(url_o.path)
        rparams['path'] = self.spatial_context
        return rparams

    def get_context_from_path(self, path):
        """ geths the spatial context from a request path """
        context = False
        if '.' in path:
            pathex = path.split('.')
            path = pathex[0]
        if '/' in path:
            pathex = path.split('/')
            print(str(pathex))
            if len(pathex) > 2:
                # remove the part that's the first slash
                pathex.pop(0)
                # remove the part that's for the url of search
                pathex.pop(0)
            context = '/'.join(pathex)
        return context

    def get_param_from_solr_facet_key(self, solr_facet_key):
        """" returns the public parameter from the solr_facet_key """
        output = solr_facet_key
        exact_match = False
        for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items():
            if solr_field_part_key == solr_facet_key:
                output = param
                exact_match = True
                break
        if exact_match is False:
            for solr_field_part_key, param in self.SOLR_FIELD_PARAM_MAPPINGS.items():
                if solr_field_part_key in solr_facet_key:
                    output = param
                    break
        return output

    def parse_slugs_in_solr_facet_key(self, solr_facet_key):
        """ returns a list of slugs encoded in a solr_facet_key
            the solr field has these slugs in reverse order
        """
        no_slug_field_list = [SolrDocument.ROOT_CONTEXT_SOLR,
                              SolrDocument.ROOT_PROJECT_SOLR,
                              SolrDocument.ROOT_LINK_DATA_SOLR,
                              SolrDocument.ROOT_PREDICATE_SOLR]
        if solr_facet_key in no_slug_field_list:
            slugs = False
        else:
            raw_slugs = []
            facet_key_list = solr_facet_key.split('___')
            list_len = len(facet_key_list)
            i = 0
            for list_item in facet_key_list:
                i += 1
                if i < list_len:
                    # last item is the suffix for the field type
                    # also replace '_' with '-' to get a slug
                    raw_slugs.append(list_item.replace('_', '-'))
            slugs = raw_slugs[::-1]
        return slugs

    def prep_base_request_obj(self, request_dict):
        """ prepares a base request object from the old request object
            to use to create new requests
        """
        self.base_request = request_dict
        return self.base_request

    def get_request_param(self, param, default, as_list=False):
        """ get a string or list to use in queries from either
            the request object or the internal_request object
            so we have flexibility in doing searches without
            having to go through HTTP
        """
        output = False
        if self.request is not False:
            if as_list:
                output = self.request.GET.getlist(param)
            else:
                output = self.request.GET.get(param, default=default)
        elif self.internal_request is not False:
            if as_list:
                if param in self.internal_request:
                    param_obj = self.internal_request[param]
                    if isinstance(param_obj, list):
                        output = param_obj
                    else:
                        output = [param_obj]
            else:
                if param in self.internal_request:
                    output = self.internal_request[param]
                else:
                    output = default
        else:
            output = False
        return output 
コード例 #16
0
 def get_entity(self, identifier):
     """ gets an entity either from the cache or from
         database lookups.
     """
     m_cache = MemoryCache()
     return m_cache.get_entity(identifier)