def guess_collection_of_a_record(recID, referer=None, recreate_cache_if_needed=True): """Return collection name a record recid belongs to, by first testing the referer URL if provided and otherwise returning the primary collection.""" if referer: dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer) # requests can come from different invenio installations, with # different collections if CFG_SITE_URL.find(hostname) < 0: return guess_primary_collection_of_a_record(recID) g = _re_collection_url.match(path) if g: name = urllib.unquote_plus(g.group(1)) # check if this collection actually exist (also normalize the name # if case-insensitive) name = Collection.query.filter_by(name=name).value('name') if name and recID in get_collection_reclist(name): return name elif path.startswith('/search'): if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() query = cgi.parse_qs(query) for name in query.get('cc', []) + query.get('c', []): name = Collection.query.filter_by(name=name).value('name') if name and recID in get_collection_reclist(name, recreate_cache_if_needed=False): return name return guess_primary_collection_of_a_record(recID)
def guess_collection_of_a_record(recID, referer=None, recreate_cache_if_needed=True): """Return collection name a record recid belongs to, by first testing the referer URL if provided and otherwise returning the primary collection.""" if referer: dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer) # requests can come from different invenio installations, with # different collections if CFG_SITE_URL.find(hostname) < 0: return guess_primary_collection_of_a_record(recID) g = _re_collection_url.match(path) if g: name = urllib.unquote_plus(g.group(1)) # check if this collection actually exist (also normalize the name # if case-insensitive) name = Collection.query.filter_by(name=name).value('name') if name and recID in get_collection_reclist(name): return name elif path.startswith('/search'): if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() query = cgi.parse_qs(query) for name in query.get('cc', []) + query.get('c', []): name = Collection.query.filter_by(name=name).value('name') if name and recID in get_collection_reclist(name, recreate_cache_if_needed=False): return name return guess_primary_collection_of_a_record(recID)
def get_records_that_can_be_displayed(permitted_restricted_collections, hitset_in_any_collection, current_coll=None, colls=None): """Return records that can be displayed.""" current_coll = current_coll or cfg['CFG_SITE_NAME'] records_that_can_be_displayed = intbitset() if colls is None: colls = [current_coll] policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper() # real & virtual current_coll_children = get_collection_allchildren(current_coll) # Add all restricted collections, that the user has access to, and are # under the current collection do not use set here, in order to maintain a # specific order: children of 'cc' (real, virtual, restricted), rest of 'c' # that are not cc's children colls_to_be_displayed = set([ coll for coll in current_coll_children if coll in colls or coll in permitted_restricted_collections ]) colls_to_be_displayed |= set( [coll for coll in colls if coll not in colls_to_be_displayed]) # Get all records in applicable collections records_that_can_be_displayed = intbitset() for coll in colls_to_be_displayed: records_that_can_be_displayed |= get_collection_reclist(coll) if policy == 'ANY': # User needs to have access to at least one collection that restricts # the records. We need this to be able to remove records that are both # in a public and restricted collection. permitted_recids = intbitset() notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection in permitted_restricted_collections: permitted_recids |= get_collection_reclist(collection) else: notpermitted_recids |= get_collection_reclist(collection) notpermitted_recids -= permitted_recids else: # User needs to have access to all collections that restrict a records. notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection not in permitted_restricted_collections: notpermitted_recids |= get_collection_reclist(collection) # Remove records that can not be seen by user records_that_can_be_displayed -= notpermitted_recids # Intersect only if there are some matched records if not hitset_in_any_collection.is_infinite(): records_that_can_be_displayed &= hitset_in_any_collection return records_that_can_be_displayed
def get_records_that_can_be_displayed(permitted_restricted_collections, hitset_in_any_collection, current_coll=None, colls=None): """Return records that can be displayed.""" current_coll = current_coll or cfg['CFG_SITE_NAME'] records_that_can_be_displayed = intbitset() if colls is None: colls = [current_coll] policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper() # real & virtual current_coll_children = get_collection_allchildren(current_coll) # Add all restricted collections, that the user has access to, and are # under the current collection do not use set here, in order to maintain a # specific order: children of 'cc' (real, virtual, restricted), rest of 'c' # that are not cc's children colls_to_be_displayed = set([ coll for coll in current_coll_children if coll in colls or coll in permitted_restricted_collections ]) colls_to_be_displayed |= set([coll for coll in colls if coll not in colls_to_be_displayed]) # Get all records in applicable collections records_that_can_be_displayed = intbitset() for coll in colls_to_be_displayed: records_that_can_be_displayed |= get_collection_reclist(coll) if policy == 'ANY': # User needs to have access to at least one collection that restricts # the records. We need this to be able to remove records that are both # in a public and restricted collection. permitted_recids = intbitset() notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection in permitted_restricted_collections: permitted_recids |= get_collection_reclist(collection) else: notpermitted_recids |= get_collection_reclist(collection) notpermitted_recids -= permitted_recids else: # User needs to have access to all collections that restrict a records. notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection not in permitted_restricted_collections: notpermitted_recids |= get_collection_reclist(collection) # Remove records that can not be seen by user records_that_can_be_displayed -= notpermitted_recids # Intersect only if there are some matched records if not hitset_in_any_collection.is_infinite(): records_that_can_be_displayed &= hitset_in_any_collection return records_that_can_be_displayed
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx( p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append( [phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #write_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") for dbcollid in dbcollids: variants = ("collection:" + dbcollid, 'collection:"' + dbcollid + '"', "980__a:" + dbcollid, '980__a:"' + dbcollid + '"', '980:' + dbcollid, '980:"' + dbcollid + '"') res = run_sql( "SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants) if res: out = res[0][0] break if CFG_CERN_SITE: recID = int(recID) # dirty hack for ATLAS collections at CERN: if out in ('ATLAS Communications', 'ATLAS Internal Notes'): for alternative_collection in ( 'ATLAS Communications Physics', 'ATLAS Communications General', 'ATLAS Internal Notes Physics', 'ATLAS Internal Notes General', ): if recID in get_collection_reclist(alternative_collection): return alternative_collection # dirty hack for FP FP_collections = { 'DO': ['Current Price Enquiries', 'Archived Price Enquiries'], 'IT': [ 'Current Invitation for Tenders', 'Archived Invitation for Tenders' ], 'MS': ['Current Market Surveys', 'Archived Market Surveys'] } fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections] for coll in fp_coll_ids: for coll_name in FP_collections[coll]: if recID in get_collection_reclist(coll_name): return coll_name return out
def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") for dbcollid in dbcollids: variants = ( "collection:" + dbcollid, 'collection:"' + dbcollid + '"', "980__a:" + dbcollid, '980__a:"' + dbcollid + '"', "980:" + dbcollid, '980:"' + dbcollid + '"', ) res = run_sql("SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants) if res: out = res[0][0] break if CFG_CERN_SITE: recID = int(recID) # dirty hack for ATLAS collections at CERN: if out in ("ATLAS Communications", "ATLAS Internal Notes"): for alternative_collection in ( "ATLAS Communications Physics", "ATLAS Communications General", "ATLAS Internal Notes Physics", "ATLAS Internal Notes General", ): if recID in get_collection_reclist(alternative_collection): return alternative_collection # dirty hack for FP FP_collections = { "DO": ["Current Price Enquiries", "Archived Price Enquiries"], "IT": ["Current Invitation for Tenders", "Archived Invitation for Tenders"], "MS": ["Current Market Surveys", "Archived Market Surveys"], } fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections] for coll in fp_coll_ids: for coll_name in FP_collections[coll]: if recID in get_collection_reclist(coll_name): return coll_name return out
def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True): """Return the list of restricted collections to which recid belongs.""" if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() collection_reclist_cache.recreate_cache_if_needed() return [collection for collection in restricted_collection_cache.cache if recid in get_collection_reclist( collection, recreate_cache_if_needed=False)]
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset ) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, "e") for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: # write_warning(req, """<p>No match close to <em>%s</em> found in given collections. # Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True): """Return the list of restricted collections to which recid belongs.""" if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() collection_reclist_cache.recreate_cache_if_needed() return [ collection for collection in restricted_collection_cache.cache if recid in get_collection_reclist(collection, recreate_cache_if_needed=False) ]
def get_all_collections_of_a_record(recID, recreate_cache_if_needed=True): """Return all the collection names a record belongs to. Note this function is O(n_collections).""" ret = [] if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name, recreate_cache_if_needed=False): ret.append(name) return ret
def get_all_collections_of_a_record(recID, recreate_cache_if_needed=True): """Return all the collection names a record belongs to. Note this function is O(n_collections).""" ret = [] if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name, recreate_cache_if_needed=False): ret.append(name) return ret
def get_facets_for_query(self, qid, limit=20, parent=None): """Return record ids as intbitset.""" recIDsHitSet = self.get_recids_intbitset(qid) parent = request.args.get('parent', None) if parent is not None: collection = Collection.query.filter( Collection.name == parent).first_or_404() else: cc = search_results_cache.get( get_search_results_cache_key_from_qid(qid) + '::cc') if cc is not None: collection = Collection.query.filter( Collection.name == cc).first_or_404() else: collection = Collection.query.get(1) facet = [] for c in collection.collection_children_r: num_records = len(get_collection_reclist( c.name, recreate_cache_if_needed=False ).intersection(recIDsHitSet)) if num_records: facet.append((c.name, num_records, c.name_ln)) return sorted(facet, key=lambda x: x[1], reverse=True)[0:limit]
def __call__(self, req, form): """ Perform a search. """ argd = wash_search_urlargd(form) _ = gettext_set_language(argd['ln']) if req.method == 'POST': raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text=_("You are not authorized to view this area."), navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if 'rg' not in form: # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(req, 'runbibedit')[0] != 0: argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS involved_collections = set() involved_collections.update(argd['c']) involved_collections.add(argd['cc']) if argd['id'] > 0: argd['recid'] = argd['id'] if argd['idb'] > 0: argd['recidb'] = argd['idb'] if argd['sysno']: tmp_recid = find_record_from_sysno(argd['sysno']) if tmp_recid: argd['recid'] = tmp_recid if argd['sysnb']: tmp_recid = find_record_from_sysno(argd['sysnb']) if tmp_recid: argd['recidb'] = tmp_recid if argd['recid'] > 0: if argd['recidb'] > argd['recid']: # Hack to check if among the restricted collections # at least a record of the range is there and # then if the user is not authorized for that # collection. recids = intbitset(xrange(argd['recid'], argd['recidb'])) restricted_collection_cache.recreate_cache_if_needed() for collname in restricted_collection_cache.cache: (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collname) if auth_code and user_info['email'] == 'guest': coll_recids = get_collection_reclist(collname) if coll_recids & recids: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : collname}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') else: involved_collections.add(guess_primary_collection_of_a_record(argd['recid'])) # If any of the collection requires authentication, redirect # to the authentication form. for coll in involved_collections: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): auth_code, auth_message = acc_authorize_action(req, 'runbibedit') if auth_code != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if isinstance(out, intbitset): return out.fastdump() elif out == []: return str(out) else: return out
def record_public_p(recID, recreate_cache_if_needed=True): """Return 1 if the record is public, i.e. if it can be found in the Home collection. Return 0 otherwise. """ return recID in get_collection_reclist(CFG_SITE_NAME, recreate_cache_if_needed=recreate_cache_if_needed)
def record_public_p(recID, recreate_cache_if_needed=True): """Return 1 if the record is public, i.e. if it can be found in the Home collection. Return 0 otherwise. """ return recID in get_collection_reclist(CFG_SITE_NAME, recreate_cache_if_needed=recreate_cache_if_needed)