Ejemplo n.º 1
0
def guess_collection_of_a_record(recID, referer=None, recreate_cache_if_needed=True):
    """Return collection name a record recid belongs to, by first testing
       the referer URL if provided and otherwise returning the
       primary collection."""
    if referer:
        dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer)
        # requests can come from different invenio installations, with
        # different collections
        if CFG_SITE_URL.find(hostname) < 0:
            return guess_primary_collection_of_a_record(recID)
        g = _re_collection_url.match(path)
        if g:
            name = urllib.unquote_plus(g.group(1))
            # check if this collection actually exist (also normalize the name
            # if case-insensitive)
            name = Collection.query.filter_by(name=name).value('name')
            if name and recID in get_collection_reclist(name):
                return name
        elif path.startswith('/search'):
            if recreate_cache_if_needed:
                collection_reclist_cache.recreate_cache_if_needed()
            query = cgi.parse_qs(query)
            for name in query.get('cc', []) + query.get('c', []):
                name = Collection.query.filter_by(name=name).value('name')
                if name and recID in get_collection_reclist(name, recreate_cache_if_needed=False):
                    return name
    return guess_primary_collection_of_a_record(recID)
Ejemplo n.º 2
0
def guess_collection_of_a_record(recID, referer=None, recreate_cache_if_needed=True):
    """Return collection name a record recid belongs to, by first testing
       the referer URL if provided and otherwise returning the
       primary collection."""
    if referer:
        dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer)
        # requests can come from different invenio installations, with
        # different collections
        if CFG_SITE_URL.find(hostname) < 0:
            return guess_primary_collection_of_a_record(recID)
        g = _re_collection_url.match(path)
        if g:
            name = urllib.unquote_plus(g.group(1))
            # check if this collection actually exist (also normalize the name
            # if case-insensitive)
            name = Collection.query.filter_by(name=name).value('name')
            if name and recID in get_collection_reclist(name):
                return name
        elif path.startswith('/search'):
            if recreate_cache_if_needed:
                collection_reclist_cache.recreate_cache_if_needed()
            query = cgi.parse_qs(query)
            for name in query.get('cc', []) + query.get('c', []):
                name = Collection.query.filter_by(name=name).value('name')
                if name and recID in get_collection_reclist(name, recreate_cache_if_needed=False):
                    return name
    return guess_primary_collection_of_a_record(recID)
Ejemplo n.º 3
0
def get_records_that_can_be_displayed(permitted_restricted_collections,
                                      hitset_in_any_collection,
                                      current_coll=None,
                                      colls=None):
    """Return records that can be displayed."""
    current_coll = current_coll or cfg['CFG_SITE_NAME']
    records_that_can_be_displayed = intbitset()

    if colls is None:
        colls = [current_coll]

    policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper()

    # real & virtual
    current_coll_children = get_collection_allchildren(current_coll)

    # Add all restricted collections, that the user has access to, and are
    # under the current collection do not use set here, in order to maintain a
    # specific order: children of 'cc' (real, virtual, restricted), rest of 'c'
    # that are  not cc's children
    colls_to_be_displayed = set([
        coll for coll in current_coll_children
        if coll in colls or coll in permitted_restricted_collections
    ])
    colls_to_be_displayed |= set(
        [coll for coll in colls if coll not in colls_to_be_displayed])

    # Get all records in applicable collections
    records_that_can_be_displayed = intbitset()
    for coll in colls_to_be_displayed:
        records_that_can_be_displayed |= get_collection_reclist(coll)

    if policy == 'ANY':
        # User needs to have access to at least one collection that restricts
        # the records. We need this to be able to remove records that are both
        # in a public and restricted collection.
        permitted_recids = intbitset()
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection in permitted_restricted_collections:
                permitted_recids |= get_collection_reclist(collection)
            else:
                notpermitted_recids |= get_collection_reclist(collection)
        notpermitted_recids -= permitted_recids
    else:
        # User needs to have access to all collections that restrict a records.
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection not in permitted_restricted_collections:
                notpermitted_recids |= get_collection_reclist(collection)

    # Remove records that can not be seen by user
    records_that_can_be_displayed -= notpermitted_recids

    # Intersect only if there are some matched records
    if not hitset_in_any_collection.is_infinite():
        records_that_can_be_displayed &= hitset_in_any_collection

    return records_that_can_be_displayed
Ejemplo n.º 4
0
def get_records_that_can_be_displayed(permitted_restricted_collections,
                                      hitset_in_any_collection,
                                      current_coll=None, colls=None):
    """Return records that can be displayed."""
    current_coll = current_coll or cfg['CFG_SITE_NAME']
    records_that_can_be_displayed = intbitset()

    if colls is None:
        colls = [current_coll]

    policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper()

    # real & virtual
    current_coll_children = get_collection_allchildren(current_coll)

    # Add all restricted collections, that the user has access to, and are
    # under the current collection do not use set here, in order to maintain a
    # specific order: children of 'cc' (real, virtual, restricted), rest of 'c'
    # that are  not cc's children
    colls_to_be_displayed = set([
        coll for coll in current_coll_children
        if coll in colls or coll in permitted_restricted_collections
    ])
    colls_to_be_displayed |= set([coll for coll in colls
                                  if coll not in colls_to_be_displayed])

    # Get all records in applicable collections
    records_that_can_be_displayed = intbitset()
    for coll in colls_to_be_displayed:
        records_that_can_be_displayed |= get_collection_reclist(coll)

    if policy == 'ANY':
        # User needs to have access to at least one collection that restricts
        # the records. We need this to be able to remove records that are both
        # in a public and restricted collection.
        permitted_recids = intbitset()
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection in permitted_restricted_collections:
                permitted_recids |= get_collection_reclist(collection)
            else:
                notpermitted_recids |= get_collection_reclist(collection)
        notpermitted_recids -= permitted_recids
    else:
        # User needs to have access to all collections that restrict a records.
        notpermitted_recids = intbitset()
        for collection in restricted_collection_cache.cache:
            if collection not in permitted_restricted_collections:
                notpermitted_recids |= get_collection_reclist(collection)

    # Remove records that can not be seen by user
    records_that_can_be_displayed -= notpermitted_recids

    # Intersect only if there are some matched records
    if not hitset_in_any_collection.is_infinite():
        records_that_can_be_displayed &= hitset_in_any_collection

    return records_that_can_be_displayed
Ejemplo n.º 5
0
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG):
    """Returns either biliographic phrases or words indexes."""

    ## is p enclosed in quotes? (coming from exact search)
    if p.startswith('"') and p.endswith('"'):
        p = p[1:-1]

    ## okay, "real browse" follows:
    ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test

    if not f and p.find(":") > 0:  # does 'p' contain ':'?
        f, p = p.split(":", 1)

    coll_hitset = intbitset()
    for coll_name in colls:
        coll_hitset |= get_collection_reclist(coll_name)

    index_id = get_index_id_from_field(f)
    if index_id != 0:
        browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(
            p, index_id, rg / 2, rg / 2, coll_hitset)
    else:
        browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1,
                                                      (rg - 1) / 2 + 1)
        while not browsed_phrases:
            # try again and again with shorter and shorter pattern:
            try:
                p = p[:-1]
                browsed_phrases = get_nearest_terms_in_bibxxx(
                    p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
            except:
                register_exception(req=req, alert_admin=True)
                # probably there are no hits at all:
                return []

        ## try to check hits in these particular collection selection:
        browsed_phrases_in_colls = []
        if 0:
            for phrase in browsed_phrases:
                phrase_hitset = intbitset()
                phrase_hitsets = search_pattern("", phrase, f, 'e')
                for coll in colls:
                    phrase_hitset.union_update(phrase_hitsets[coll])
                if len(phrase_hitset) > 0:
                    # okay, this phrase has some hits in colls, so add it:
                    browsed_phrases_in_colls.append(
                        [phrase, len(phrase_hitset)])

        ## were there hits in collections?
        if browsed_phrases_in_colls == []:
            if browsed_phrases != []:
                #write_warning(req, """<p>No match close to <em>%s</em> found in given collections.
                #Please try different term.<p>Displaying matches in any collection...""" % p_orig)
                ## try to get nbhits for these phrases in any collection:
                for phrase in browsed_phrases:
                    nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset)
                    if nbhits > 0:
                        browsed_phrases_in_colls.append([phrase, nbhits])

    return browsed_phrases_in_colls
Ejemplo n.º 6
0
def guess_primary_collection_of_a_record(recID):
    """Return primary collection name a record recid belongs to, by
       testing 980 identifier.
       May lead to bad guesses when a collection is defined dynamically
       via dbquery.
       In that case, return 'CFG_SITE_NAME'."""
    out = CFG_SITE_NAME
    dbcollids = get_fieldvalues(recID, "980__a")
    for dbcollid in dbcollids:
        variants = ("collection:" + dbcollid, 'collection:"' + dbcollid + '"',
                    "980__a:" + dbcollid, '980__a:"' + dbcollid + '"',
                    '980:' + dbcollid, '980:"' + dbcollid + '"')
        res = run_sql(
            "SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)",
            variants)
        if res:
            out = res[0][0]
            break
    if CFG_CERN_SITE:
        recID = int(recID)
        # dirty hack for ATLAS collections at CERN:
        if out in ('ATLAS Communications', 'ATLAS Internal Notes'):
            for alternative_collection in (
                    'ATLAS Communications Physics',
                    'ATLAS Communications General',
                    'ATLAS Internal Notes Physics',
                    'ATLAS Internal Notes General',
            ):
                if recID in get_collection_reclist(alternative_collection):
                    return alternative_collection

        # dirty hack for FP
        FP_collections = {
            'DO': ['Current Price Enquiries', 'Archived Price Enquiries'],
            'IT': [
                'Current Invitation for Tenders',
                'Archived Invitation for Tenders'
            ],
            'MS': ['Current Market Surveys', 'Archived Market Surveys']
        }
        fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections]
        for coll in fp_coll_ids:
            for coll_name in FP_collections[coll]:
                if recID in get_collection_reclist(coll_name):
                    return coll_name

    return out
Ejemplo n.º 7
0
def guess_primary_collection_of_a_record(recID):
    """Return primary collection name a record recid belongs to, by
       testing 980 identifier.
       May lead to bad guesses when a collection is defined dynamically
       via dbquery.
       In that case, return 'CFG_SITE_NAME'."""
    out = CFG_SITE_NAME
    dbcollids = get_fieldvalues(recID, "980__a")
    for dbcollid in dbcollids:
        variants = (
            "collection:" + dbcollid,
            'collection:"' + dbcollid + '"',
            "980__a:" + dbcollid,
            '980__a:"' + dbcollid + '"',
            "980:" + dbcollid,
            '980:"' + dbcollid + '"',
        )
        res = run_sql("SELECT name FROM collection WHERE dbquery IN (%s,%s,%s,%s,%s,%s)", variants)
        if res:
            out = res[0][0]
            break
    if CFG_CERN_SITE:
        recID = int(recID)
        # dirty hack for ATLAS collections at CERN:
        if out in ("ATLAS Communications", "ATLAS Internal Notes"):
            for alternative_collection in (
                "ATLAS Communications Physics",
                "ATLAS Communications General",
                "ATLAS Internal Notes Physics",
                "ATLAS Internal Notes General",
            ):
                if recID in get_collection_reclist(alternative_collection):
                    return alternative_collection

        # dirty hack for FP
        FP_collections = {
            "DO": ["Current Price Enquiries", "Archived Price Enquiries"],
            "IT": ["Current Invitation for Tenders", "Archived Invitation for Tenders"],
            "MS": ["Current Market Surveys", "Archived Market Surveys"],
        }
        fp_coll_ids = [coll for coll in dbcollids if coll in FP_collections]
        for coll in fp_coll_ids:
            for coll_name in FP_collections[coll]:
                if recID in get_collection_reclist(coll_name):
                    return coll_name

    return out
Ejemplo n.º 8
0
def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True):
    """Return the list of restricted collections to which recid belongs."""
    if recreate_cache_if_needed:
        restricted_collection_cache.recreate_cache_if_needed()
        collection_reclist_cache.recreate_cache_if_needed()
    return [collection for collection in restricted_collection_cache.cache
            if recid in get_collection_reclist(
                collection, recreate_cache_if_needed=False)]
Ejemplo n.º 9
0
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG):
    """Returns either biliographic phrases or words indexes."""

    ## is p enclosed in quotes? (coming from exact search)
    if p.startswith('"') and p.endswith('"'):
        p = p[1:-1]

    ## okay, "real browse" follows:
    ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test

    if not f and p.find(":") > 0:  # does 'p' contain ':'?
        f, p = p.split(":", 1)

    coll_hitset = intbitset()
    for coll_name in colls:
        coll_hitset |= get_collection_reclist(coll_name)

    index_id = get_index_id_from_field(f)
    if index_id != 0:
        browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(
            p, index_id, rg / 2, rg / 2, coll_hitset
        )
    else:
        browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
        while not browsed_phrases:
            # try again and again with shorter and shorter pattern:
            try:
                p = p[:-1]
                browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1)
            except:
                register_exception(req=req, alert_admin=True)
                # probably there are no hits at all:
                return []

        ## try to check hits in these particular collection selection:
        browsed_phrases_in_colls = []
        if 0:
            for phrase in browsed_phrases:
                phrase_hitset = intbitset()
                phrase_hitsets = search_pattern("", phrase, f, "e")
                for coll in colls:
                    phrase_hitset.union_update(phrase_hitsets[coll])
                if len(phrase_hitset) > 0:
                    # okay, this phrase has some hits in colls, so add it:
                    browsed_phrases_in_colls.append([phrase, len(phrase_hitset)])

        ## were there hits in collections?
        if browsed_phrases_in_colls == []:
            if browsed_phrases != []:
                # write_warning(req, """<p>No match close to <em>%s</em> found in given collections.
                # Please try different term.<p>Displaying matches in any collection...""" % p_orig)
                ## try to get nbhits for these phrases in any collection:
                for phrase in browsed_phrases:
                    nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset)
                    if nbhits > 0:
                        browsed_phrases_in_colls.append([phrase, nbhits])

    return browsed_phrases_in_colls
Ejemplo n.º 10
0
def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True):
    """Return the list of restricted collections to which recid belongs."""
    if recreate_cache_if_needed:
        restricted_collection_cache.recreate_cache_if_needed()
        collection_reclist_cache.recreate_cache_if_needed()
    return [
        collection for collection in restricted_collection_cache.cache
        if recid in get_collection_reclist(collection,
                                           recreate_cache_if_needed=False)
    ]
Ejemplo n.º 11
0
def get_all_collections_of_a_record(recID, recreate_cache_if_needed=True):
    """Return all the collection names a record belongs to.
    Note this function is O(n_collections)."""
    ret = []
    if recreate_cache_if_needed:
        collection_reclist_cache.recreate_cache_if_needed()
    for name in collection_reclist_cache.cache.keys():
        if recID in get_collection_reclist(name, recreate_cache_if_needed=False):
            ret.append(name)
    return ret
Ejemplo n.º 12
0
def get_all_collections_of_a_record(recID, recreate_cache_if_needed=True):
    """Return all the collection names a record belongs to.
    Note this function is O(n_collections)."""
    ret = []
    if recreate_cache_if_needed:
        collection_reclist_cache.recreate_cache_if_needed()
    for name in collection_reclist_cache.cache.keys():
        if recID in get_collection_reclist(name, recreate_cache_if_needed=False):
            ret.append(name)
    return ret
Ejemplo n.º 13
0
 def get_facets_for_query(self, qid, limit=20, parent=None):
     """Return record ids as intbitset."""
     recIDsHitSet = self.get_recids_intbitset(qid)
     parent = request.args.get('parent', None)
     if parent is not None:
         collection = Collection.query.filter(
             Collection.name == parent).first_or_404()
     else:
         cc = search_results_cache.get(
             get_search_results_cache_key_from_qid(qid) + '::cc')
         if cc is not None:
             collection = Collection.query.filter(
                 Collection.name == cc).first_or_404()
         else:
             collection = Collection.query.get(1)
     facet = []
     for c in collection.collection_children_r:
         num_records = len(get_collection_reclist(
             c.name, recreate_cache_if_needed=False
         ).intersection(recIDsHitSet))
         if num_records:
             facet.append((c.name, num_records, c.name_ln))
     return sorted(facet, key=lambda x: x[1], reverse=True)[0:limit]
Ejemplo n.º 14
0
    def __call__(self, req, form):
        """ Perform a search. """
        argd = wash_search_urlargd(form)

        _ = gettext_set_language(argd['ln'])

        if req.method == 'POST':
            raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED

        uid = getUid(req)
        user_info = collect_user_info(req)
        if uid == -1:
            return page_not_authorized(req, "../",
                text=_("You are not authorized to view this area."),
                                       navmenuid='search')
        elif uid > 0:
            pref = get_user_preferences(uid)
            try:
                if 'rg' not in form:
                    # fetch user rg preference only if not overridden via URL
                    argd['rg'] = int(pref['websearch_group_records'])
            except (KeyError, ValueError):
                pass

        if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(req, 'runbibedit')[0] != 0:
            argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS

        involved_collections = set()
        involved_collections.update(argd['c'])
        involved_collections.add(argd['cc'])

        if argd['id'] > 0:
            argd['recid'] = argd['id']
        if argd['idb'] > 0:
            argd['recidb'] = argd['idb']
        if argd['sysno']:
            tmp_recid = find_record_from_sysno(argd['sysno'])
            if tmp_recid:
                argd['recid'] = tmp_recid
        if argd['sysnb']:
            tmp_recid = find_record_from_sysno(argd['sysnb'])
            if tmp_recid:
                argd['recidb'] = tmp_recid

        if argd['recid'] > 0:
            if argd['recidb'] > argd['recid']:
                # Hack to check if among the restricted collections
                # at least a record of the range is there and
                # then if the user is not authorized for that
                # collection.
                recids = intbitset(xrange(argd['recid'], argd['recidb']))
                restricted_collection_cache.recreate_cache_if_needed()
                for collname in restricted_collection_cache.cache:
                    (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collname)
                    if auth_code and user_info['email'] == 'guest':
                        coll_recids = get_collection_reclist(collname)
                        if coll_recids & recids:
                            cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : collname})
                            target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                                    make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
                            return redirect_to_url(req, target, norobot=True)
                    elif auth_code:
                        return page_not_authorized(req, "../", \
                            text=auth_msg, \
                            navmenuid='search')
            else:
                involved_collections.add(guess_primary_collection_of_a_record(argd['recid']))

        # If any of the collection requires authentication, redirect
        # to the authentication form.
        for coll in involved_collections:
            if collection_restricted_p(coll):
                (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll)
                if auth_code and user_info['email'] == 'guest':
                    cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll})
                    target = CFG_SITE_SECURE_URL + '/youraccount/login' + \
                            make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {})
                    return redirect_to_url(req, target, norobot=True)
                elif auth_code:
                    return page_not_authorized(req, "../", \
                        text=auth_msg, \
                        navmenuid='search')

        #check if the user has rights to set a high wildcard limit
        #if not, reduce the limit set by user, with the default one
        if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0):
            auth_code, auth_message = acc_authorize_action(req, 'runbibedit')
            if auth_code != 0:
                argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT

        # only superadmins can use verbose parameter for obtaining debug information
        if not isUserSuperAdmin(user_info):
            argd['verbose'] = 0

        # Keep all the arguments, they might be reused in the
        # search_engine itself to derivate other queries
        req.argd = argd

        # mod_python does not like to return [] in case when of=id:
        out = perform_request_search(req, **argd)
        if isinstance(out, intbitset):
            return out.fastdump()
        elif out == []:
            return str(out)
        else:
            return out
Ejemplo n.º 15
0
def record_public_p(recID, recreate_cache_if_needed=True):
    """Return 1 if the record is public, i.e. if it can be found in the Home collection.
       Return 0 otherwise.
    """
    return recID in get_collection_reclist(CFG_SITE_NAME, recreate_cache_if_needed=recreate_cache_if_needed)
Ejemplo n.º 16
0
def record_public_p(recID, recreate_cache_if_needed=True):
    """Return 1 if the record is public, i.e. if it can be found in the Home collection.
       Return 0 otherwise.
    """
    return recID in get_collection_reclist(CFG_SITE_NAME, recreate_cache_if_needed=recreate_cache_if_needed)