Beispiel #1
0
def get_all_recids(including_deleted=True):  #6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted:  # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Beispiel #2
0
def get_all_recids(including_deleted=True):#6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted: # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Beispiel #3
0
def get_recid_and_reportnumber(recid=None,
                               reportnumber=None,
                               keep_original_reportnumber=True):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
    if recid:
        ## Recid specified receives priority.
        recid = int(recid)
        values = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
        if values:
            ## Let's take whatever reportnumber is stored in the matching record
            reportnumber = values[0]
            return recid, reportnumber
        else:
            raise ValueError(
                "The record %s does not have a primary report number" % recid)
    elif reportnumber:
        ## Ok reportnumber specified, let's better try 1st with primary and then
        ## with other reportnumber
        recids = search_pattern(p='%s:"%s"' %
                                (CFG_PRIMARY_REPORTNUMBER, reportnumber))
        if not recids:
            ## Not found as primary
            recids = search_pattern(p='reportnumber:"%s"' % reportnumber)
        if len(recids) > 1:
            raise ValueError(
                'More than one record matches the reportnumber "%s": %s' %
                (reportnumber, ', '.join([str(i) for i in recids])))
        elif len(recids) == 1:
            recid = list(recids)[0]
            if keep_original_reportnumber:
                return recid, reportnumber
            else:
                reportnumbers = get_fieldvalues(recid,
                                                CFG_PRIMARY_REPORTNUMBER)
                if not reportnumbers:
                    raise ValueError(
                        "The matched record %s does not have a primary report number"
                        % recid)
                return recid, reportnumbers[0]
        else:
            raise ValueError(
                "No records are matched by the provided reportnumber: %s" %
                reportnumber)
    raise ValueError(
        "At least the recid or the reportnumber must be specified")
Beispiel #4
0
def Get_Sysno(parameters, curdir, form, user_info=None):
    """
       **Deprecated - Use Get_Recid Instead**
    """
    global rn, sysno
    # initialize sysno variable
    sysno = ""
    if os.path.exists("%s/SN" % curdir):
        fp = open("%s/SN" % curdir, "r")
        sysno = fp.read()
        fp.close()
    else:
        searchresults = list(search_pattern(req=None, p=rn, f="reportnumber"))
        if len(searchresults) == 0:
            raise InvenioWebSubmitFunctionStop(
                "<SCRIPT>document.forms[0].action=\"/submit\";document.forms[0].curpage.value=1;document.forms[0].step.value=0;user_must_confirm_before_leaving_page = false;alert('The report %s cannot be found in our database.\\nPerhaps it has not been integrated yet?\\nAnyway, you can choose another report number if you wish.\\n Or retry this action in a few minutes.');document.forms[0].submit();</SCRIPT>"
                % rn)
        elif len(searchresults) > 1:
            raise InvenioWebSubmitFunctionStop(
                "<SCRIPT>document.forms[0].action=\"/submit\";document.forms[0].curpage.value=1;document.forms[0].step.value=0;user_must_confirm_before_leaving_page = false;alert('Multiple documents have been found with report number %s\\nYou can try with another report number if you wish.\\n Or retry this action in a few minutes.');document.forms[0].submit();</SCRIPT>"
                % rn)
        else:
            sysno = searchresults[0]
        # save resultin a file
        fp = open("%s/SN" % curdir, "w")
        fp.write(str(sysno))
        fp.close()
    return ""
Beispiel #5
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=params['collection'],
                                       p=params['pattern'],
                                       f=params['field']))
    return res
Beispiel #6
0
    def delete_record_collection_identifiers(self):
        """
        Remove collection identifiers for this collection from all records.
        """
        from invenio.legacy.search_engine import search_pattern
        provisional_id = self.get_collection_name(provisional=True)
        normal_id = self.get_collection_name(provisional=False)

        def test_func(code, val):
            return False

        def replace_func(code, val):
            return (code, val)

        def include_func(code, val):
            return not (code == 'a' and (
                val == provisional_id or val == normal_id))

        coll = []
        for r in search_pattern(p="980__a:%s OR 980__a:%s" % (
                normal_id, provisional_id)):
            coll.append(
                self._modify_record(r, test_func, replace_func, include_func)
            )

        self._upload_collection(coll)
Beispiel #7
0
 def _run_query(self):
     """Query database for records based on rule configuration."""
     query_kwargs = {}
     query_kwargs.update(self._query_filters())
     query_kwargs.update(self._query_options())
     result = search_pattern(**query_kwargs)
     if result.is_infinite():
         query_kwargs.update(self._query_filters(force_finiteness=True))
         result = search_pattern(**query_kwargs)
     assert not result.is_infinite(), '\n'.join((
         '',
         '`search_pattern` now works,'
         'alas; you must now amend',
         'my delicate workarounds.',
     ))
     return result
Beispiel #8
0
 def get_value_recids(self, value):
     """Return record ids in intbitset for given field value."""
     from invenio.legacy.search_engine import search_pattern
     if isinstance(value, unicode):
         value = value.encode('utf8')
     p = '"' + str(value) + '"'
     return search_pattern(p=p, f=self.name)
Beispiel #9
0
def search_unit(query, f, m, wl=None):
    """Search in fulltext."""
    from invenio.legacy.search_engine import (search_unit_in_bibwords,
                                              search_pattern)
    from invenio.legacy.miscutil.solrutils_bibindex_searcher import (
        solr_get_bitset)
    from invenio.legacy.miscutil.xapianutils_bibindex_searcher import (
        xapian_get_bitset)
    from ...utils import get_idx_indexer

    def fix(p):
        if m and (m == 'a' or m == 'r'):  # phrase/regexp query
            if p.startswith('%') and p.endswith('%'):
                p = p[1:-1]  # fix for partial phrase
            p = '"' + p + '"'
        return p

    indexers = {
        'SOLR': solr_get_bitset,
        'XAPIAN': xapian_get_bitset,
    }
    indexer = get_idx_indexer('fulltext')
    if indexer in indexers and \
            current_app.config.get('CFG_{}_ENABLED'.format(indexer), False):
        try:
            indexers[indexer](fix(query), f, m)
        except:
            current_app.logger.exception("Fulltext search is broken.")
            return intbitset()
    elif m == 'a' or m == 'r':
        # FIXME: workaround for not having phrase index yet
        return search_pattern(p=query, f=f, m='w')
    # FIXME raise ContinueSearch(query, f, m, wl)
    return search_unit_in_bibwords(query, f, wl=wl)
Beispiel #10
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
def bst_openaire_altmetric():
    """
    """
    recids = search_pattern(p="0->Z", f="0247_a")
    a = Altmetric()

    for recid in recids:
        try:
            # Check if we already have an Altmetric id
            sysno_inst = get_fieldvalues(recid, "035__9")
            if ['Altmetric'] in sysno_inst:
                continue

            doi_val = get_fieldvalues(recid, "0247_a")[0]
            json_res = a.doi(doi_val)

            rec = {}
            record_add_field(rec, "001", controlfield_value=str(recid))

            if json_res:
                record_add_field(rec,
                                 '035',
                                 subfields=[('a',
                                             str(json_res['altmetric_id'])),
                                            ('9', 'Altmetric')])
                bibupload(rec, opt_mode='correct')
        except AltmetricHTTPException, e:
            register_exception(prefix='Altmetric error (status code %s): %s' %
                               (e.status_code, str(e)),
                               alert_admin=False)
def format_element(bfo, newline=False):
    """Print link to single proceeding if the proceeding exists."""
    # check if it's not a proceeding
    # we don't want to show the link to the proceedings when this record is
    # also a proceeding (this will create link to the same record)
    info = bfo.fields('980')
    proceeding = False
    for field in info:
        if 'a' in field:
            if field['a'].lower() == "proceedings":
                proceeding = True
    if proceeding:
        # it's a proceeding, so return nothing
        return ''

    cnum = str(bfo.field('773__w'))
    out = ""
    if not cnum:
        # no CNUM, return empty string
        return out
    # some CNUMs have "/" instead of "-" as a separator, so we change them
    cnum = cnum.replace("/", "-")
    search_result = search_pattern(p="773__w:" + cnum + " and 980__a:proceedings")
    if search_result:
        recID = list(search_result)[0]
        if recID != '':
            out = '<a href="/record/' + str(recID) + '">Proceedings</a>'
            if newline:
                out += '<br/>'

    return out
Beispiel #13
0
 def get_value_recids(self, value):
     """Return record ids in intbitset for given field value."""
     from invenio.legacy.search_engine import search_pattern
     if isinstance(value, unicode):
         value = value.encode('utf8')
     p = '"' + str(value) + '"'
     return search_pattern(p=p, f=self.name)
def bst_openaire_altmetric():
    """
    """
    recids = search_pattern(p="0->Z", f="0247_a")
    a = Altmetric()

    for recid in recids:
        try:
            # Check if we already have an Altmetric id
            sysno_inst = get_fieldvalues(recid, "035__9")
            if ['Altmetric'] in sysno_inst:
                continue

            doi_val = get_fieldvalues(recid, "0247_a")[0]
            json_res = a.doi(doi_val)

            rec = {}
            record_add_field(rec, "001", controlfield_value=str(recid))

            if json_res:
                record_add_field(rec, '035', subfields=[
                    ('a', str(json_res['altmetric_id'])), ('9', 'Altmetric')]
                )
                bibupload(rec, opt_mode='correct')
        except AltmetricHTTPException, e:
            register_exception(prefix='Altmetric error (status code %s): %s' %
                              (e.status_code, str(e)), alert_admin=False)
Beispiel #15
0
def render_citesummary_prologue(req, ln, recids, collections, search_patterns,
                                searchfield, coll_recids):
    total_count = len(recids)
    citable_recids = recids & search_pattern(p='collection:citeable')
    prologue = websearch_templates.tmpl_citesummary_prologue(
        coll_recids, collections, search_patterns, searchfield, citable_recids,
        total_count, ln)
    req.write(prologue)
Beispiel #16
0
def get_existing_records_for_reportnumber(reportnum):
    """Given a report number, return a list of recids of real (live) records
       that are associated with it.
       That's to say if the record does not exist (prehaps deleted, for example)
       its recid will now be returned in the list.

       @param reportnum: the report number for which recids are to be returned.
       @type reportnum: string
       @return: list of recids.
       @rtype: list
       @note: If reportnum was not found in phrase indexes, the function searches
           directly in bibxxx tables via MARC tags, so that the record does not
           have to be phrase-indexed.
    """
    existing_records = []  ## List of the report numbers of existing records

    ## Get list of records with the report-number: (first in phrase indexes)
    reclist = list(search_pattern(req=None,
                                  p=reportnum,
                                  f="reportnumber",
                                  m="e"))
    if not reclist:
        # Maybe the record has not been indexed yet? (look in bibxxx tables)
        tags = get_field_tags("reportnumber")
        for tag in tags:
            recids = list(search_pattern(req=None,
                                         p=reportnum,
                                         f=tag,
                                         m="e"))
            reclist.extend(recids)

        reclist = dict.fromkeys(reclist).keys() # Remove duplicates

    ## Loop through all recids retrieved and testing to see whether the record
    ## actually exists or not. If none of the records exist, there is no record
    ## with this reportnumber; If more than one of the records exists, then
    ## there are multiple records with the report-number; If only one record
    ## exists, then everything is OK,
    for rec in reclist:
        rec_exists = record_exists(rec)
        if rec_exists == 1:
            ## This is a live record record the recid and augment the counter of
            ## records found:
            existing_records.append(rec)
    return existing_records
Beispiel #17
0
def get_recids(recids, collections):
    """Compute recids for each column"""
    d_recids = {}
    for coll, colldef in collections:
        if not colldef:
            d_recids[coll] = recids
        else:
            d_recids[coll] = recids & search_pattern(p=colldef)
    return d_recids
Beispiel #18
0
def search_unit(query, f, m, wl=None):
    """Search in fulltext."""
    from invenio.legacy.search_engine import (search_unit_in_bibwords,
                                              search_pattern)
    if m == 'a' or m == 'r':
        # FIXME: workaround for not having phrase index yet
        return search_pattern(p=query, f=f, m='w')
    # FIXME raise ContinueSearch(query, f, m, wl)
    return search_unit_in_bibwords(query, f, wl=wl)
Beispiel #19
0
def get_recids(recids, collections):
    """Compute recids for each column"""
    d_recids = {}
    for coll, colldef in collections:
        if not colldef:
            d_recids[coll] = recids
        else:
            d_recids[coll] = recids & search_pattern(p=colldef)
    return d_recids
Beispiel #20
0
def search_unit(query, f, m, wl=None):
    """Search in fulltext."""
    from invenio.legacy.search_engine import (
        search_unit_in_bibwords, search_pattern
    )
    if m == 'a' or m == 'r':
        # FIXME: workaround for not having phrase index yet
        return search_pattern(p=query, f=f, m='w')
    # FIXME raise ContinueSearch(query, f, m, wl)
    return search_unit_in_bibwords(query, f, wl=wl)
Beispiel #21
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.legacy.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Beispiel #22
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.legacy.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Beispiel #23
0
def is_authority_record(recID):
    """
    returns whether recID is an authority record

    @param recID: the record id to check
    @type recID: int

    @return: True or False
    """
    # low-level: don't use possibly indexed logical fields !
    return recID in search_pattern(p='980__a:AUTHORITY')
Beispiel #24
0
def oai_get_recid(identifier):
    """Returns the recid corresponding to the OAI identifier. Prefer a non deleted
    record if multiple recids matches but some of them are deleted (e.g. in
    case of merging). Returns None if no record matches."""
    if identifier:
        recids = search_pattern(p=identifier, f=CFG_OAI_ID_FIELD, m='e', ap=-9)
        if recids:
            displayable_recids = get_records_that_can_be_displayed(current_user, recids)
            for recid in displayable_recids:
                if record_exists(recid) > 0:
                    return recid
    return None
Beispiel #25
0
def render_citesummary_prologue(req, ln, recids, collections, search_patterns,
                                                     searchfield, coll_recids):
    total_count = len(recids)
    citable_recids = recids & search_pattern(p='collection:citeable')
    prologue = websearch_templates.tmpl_citesummary_prologue(coll_recids,
                                                             collections,
                                                             search_patterns,
                                                             searchfield,
                                                             citable_recids,
                                                             total_count,
                                                             ln)
    req.write(prologue)
Beispiel #26
0
def create_update_jobs_by_search(pattern, batch_template_file, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS):
    """ Creates the job description files to update all records that fit a
        search pattern. Be aware of the search limitations!
    @param search_pattern: The pattern to search for
    @type search_pattern: string
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    recids = search_pattern(p=pattern)
    return create_update_jobs_by_recids(recids, batch_template_file, job_directory)
Beispiel #27
0
def get_recids_matching_query(p, f, config, m="e"):
    """Return set of recIDs matching query for pattern p in field f.

    @param p: pattern to search for
    @type recID: unicode string
    @param f: field to search in
    @type recID: unicode string
    @param config: bibrank configuration
    @type recID: dict
    @param m: type of matching (usually 'e' for exact or 'r' for regexp)
    @type recID: string
    """
    p = p.encode("utf-8")
    f = f.encode("utf-8")
    function = config.get("rank_method", "function")
    collections = config.get(function, "collections")
    if collections:
        ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
    else:
        ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
    return ret
Beispiel #28
0
def get_recids_matching_query(p, f, config, m='e'):
    """Return set of recIDs matching query for pattern p in field f.

    @param p: pattern to search for
    @type recID: unicode string
    @param f: field to search in
    @type recID: unicode string
    @param config: bibrank configuration
    @type recID: dict
    @param m: type of matching (usually 'e' for exact or 'r' for regexp)
    @type recID: string
    """
    p = p.encode('utf-8')
    f = f.encode('utf-8')
    function = config.get("rank_method", "function")
    collections = config.get(function, 'collections')
    if collections:
        ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
    else:
        ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
    return ret
Beispiel #29
0
def oai_get_recid(identifier):
    """Returns the recid corresponding to the OAI identifier. Prefer a non deleted
    record if multiple recids matches but some of them are deleted (e.g. in
    case of merging). Returns None if no record matches."""
    if identifier:
        recids = search_pattern(p=identifier, f=CFG_OAI_ID_FIELD, m='e', ap=-9)
        if recids:
            displayable_recids = get_records_that_can_be_displayed(current_user, recids)
            for recid in displayable_recids:
                if record_exists(recid) > 0:
                    return recid
    return None
Beispiel #30
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule:
            query = rule["filter_pattern"]
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(p=query,
                                                of='intbitset',
                                                wl=rule.get('filter_limit', 0),
                                                f=rule.get(
                                                    'filter_field', None),
                                                c=collections)
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED',
                                                         f='980__%',
                                                         type='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY',
                                                             f='980__%',
                                                             type='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
Beispiel #31
0
 def is_empty(self):
     if self.eresable:
         # Ensure instruments has not records.
         from invenio.legacy.search_engine import search_pattern
         q = '980__:%s' % self.get_collection_name()
         recids = search_pattern(p=q)
         if len(recids) != 0:
             self.eresable = False
             db.session.commit()
             return False
         else:
             return True
     return False
Beispiel #32
0
 def is_empty(self):
     if self.eresable:
         # Ensure project has not records.
         from invenio.legacy.search_engine import search_pattern
         q = '980__:%s' % self.get_collection_name()
         recids = search_pattern(p=q)
         if len(recids) != 0:
             self.eresable = False
             db.session.commit()
             return False
         else:
             return True
     return False
Beispiel #33
0
def search_unit(query, f, m, wl=None):
    """Search for records referred to by matched records."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_refersto_hitset

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        return get_refersto_hitset(ahitset)
    else:
        return intbitset([])
Beispiel #34
0
def get_recid_and_reportnumber(recid=None, reportnumber=None, keep_original_reportnumber=True):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
    if recid:
        ## Recid specified receives priority.
        recid = int(recid)
        values = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
        if values:
            ## Let's take whatever reportnumber is stored in the matching record
            reportnumber = values[0]
            return recid, reportnumber
        else:
            raise ValueError("The record %s does not have a primary report number" % recid)
    elif reportnumber:
        ## Ok reportnumber specified, let's better try 1st with primary and then
        ## with other reportnumber
        recids = search_pattern(p='%s:"%s"' % (CFG_PRIMARY_REPORTNUMBER, reportnumber))
        if not recids:
            ## Not found as primary
            recids = search_pattern(p='reportnumber:"%s"' % reportnumber)
        if len(recids) > 1:
            raise ValueError('More than one record matches the reportnumber "%s": %s' % (reportnumber, ', '.join(recids)))
        elif len(recids) == 1:
            recid = list(recids)[0]
            if keep_original_reportnumber:
                return recid, reportnumber
            else:
                reportnumbers = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
                if not reportnumbers:
                    raise ValueError("The matched record %s does not have a primary report number" % recid)
                return recid, reportnumbers[0]
        else:
            raise ValueError("No records are matched by the provided reportnumber: %s" % reportnumber)
    raise ValueError("At least the recid or the reportnumber must be specified")
Beispiel #35
0
def oai_get_recid(identifier):
    """Returns the recid corresponding to the OAI identifier. Prefer a non deleted
    record if multiple recids matches but some of them are deleted (e.g. in
    case of merging). Returns None if no record matches."""
    if identifier:
        recids = search_pattern(p=identifier, f=CFG_OAI_ID_FIELD, m='e', ap=-9)
        if recids:
            restricted_recids = get_all_restricted_recids()
            for recid in recids:
                if record_exists(recid) > 0 and recid not in restricted_recids:
                    return recid
            if recid not in restricted_recids:
                return recid
    return None
Beispiel #36
0
def oai_get_recid(identifier):
    """Returns the recid corresponding to the OAI identifier. Prefer a non deleted
    record if multiple recids matches but some of them are deleted (e.g. in
    case of merging). Returns None if no record matches."""
    if identifier:
        recids = search_pattern(p=identifier, f=CFG_OAI_ID_FIELD, m='e', ap=-9)
        if recids:
            restricted_recids = get_all_restricted_recids()
            for recid in recids:
                if record_exists(recid) > 0 and recid not in restricted_recids:
                    return recid
            if recid not in restricted_recids:
                return recid
    return None
Beispiel #37
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule:
            query = rule["filter_pattern"]
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(
                    p=query,
                    of='intbitset',
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                    c=collections
                )
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED', f='980__%', m='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY', f='980__%', m='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
Beispiel #38
0
def get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b):
    """
    Retrieve list of recids that were already linked to recid_a using
    this relation (marc_for_b), and that should no longer be linked
    after this update (in 'correct' mode) as they are no longer part of
    recids_and_rns_b.
    """
    unlinked_recids = []
    if upload_mode == 'correct':
        marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \
        _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1")
        already_linked_recids = search_pattern(p=str(recid_a), m='e', f=marc_tag_for_b + marc_ind1_for_b + marc_ind2_for_b + 'w')
        to_be_linked_recids = [recid for recid, rn in recids_and_rns_b]
        unlinked_recids = [recid for recid in already_linked_recids if not recid in to_be_linked_recids]
    return unlinked_recids
Beispiel #39
0
def openaire_altmetric_check_all():
    """
    Retrieve Altmetric information for all records
    """
    # Records with DOI
    recids = search_pattern(p="0->Z", f="0247_a")

    # Do not parallelize tasks to not overload Altmetric
    subtasks = []
    logger.debug("Checking Altmetric for %s records" % len(recids))
    for i in xrange(0, len(recids), MAX_RECORDS):
        # Creating immutable subtasks - see http://docs.celeryproject.org/en/latest/userguide/canvas.html
        subtasks.append(openaire_altmetric_update.si(list(recids[i:i+MAX_RECORDS])))

    chain(*subtasks).apply_async()
Beispiel #40
0
def is_record_matching_pattern(record_pattern, recid, curdir):
    """Given a pattern and a recid, returns True if the recid
       can be retrieved using the record_pattern. This enforces
       restrictions on which type of documents can be modified via a
       certain submission interface.
       The record_pattern can be anything that can be used by
       search_pattern to search for.
       Also, one can use variables stored locally, like <comboDEMOJRN>
       to denote the category or subcategory.
       Ex:
           reportnumber:DEMO-<comboDEMOJRN>-*
           collection:ATLANTISTIMESNEWS
           reportnumber:DEMO-<comboDEMOJRN>-* | collection:ATLANTISTIMESNEWS
       As a note, you can test your pattern, using the search engine
       and see if it retrieves the expected results.

    """
    # if no pattern is configured, then do not do any checks
    if not record_pattern:
        return True
    # check for local variables embedded in the pattern (ex: <comboXYZ>)
    # and  replace them with the value read from the corresponding file
    pattern_local_variables = '<\w+>'
    local_vars = re.findall(pattern_local_variables, record_pattern)
    final_record_pattern = record_pattern
    if local_vars:
        for local_var in local_vars:
             if record_pattern.find(local_var) > -1:
                 file_name = local_var[1:-1].strip()
                 try:
                     f = open("%s/%s" %(curdir, file_name), "r")
                     local_variable_content = f.read().strip()
                     final_record_pattern = final_record_pattern.replace(local_var, local_variable_content)
                     f.close()
                 except IOError:
                     msg = "Record pattern badly defined. There is no local file: %s." % file_name
                     raise InvenioWebSubmitFunctionError(msg)
    # check to see if nested <> tags were used, in this case throw an error -not supported
    if final_record_pattern.find('<') > -1 or final_record_pattern.find('>') > -1:
        msg = "Record pattern badly defined -> the local variables tags should be revised." % file_name
        raise InvenioWebSubmitFunctionError(msg)
    # get the list of records that match the final record pattern
    reclist = list(search_pattern(p=final_record_pattern))
    # check to see if our recid is part of this list or not
    if recid in reclist:
        return True
    else:
        return False
Beispiel #41
0
def search_unit(query, f, m, wl=None):
    """Search for records in citation index."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_citedby_hitset

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        if ahitset:
            return get_citedby_hitset(ahitset)
        else:
            return intbitset([])
    else:
        return intbitset([])
Beispiel #42
0
def search_unit(query, f, m, wl=None):
    """Search for records in citation index."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_citedby_hitset

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        if ahitset:
            return get_citedby_hitset(ahitset)
        else:
            return intbitset([])
    else:
        return intbitset([])
Beispiel #43
0
def create_update_jobs_by_search(pattern,
                                 batch_template_file,
                                 job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS
                                 ):
    """ Creates the job description files to update all records that fit a
        search pattern. Be aware of the search limitations!
    @param search_pattern: The pattern to search for
    @type search_pattern: string
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    recids = search_pattern(p=pattern)
    return create_update_jobs_by_recids(recids, batch_template_file,
                                        job_directory)
Beispiel #44
0
def openaire_altmetric_check_all():
    """
    Retrieve Altmetric information for all records
    """
    # Records with DOI
    recids = search_pattern(p="0->Z", f="0247_a")

    # Do not parallelize tasks to not overload Altmetric
    subtasks = []
    logger.debug("Checking Altmetric for %s records" % len(recids))
    for i in xrange(0, len(recids), MAX_RECORDS):
        # Creating immutable subtasks - see
        # http://docs.celeryproject.org/en/latest/userguide/canvas.html
        subtasks.append(
            openaire_altmetric_update.si(list(recids[i:i + MAX_RECORDS])))

    chain(*subtasks).apply_async()
def _get_breaking_news(lang, journal_name):
    """
    Gets the 'Breaking News' articles that are currently active according to
    start and end dates.
    """
    # CERN Bulletin only
    if not journal_name.lower() == 'cernbulletin':
        return ''
    # Look for active breaking news
    breaking_news_recids = [recid for recid in search_pattern(p='980__a:BULLETINBREAKING') \
                            if record_exists(recid) == 1]
    today = time.mktime(time.localtime())
    breaking_news = ""
    for recid in breaking_news_recids:
        temp_rec = BibFormatObject(recid)
        try:
            end_date = time.mktime(time.strptime(temp_rec.field("925__b"),
                                                 "%m/%d/%Y"))
        except:
            end_date = time.mktime(time.strptime("01/01/1970", "%m/%d/%Y"))
        if end_date < today:
            continue
        try:
            start_date = time.mktime(time.strptime(temp_rec.field("925__a"),
                                                   "%m/%d/%Y"))
        except:
            start_date = time.mktime(time.strptime("01/01/2050", "%m/%d/%Y"))
        if start_date > today:
            continue
        publish_date = temp_rec.field("269__c")
        if lang == 'fr':
            title = temp_rec.field("246_1a")
        else:
            title = temp_rec.field("245__a")
        breaking_news += '''
<h2 class="%s">%s<br/>
    <strong>
        <a href="%s/journal/popup?name=%s&amp;type=breaking_news&amp;record=%s&amp;ln=%s" target="_blank">%s</a>
    </strong>
</h2>
''' % ("", publish_date, CFG_SITE_URL, journal_name, recid, lang, title)
    if breaking_news:
        breaking_news = '<li>%s</li>' % breaking_news

    return breaking_news
def _get_breaking_news(lang, journal_name):
    """
    Gets the 'Breaking News' articles that are currently active according to
    start and end dates.
    """
    # CERN Bulletin only
    if not journal_name.lower() == 'cernbulletin':
        return ''
    # Look for active breaking news
    breaking_news_recids = [recid for recid in search_pattern(p='980__a:BULLETINBREAKING') \
                            if record_exists(recid) == 1]
    today = time.mktime(time.localtime())
    breaking_news = ""
    for recid in breaking_news_recids:
        temp_rec = BibFormatObject(recid)
        try:
            end_date = time.mktime(time.strptime(temp_rec.field("925__b"),
                                                 "%m/%d/%Y"))
        except:
            end_date = time.mktime(time.strptime("01/01/1970", "%m/%d/%Y"))
        if end_date < today:
            continue
        try:
            start_date = time.mktime(time.strptime(temp_rec.field("925__a"),
                                                   "%m/%d/%Y"))
        except:
            start_date = time.mktime(time.strptime("01/01/2050", "%m/%d/%Y"))
        if start_date > today:
            continue
        publish_date = temp_rec.field("269__c")
        if lang == 'fr':
            title = temp_rec.field("246_1a")
        else:
            title = temp_rec.field("245__a")
        breaking_news += '''
<h2 class="%s">%s<br/>
    <strong>
        <a href="%s/journal/popup?name=%s&amp;type=breaking_news&amp;record=%s&amp;ln=%s" target="_blank">%s</a>
    </strong>
</h2>
''' % ("", publish_date, CFG_SITE_URL, journal_name, recid, lang, title)
    if breaking_news:
        breaking_news = '<li>%s</li>' % breaking_news

    return breaking_news
def bst_openaire_check_rights():
    """
    Tasklet to verify access rights consistency.
    """
    restrictions = {
        'cc0': '',
        'openAccess': '',
        'closedAccess': 'status: closedAccess',
        'restrictedAccess': 'status: restrictedAccess',
        'embargoedAccess': 'firerole: deny until "%(date)s"\nallow any',
    }

    errors = []

    keys = dict(current_app.config['CFG_ACCESS_RIGHTS_KEYS']).keys()

    for access_rights in keys:
        write_message(
            "Checking records with access rights '%s'" % access_rights)
        recids = search_pattern(p=access_rights, f="542__l")

        for r in recids:
            date = ''
            if access_rights == 'embargoedAccess':
                try:
                    date = get_fieldvalues(r, "942__a")[0]
                except IndexError:
                    raise Exception(
                        "Embargoed record %s is missing embargo date in 942__a"
                        % r
                    )
            expected_status = restrictions[access_rights] % {'date': date}

            brd = BibRecDocs(r)
            for d in brd.list_bibdocs():
                real_status = d.get_status()
                if real_status != expected_status:
                    d.set_status(expected_status)
                    write_message(
                        "Fixed record %s with wrong status. From: %s To: %s" %
                        (r, real_status, expected_status))

    for e in errors:
        write_message(e)
Beispiel #48
0
def bst_openaire_check_rights():
    """
    Tasklet to verify access rights consistency.
    """
    restrictions = {
        'cc0': '',
        'openAccess': '',
        'closedAccess': 'status: closedAccess',
        'restrictedAccess': 'status: restrictedAccess',
        'embargoedAccess': 'firerole: deny until "%(date)s"\nallow any',
    }

    errors = []

    keys = dict(current_app.config['CFG_ACCESS_RIGHTS_KEYS']).keys()

    for access_rights in keys:
        write_message("Checking records with access rights '%s'" %
                      access_rights)
        recids = search_pattern(p=access_rights, f="542__l")

        for r in recids:
            date = ''
            if access_rights == 'embargoedAccess':
                try:
                    date = get_fieldvalues(r, "942__a")[0]
                except IndexError:
                    raise Exception(
                        "Embargoed record %s is missing embargo date in 942__a"
                        % r)
            expected_status = restrictions[access_rights] % {'date': date}

            brd = BibRecDocs(r)
            for d in brd.list_bibdocs():
                real_status = d.get_status()
                if real_status != expected_status:
                    d.set_status(expected_status)
                    write_message(
                        "Fixed record %s with wrong status. From: %s To: %s" %
                        (r, real_status, expected_status))

    for e in errors:
        write_message(e)
Beispiel #49
0
def get_dependent_records_for_control_no(control_no):
    """
    returns a list of recIDs that refer to an authority record containing
    the given control_no.
    E.g. if an authority record has the control number
    "AUTHOR:(CERN)aaa0005" in its '035__a' subfield, then this function will return all
    recIDs of records that contain any 'XXX__0' subfield
    containing "AUTHOR:(CERN)aaa0005"

    @param control_no: the control number for an authority record
    @type control_no: string

    @return: list of recIDs
    """
    # We don't want to return the recID who's control number is control_no
    myRecIDs = _get_low_level_recIDs_intbitset_from_control_no(control_no)
    # Use search_pattern, since we want to find records from both bibliographic
    # as well as authority record collections
    return list(search_pattern(p='"' + control_no+'"') - myRecIDs)
def search_unit(query, f, m, wl=None):
    """Search for records in citation index excluding self-cites."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_refers_to_list
    from invenio.legacy.bibrank.selfcites_searcher import get_self_refers_to_list

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        citees = intbitset()
        references = get_refers_to_list(ahitset)
        selfreferences = get_self_refers_to_list(ahitset)
        for refs, selfrefs in zip(references, selfreferences):
            # refs is in the form [(citer, citees), ...]
            citees += refs[1] - selfrefs[1]
        return citees
    else:
        return intbitset([])
def search_unit(query, f, m, wl=None):
    """Search for records in citation index excluding self-cites."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_refers_to_list
    from invenio.legacy.bibrank.selfcites_searcher import (
        get_self_refers_to_list)

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        citees = intbitset()
        references = get_refers_to_list(ahitset)
        selfreferences = get_self_refers_to_list(ahitset)
        for refs, selfrefs in zip(references, selfreferences):
            # refs is in the form [(citer, citees), ...]
            citees += refs[1] - selfrefs[1]
        return citees
    else:
        return intbitset([])
Beispiel #52
0
def search_unit(query, f, m, wl=None):
    """Search for records referred to by matched records except self-refs."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_cited_by_list
    from invenio.legacy.bibrank.selfcites_searcher import (
        get_self_cited_by_list)

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        citers = intbitset()
        citations = get_cited_by_list(ahitset)
        selfcitations = get_self_cited_by_list(ahitset)
        for cites, selfcites in zip(citations, selfcitations):
            # cites is in the form [(citee, citers), ...]
            citers += cites[1] - selfcites[1]
        return citers
    else:
        return intbitset([])
def search_unit(query, f, m, wl=None):
    """Search for records referred to by matched records except self-refs."""
    from invenio.legacy.search_engine import search_pattern
    from invenio.legacy.bibrank.citation_searcher import get_cited_by_list
    from invenio.legacy.bibrank.selfcites_searcher import (
        get_self_cited_by_list
    )

    if query:
        if isinstance(query, intbitset):
            ahitset = query
        else:
            ahitset = search_pattern(p=query)
        citers = intbitset()
        citations = get_cited_by_list(ahitset)
        selfcitations = get_self_cited_by_list(ahitset)
        for cites, selfcites in zip(citations, selfcitations):
            # cites is in the form [(citee, citers), ...]
            citers += cites[1] - selfcites[1]
        return citers
    else:
        return intbitset([])
Beispiel #54
0
def get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode,
                         recids_and_rns_b):
    """
    Retrieve list of recids that were already linked to recid_a using
    this relation (marc_for_b), and that should no longer be linked
    after this update (in 'correct' mode) as they are no longer part of
    recids_and_rns_b.
    """
    unlinked_recids = []
    if upload_mode == 'correct':
        marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \
        _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1")
        already_linked_recids = search_pattern(
            p=str(recid_a),
            m='e',
            f=marc_tag_for_b + marc_ind1_for_b + marc_ind2_for_b + 'w')
        to_be_linked_recids = [recid for recid, rn in recids_and_rns_b]
        unlinked_recids = [
            recid for recid in already_linked_recids
            if not recid in to_be_linked_recids
        ]
    return unlinked_recids
Beispiel #55
0
def _get_low_level_recIDs_intbitset_from_control_no(control_no):
    """
    returns the intbitset hitlist of ALL record ID(s) of the authority records
    corresponding to the given (INVENIO) MARC control number
    (e.g. '(XYZ)abc123'), (e.g. from the 035 field) of the authority record.

    Note: This function does not filter out DELETED records!!! The caller
    to this function must do this himself.

    @param control_no: an (INVENIO) MARC internal control number to an authority record
    @type control_no: string

    @return:: intbitset containing the record ID(s) of the referenced authority record
        (should be only one)
    """
    #low-level search needed e.g. for bibindex
    hitlist = search_pattern(
        p=CFG_BIBAUTHORITY_RECORD_CONTROL_NUMBER_FIELD + ":" +
        '"' + control_no + '"')

    # return
    return hitlist