def tmpl_progress_overview(self, ln=CFG_SITE_LANG): #TP: cele..
        """
        Generates a progress overview.
        """
        p_total = "980__a:%"
        res_total = search_pattern(p=p_total)
        p_done = "950__s:p* or 950__s:2"
        res_done = search_pattern(p=p_done)
        prog = 100*len(res_done)/len(res_total)
        from invenio.supervisors_config import CFG_SUPERVISORS_APPROX_GOAL
        prog2 = 100*len(res_done)/CFG_SUPERVISORS_APPROX_GOAL
        out = """<p>Processed: %(done)s / %(total)s""" % {"total": len(res_total), "done": len(res_done)}
        out += """ (%s %%) &nbsp; """ % prog
        out += """Approximate overall progress: %(done)s / %(approxGoal)s""" % {"approxGoal": CFG_SUPERVISORS_APPROX_GOAL, "done": len(res_done)}
        out += """ (%s %%)<br/>""" % prog2
        out += """Recent progress stats: <a href="%(site_url)s/%(today)s">today</a> | 
                   <a href="%(site_url)s/%(week)s">this week</a> | 
                   <a href="%(site_url)s/%(month)s">this month</a>&nbsp;&nbsp;
                   <a href="%(site_url)s/%(lweek)s">last week</a> | 
                   <a href="%(site_url)s/%(lmonth)s">last month</a>&nbsp;&nbsp;
                   <a href="%(site_url)s/%(all)s">all</a>""" % {'site_url': CFG_SITE_URL,
                    'today': 'supervisors/customevent?ln=%(ln)s&timespan=today&format=flot&ids=cardStructured&action_gen=Generate',
                    'week': 'supervisors/customevent?ln=%(ln)s&timespan=this+week&format=flot&ids=cardStructured&action_gen=Generate',
                    'month': 'supervisors/customevent?ln=%(ln)s&timespan=this+month&format=flot&ids=cardStructured&action_gen=Generate',
                    'lweek': 'supervisors/customevent?ln=%(ln)s&timespan=last+week&format=flot&ids=cardStructured&action_gen=Generate',
                    'lmonth': 'supervisors/customevent?ln=%(ln)s&timespan=last+month&format=flot&ids=cardStructured&action_gen=Generate',
                    'all': 'supervisors/customevent?ln=%(ln)s&timespan=full+history&format=flot&ids=cardStructured&action_gen=Generate',
                    'ln': CFG_SITE_LANG}

        return out
Ejemplo n.º 2
0
    def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG):
        """
        Create info about latest additions that will be used for
        create_instant_browse() later.
        """
        self.latest_additions_info = []
        if self.nbrecs and self.reclist:
            # firstly, get last 'rg' records:
            recIDs = list(self.reclist)

            # FIXME: temporary hack in order to display tweaked latest
            # additions box for some CERN collections:
            if CFG_CERN_SITE:
                this_year = time.strftime("%Y", time.localtime())
                if self.name in ['CERN Yellow Reports']:
                    last_year = str(int(this_year) - 1)
                    # detect recIDs only from this and past year:
                    recIDs = list(self.reclist & \
                                  search_pattern(p='year:%s or year:%s' % \
                                                 (this_year, last_year)))
                elif self.name in ['Videos']:
                    # detect recIDs only from this year:
                    recIDs = list(self.reclist & \
                                  search_pattern(p='year:%s' % this_year))

            total = len(recIDs)
            to_display = min(rg, total)

            for idx in range(total-1, total-to_display-1, -1):
                recid = recIDs[idx]
                self.latest_additions_info.append({'id': recid,
                                                   'format': format_record(recid, "hb", ln=ln),
                                                   'date': get_creation_date(recid, fmt="%Y-%m-%d<br />%H:%i")})
        return
Ejemplo n.º 3
0
def get_recid_and_reportnumber(recid=None, reportnumber=None):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
    if recid:
        ## Recid specified receives priority.
        recid = int(recid)
        values = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
        if values:
            ## Let's take whatever reportnumber is stored in the matching record
            reportnumber = values[0]
            return recid, reportnumber
        else:
            raise ValueError("The record %s does not have a primary report number" % recid)
    elif reportnumber:
        ## Ok reportnumber specified, let's better try 1st with primary and then
        ## with other reportnumber
        recids = search_pattern(p='%s:"%s"' % (CFG_PRIMARY_REPORTNUMBER, reportnumber))
        if not recids:
            ## Not found as primary
            recids = search_pattern(p='reportnumber:"%s"' % reportnumber)
        if len(recids) > 1:
            raise ValueError('More than one record matches the reportnumber "%s": %s' % (reportnumber, ', '.join(recids)))
        elif len(recids) == 1:
            recid = list(recids)[0]
            reportnumbers = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
            if not reportnumbers:
                raise ValueError("The matched record %s does not have a primary report number" % recid)
            return recid, reportnumbers[0]
        else:
            raise ValueError("No records are matched by the provided reportnumber: %s" % reportnumber)
    raise ValueError("At least the recid or the reportnumber must be specified")
Ejemplo n.º 4
0
def get_record_ids_to_export():
    all_recids = get_all_recids()
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    return (recids_with_a_doi | recids_with_an_arxiv_id
            | recids_with_other_id) & all_recids
    def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG):
        """
        Create info about latest additions that will be used for
        create_instant_browse() later.
        """
        self.latest_additions_info = []
        if self.nbrecs and self.reclist:
            # firstly, get last 'rg' records:
            recIDs = list(self.reclist)

            # CERN hack begins: tweak latest additions for selected collections:
            if CFG_CERN_SITE:
                # alter recIDs list for some CERN collections:
                this_year = time.strftime("%Y", time.localtime())
                if self.name in ['CERN Yellow Reports','Videos']:
                    last_year = str(int(this_year) - 1)
                    # detect recIDs only from this and past year:
                    recIDs = list(self.reclist & \
                                  search_pattern(p='year:%s or year:%s' % \
                                                 (this_year, last_year)))
                elif self.name in ['VideosXXX']:
                    # detect recIDs only from this year:
                    recIDs = list(self.reclist & \
                                  search_pattern(p='year:%s' % this_year))
                elif self.name == 'CMS Physics Analysis Summaries' and \
                         1281585 in self.reclist:
                    # REALLY, REALLY temporary hack
                    recIDs = list(self.reclist)
                    recIDs.remove(1281585)
                # apply special filters:
                if self.name in ['Videos']:
                    # select only videos with movies:
                    recIDs = list(intbitset(recIDs) & \
                                  search_pattern(p='collection:"PUBLVIDEOMOVIE"'))
                # sort some CERN collections specially:
                if self.name in ['Videos',
                                 'Video Clips',
                                 'Video Movies',
                                 'Video News',
                                 'Video Rushes',
                                 'Webcast',
                                 'ATLAS Videos',
                                 'Restricted Video Movies',
                                 'Restricted Video Rushes',
                                 'LHC First Beam Videos',
                                 'CERN openlab Videos']:
                    recIDs = sort_records(None, recIDs, '269__c')
            # CERN hack ends.

            total = len(recIDs)
            to_display = min(rg, total)

            for idx in range(total-1, total-to_display-1, -1):
                recid = recIDs[idx]
                self.latest_additions_info.append({'id': recid,
                                                   'format': format_record(recid, "hb", ln=ln),
                                                   'date': get_creation_date(recid, fmt="%Y-%m-%d<br />%H:%i")})
        return
def do_upgrade():
    """ Implement your upgrades here  """

    from invenio.search_engine import search_pattern

    for recid in search_pattern(p="980__a:0->Z AND NOT 980__a:PROVISIONAL AND NOT 980__a:PENDING AND NOT 980__a:SPAM AND NOT 980__a:REJECTED AND NOT 980__a:DUPLICATE AND NOT 980__a:DARK AND NOT 980__c:DELETED AND NOT 980__a:OPENAIRE AND NOT 980__a:curated"):
        migrate_record(recid, additions=[[('a', 'provisional-user-zenodo')], ])

    for recid in search_pattern(p="980__a:curated"):
        migrate_record(recid, substitutions=[('curated', 'user-zenodo'), ])
Ejemplo n.º 7
0
def get_record_ids_to_export(unmatched_only=False):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id | recids_with_other_id) & all_recids
Ejemplo n.º 8
0
def get_record_ids_to_export(unmatched_only=False):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id
                | recids_with_other_id) & all_recids
Ejemplo n.º 9
0
def get_all_recids(including_deleted=True):#6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted: # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Ejemplo n.º 10
0
def get_all_recids(including_deleted=True):  #6.68s on cdsdev
    """Returns a list of all records available in the system"""
    res = run_sql("SELECT id FROM bibrec")
    if not res:
        return intbitset([])
    all_recs = intbitset(res)
    if not including_deleted:  # we want to exclude deleted records
        if CFG_CERN_SITE:
            deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
        else:
            deleted = search_pattern(p='980__:"DELETED"')
        all_recs.difference_update(deleted)
    return all_recs
Ejemplo n.º 11
0
def get_recid_and_reportnumber(recid=None,
                               reportnumber=None,
                               keep_original_reportnumber=True):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
    if recid:
        ## Recid specified receives priority.
        recid = int(recid)
        values = get_fieldvalues(recid, CFG_PRIMARY_REPORTNUMBER)
        if values:
            ## Let's take whatever reportnumber is stored in the matching record
            reportnumber = values[0]
            return recid, reportnumber
        else:
            raise ValueError(
                "The record %s does not have a primary report number" % recid)
    elif reportnumber:
        ## Ok reportnumber specified, let's better try 1st with primary and then
        ## with other reportnumber
        recids = search_pattern(p='%s:"%s"' %
                                (CFG_PRIMARY_REPORTNUMBER, reportnumber))
        if not recids:
            ## Not found as primary
            recids = search_pattern(p='reportnumber:"%s"' % reportnumber)
        if len(recids) > 1:
            raise ValueError(
                'More than one record matches the reportnumber "%s": %s' %
                (reportnumber, ', '.join([str(i) for i in recids])))
        elif len(recids) == 1:
            recid = list(recids)[0]
            if keep_original_reportnumber:
                return recid, reportnumber
            else:
                reportnumbers = get_fieldvalues(recid,
                                                CFG_PRIMARY_REPORTNUMBER)
                if not reportnumbers:
                    raise ValueError(
                        "The matched record %s does not have a primary report number"
                        % recid)
                return recid, reportnumbers[0]
        else:
            raise ValueError(
                "No records are matched by the provided reportnumber: %s" %
                reportnumber)
    raise ValueError(
        "At least the recid or the reportnumber must be specified")
Ejemplo n.º 12
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    harvested_files = oai_harvest_query(prefix="arXiv",
                                        verb="ListIdentifiers",
                                        fro=date.strftime("%Y-%m-%d"))
    modified_arxiv_ids = []
    for harvested_file in harvested_files:
        modified_arxiv_ids += _RE_ARXIV_ID.findall(open(harvested_file).read())
        os.remove(harvested_file)
    recids = intbitset()
    for arxiv_id in modified_arxiv_ids:
        recids |= search_pattern(p='035__a:"oai:arXiv.org:%s"' % arxiv_id)
    return recids - search_pattern(p="980:DELETED")
Ejemplo n.º 13
0
def format_element(bfo, newline=False, show_doi=False):
    """
    Prints link to proceedings if the proceedings exist.
    If not, nothing is returned.

    @param newline: if True, add <br /> at the end
    @param show_doi: if True, show DOI of the proceeding in brackets
    """
    cnum = str(bfo.field('111__g'))
    out = ""
    if not cnum:
        #Something is wrong, return empty string
        return out
    search_result = search_pattern(p="773__w:" + cnum + " and 980__a:proceedings")
    if search_result:
        if len(search_result) > 1:
            # multiple proceedings
            proceedings = []
            for i, recID in enumerate(search_result):
                # check for the DOI and put it in brackets in the output
                doi = get_fieldvalues(recID, '0247_a')
                if show_doi and doi:
                    proceedings.append('<a href="/record/%(ID)s">#%(number)s</a> (DOI: <a href="http://dx.doi.org/%(doi)s">%(doi)s</a>)'
                                       % {'ID': recID, 'number': i+1, 'doi': doi[0]})
                else:
                    proceedings.append('<a href="/record/%(ID)s">#%(number)s</a>' % {'ID': recID, 'number': i+1})
            out = 'Proceedings: '
            out += ', '.join(proceedings)
        elif len(search_result) == 1:
            # only one proceeding
            out += '<a href="/record/' + str(search_result[0]) + '">Proceedings</a>'
        if newline:
            out += '<br/>'
    return out
Ejemplo n.º 14
0
def main():
    # 
    from_base = 'http://openaire.cern.ch/'
    to_base = 'http://localhost:4000/'
    
    # All records
    recids = search_pattern(p="0->Z", f="8564_u")
    
    print "<collection>"
    for recid in recids:
        # Get record information 
        touched = False
        file_links = get_fieldvalues(recid, "8564_u")

        def replace_link(x): 
            if x.startswith(from_base):
                return x.replace(from_base, to_base)
            else:
                return x
        
        new_file_links = map(replace_link, file_links)
        
        # Print correcting to record
        rec = {}
        record_add_field(rec, "001", controlfield_value=str(recid))
        for old_link,new_link in zip(file_links, new_file_links):
            if old_link != new_link:
                touched = True 
            record_add_field(rec, '856', ind1='4', subfields=[('u', new_link)])
        
        if touched:
            print record_xml_output(rec)
    print "</collection>"
Ejemplo n.º 15
0
def main():
    from_base = 'http://openaire.cern.ch'
    to_base = config.CFG_SITE_URL

    # All records
    recids = search_pattern(p="0->Z", f="8564_u")

    print "<collection>"
    for recid in recids:
        # Get record information
        touched = False
        file_links = get_fieldvalues(recid, "8564_u")

        new_file_links = map(replace_link_func(from_base, to_base), file_links)

        # Print correcting to record
        rec = {}
        record_add_field(rec, "001", controlfield_value=str(recid))
        for old_link, new_link in zip(file_links, new_file_links):
            if old_link != new_link:
                touched = True
            record_add_field(rec, '856', ind1='4', subfields=[('u', new_link)])

        if touched:
            print record_xml_output(rec)
    print "</collection>"
Ejemplo n.º 16
0
def format_element(bfo, newline=False):
    """
    Prints link to single proceeding if the proceeding exists.
    If not, nothing is returned.
    """

    # check if it's not a proceeding
    # we don't want to show the link to the proceedings when this record is
    # also a proceeding (this will create link to the same record)
    info = bfo.fields('980')
    proceeding = False
    for field in info:
        if field.has_key('a'):
            if field['a'].lower() == "proceedings":
                proceeding = True
    if proceeding == True:
        # it's a proceeding, so return nothing
        return ''

    cnum = str(bfo.field('773__w'))
    out = ""
    if not cnum:
        #No CNUM, return empty string
        return out
    # some CNUMs have "/" instead of "-" as a separator, so we change them
    cnum = cnum.replace("/", "-")
    search_result = search_pattern(p="773__w:" + cnum + " and 980__a:proceedings")
    if search_result:
        recID = list(search_result)[0]
        if recID != '':
            out = '<a href="/record/' + str(recID) + '">Proceedings</a>'
            if newline:
                out += '<br/>'

    return out
Ejemplo n.º 17
0
def get_data_for_definition_bibrec(column_name, recids_copy):
    '''Having a column_name and a list of recids, it returns a dictionary
    mapping each recids with its correspondig value from the column'''
    if column_name == 'id':
        ## short-cut for recids:
        return dict(((x, x) for x in recids_copy))
    if len(recids_copy) < 400:
        res = run_sql(
            'SELECT id, DATE_FORMAT(%s, "%%Y%%m%%d%%H%%i%%S") FROM bibrec WHERE id in (%s)'
            % (
                column_name,
                ','.join((str(r) for r in recids_copy)),
            ))
    elif max(recids_copy) - min(recids_copy) < 4000:
        res = run_sql(
            'SELECT id, DATE_FORMAT(%s, "%%Y%%m%%d%%H%%i%%S") FROM bibrec WHERE id >= %s AND id <= %s'
            % (
                column_name,
                min(recids_copy),
                max(recids_copy),
            ))
        res = [c for c in res if c[0] in recids_copy]
    else:
        res = run_sql(
            'SELECT id, DATE_FORMAT(%s, "%%Y%%m%%d%%H%%i%%S") FROM bibrec' %
            column_name)
        deleted = search_pattern(p='980__:"DELETED"')
        res = dict(res)
        for id in deleted:
            res.pop(id)
    # use modified invenio.datetime to accommodate years before 1900
    # dict_column = dict((id, datetime(d.year, d.month, d.day, d.hour, d.minute, d.second).strftime('%Y%m%d%H%M%S')) for id, d in res)
    return dict(res)
Ejemplo n.º 18
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            if name.strip():
                result_hitset = search_pattern(p=name,
                                               f=self.institution_name_field)
                institution_hitset = result_hitset & institution_collection_hitset
                institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
Ejemplo n.º 19
0
def get_record_ids_to_export(unmatched_only=False, since=None):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    if CFG_INSPIRE_SITE:
        recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a')
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if since:
        modified_recids = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (since, )))
        all_recids = all_recids & modified_recids
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id | recids_with_other_id) & all_recids
Ejemplo n.º 20
0
def query_records(params):
    """Prduces record IDs from given query parameters

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
Ejemplo n.º 21
0
def format_element(bfo, separator='; ', nbOnly='no', searchlink='no'):
    """
    Prints the records (or number of records) citing this record

    DO NOT USE > testing, not on cdsweb
    @param nbOnly  only print number
    @param searchlink print number (if nbOnly) as a link to the search to find these items
    @param separator a separator between citations
    """
    from urllib import quote

    from invenio.config import CFG_SITE_URL

    primary_report_numbers = bfo.fields('037__a')
    additional_report_numbers = bfo.fields('088__a')

    report_numbers = primary_report_numbers
    report_numbers.extend(additional_report_numbers)
    report_numbers = [quote(rep_num) for rep_num in report_numbers]

    res = []
    for rep_num in report_numbers:
        res.extend(list(search_pattern(p=rep_num, f='999C5r')))

    if nbOnly.lower() == 'yes':
        if searchlink.lower()=='yes':
            from bfe_server_info import format_element as bfe_server
            return '<a href="'+CFG_SITE_URL+'/search?p=recid:'+bfo.control_field('001')+'&amp;rm=citation">'+str(len(res))+'</a>'
        else:
            return str(len(res))
    else:
        from invenio.bibformat import format_records
        return '<br/>'.join(format_records(res, 'hs'))
Ejemplo n.º 22
0
def main():
    from_base = 'http://openaire.cern.ch'
    to_base = config.CFG_SITE_URL

    # All records
    recids = search_pattern(p="0->Z", f="8564_u")

    print "<collection>"
    for recid in recids:
        # Get record information
        touched = False
        file_links = get_fieldvalues(recid, "8564_u")

        new_file_links = map(replace_link_func(from_base, to_base), file_links)

        # Print correcting to record
        rec = {}
        record_add_field(rec, "001", controlfield_value=str(recid))
        for old_link, new_link in zip(file_links, new_file_links):
            if old_link != new_link:
                touched = True
            record_add_field(rec, '856', ind1='4', subfields=[('u', new_link)])

        if touched:
            print record_xml_output(rec)
    print "</collection>"
Ejemplo n.º 23
0
def bst_openaire_altmetric():
    """
    """
    recids = search_pattern(p="0->Z", f="0247_a")
    a = Altmetric()

    for recid in recids:
        try:
            # Check if we already have an Altmetric id
            sysno_inst = get_fieldvalues(recid, "035__9")
            if ['Altmetric'] in sysno_inst:
                continue

            doi_val = get_fieldvalues(recid, "0247_a")[0]
            json_res = a.doi(doi_val)

            rec = {}
            record_add_field(rec, "001", controlfield_value=str(recid))

            if json_res:
                record_add_field(rec, '035', subfields=[('a',
                    str(json_res['altmetric_id'])), ('9', 'Altmetric')])
                bibupload(rec, opt_mode='correct')
        except AltmetricHTTPException, e:
            register_exception(prefix='Altmetric error (status code %s): %s' %
                (e.status_code, str(e)), alert_admin=False)
Ejemplo n.º 24
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            result_hitset = search_pattern(
                p=name,
                f=self.institution_name_field
            )
            institution_hitset = result_hitset & institution_collection_hitset
            institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
def do_upgrade():
    """ Implement your upgrades here  """

    from invenio.search_engine import search_pattern

    for recid in search_pattern(
            p="980__a:0->Z AND NOT 980__a:PROVISIONAL AND NOT 980__a:PENDING AND NOT 980__a:SPAM AND NOT 980__a:REJECTED AND NOT 980__a:DUPLICATE AND NOT 980__a:DARK AND NOT 980__c:DELETED AND NOT 980__a:OPENAIRE AND NOT 980__a:curated"
    ):
        migrate_record(recid, additions=[
            [('a', 'provisional-user-zenodo')],
        ])

    for recid in search_pattern(p="980__a:curated"):
        migrate_record(recid, substitutions=[
            ('curated', 'user-zenodo'),
        ])
Ejemplo n.º 26
0
def find_records(collection, subfields):
    """
    Find records with VOLATILE content.

    @param collection: collection to be checked
    @type  collection: string
    @param subfields: VOLATILE content in tagiic
    @type  subfields: dict
    @return: dict {recid: array of tagiic}
    """

    sf_keys = subfields.keys()
    sf_keys.sort()

    recs_collection = get_collection_reclist(collection)
    recs_to_change = {}
    for tagiic in sf_keys:
        for value in subfields[tagiic]:
            result = search_pattern(p=value, f=tagiic, m='e') & recs_collection
            if result:
                write_message('Update %i records with %s:"%s" -- %s' \
                              % (len(result), tagiic, value, list(result)))
            for recid in result:
                if recs_to_change.has_key(recid):
                    recs_to_change[recid].append(tagiic)
                else:
                    recs_to_change[recid] = [tagiic, ]
    return recs_to_change
Ejemplo n.º 27
0
def Get_Sysno(parameters, curdir, form, user_info=None):
    """
       **Deprecated - Use Get_Recid Instead**
    """
    global rn, sysno
    # initialize sysno variable
    sysno = ""
    if os.path.exists("%s/SN" % curdir):
        fp = open("%s/SN" % curdir, "r")
        sysno = fp.read()
        fp.close()
    else:
        searchresults = list(search_pattern(req=None, p=rn, f="reportnumber"))
        if len(searchresults) == 0:
            raise InvenioWebSubmitFunctionStop(
                "<SCRIPT>document.forms[0].action=\"/submit\";document.forms[0].curpage.value=1;document.forms[0].step.value=0;user_must_confirm_before_leaving_page = false;document.forms[0].submit();alert('The report %s cannot be found in our database.\\nPerhaps it has not been integrated yet?\\nAnyway, you can choose another report number if you wish.\\n Or retry this action in a few minutes.');</SCRIPT>"
                % rn)
        elif len(searchresults) > 1:
            raise InvenioWebSubmitFunctionStop(
                "<SCRIPT>document.forms[0].action=\"/submit\";document.forms[0].curpage.value=1;document.forms[0].step.value=0;user_must_confirm_before_leaving_page = false;document.forms[0].submit();alert('Multiple documents have been found with report number %s\\nYou can try with another report number if you wish.\\n Or retry this action in a few minutes.');</SCRIPT>"
                % rn)
        else:
            sysno = searchresults[0]
        # save resultin a file
        fp = open("%s/SN" % curdir, "w")
        fp.write(str(sysno))
        fp.close()
    return ""
Ejemplo n.º 28
0
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import search_pattern
        xml = self.get_article(path)
        rec = {}
        journal, issn, volume, issue, first_page, last_page, year, start_date, doi = self.get_publication_information(xml)

        recid = search_pattern(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
            self.logger.error('Adding PDF/A. No paper with this DOI: %s. Trying to add it anyway.' % (doi,))
            register_exception(alert_admin=True, prefix="'Adding PDF/A. No paper with this DOI: %s. Trying to add it anyway.." % (doi,))

        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi,))
            elif exists(join(path, 'main.pdf')):
                record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
                self.logger.debug('No PDF/A in VTEX package for record: %s' % (doi,))
            else:
                raise MissingFFTError("Record %s doesn't contain PDF file." % (doi,))
        except MissingFFTError, err:
            register_exception(alert_admin=True, prefix="Elsevier paper: %s is missing PDF." % (doi,))
            self.logger.warning("Record %s doesn't contain PDF file." % (doi,))
Ejemplo n.º 29
0
def format_element(bfo, separator='; ', nbOnly='no', searchlink='no'):
    """
    Prints the records (or number of records) citing this record

    DO NOT USE > testing, not on cdsweb
    @param nbOnly  only print number
    @param searchlink print number (if nbOnly) as a link to the search to find these items
    @param separator a separator between citations
    """
    from urllib import quote

    from invenio.config import CFG_SITE_URL

    primary_report_numbers = bfo.fields('037__a')
    additional_report_numbers = bfo.fields('088__a')

    report_numbers = primary_report_numbers
    report_numbers.extend(additional_report_numbers)
    report_numbers = [quote(rep_num) for rep_num in report_numbers]

    res = []
    for rep_num in report_numbers:
        res.extend(list(search_pattern(p=rep_num, f='999C5r')))

    if nbOnly.lower() == 'yes':
        if searchlink.lower()=='yes':
            from bfe_server_info import format_element as bfe_server
            return '<a href="'+CFG_SITE_URL+'/search?p=recid:'+bfo.control_field('001')+'&rm=citation">'+str(len(res))+'</a>'
        else:
            return str(len(res))
    else:
        from invenio.bibformat import format_records
        return '<br/>'.join(format_records(res, 'hs'))
Ejemplo n.º 30
0
def bst_openaire_altmetric():
    """
    """
    recids = search_pattern(p="0->Z", f="0247_a")
    a = Altmetric()

    for recid in recids:
        try:
            # Check if we already have an Altmetric id
            sysno_inst = get_fieldvalues(recid, "035__9")
            if ['Altmetric'] in sysno_inst:
                continue

            doi_val = get_fieldvalues(recid, "0247_a")[0]
            json_res = a.doi(doi_val)

            rec = {}
            record_add_field(rec, "001", controlfield_value=str(recid))

            if json_res:
                record_add_field(rec,
                                 '035',
                                 subfields=[('a',
                                             str(json_res['altmetric_id'])),
                                            ('9', 'Altmetric')])
                bibupload(rec, opt_mode='correct')
        except AltmetricHTTPException, e:
            register_exception(prefix='Altmetric error (status code %s): %s' %
                               (e.status_code, str(e)),
                               alert_admin=False)
Ejemplo n.º 31
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=params['collection'],
                                       p=params['pattern'],
                                       f=params['field']))
    return res
Ejemplo n.º 32
0
def format_element(bfo, separator='; '):
    """
    Prints the list of the "children" institutions
    """
    from invenio.search_engine import search_pattern
    from invenio.bibformat_engine import BibFormatObject

    recID = str(bfo.recID)
    out = ""
    children = []
    if not recID:
        #Something is wrong, return empty string
        return out
    all_institutions = search_pattern(p="510__0:" + str(recID))
    for institution_id in all_institutions:
        for field in BibFormatObject(institution_id).fields('510__'):
            if field.get('0') == str(recID) and field.get('w') == 't':
                children.append(institution_id)

    if children:
        out += "Subsidiary Institution: "
        for item in children:
            # get the abbreviated name of the institution
            abbrev = BibFormatObject(item).field('110__u')
            if not abbrev:
                abbrev = BibFormatObject(item).field('110__a')
            if not abbrev:
            # if no name is found, we display record ID as a text of the link
                abbrev = item
            out += '<a href="/record/' + str(item) + '">' + str(abbrev) \
                + '</a>' + separator

    # remove last separator and space, then return the string
    out = out[:-2]
    return out
Ejemplo n.º 33
0
def fetch_records_missing_arxiv_fulltext():
    """
    Returns all the record IDs for records which are supposed to have an arXiv
    fulltext but do not have it.
    """
    return search_pattern(p='035__9:"arXiv" - 980:DELETED'
                          ) - fetch_records_with_arxiv_fulltext()
Ejemplo n.º 34
0
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, )))
            compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, )))
                modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d'
                                                      ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Ejemplo n.º 35
0
def render_citesummary_prologue(req, ln, recids, collections, search_patterns,
                                searchfield, d_recids):
    total_count = len(recids)
    citable_recids = recids & search_pattern(p='collection:citeable')
    prologue = websearch_templates.tmpl_citesummary_prologue(
        d_recids, collections, search_patterns, searchfield, citable_recids,
        total_count, ln)
    req.write(prologue)
Ejemplo n.º 36
0
def get_existing_records_for_reportnumber(reportnum):
    """Given a report number, return a list of recids of real (live) records
       that are associated with it.
       That's to say if the record does not exist (prehaps deleted, for example)
       its recid will now be returned in the list.

       @param reportnum: the report number for which recids are to be returned.
       @type reportnum: string
       @return: list of recids.
       @rtype: list
       @note: If reportnum was not found in phrase indexes, the function searches
           directly in bibxxx tables via MARC tags, so that the record does not
           have to be phrase-indexed.
    """
    existing_records = []  ## List of the report numbers of existing records

    ## Get list of records with the report-number: (first in phrase indexes)
    reclist = list(search_pattern(req=None,
                                  p=reportnum,
                                  f="reportnumber",
                                  m="e"))
    if not reclist:
        # Maybe the record has not been indexed yet? (look in bibxxx tables)
        tags = get_field_tags("reportnumber")
        for tag in tags:
            recids = list(search_pattern(req=None,
                                         p=reportnum,
                                         f=tag,
                                         m="e"))
            reclist.extend(recids)

        reclist = dict.fromkeys(reclist).keys() # Remove duplicates

    ## Loop through all recids retrieved and testing to see whether the record
    ## actually exists or not. If none of the records exist, there is no record
    ## with this reportnumber; If more than one of the records exists, then
    ## there are multiple records with the report-number; If only one record
    ## exists, then everything is OK,
    for rec in reclist:
        rec_exists = record_exists(rec)
        if rec_exists == 1:
            ## This is a live record record the recid and augment the counter of
            ## records found:
            existing_records.append(rec)
    return existing_records
Ejemplo n.º 37
0
def get_existing_records_for_reportnumber(reportnum):
    """Given a report number, return a list of recids of real (live) records
       that are associated with it.
       That's to say if the record does not exist (prehaps deleted, for example)
       its recid will now be returned in the list.

       @param reportnum: the report number for which recids are to be returned.
       @type reportnum: string
       @return: list of recids.
       @rtype: list
       @note: If reportnum was not found in phrase indexes, the function searches
           directly in bibxxx tables via MARC tags, so that the record does not
           have to be phrase-indexed.
    """
    existing_records = []  ## List of the report numbers of existing records

    ## Get list of records with the report-number: (first in phrase indexes)
    reclist = list(search_pattern(req=None,
                                  p=reportnum,
                                  f="reportnumber",
                                  m="e"))
    if not reclist:
        # Maybe the record has not been indexed yet? (look in bibxxx tables)
        tags = get_field_tags("reportnumber")
        for tag in tags:
            recids = list(search_pattern(req=None,
                                         p=reportnum,
                                         f=tag,
                                         m="e"))
            reclist.extend(recids)

        reclist = dict.fromkeys(reclist).keys() # Remove duplicates

    ## Loop through all recids retrieved and testing to see whether the record
    ## actually exists or not. If none of the records exist, there is no record
    ## with this reportnumber; If more than one of the records exists, then
    ## there are multiple records with the report-number; If only one record
    ## exists, then everything is OK,
    for rec in reclist:
        rec_exists = record_exists(rec)
        if rec_exists == 1:
            ## This is a live record record the recid and augment the counter of
            ## records found:
            existing_records.append(rec)
    return existing_records
Ejemplo n.º 38
0
def get(query, from_date, **kwargs):
    """Get recids matching query and with changes."""
    recids, search_pattern = get_modified_recids(from_date)
    recids = recids.union(get_modified_bibdoc_recids(from_date))

    if query:
        return recids.intersection(set(search_pattern(p=query)))

    return len(recids), recids
Ejemplo n.º 39
0
def get_recids(recids, collections):
    """Compute recids for each column"""
    d_recids = {}
    for coll, colldef in collections:
        if not colldef:
            d_recids[coll] = recids
        else:
            d_recids[coll] = recids & search_pattern(p=colldef)
    return d_recids
Ejemplo n.º 40
0
def get_recids(recids, collections):
    """Compute recids for each column"""
    d_recids = {}
    for coll, colldef in collections:
        if not colldef:
            d_recids[coll] = recids
        else:
            d_recids[coll] = recids & search_pattern(p=colldef)
    return d_recids
Ejemplo n.º 41
0
def get(query, from_date, **kwargs):
    """Get recids matching query and with changes."""
    recids, search_pattern = get_modified_recids(from_date)
    recids = recids.union(get_modified_bibdoc_recids(from_date))

    if query:
        return recids.intersection(set(search_pattern(p=query)))

    return len(recids), recids
Ejemplo n.º 42
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Ejemplo n.º 43
0
def get_record_ids_to_export(unmatched_only=False, since=None):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    if CFG_INSPIRE_SITE:
        recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a')
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if since:
        modified_recids = intbitset(
            run_sql("SELECT id FROM bibrec WHERE modification_date>=%s",
                    (since, )))
        all_recids = all_recids & modified_recids
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id
                | recids_with_other_id) & all_recids
Ejemplo n.º 44
0
def is_authority_record(recID):
    """
    returns whether recID is an authority record

    @param recID: the record id to check
    @type recID: int

    @return: True or False
    """
    # low-level: don't use possibly indexed logical fields !
    return recID in search_pattern(p='980__a:AUTHORITY')
Ejemplo n.º 45
0
def requestarticles(timespan):
    startdate = now + datetime.timedelta(days=-timespan)
    stampofstartdate = '%4d%02d%02d' % (startdate.year, startdate.month,
                                        startdate.day)
    jnlfilename = 'desypubdb-%s.%s' % (stampofstartdate, timespan)
    recids = search_pattern(
        p="005:%s->2050 0247_a:10.3204* and not 9131_1:'G:(DE-HGF)POF3-620 980:unrestricted not 980:deleted'"
        % (stampofstartdate))
    #recids = search_pattern(p="001:295330")

    return (recids, jnlfilename)
Ejemplo n.º 46
0
def is_authority_record(recID):
    """
    returns whether recID is an authority record

    :param recID: the record id to check
    :type recID: int

    :return: True or False
    """
    # low-level: don't use possibly indexed logical fields !
    return recID in search_pattern(p='980__a:AUTHORITY')
def get_recids_matching_query(p, f, config, m='e'):
    """Return set of recIDs matching query for pattern p in field f.

    @param p: pattern to search for
    @type recID: unicode string
    @param f: field to search in
    @type recID: unicode string
    @param config: bibrank configuration
    @type recID: dict
    @param m: type of matching (usually 'e' for exact or 'r' for regexp)
    @type recID: string
    """
    p = p.encode('utf-8')
    f = f.encode('utf-8')
    function = config.get("rank_method", "function")
    collections = config.get(function, 'collections')
    if collections:
        ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
    else:
        ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
    return ret
Ejemplo n.º 48
0
def create_update_jobs_by_search(pattern, batch_template_file, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS):
    """ Creates the job description files to update all records that fit a
        search pattern. Be aware of the search limitations!
    @param search_pattern: The pattern to search for
    @type search_pattern: string
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    recids = search_pattern(p=pattern)
    return create_update_jobs_by_recids(recids, batch_template_file, job_directory)
Ejemplo n.º 49
0
def get_recids_matching_query(p, f, config, m='e'):
    """Return set of recIDs matching query for pattern p in field f.

    @param p: pattern to search for
    @type recID: unicode string
    @param f: field to search in
    @type recID: unicode string
    @param config: bibrank configuration
    @type recID: dict
    @param m: type of matching (usually 'e' for exact or 'r' for regexp)
    @type recID: string
    """
    p = p.encode('utf-8')
    f = f.encode('utf-8')
    function = config.get("rank_method", "function")
    collections = config.get(function, 'collections')
    if collections:
        ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
    else:
        ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
    return ret
Ejemplo n.º 50
0
def render_citesummary_prologue(req, ln, recids, collections, search_patterns,
                                                     searchfield, coll_recids):
    total_count = len(recids)
    citable_recids = recids & search_pattern(p='collection:citeable')
    prologue = websearch_templates.tmpl_citesummary_prologue(coll_recids,
                                                             collections,
                                                             search_patterns,
                                                             searchfield,
                                                             citable_recids,
                                                             total_count,
                                                             ln)
    req.write(prologue)
Ejemplo n.º 51
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance,
             verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(
                search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return -1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1], ))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return - 1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1],))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
Ejemplo n.º 54
0
def get_recids_for_rules(rules):
    """
    Generates the final list of record IDs to load.

    @param rules dict of rules {rule_name: rule_dict}
    @type rules: dict of rules

    @return dict {rule_name: array of record IDs}
    """
    override_record_ids = task_get_option("record_ids")
    recids = {}
    for rule_name, rule in rules.iteritems():
        if "filter_pattern" in rule or "filter_collection" in rule:
            query = rule.get("filter_pattern", '')
            if "filter_collection" in rule:
                collections = rule["filter_collection"].split()
            else:
                collections = None
            write_message("Performing given search query: '%s'" % query)
            if collections:
                result = perform_request_search(p=query,
                                                of='intbitset',
                                                wl=rule.get('filter_limit', 0),
                                                f=rule.get(
                                                    'filter_field', None),
                                                c=collections)
            else:
                result = search_pattern(
                    p=query,
                    wl=rule.get('filter_limit', 0),
                    f=rule.get('filter_field', None),
                )
        else:
            result = intbitset(trailing_bits=True)

        if override_record_ids is not None:
            result.intersection_update(override_record_ids)
        else:
            last_run = get_rule_lastrun(rule_name)
            modified_recids = get_modified_records_since(last_run)
            if not "consider_deleted_records" in rule:
                modified_recids -= search_unit_in_bibxxx(p='DELETED',
                                                         f='980__%',
                                                         type='e')
                if CFG_CERN_SITE:
                    modified_recids -= search_unit_in_bibxxx(p='DUMMY',
                                                             f='980__%',
                                                             type='e')
            result.intersection_update(modified_recids)
        recids[rule_name] = result

    return recids
Ejemplo n.º 55
0
def is_record_matching_pattern(record_pattern, recid, curdir):
    """Given a pattern and a recid, returns True if the recid
       can be retrieved using the record_pattern. This enforces
       restrictions on which type of documents can be modified via a
       certain submission interface.
       The record_pattern can be anything that can be used by
       search_pattern to search for.
       Also, one can use variables stored locally, like <comboDEMOJRN>
       to denote the category or subcategory.
       Ex:
           reportnumber:DEMO-<comboDEMOJRN>-*
           collection:ATLANTISTIMESNEWS
           reportnumber:DEMO-<comboDEMOJRN>-* | collection:ATLANTISTIMESNEWS
       As a note, you can test your pattern, using the search engine
       and see if it retrieves the expected results.

    """
    # if no pattern is configured, then do not do any checks
    if not record_pattern:
        return True
    # check for local variables embedded in the pattern (ex: <comboXYZ>)
    # and  replace them with the value read from the corresponding file
    pattern_local_variables = '<\w+>'
    local_vars = re.findall(pattern_local_variables, record_pattern)
    final_record_pattern = record_pattern
    if local_vars:
        for local_var in local_vars:
            if record_pattern.find(local_var) > -1:
                file_name = local_var[1:-1].strip()
                try:
                    f = open("%s/%s" % (curdir, file_name), "r")
                    local_variable_content = f.read().strip()
                    final_record_pattern = final_record_pattern.replace(
                        local_var, local_variable_content)
                    f.close()
                except IOError:
                    msg = "Record pattern badly defined. There is no local file: %s." % file_name
                    raise InvenioWebSubmitFunctionError(msg)
    # check to see if nested <> tags were used, in this case throw an error -not supported
    if final_record_pattern.find('<') > -1 or final_record_pattern.find(
            '>') > -1:
        msg = "Record pattern badly defined -> the local variables tags should be revised." % file_name
        raise InvenioWebSubmitFunctionError(msg)
    # get the list of records that match the final record pattern
    reclist = list(search_pattern(p=final_record_pattern))
    # check to see if our recid is part of this list or not
    if recid in reclist:
        return True
    else:
        return False
Ejemplo n.º 56
0
def oai_get_recid(identifier):
    """Returns the recid corresponding to the OAI identifier. Prefer a non deleted
    record if multiple recids matches but some of them are deleted (e.g. in
    case of merging). Returns None if no record matches."""
    if identifier:
        recids = search_pattern(p=identifier, f=CFG_OAI_ID_FIELD, m='e')
        if recids:
            restricted_recids = get_all_restricted_recids()
            for recid in recids:
                if record_exists(recid) > 0 and recid not in restricted_recids:
                    return recid
            if recid not in restricted_recids:
                return recid
    return None