Esempio n. 1
0
def get_all_recids():
    """Return all relevant record IDs."""
    if CFG_INSPIRE_SITE:
        all_recids = get_collection_reclist(CFG_SITE_NAME) | get_collection_reclist("Conferences")
    elif CFG_CERN_SITE:
        all_recids = get_collection_reclist(CFG_SITE_NAME) | get_collection_reclist("CERN Articles & Preprints") | get_collection_reclist("CERN Series") | get_collection_reclist("CERN Departments") | get_collection_reclist("CERN Experiments") | get_collection_reclist("CERN R&D Projects")
        # We exclude all records that is not relevant for CERN/CDS
        # all_recids = all_recids & search_pattern(p='690c:CERN or 595:cds')
        # We exclude all records with an existing INSPIRE ID.
        # all_recids = all_recids - search_pattern(p='035:INSPIRE')
    else:
        all_recids = intbitset()
    return all_recids
Esempio n. 2
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    dummy, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = """SELECT `id_bibrec`, `cd` FROM `bibdocfsinfo`
                 INNER JOIN `bibrec_bibdoc`
                 ON `bibdocfsinfo`.`id_bibdoc` = `bibrec_bibdoc`.`id_bibdoc`
                 WHERE `cd` > %s
                 AND format IN ('.pdf', '.PDF', '.pdf;pdfa', '.PDF;pdfa')
                 ORDER BY `cd`"""
        records = run_sql(sql, [last_date.isoformat()])
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql(
                """SELECT `id`, NULL FROM `bibrec`
                                 WHERE `id` IN (%s)
                                 ORDER BY `id`""" % format_strings,
                list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Esempio n. 3
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            result_hitset = search_pattern(
                p=name,
                f=self.institution_name_field
            )
            institution_hitset = result_hitset & institution_collection_hitset
            institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
Esempio n. 4
0
def create_collection_bibrec(table_name, coll_name, step_size=10000, max_size=-1):
    if table_name[0] != '_':
        raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important")
    
    create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name))
    dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name))
    dbquery.run_sql(create_stmt)
    
    # now retrieve the collection
    c = search_engine.get_collection_reclist(coll_name)
    # reverse sort it
    c = sorted(c, reverse=True)
    
    if len(c) < 0:
        sys.stderr.write("The collection %s is empty!\n" % coll_name)
    
    c = list(c)
    l = len(c)
    if max_size > 0:
        l = max_size
    i = 0
    sys.stderr.write("Copying bibrec data, patience please...\n")
    while i < l:
        dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % 
                             (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size]))))
        i = i + len(c[i:i+step_size])
        #sys.stderr.write("%s\n" % i)
        
    sys.stderr.write("Total number of records: %s Copied: %s\n" % (len(c), min(l, len(c))))
Esempio n. 5
0
def late(req):
    req.content_type = "text/html"
    print >> req, pageheaderonly("Late journals", req=req)
    for journal in CFG_JOURNALS:
        print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal))
        results = get_collection_reclist(journal)
        print >> req, "<table>"
        print >> req, "<tr><th>DOI</th><th>Title</th><th>DOI registration</th><th>Arrival in SCOAP3</th></tr>"
        for recid in results:
            creation_date = run_sql("SELECT creation_date FROM bibrec WHERE id=%s", (recid, ))[0][0]
            record = get_record(recid)
            doi = record_get_field_value(record, '024', '7', code='a')
            title = record_get_field_value(record, '245', code='a')
            doi_date = run_sql("SELECT creation_date FROM doi WHERE doi=%s", (doi, ))
            background = "#eee"
            if doi_date:
                doi_date = doi_date[0][0]
                if (creation_date - doi_date).days < 0:
                    background = "#66FF00"
                elif (creation_date - doi_date).days < 1:
                    background = "#FF6600"
                else:
                    background = "#FF0000"
            else:
                doi_date = ''
            print >> req, '<tr style="background-color: %s;"><td><a href="http://dx.doi.org/%s" target="_blank">%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (
                    background,
                    escape(doi, True),
                    escape(doi),
                    title,
                    doi_date,
                    creation_date)
        print >> req, "</table>"
Esempio n. 6
0
def find_records(collection, subfields):
    """
    Find records with VOLATILE content.

    @param collection: collection to be checked
    @type  collection: string
    @param subfields: VOLATILE content in tagiic
    @type  subfields: dict
    @return: dict {recid: array of tagiic}
    """

    sf_keys = subfields.keys()
    sf_keys.sort()

    recs_collection = get_collection_reclist(collection)
    recs_to_change = {}
    for tagiic in sf_keys:
        for value in subfields[tagiic]:
            result = search_pattern(p=value, f=tagiic, m='e') & recs_collection
            if result:
                write_message('Update %i records with %s:"%s" -- %s' \
                              % (len(result), tagiic, value, list(result)))
            for recid in result:
                if recs_to_change.has_key(recid):
                    recs_to_change[recid].append(tagiic)
                else:
                    recs_to_change[recid] = [tagiic, ]
    return recs_to_change
Esempio n. 7
0
def unlinked(req):
    """
    Return an id-ordered list of citation log entries of at most 10000
    rows.
    """
    from invenio.dbquery import run_sql
    from invenio.search_engine import get_fieldvalues, get_collection_reclist
    useful_personids1 = intbitset(run_sql("SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'"))
    useful_personids2 = intbitset(run_sql("SELECT distinct personid from aidPERSONIDPAPERS where flag=2"))
    linked_personids = intbitset(run_sql("SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'"))
    names = dict(run_sql("SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'"))
    matched_names = [name.lower().strip() for name in get_fieldvalues(get_collection_reclist('HepNames'), '035__a')]
    personid_to_match = (useful_personids1 | useful_personids2) - linked_personids

    body = ['<ol>']
    for personid in personid_to_match:
        name = names.get(personid, str(personid))
        if name.lower().strip() in matched_names:
            continue
        body.append('<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>' % {
                'siteurl': escape(CFG_SITE_SECURE_URL, True),
                'bai': escape(name, True)})
    body.append('</ol>')
    body = '\n'.join(body)

    return page(req=req, body=body, title="Unlinked useful BAIs")
Esempio n. 8
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Esempio n. 9
0
def eprints():
    total = 0
    fermilab = get_collection_reclist('Fermilab')
    print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%')
    date_range = ['1904', '1905', '1906']
    #date_range = range(1, 20)
    for yymm in date_range:
        yymm = str(yymm)
        if len(yymm) == 1:
            yymm = '0' + yymm
        search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \
                   yymm + '*"'
        search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"'
        x = perform_request_search(p=search, cc='HEP')
        search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"'
        y = perform_request_search(p=search, cc='HEP')
        x_f = intbitset(x) & fermilab
        y_f = intbitset(y) & fermilab
        length = len(x) + len(y)
        length_f = len(x_f) + len(y_f)
        try:
            ratio = float(length_f) / float(length) * 100.0
        except ZeroDivisionError:
            ratio = 0
        print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f,
                                                   ratio)
        total += length
    print "Total =", total
Esempio n. 10
0
def create_collection_bibrec(table_name, coll_name, step_size=10000, maxsize=None):
    if table_name[0] != '_':
        raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important")
    
    create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name))
    dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name))
    dbquery.run_sql(create_stmt)
    
    print create_stmt

    #now retrieve the collection
    c = search_engine.get_collection_reclist(coll_name)
    if len(c) < 0:
        sys.stderr.write("The collection %s is empty!\n" % coll_name)
    else:
    	print 'collection has x recs:', len(c)
    
    c = list(c)
    l = len(c)
    i = 0
    sys.stderr.write("Copying bibrec data\n")
    while i < l:
        dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % 
                             (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size]))))
        i = i + step_size
        sys.stderr.write("%s\n" % i)

        if (maxsize and i > maxsize):
        	break
        
    sys.stderr.write("Total number of records: %s\n" % l)
Esempio n. 11
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            if name.strip():
                result_hitset = search_pattern(p=name,
                                               f=self.institution_name_field)
                institution_hitset = result_hitset & institution_collection_hitset
                institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
Esempio n. 12
0
def eprints():
    total = 0
    fermilab = get_collection_reclist('Fermilab')
    print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA',
                                                   '%')
    date_range = ['1901', '1902', '1903']
    date_range = range(1, 20)
    for yymm in date_range:
        yymm = str(yymm)
        if len(yymm) == 1:
            yymm = '0' + yymm
        search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \
                   yymm + '*"'
        search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"'
        x = perform_request_search(p=search, cc='HEP')
        search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"'
        y = perform_request_search(p=search, cc='HEP')
        x_f = intbitset(x) & fermilab
        y_f = intbitset(y) & fermilab
        length = len(x) + len(y)
        length_f = len(x_f) + len(y_f)
        try:
            ratio = float(length_f)/float(length)*100.0
        except ZeroDivisionError:
            ratio = 0
        print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, 
                                                   ratio)
        total += length
    print "Total =", total
Esempio n. 13
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Esempio n. 14
0
def fetch_records_missing_arxiv_fulltext():
    """
    Returns all the record IDs for records which are supposed to have an arXiv
    fulltext but do not have it.
    """
    return (search_pattern(p='035__9:"arXiv" - 980:DELETED')
            & get_collection_reclist('HEP')) \
        - fetch_records_with_arxiv_fulltext()
Esempio n. 15
0
def get_all_public_records(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    recids = intbitset()
    for collection in collections:
        recids += get_collection_reclist(collection)
    query = 'SELECT id, modification_date FROM bibrec'
    res = run_sql(query)
    return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
Esempio n. 16
0
def create_update_jobs_by_collection(batch_template_file, collection, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS):
    """ Creates the job description files to update a whole collection
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param collection: name of the collection that should be updated
    @type collection: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    recids = get_collection_reclist(collection)
    return create_update_jobs_by_recids(recids, batch_template_file, job_directory)
def get_all_public_records(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    recids = intbitset()
    for collection in collections:
        recids += get_collection_reclist(collection)
    query = 'SELECT id, modification_date FROM bibrec'
    res = run_sql(query)
    return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
Esempio n. 18
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, 'dumps'))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
                      "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            'prefix': CFG_SITE_URL,
            'collection': collection,
            'date': time.ctime()
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, 'dumps',
                                   '.%s-records.xml.gz' % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            with run_ro_on_slave_db():
                print >> output, format_record(recid, 'xme', user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s" %
                    (collection, recid, (i + 1) * 100 / tot,
                     time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time_estimation))))
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >> open(output_path + '.md5', "w"), calculate_md5(output_path)
        os.rename(
            output_path,
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz' % collection))
        os.rename(
            output_path + '.md5',
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz.md5' % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
              os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
Esempio n. 19
0
def get_compliance_values():
    reclist = get_collection_reclist('SCOAP3 Repository')
    for recid in reclist:
        tmpdic = {}
        rec = get_record(recid)
        if '591' in rec:
            for i in range(3):
                str_val = rec['591'][i][0][0][1]
                key = str_val[:str_val.find(':')].lower()
                val = int(str_val[str_val.find(':')+1:])
                tmpdic[key] = val
            compliance_check_values[recid] = tmpdic
Esempio n. 20
0
def get_compliance_values():
    reclist = get_collection_reclist('SCOAP3 Repository')
    for recid in reclist:
        tmpdic = {}
        rec = get_record(recid)
        if '591' in rec:
            for field in rec['591']:
                str_val = field[0][0][1]
                key = str_val[:str_val.find(':')].lower()
                val = int(str_val[str_val.find(':')+1:])
                tmpdic[key] = val
            compliance_check_values[recid] = tmpdic
Esempio n. 21
0
def unlinked(req, orcidonly=False):
    """
    Return an id-ordered list of citation log entries of at most 10000
    rows.
    """
    from invenio.dbquery import run_sql
    from invenio.search_engine import get_fieldvalues, get_collection_reclist
    useful_personids1 = intbitset(
        run_sql(
            "SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'"
        ))
    useful_personids2 = intbitset()
    if not orcidonly:
        useful_personids2 = intbitset(
            run_sql(
                "SELECT distinct personid from aidPERSONIDPAPERS where flag=2")
        )
    linked_personids = intbitset(
        run_sql(
            "SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'")
    )
    names = dict(
        run_sql(
            "SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'"
        ))
    matched_names = [
        name.lower().strip() for name in get_fieldvalues(
            get_collection_reclist('HepNames'), '035__a')
    ]
    personid_to_match = (useful_personids1
                         | useful_personids2) - linked_personids

    body = ['<ol>']
    for personid in personid_to_match:
        name = names.get(personid, str(personid))
        if name.lower().strip() in matched_names:
            continue
        body.append(
            '<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>'
            % {
                'siteurl': escape(CFG_SITE_SECURE_URL, True),
                'bai': escape(name, True)
            })
    body.append('</ol>')
    body = '\n'.join(body)

    if orcidonly:
        title = "Unlinked BAIs with ORCID"
    else:
        title = "Unlinked useful BAIs"

    return page(req=req, body=body, title=title)
Esempio n. 22
0
def get_all_recids():
    """Return all relevant record IDs."""
    if CFG_INSPIRE_SITE:
        all_recids = get_collection_reclist(
            CFG_SITE_NAME) | get_collection_reclist(
                "Conferences") | get_collection_reclist(
                    "For CDS") | get_collection_reclist("CDS Hidden")
    elif CFG_CERN_SITE:
        all_recids = get_collection_reclist(
            CFG_SITE_NAME) | get_collection_reclist(
                "CERN Articles & Preprints") | get_collection_reclist(
                    "CERN Series") | get_collection_reclist(
                        "CERN Departments") | get_collection_reclist(
                            "CERN Experiments") | get_collection_reclist(
                                "CERN R&D Projects")
        # We exclude all records that is not relevant for CERN/CDS
        # all_recids = all_recids & search_pattern(p='690c:CERN or 595:cds')
        # We exclude all records with an existing INSPIRE ID.
        # all_recids = all_recids - search_pattern(p='035:INSPIRE')
    else:
        all_recids = intbitset()
    return all_recids
def get_all_public_records(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    all_restricted_recids = get_all_restricted_recids()
    recids = intbitset()
    minimum_timestamp = get_minimum_timestamp()
    for collection in collections:
        recids += get_collection_reclist(collection)
    recids = recids.difference(all_restricted_recids)
    query = 'SELECT id, modification_date FROM bibrec'
    res = run_sql(query)
    return [(recid, max(lastmod, minimum_timestamp)) for (recid, lastmod) in res if recid in recids]
Esempio n. 24
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, "dumps"))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            "prefix": CFG_SITE_URL,
            "collection": collection,
            "date": time.ctime(),
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            print >> output, format_record(recid, "xme", user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s"
                    % (
                        collection,
                        recid,
                        (i + 1) * 100 / tot,
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)),
                    )
                )
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >>open(output_path + ".md5", "w"), calculate_md5(output_path)
        os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection))
        os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(
        os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html")
    )
Esempio n. 25
0
def create_update_jobs_by_collection(
        batch_template_file,
        collection,
        job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS):
    """ Creates the job description files to update a whole collection
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param collection: name of the collection that should be updated
    @type collection: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    recids = get_collection_reclist(collection)
    return create_update_jobs_by_recids(recids, batch_template_file,
                                        job_directory)
def get_all_public_records_modified_last_month(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    all_restricted_recids = get_all_restricted_recids()
    current_date = datetime.date.today()
    one_month_ago = current_date - datetime.timedelta(days = 31)
    recids = intbitset()
    for collection in collections:
        recids += get_collection_reclist(collection)
    recids = recids.difference(all_restricted_recids)
    query = 'SELECT id, modification_date FROM bibrec WHERE modification_date > %s'
    res = run_sql(query, (one_month_ago,))
    return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
Esempio n. 27
0
 def _append_recid_collection_list(collection, current_recids):
     """Updated list of recids with new recids from collection
     @param collection: (string) collection name to use to obtain record
     ids
     @param current_recids: (list) list of current record ids
     which have already been obtained from previous collection or
     recid flags
     @return: (list) current record ids with newly appended recids
     from input collection
     """
     records = get_collection_reclist(collection)
     for r in records:
         if r not in current_recids:
             current_recids.append(r)
     return current_recids
Esempio n. 28
0
 def _append_recid_collection_list(collection, current_recids):
     """Updated list of recids with new recids from collection
     @param collection: (string) collection name to use to obtain record
     ids
     @param current_recids: (list) list of current record ids
     which have already been obtained from previous collection or
     recid flags
     @return: (list) current record ids with newly appended recids
     from input collection
     """
     records = get_collection_reclist(collection)
     for r in records:
         if r not in current_recids:
             current_recids.append(r)
     return current_recids
Esempio n. 29
0
def get_all_public_records(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    all_restricted_recids = get_all_restricted_recids()
    recids = intbitset()
    minimum_timestamp = get_minimum_timestamp()
    for collection in collections:
        recids += get_collection_reclist(collection)
    recids = recids.difference(all_restricted_recids)
    query = 'SELECT id, modification_date FROM bibrec'
    res = run_sql(query)
    return [(recid, max(lastmod, minimum_timestamp))
            for (recid, lastmod) in res if recid in recids]
Esempio n. 30
0
def get_all_public_records_modified_last_month(collections):
    """ Get all records which exist (i.e. not suppressed ones) and are in
    accessible collection.
    returns list of (recid, last_modification) tuples
    """
    all_restricted_recids = get_all_restricted_recids()
    current_date = datetime.date.today()
    one_month_ago = current_date - datetime.timedelta(days=31)
    recids = intbitset()
    for collection in collections:
        recids += get_collection_reclist(collection)
    recids = recids.difference(all_restricted_recids)
    query = 'SELECT id, modification_date FROM bibrec WHERE modification_date > %s'
    res = run_sql(query, (one_month_ago, ))
    return [(recid, lastmod) for (recid, lastmod) in res if recid in recids]
Esempio n. 31
0
def main():
    for journal in CFG_JOURNALS:
        name = get_coll_i18nname(journal)
        reclist = get_collection_reclist(journal)
        print "<h2>%s</h2>" % escape(name)
        if not reclist:
            print "<p>None yet.</p>"
            continue
        print "<p><ul>"
        for recid in reclist:
            record = get_record(recid)
            title = remove_html_markup(record_get_field_value(record, '245', code='a'), remove_escaped_chars_p=False).strip()
            doi = record_get_field_value(record, '024', '7', code='a')
            print '<li><a href="http://dx.doi.org/%s" target="_blank">%s</a>: %s</li>' % (escape(doi, True), escape(doi), title)
        print "</ul></p>"
Esempio n. 32
0
def find_book(citation_element):
    books_recids = get_collection_reclist('Books')
    search_string = citation_element['title']
    recids = intbitset(get_recids_matching_query(search_string, 'title'))
    recids &= books_recids
    if len(recids) == 1:
        return recids

    if 'year' in citation_element:
        for recid in recids:
            year_tags = get_fieldvalues(recid, '269__c')
            for tag in year_tags:
                if tag == citation_element['year']:
                    return [recid]

    return []
Esempio n. 33
0
def get_all_recids():
    if CFG_INSPIRE_SITE:
        all_recids = get_collection_reclist(CFG_SITE_NAME)
    elif CFG_CERN_SITE:
        all_recids = get_collection_reclist(
            CFG_SITE_NAME) | get_collection_reclist(
                "CERN Articles & Preprints") | get_collection_reclist(
                    "CERN Series") | get_collection_reclist(
                        "CERN Departments") | get_collection_reclist(
                            "CERN Experiments") | get_collection_reclist(
                                "CERN R&D Projects")
    else:
        all_recids = intbitset()
    return all_recids
Esempio n. 34
0
    def lazy_parser(collection, left_tags, right_tags):
        for recid in get_collection_reclist(collection):
            try:
                # Key tag
                # e.g. for journals database: 711__a
                left_values = get_tag_values(recid, left_tags)
            except IndexError:
                pass
            else:
                # Value tags
                # e.g. for journals database: 130__a, 730__a and 030__a
                right_values = get_tag_values(recid, right_tags)

                for left_value in set(left_values):
                    for right_value in set(right_values):
                        yield left_value, right_value
Esempio n. 35
0
    def lazy_parser(collection, left_tags, right_tags):
        for recid in get_collection_reclist(collection):
            try:
                # Key tag
                # e.g. for journals database: 130__a, 730__a and 030__a
                left_values = get_tag_values(recid, left_tags)
            except IndexError:
                pass
            else:
                # Value tags
                # e.g. for journals database: 711__a
                right_values = get_tag_values(recid, right_tags)

                for left_value in set(left_values):
                    for right_value in set(right_values):
                        yield left_value, right_value
Esempio n. 36
0
def find_book(citation_element):
    books_recids = get_collection_reclist('Books')
    search_string = citation_element['title']
    recids = intbitset(get_recids_matching_query(search_string, 'title'))
    recids &= books_recids
    if len(recids) == 1:
        return recids

    if 'year' in citation_element:
        for recid in recids:
            year_tags = get_fieldvalues(recid, '269__c')
            for tag in year_tags:
                if tag == citation_element['year']:
                    return [recid]

    return []
def build_hepnames_knowledge():
    recids = get_collection_reclist('HepNames')
    ret = {}
    for recid in recids:
        ids = {'recid': recid}
        record = get_record(recid)
        for field in record_get_field_instances(record, '035'):
            id_type = None
            id_value = None
            for code, value in field_get_subfield_instances(field):
                code = code.strip()
                value = value.strip()
                if code == '9':
                    if id_type and id_type != value.upper():
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr)
                        break
                    id_type = value.upper()
                if code == 'a':
                    if id_value and id_value != value:
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr)
                        break
                    id_value = value
            if not id_type or not id_value:
                # Incomplete IDs
                continue
            else:
                if id_type == 'BAI':
                    if not valid_bai(id_value):
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid BAI: {value}".format(recid=recid, value=id_value), stream=sys.stderr)
                        continue
                elif id_type == 'INSPIRE':
                    if not valid_inspire(id_value):
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid INSPIRE: {value}".format(recid=recid, value=id_value), stream=sys.stderr)
                        continue
                elif id_type == 'ORCID':
                    if not valid_orcid(id_value):
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid ORCID: {value}".format(recid=recid, value=id_value), stream=sys.stderr)
                        continue
                elif id_type == 'KAKEN':
                    if not valid_kaken(id_value):
                        write_message("ERROR: http://inspirehep.net/record/{recid} has invalid KAKEN: {value}".format(recid=recid, value=id_value), stream=sys.stderr)
                        continue
                ids[id_type] = id_value.upper()
                if id_type == 'BAI':
                    ids['ORIGINAL_BAI'] = id_value
        ret[recid] = ids
    return ret.values()
Esempio n. 38
0
def task_parse_options(key, value, opts, args):   # pylint: disable-msg=W0613
    """ Must be defined for bibtask to create a task """
    if args:
        # There should be no standalone arguments for any bibcatalog job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(get_collection_reclist(v))
    elif key in ('-i', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('--tickets',):
        tickets = task_get_option('tickets')
        if not tickets:
            tickets = set()
            task_set_option('tickets', tickets)
        for item in value.split(','):
            tickets.add(item.strip())
    elif key in ('--all-tickets',):
        task_set_option('all-tickets', True)
    elif key in ('-q', '--query'):
        query = task_get_option('query')
        if not query:
            query = set()
            task_set_option('query', query)
        query.add(value)
    elif key in ('-r', '--reportnumbers'):
        reportnumbers = task_get_option('reportnumbers')
        if not reportnumbers:
            reportnumbers = set()
            task_set_option('reportnumbers', reportnumbers)
        reportnumbers.add(value)
    return True
def get_all_recids():
    if CFG_INSPIRE_SITE:
        all_recids = get_collection_reclist(CFG_SITE_NAME)
    elif CFG_CERN_SITE:
        all_recids = (
            get_collection_reclist(CFG_SITE_NAME)
            | get_collection_reclist("CERN Articles & Preprints")
            | get_collection_reclist("CERN Series")
            | get_collection_reclist("CERN Departments")
            | get_collection_reclist("CERN Experiments")
            | get_collection_reclist("CERN R&D Projects")
        )
    else:
        all_recids = intbitset()
    return all_recids
Esempio n. 40
0
def task_parse_options(key, value, opts, args):  # pylint: disable-msg=W0613
    """ Must be defined for bibtask to create a task """
    if args:
        # There should be no standalone arguments for any bibcatalog job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(get_collection_reclist(v))
    elif key in ('-i', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('--tickets', ):
        tickets = task_get_option('tickets')
        if not tickets:
            tickets = set()
            task_set_option('tickets', tickets)
        for item in value.split(','):
            tickets.add(item.strip())
    elif key in ('--all-tickets', ):
        task_set_option('all-tickets', True)
    elif key in ('-q', '--query'):
        query = task_get_option('query')
        if not query:
            query = set()
            task_set_option('query', query)
        query.add(value)
    elif key in ('-r', '--reportnumbers'):
        reportnumbers = task_get_option('reportnumbers')
        if not reportnumbers:
            reportnumbers = set()
            task_set_option('reportnumbers', reportnumbers)
        reportnumbers.add(value)
    return True
Esempio n. 41
0
def late(req):
    req.content_type = "text/html"
    print >> req, pageheaderonly("Late journals", req=req)

    th = ("<tr><th>DOI</th><th>Title</th><th>DOI registration</th>"
          "<th>Arrival in SCOAP3</th></tr>")
    tr = ("<tr style='background-color: {0};'><td>"
          "<a href='http://dx.doi.org/{1}' target='_blank'>{2}</td>"
          "<td>{3}</td><td>{4}</td><td>{5}</td></tr>")

    sql_bibrec = "SELECT creation_date FROM bibrec WHERE id=%s"
    sql_doi = "SELECT creation_date FROM doi WHERE doi=%s"

    for journal in CFG_JOURNALS:
        print >> req, "<h2>%s</h2>" % escape(get_coll_i18nname(journal))
        results = get_collection_reclist(journal)
        print >> req, "<table>"
        print >> req, th
        for recid in results:
            creation_date = run_sql(sql_bibrec, (recid, ))[0][0]
            record = get_record(recid)
            doi = record_get_field_value(record, '024', '7', code='a')
            title = record_get_field_value(record, '245', code='a')
            doi_date = run_sql(sql_doi, (doi, ))
            background = "#eee"
            if doi_date:
                doi_date = doi_date[0][0]
                if (creation_date - doi_date).days < 0:
                    background = "#66FF00"
                elif (creation_date - doi_date).days < 1:
                    background = "#FF6600"
                else:
                    background = "#FF0000"
            else:
                doi_date = ''
            print >> req, tr.format(background,
                                    escape(doi, True),
                                    escape(doi),
                                    title,
                                    doi_date,
                                    creation_date)
        print >> req, "</table>"
Esempio n. 42
0
def parse_pdg_element(element, hep_collection=get_collection_reclist('HEP')):
    """Given an element from the PDG update file, this function will check the
    validity of the recid and return it with the parsed PDG data.

    Params: dict element - the element be parsed
            intbitset hep_collection - all recids in HEP, used for caching
    Return: ParseResult - Status code
            int recid - record ID
            list pdg_values - pdg_values
    """
    recid = None
    pdg_values = None
    if set(element.keys()) != set(('inspireId', 'pdgIdList')):
        return ParseResult.Invalid, None, None

    recid = int(element['inspireId'])
    pdg_values = element['pdgIdList']

    if recid not in hep_collection:
        return ParseResult.Missing, None, None

    return ParseResult.Success, recid, pdg_values
Esempio n. 43
0
def parse_pdg_element(element, hep_collection=get_collection_reclist('HEP')):
    """Given an element from the PDG update file, this function will check the
    validity of the recid and return it with the parsed PDG data.

    Params: dict element - the element be parsed
            intbitset hep_collection - all recids in HEP, used for caching
    Return: ParseResult - Status code
            int recid - record ID
            list pdg_values - pdg_values
    """
    recid = None
    pdg_values = None
    if set(element.keys()) != set(('inspireId', 'pdgIdList')):
        return ParseResult.Invalid, None, None

    recid = int(element['inspireId'])
    pdg_values = element['pdgIdList']

    if recid not in hep_collection:
        return ParseResult.Missing, None, None

    return ParseResult.Success, recid, pdg_values
Esempio n. 44
0
def bst_cnumcatchup():
    modrecs = intbitset([
        x[0] for x in run_sql('select id from bibrec where ' +
                              'modification_date >' +
                              'DATE_SUB(CURDATE(), INTERVAL 3 DAY)')
    ])

    confupd = intbitset(get_collection_reclist('Conferences')) \
        & modrecs
    procupd = intbitset(perform_request_search(p="980__a:Proceedings")) \
        & modrecs

    cnums = []
    for r in confupd:
        for c in get_fieldvalues(r, '111__g'):
            if len(c) > 3:
                cnums.append(c)
    for r in procupd:
        for c in get_fieldvalues(r, '773__w'):
            if len(c) > 3:
                cnums.append(c)

    recs = intbitset()
    for cn in cnums:
        recs += intbitset(perform_request_search(p="find cnum %s" % cn))

    if recs:
        while len(recs) > 500:
            nextchunk = recs[:500]
            recs = recs[500:]
            task_low_level_submission('bibreformat',
                                      'bibreformat:bstcnumcatchup', '-o', 'HB',
                                      '-i',
                                      ','.join([str(r) for r in nextchunk]))
        if recs:
            task_low_level_submission('bibreformat',
                                      'bibreformat:bstcnumcatchup', '-o', 'HB',
                                      '-i', ','.join([str(r) for r in recs]))
Esempio n. 45
0
def bst_cnumcatchup():
    modrecs = intbitset([x[0] for x in
                         run_sql('select id from bibrec where ' +
                                 'modification_date >' +
                                 'DATE_SUB(CURDATE(), INTERVAL 3 DAY)')])

    confupd = intbitset(get_collection_reclist('Conferences')) \
        & modrecs
    procupd = intbitset(perform_request_search(p="980__a:Proceedings")) \
        & modrecs

    cnums = []
    for r in confupd:
        for c in get_fieldvalues(r, '111__g'):
            if len(c) > 3:
                cnums.append(c)
    for r in procupd:
        for c in get_fieldvalues(r, '773__w'):
            if len(c) > 3:
                cnums.append(c)

    recs = intbitset()
    for cn in cnums:
        recs += intbitset(perform_request_search(p="find cnum %s" % cn))

    if recs:
        while len(recs) > 500:
            nextchunk = recs[:500]
            recs = recs[500:]
            task_low_level_submission('bibreformat',
                                      'bibreformat:bstcnumcatchup',
                                      '-o', 'HB', '-i',
                                      ','.join([str(r) for r in nextchunk]))
        if recs:
            task_low_level_submission('bibreformat',
                                      'bibreformat:bstcnumcatchup',
                                      '-o', 'HB', '-i',
                                      ','.join([str(r) for r in recs]))
Esempio n. 46
0
def bst_autocompletion_cache(collection_list=None):
    """
    Bibtasklet responsible of the generation of the subjects and authors list for the
    autocompletion suggestions.
    @param collection_list: list of collection ids to cache.
                            If None, all the collections will be calculated.

    """

    task_update_progress("Started updating autocomplete cache")

    tag_dicc = {'en': '9051_a', 'fr': '9061_a', 'es': '9071_a'}

    if collection_list == None:
        res = run_sql("SELECT id FROM collection")
        collection_list = [i[0] for i in res]

    i = 0
    task_update_progress("Done %s of %s" % (i, len(collection_list)))
    for collection in collection_list:
        i += 1
        recids = list(get_collection_reclist(get_collection_name_by_id(collection)))
        authors = get_most_popular_field_values(recids, get_field_tags('exactauthor'))[0:200]
        authors = [a[0] for a in authors]

        subjects = {}
        for ln in ['en', 'fr', 'es']:
            subject_tag = tag_dicc[ln]
            subjects[ln] = [s[0] for s in get_most_popular_field_values(recids, subject_tag)]

        ins = AutocompletionCache(id_collection=collection, authors=authors, subjects=subjects)
        db.session.merge(ins)
        db.session.flush()
        task_update_progress("Done %s of %s" % (i, len(collection_list)))


    db.session.close_all()
    task_update_progress("Finished updating autocomplete cache")
Esempio n. 47
0
 def lazy_parser(collection, left_tags, right_tags, volume_subfield):
     for recid in get_collection_reclist(collection):
         record = get_record(recid)
         for right_tag in right_tags:
             for right_value in record_get_field_values(
                     record, right_tag[:3], right_tag[3], right_tag[4],
                     right_tag[5]):
                 if not right_value:
                     continue  # Empty metadata
                 yield right_value, right_value
                 for left_tag in left_tags:
                     for left_field in record_get_field_instances(
                             record, left_tag[:3], left_tag[3],
                             left_tag[4]):
                         left_subfields = dict(
                             field_get_subfield_instances(left_field))
                         if left_tag[5] not in left_subfields:
                             continue  # Empty field
                         if volume_subfield in left_subfields:
                             yield left_subfields[left_tag[5]], '%s;%s' % (
                                 right_value,
                                 left_subfields[volume_subfield])
                         else:
                             yield left_subfields[left_tag[5]], right_value
Esempio n. 48
0
from invenio.search_engine import perform_request_search
from invenio.search_engine import get_all_field_values
from invenio.intbitset import intbitset

#from invenio.bibauthorid_dbinterface \
#     import _select_from_aidpersoniddata_where
from invenio.dbquery import run_sql



from hep_convert_email_to_id import find_inspire_id_from_record, \
                                    bad_id_check, \
                                    get_hepnames_anyid_from_recid

LETTER = None
RECIDS_HEPN = get_collection_reclist('HepNames')
RECIDS_INST = get_collection_reclist('Institutions')
RECIDS_EXPT = get_collection_reclist('Experiments')

GOOD_IDENTIFIERS = set([
    x.lower() for x in [
        'ARXIV', 'BAI', 'CERN', 'DESY', 'GoogleScholar', 'INSPIRE', 'JACOW',
        'KAKEN', 'ORCID', 'ResearcherID', 'SCOPUS', 'SLAC', 'Wikipedia'
    ]
])

BAI_URL = 'https://inspirehep.net/author/manage_profile/'


def bad_identifiers():
    """Looks for bad 035__9 fields"""
Esempio n. 49
0
""" Bibcheck plugin checking that reference info in 999C50 agrees with
    citation info in 999C5r and 999C5s
"""

import re
from collections import defaultdict, namedtuple

from invenio.intbitset import intbitset
from invenio.search_engine import (get_collection_reclist, search_pattern,
                                   search_unit)

Reftags = namedtuple('Reftags', 'pubnote repno DOI citedrecid curatorflag')
FIELDS = Reftags('999C5s', '999C5r', '999C5a', '999C50', '999C59')


HEPRECS = get_collection_reclist('HEP')

CATEGORY = re.compile(ur'^(.*)\[[^\]]+\]')
ARXIVPREFIX = re.compile(ur'^(arXiv:(\s+)?)\D', re.I)


class Reference(object):
    """ container for various ref info """
    def __init__(self):
        self._fields = defaultdict(list)


    @staticmethod
    def normalize_repno(repno):
        """ cast repno into standard form """
Esempio n. 50
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql(
        "SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                write_message("INFO: Collection %s has not been previously "
                              "analyzed." % collection,
                              stream=sys.stderr,
                              verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif task_get_option('force'):
                write_message("INFO: Analysis is forced for collection %s." %
                              collection,
                              stream=sys.stderr,
                              verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(
                    run_sql(
                        "SELECT id FROM bibrec "
                        "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                write_message(
                    "WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr,
                    verbose=2)
        else:
            write_message("ERROR: Collection '%s' doesn't contain any record. "
                          "Cannot analyse keywords." % collection,
                          stream=sys.stderr,
                          verbose=0)

    return rec_onts
"""

import re
from sys import argv

from invenio.search_engine import perform_request_search, get_record, \
                                  search_unit, get_all_field_values
from invenio.bibrecord import print_rec, record_get_field_instances, \
                              record_add_field
from invenio.intbitset import intbitset
from invenio.bibformat_engine import BibFormatObject
from invenio.search_engine import get_collection_reclist

from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE

HN = get_collection_reclist('HepNames')

EMAILS_HEPNAMES = get_all_field_values('371__m') + \
         get_all_field_values('371__o') + \
         get_all_field_values('595__m') + \
         get_all_field_values('595__o')
EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m')

COUNTER_MAX = 400

def generate_check_digit(base_digits):
    '''
    Taken from https://github.com/tjwds/generate-orcid-checksum
    '''
    total = 0
    for digit in str(base_digits):
Esempio n. 52
0
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
""" Bibcheck plugin checking that reference info in 999C50 agrees with
    citation info in 999C5r and 999C5s
"""

import re
from collections import defaultdict, namedtuple

from invenio.intbitset import intbitset
from invenio.search_engine import (get_collection_reclist, search_pattern,
                                   search_unit)

Reftags = namedtuple('Reftags', 'pubnote repno DOI citedrecid curatorflag')
FIELDS = Reftags('999C5s', '999C5r', '999C5a', '999C50', '999C59')

HEPRECS = get_collection_reclist('HEP')

CATEGORY = re.compile(ur'^(.*)\[[^\]]+\]')
ARXIVPREFIX = re.compile(ur'^(arXiv:(\s+)?)\D', re.I)


class Reference(object):
    """ container for various ref info """
    def __init__(self):
        self._fields = defaultdict(list)

    @staticmethod
    def normalize_repno(repno):
        """ cast repno into standard form """

        # normalize "arXiv:1612.12345 [hep-th]"
Esempio n. 53
0
import sys
import re
from invenio.intbitset import intbitset
import urlparse
import pytz
import os
from md5 import md5

## Generate filtered apache logs from OpenAIRE with:
## $ cd /opt/invenio/var/log
## $ cat apache.log  apache-ssl.log | grep "GET /record/" | grep 200 | gzip > ~/eu.log.gz

## Generate this locally to OpenAIRE with:
from invenio.search_engine import get_collection_reclist, CFG_SITE_NAME
eu_recids = get_collection_reclist(CFG_SITE_NAME)

#eu_recids = intbitset(eu_recids = [9, 10, 19, 23, 24, 25, 26, 773, 774, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 799, 800, 802, 803, 804, 805, 806, 808, 809, 810, 811, 812, 813, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 828, 829, 830, 832, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 858, 861, 875, 877, 878, 879, 882, 884, 885, 887, 888, 889, 890, 891, 892, 893, 894, 899, 900, 901, 902, 903, 905, 906, 907, 912, 913, 914, 915, 923, 925, 928, 930, 931, 932, 934])

## This are from CDS
## To obtain the recids from CDS fire up ipython
## from invenio.search_engine import search_pattern
## eu_recids = search_pattern(p='ec_fundedresources', f='0248_p')

eu_recids = intbitset([1111467, 1119156, 1119304, 1119305, 1123073, 1131840, 1150815, 1152380, 1153910, 1154457, 1161069, 1165141, 1166365, 1171145, 1171956, 1172330, 1174720, 1174799, 1176934, 1177572, 1178778, 1178965, 1179056, 1179975, 1180629, 1180882, 1181684, 1183307, 1185309, 1186606, 1191601, 1192007, 1194234, 1194627, 1194889, 1194911, 1195998, 1198185, 1198199, 1198803, 1199128, 1201615, 1202603, 1204323, 1204596, 1205042, 1205627, 1206034, 1206388, 1207015, 1207269, 1207509, 1208557, 1209236, 1209302, 1209573, 1210107, 1210369, 1210586, 1210726, 1211321, 1211333, 1212045, 1212628, 1212647, 1212816, 1212901, 1213091, 1213474, 1213664, 1213885, 1213943, 1213965, 1214312, 1214514, 1214626, 1214945, 1215300, 1215671, 1215675, 1216010, 1216172, 1216173, 1216174, 1216175, 1216216, 1216493, 1216578, 1216643, 1217471, 1217703, 1217803, 1217852, 1220800, 1221038, 1221231, 1221713, 1221914, 1221916, 1221919, 1222486, 1222694, 1222700, 1222838, 1223191, 1223198, 1223541, 1223722, 1223846, 1224489, 1224652, 1224810, 1225128, 1225650, 1225729, 1225965, 1226309, 1226355, 1226551, 1226713, 1226918, 1227094, 1227133, 1227326, 1227792, 1228022, 1228452, 1228934, 1229318, 1229332, 1229356, 1229432, 1229434, 1229530, 1229531, 1229574, 1229575, 1229750, 1229994, 1230077, 1230105, 1230307, 1230376, 1230425, 1230503, 1230736, 1230755, 1230960, 1231305, 1231387, 1231747, 1231901, 1233463, 1233755, 1233863, 1233949, 1234547, 1234835, 1234922, 1234924, 1234925, 1234926, 1234929, 1235127, 1235128, 1235143, 1235144, 1235145, 1235172, 1235198, 1235259, 1235329, 1235339, 1235904, 1236534, 1236897, 1236925, 1236947, 1237190, 1237309, 1237584, 1237832, 1238424, 1238451, 1238573, 1238626, 1239851, 1240666, 1240667, 1240668, 1240816, 1240817, 1240818, 1241005, 1241307, 1241907, 1242058, 1242081, 1242085, 1242526, 1243614, 1243709, 1243710, 1244371, 1244638, 1244718, 1244731, 1246307, 1246557, 1246569, 1246962, 1247395, 1247837, 1248317, 1248563, 1248581, 1248817, 1249009, 1249090, 1249240, 1249428, 1249579, 1249582, 1249711, 1254335, 1254934, 1255033, 1255127, 1255623, 1255958, 1256429, 1256433, 1256515, 1257430, 1257907, 1258002, 1258154, 1259059, 1259461, 1259591, 1260389, 1260500, 1260579, 1260911, 1260933, 1260943, 1260944, 1260959, 1261330, 1262406, 1262655, 1262878, 1262879, 1262925, 1263511, 1263531, 1264059, 1264268, 1264540, 1264877, 1265038, 1265283, 1265490, 1265837, 1266225, 1266302, 1266406, 1266466, 1266467, 1266797, 1266811, 1267065, 1267078, 1267205, 1267609, 1268099, 1268268, 1268371, 1268394, 1268418, 1268609, 1268772, 1268841, 1269002, 1269265, 1269520, 1269604, 1269752, 1270074, 1270216, 1270325, 1270869, 1271225, 1271829, 1272125, 1272396, 1272477, 1272489, 1272590, 1272628, 1273034, 1273170, 1273211, 1273270, 1273946, 1274170, 1274383, 1274519, 1274659, 1275064, 1275577, 1275587, 1275594, 1275738, 1276020, 1276432, 1276808, 1276861, 1277106, 1277305, 1277454, 1277487, 1277731, 1277830, 1277882, 1277959, 1278031, 1278512, 1279407, 1280616, 1280784, 1280892, 1280951, 1281726, 1281730, 1282194, 1282250, 1282556, 1282605, 1283386, 1283555, 1284223, 1284800, 1285770, 1287375, 1288209, 1288422, 1289343, 1289612, 1289614, 1289851, 1290019, 1290126, 1291833, 1292549, 1292739, 1293002, 1293698, 1293904, 1294205, 1295863, 1296038, 1296499, 1297362, 1297895, 1297976, 1297977, 1298178, 1298497, 1298507, 1299652, 1300674, 1301014, 1301331, 1301701, 1302208, 1303738, 1303855, 1303952, 1304543, 1304871, 1304875, 1306249, 1307104, 1307421, 1307840, 1308076, 1310283, 1310886, 1313619, 1313622, 1313681, 1313970, 1314843, 1316237, 1316543, 1317585, 1317804, 1322393, 1323250, 1323908, 1324061, 1324645, 1325254, 1328761, 1328841, 1330864, 1331909, 1334625, 1335312, 1335824, 1336088, 1337830, 1340534, 1340535, 1341481, 1341768, 1342827, 1342828, 1343468, 1343469, 1343470, 1343471, 1343472, 1343880, 1344476, 1345361, 1347544, 1348674, 1349292, 1350832, 1351200, 1351430, 1351551, 1351789, 1352083, 1352136, 1352694, 1352711, 1352765, 1359203, 1385890, 1405045, 1405438, 1407211, 1423019, 1426293, 1426296, 1428133, 1428524, 1428908, 1428910, 1436135, 1436386, 1439010, 1443465, 1447061, 1448194, 1449781, 1449803, 1449805, 1449806, 1456830, 1456848, 1473443, 1476018, 1476020, 1476023, 1476025])

## 128.141.95.175 - - [04/May/2012:15:47:35 +0200] "GET /record/878?ln=en HTTP/1.1" 200 5647 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.41 Safari/536.5"
RE_PATH = re.compile(r"^/record/(?P<recid>\d+)(/(files/(?P<filename>.+\.\w+))?)?$")

_CFG_SALT = None
_CFG_SALT_FILE = os.path.join('.', 'salt.txt')
def recids_cache(collections, cache={}):
    if 'valid_recids' not in cache:
        cache['valid_recids'] = intbitset()
        for coll in collections.split(','):
            cache['valid_recids'] += get_collection_reclist(coll)
    return cache['valid_recids']
Esempio n. 55
0
import sys
import re
from invenio.intbitset import intbitset
import urlparse
import pytz
import os
from md5 import md5

## Generate filtered apache logs from OpenAIRE with:
## $ cd /opt/invenio/var/log
## $ cat apache.log  apache-ssl.log | grep "GET /record/" | grep 200 | gzip > ~/eu.log.gz

## Generate this locally to OpenAIRE with:
from invenio.search_engine import get_collection_reclist, CFG_SITE_NAME
eu_recids = get_collection_reclist(CFG_SITE_NAME)

#eu_recids = intbitset(eu_recids = [9, 10, 19, 23, 24, 25, 26, 773, 774, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 799, 800, 802, 803, 804, 805, 806, 808, 809, 810, 811, 812, 813, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 828, 829, 830, 832, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 858, 861, 875, 877, 878, 879, 882, 884, 885, 887, 888, 889, 890, 891, 892, 893, 894, 899, 900, 901, 902, 903, 905, 906, 907, 912, 913, 914, 915, 923, 925, 928, 930, 931, 932, 934])

## This are from CDS
## To obtain the recids from CDS fire up ipython
## from invenio.search_engine import search_pattern
## eu_recids = search_pattern(p='ec_fundedresources', f='0248_p')

eu_recids = intbitset([
    1111467, 1119156, 1119304, 1119305, 1123073, 1131840, 1150815, 1152380,
    1153910, 1154457, 1161069, 1165141, 1166365, 1171145, 1171956, 1172330,
    1174720, 1174799, 1176934, 1177572, 1178778, 1178965, 1179056, 1179975,
    1180629, 1180882, 1181684, 1183307, 1185309, 1186606, 1191601, 1192007,
    1194234, 1194627, 1194889, 1194911, 1195998, 1198185, 1198199, 1198803,
    1199128, 1201615, 1202603, 1204323, 1204596, 1205042, 1205627, 1206034,
Esempio n. 56
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                write_message("INFO: Collection %s has not been previously "
                    "analyzed." % collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif task_get_option('force'):
                write_message("INFO: Analysis is forced for collection %s." %
                    collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(run_sql("SELECT id FROM bibrec "
                    "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                write_message("WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr, verbose=2)
        else:
            write_message("ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % collection, stream=sys.stderr,
                verbose=0)

    return rec_onts
Esempio n. 57
0
def bst_create_icons(recid, icon_sizes, icon_format_mappings=None,
                     collection=None, docnames=None, add_default_icon=0, inherit_moreinfo=0):
    """BibTasklet for generating missing icons.
       @param recid: the record on which the action is being performed
       @type recid: int
       @param icon_sizes: a comma-separated list of icon sizes, ex 180,640
       @type icon_sizes: string
       @param collection: the collection name on which to run the task;
                          if recid is defined, collection will be ignored
       @type collection: string
       @param icon_format_mappings: defines for each "master" format in
                                   which format the icons should be
                                   created. If the master format is
                                   not specified here, then its icons
                                   will be created in the same format,
                                   if possible (for eg. the icons of a
                                   TIFF file would be created as TIFF,
                                   while icons of a PDF or DOC file
                                   would be created in JPG) and unless
                                   a default mapping is not provided in
                                   C{CFG_ICON_CREATION_FORMAT_MAPPINGS}.
                                   Use syntax masterextension-targetextension1,targetextension2
                                   (eg. "doc->png,jpg" or "png-jpg")
                                   Use '*' to target extensions not
                                   matched by other rules (if
                                   necessary set its value to empty ''
                                   in order to override/remove the
                                   default star rule set in
                                   C{CFG_ICON_CREATION_FORMAT_MAPPINGS}.
       @type icon_format_mappings: list
       @param docnames: the list of docnames for which we want to create an icon.
                        If not provided, consider all docnames.
                        Separate docnames using "/"
       @type docnames: list
       @param add_default_icon: if a default icon (i.e. without icon
                                size suffix, matching
                                CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT)
                                should be added (1) or not (0)
       @type add_default_icon: int
       @param inherit_moreinfo: if the added icons should also have
                                their description and comment set to
                                the same value as the "main" bibdoc
                                (1) or not (0)
       @type inherit_moreinfo: int
    """
    if recid:
        recids = [int(recid)]
    elif collection:
        from invenio.search_engine import get_collection_reclist
        recids = get_collection_reclist(collection)
    else:
        write_message("Error: no recid found.", sys.stderr)
        return 1
    try:
        add_default_icon = int(add_default_icon) and True or False
    except:
        add_default_icon = False
    try:
        inherit_moreinfo = int(inherit_moreinfo) and True or False
    except:
        inherit_moreinfo = False
    if icon_format_mappings is None:
        icon_format_mappings = []
    if isinstance(icon_format_mappings, str):
        icon_format_mappings = [icon_format_mappings]
    try:
        icon_format_mappings = dict([map(lambda x: ',' in x and x.split(',') or x, mapping.split("-", 1)) \
                                     for mapping in icon_format_mappings])
    except Exception, e:
        write_message("Error: parameter 'icon_format_mappings' not well-formed:\n%s" % e, sys.stderr)
        return 0