Exemple #1
0
def record_collect_oai_identifiers(record_xml, subjects=False):
    """
    Collects all OAI identifiers from given MARCXML.

    Returns a list of found values in the tag
    CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG.

    @param record_xml: string containing MARCXML to parse

    @return list of identifiers
    """
    result = None
    (record, status_code, list_of_errors) = create_record(record_xml)
    if not status_code:
        # Error happened
        write_message("Error collecting OAI identifier from record: %s" %
                      ("\n".join(list_of_errors), ))
    else:
        # All OK! We can get the IDs
        result = record_get_field_values(record,
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5])
        if not result:
            # No IDs found..
            write_message("No OAI IDs found in record")
    if subjects:
        subjects = record_get_field_values(record,
                                           tag="650",
                                           ind1="1",
                                           ind2="7",
                                           code="a")
        return result, subjects
    return result
def get_ids_from_recid(recid):
    record = get_record(recid)

    ## Retrieving DOI
    doi = ""
    dois = record_get_field_values(record, "024", "7", code="a")
    dois = [doi for doi in dois if doi.startswith("10.")]
    if len(dois) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois)
    elif len(dois) == 1:
        doi = dois[0]

    ## Retrieving arXiv eprint
    eprint = ""
    eprints = record_get_field_values(record, "035", code="a")
    eprints = [
        an_eprint[len("oai:arXiv.org:") :] for an_eprint in eprints if an_eprint.lower().startswith("oai:arxiv.org:")
    ]
    if len(eprints) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints)
    elif len(eprints) == 1:
        eprint = eprints[0]

    ## Retrieving Other service ID
    other_id = ""
    for field in record_get_field_instances(record, "035"):
        subfields = dict(field_get_subfield_instances(field))
        if subfields.get("9", "").upper() == CFG_OTHER_SITE.upper() and subfields.get("a"):
            other_id = subfields["a"]
    reportnumbers = record_get_field_values(record, "037", code="a")
    return [str(recid), doi, eprint, other_id] + reportnumbers
def get_ids_from_recid(recid):
    """Get all relevant identifiers from metadata of local record."""
    record = get_record(recid)

    # Retrieving DOI
    doi = ""
    dois = record_get_field_values(record, '024', '7', code='a')
    dois = [doi for doi in dois if doi.startswith('10.')]
    if len(dois) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois)
        doi = dois[0]
    elif len(dois) == 1:
        doi = dois[0]

    # Retrieving arXiv eprint
    eprint = ""
    eprints = record_get_field_values(record, '035', code='a')
    eprints = [an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:')]
    if len(eprints) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints)
        eprint = eprints[0]
    elif len(eprints) == 1:
        eprint = eprints[0]

    # Retrieving Other service ID
    other_id = ''
    for field in record_get_field_instances(record, '035'):
        subfields = dict(field_get_subfield_instances(field))
        if subfields.get('9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'):
            other_id = subfields['a']
    if CFG_INSPIRE_SITE and not other_id:
        for field in record_get_field_instances(record, '595'):
            subfields = dict(field_get_subfield_instances(field))
            if "CDS" in subfields.get('a', '').upper():
                other_id = subfields.get('a', 0).split("-")[-1]
                try:
                    int(other_id)
                except ValueError:
                    # Not an integer, we move on
                    other_id = ''
    reportnumbers = record_get_field_values(record, '037', code='a')

    system_number = ""
    if CFG_INSPIRE_SITE:
        for value in record_get_field_values(record, '970',
                                             filter_subfield_code="a",
                                             filter_subfield_value="SPIRES",
                                             filter_subfield_mode="s"):
            system_number = value.split("-")[-1]
            break  # There is typically only one

    out = [str(recid), doi, eprint, other_id, system_number] + reportnumbers
    return [val.replace('\n', ' ').replace('\r', '') for val in out]
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return - 1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1],))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
def record_get_recid(record):
    """
    Returns the recid (tag 001) of the given record, if found in the database.
    It tries to extract an OAI ID from the given record, if not successful it
    returns with errorcode 0.

    @param record: bibrecord structure

    @return: recid if found, otherwise 0 on missing OAI, -1 on OAI tag error,
                 or None if no recid found.
    """
    recid = None
    if record_has_field(record, "001"):
        return str(record_get_field_value(record, tag="001"))

    oai_id = None
    # FIXME: CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG is not set correctly for inspire
    # When OAI config is OK, use bibrecord.record_get_oaiid
    old_oaiid_tag = "035__z"
    try:
        tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]
        ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
        ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
        code = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    except IndexError:
        sys.stderr.write("Invalid CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG")
        return -1
    fieldvalues = record_get_field_values(record, tag, ind1, ind2, code)
    for fieldvalue in fieldvalues:
        if fieldvalue.startswith("oai:arXiv.org:"):
            oai_id = fieldvalue
            break
    if oai_id == None:
        fieldvalues = record_get_field_values(record, old_oaiid_tag[:3], \
                                              old_oaiid_tag[3], old_oaiid_tag[4], \
                                              old_oaiid_tag[5])
        for fieldvalue in fieldvalues:
            if fieldvalue.startswith("oai:arXiv.org:"):
                oai_id = fieldvalue
                break
        if oai_id == None:
            sys.stderr.write("No oai id found for record")
            return 0
    queries = ["%s__%s:%s" % (tag, code, oai_id)]
    queries.append("%s__%s:%s" % (old_oaiid_tag[:3], old_oaiid_tag[5], oai_id))
    queries.append("reportnumber:arXiv:%s" % (oai_id.split(":")[-1], ))
    for query in queries:
        hits = search_pattern(p=query).tolist()
        # Try different patterns
        if len(hits) == 1:
            return str(hits[0])
    return None
Exemple #6
0
def _detect_collections_from_marcxml_file(recs):
    """
    Extract all possible recIDs from MARCXML file and guess collections
    for these recIDs.
    """
    from invenio.bibrecord import record_get_field_values
    from invenio.search_engine import guess_collection_of_a_record
    from invenio.bibupload import find_record_from_sysno, \
                                  find_records_from_extoaiid, \
                                  find_record_from_oaiid

    dbcollids = {}
    sysno_tag = CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG
    oaiid_tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
    oai_tag = CFG_OAI_ID_FIELD
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag001 in record_get_field_values(rec, '001'):
                collection = guess_collection_of_a_record(int(tag001))
                dbcollids[collection] = 1
            for tag_sysno in record_get_field_values(rec,
                                                     tag=sysno_tag[:3],
                                                     ind1=sysno_tag[3],
                                                     ind2=sysno_tag[4],
                                                     code=sysno_tag[5]):
                record = find_record_from_sysno(tag_sysno)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oaiid in record_get_field_values(rec,
                                                     tag=oaiid_tag[:3],
                                                     ind1=oaiid_tag[3],
                                                     ind2=oaiid_tag[4],
                                                     code=oaiid_tag[5]):
                try:
                    records = find_records_from_extoaiid(tag_oaiid)
                except Error:
                    records = []
                if records:
                    record = records.pop()
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oai in record_get_field_values(rec,
                                                   tag=oai_tag[0:3],
                                                   ind1=oai_tag[3],
                                                   ind2=oai_tag[4],
                                                   code=oai_tag[5]):
                record = find_record_from_oaiid(tag_oai)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
    return dbcollids.keys()
def _detect_collections_from_marcxml_file(recs):
    """
    Extract all possible recIDs from MARCXML file and guess collections
    for these recIDs.
    """
    from invenio.bibrecord import record_get_field_values
    from invenio.search_engine import guess_collection_of_a_record
    from invenio.bibupload import find_record_from_sysno, \
                                  find_records_from_extoaiid, \
                                  find_record_from_oaiid

    dbcollids = {}
    sysno_tag = CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG
    oaiid_tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
    oai_tag = CFG_OAI_ID_FIELD
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag001 in record_get_field_values(rec, '001'):
                collection = guess_collection_of_a_record(int(tag001))
                dbcollids[collection] = 1
            for tag_sysno in record_get_field_values(rec, tag=sysno_tag[:3],
                                                     ind1=sysno_tag[3],
                                                     ind2=sysno_tag[4],
                                                     code=sysno_tag[5]):
                record = find_record_from_sysno(tag_sysno)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oaiid in record_get_field_values(rec, tag=oaiid_tag[:3],
                                                     ind1=oaiid_tag[3],
                                                     ind2=oaiid_tag[4],
                                                     code=oaiid_tag[5]):
                try:
                    records = find_records_from_extoaiid(tag_oaiid)
                except Error:
                    records = []
                if records:
                    record = records.pop()
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oai in record_get_field_values(rec, tag=oai_tag[0:3],
                                                   ind1=oai_tag[3],
                                                   ind2=oai_tag[4],
                                                   code=oai_tag[5]):
                record = find_record_from_oaiid(tag_oai)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
    return dbcollids.keys()
def record_collect_oai_identifiers(record_xml):
    """
    Collects all OAI identifiers from given MARCXML.

    Returns a list of found values in the tag
    CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG.

    @param record_xml: string containing MARCXML to parse

    @return list of identifiers
    """
    result = None
    (record, status_code, list_of_errors) = create_record(record_xml)
    if not status_code:
        # Error happened
        write_message("Error collecting OAI identifier from record: %s" %
                     ("\n".join(list_of_errors),))
    else:
        # All OK! We can get the IDs
        result = record_get_field_values(record,
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5])
        if not result:
            # No IDs found..
            write_message("No OAI IDs found in record")
    return result
Exemple #9
0
 def test_compare_field_values_with_bibrecord_values(self):
     """bibfield - same value as in bibrecord"""
     from invenio.bibrecord import record_get_field_values
     from invenio.search_engine import get_record as search_engine_get_record
     record = get_record(1)
     bibrecord_value = record_get_field_values(search_engine_get_record(1), '245', ' ', ' ', 'a')[0]
     self.assertEqual(bibrecord_value, record['title.title'])
 def _get_approximate_address(record):
     city = record_get_field_value(record, '371', code="b")
     zipcode = record_get_field_value(record, '371', code="e")
     country = record_get_field_value(record, '371', code="d")
     address = [value for value in record_get_field_values(record, '371', code='a') if zipcode not in value]
     address.extend([city, zipcode, country])
     return [elem for elem in address if elem]
Exemple #11
0
def check_records(records):
    """
    Add INSPIRE ID if missing
    """
    _init_db()
    for record in records:
        if 'INSPIRE' in record_get_field_values(record, '035', code='9'):
            ## Has already the link. Good! Let's go on.
            continue
        doi = record_get_field_value(record, '024', ind1='7', code='a')
        arxiv = record_get_field_value(record, '037', code='a')
        query = 'doi:"%s"' % doi
        if arxiv:
            query += ' or %s' % arxiv
        inspireid = run_sql("SELECT inspireid FROM doi2inspireid WHERE doi=%s", (doi,))
        if inspireid:
            inspireid = inspireid[0][0]
        else:
            sleep(2)
            inspireid = [int(elem.strip()) for elem in urlopen(create_url("http://inspirehep.net/search", {'cc': 'HEP', 'of': 'id', 'p': query})).read().strip()[1:-1].split(',') if elem.strip()]
            if len(inspireid) == 1:
                inspireid = inspireid[0]
                try:
                    run_sql("INSERT INTO doi2inspireid(doi, inspireid, creation_date) VALUES(%s, %s, NOW())", (doi, inspireid))
                except IntegrityError, err:
                    other_doi = run_sql("SELECT doi FROM doi2inspireid WHERE inspireid=%s", (inspireid, ))[0][0]
                    record.warn("This record with doi %s is connected with INSPIRE id %s which is already connected to doi %s" % (doi, inspireid, other_doi))
                    continue
            else:
                record.warn("More than one inspire ID matches this record: %s" % inspireid)
                continue
Exemple #12
0
def show_restricted_records(req):
    user_info = collect_user_info(req)
    if not acc_is_user_in_role(user_info, acc_get_role_id("SCOAP3")):
        return page_not_authorized(req=req)

    all_ids = [id[0] for id in run_sql("Select id from bibrec")]
    visible_ids = perform_request_search()

    deleted_and_older_and_restricted = set(all_ids) - set(visible_ids)
    restricted_ids = []
    # restricted_ids_older = []
    for id in deleted_and_older_and_restricted:
        rec = get_record(id)
        collections = record_get_field_values(rec, "980","%","%","%")
        if "DELETED" not in collections:
            year = record_get_field_values(rec, "773","%","%","y")
            title = record_get_field_values(rec, "245","%","%","a")
            if title:
                title = title[0]
            else:
                title = "No title"
            if year:
                if int(year[0]) >= 2015:
                    restricted_ids.append((id, title))
                # else:
                #    restricted_ids_older.append(id)
            else:
                restricted_ids.append((id,title))

    print "Restricted ids"
    print restricted_ids

    req.content_type = "text/html"
    req.write(pageheaderonly("Repository tools", req=req))
    req.write("<h1>Restricted records</h1>")
    req.write("<strong>Total number of possibli restricted records: {0}</strong>".format(len(restricted_ids)))
    req.write("<ol>")
    for id, title in restricted_ids:
        req.write("<li><a href='http://repo.scoap3.org/record/{1}'>{0}</a> <a href='http://repo.scoap3.org/record/edit/?ln=en#state=edit&recid={1}'>edit</a></li>".format(title, id))
    req.write("</ol>")
    # for id, title in restricted_ids:
    #    req.write("{0},".format(id))

    req.write(pagefooteronly(req=req))
    return ""
Exemple #13
0
def main(args):
    config_path = CFG_ETCDIR + "/bibharvest/" + CONFIG_FILE

    if len(args) != 1:
        sys.stderr.write("Error: Missing MARCXML to analyse")
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please input_xml a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    load_config(config_path)

    # Hack to activate UTF-8
    reload(sys)
    sys.setdefaultencoding("utf8")
    assert sys.getdefaultencoding() == "utf8"

    record_tree, header_subs = clean_oai_xml(input_filename)
    records, deleted_records = element_tree_to_record(record_tree, header_subs)
    insert_records = []
    append_records = []

    for record in records:
        # Step 1: Attempt to match the record to those already in Inspire
        recid = record['001'][0][3]
        res = attempt_record_match(recid)
        if not res:
            _print("Record %s does not exist: inserting" % (recid, ))
            # No record found
            # Step 2: Appply filter to transform CDS MARC to Inspire MARC
            insert_records.append(apply_filter(record))
            #insert_records.append(record)
        else:
            _print("Record %s found: %r" % (recid, res))

    for record in deleted_records:
        recid = record_get_field_values(record, tag="035",
                                        code="a")[0].split(":")[-1]
        res = attempt_record_match(recid)
        if res:
            # Record exists and we should then delete it
            _print("Record %s exists. Delete it" % (recid, ))
            append_records.append(record)

    # Output results. Create new files, if necessary.
    write_record_to_file("%s.insert.xml" % (input_filename, ), insert_records)
    _print("%s.insert.xml" % (input_filename, ))
    _print("Number of records to insert:  %d\n" % (len(insert_records), ))
    write_record_to_file("%s.append.xml" % (input_filename, ), append_records)
    _print("%s.append.xml" % (input_filename, ))
    _print("Number of records to append:  %d\n" % (len(append_records), ))
def main(args):
    config_path = CFG_ETCDIR + "/bibharvest/" + CONFIG_FILE

    if len(args) != 1:
        sys.stderr.write("Error: Missing MARCXML to analyse")
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please input_xml a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    load_config(config_path)

    # Hack to activate UTF-8
    reload(sys)
    sys.setdefaultencoding("utf8")
    assert sys.getdefaultencoding() == "utf8"

    record_tree, header_subs = clean_oai_xml(input_filename)
    records, deleted_records = element_tree_to_record(record_tree, header_subs)
    insert_records = []
    append_records = []

    for record in records:
        # Step 1: Attempt to match the record to those already in Inspire
        recid = record["001"][0][3]
        res = attempt_record_match(recid)
        if not res:
            _print("Record %s does not exist: inserting" % (recid,))
            # No record found
            # Step 2: Appply filter to transform CDS MARC to Inspire MARC
            insert_records.append(apply_filter(record))
            # insert_records.append(record)
        else:
            _print("Record %s found: %r" % (recid, res))

    for record in deleted_records:
        recid = record_get_field_values(record, tag="035", code="a")[0].split(":")[-1]
        res = attempt_record_match(recid)
        if res:
            # Record exists and we should then delete it
            _print("Record %s exists. Delete it" % (recid,))
            append_records.append(record)

    # Output results. Create new files, if necessary.
    write_record_to_file("%s.insert.xml" % (input_filename,), insert_records)
    _print("%s.insert.xml" % (input_filename,))
    _print("Number of records to insert:  %d\n" % (len(insert_records),))
    write_record_to_file("%s.append.xml" % (input_filename,), append_records)
    _print("%s.append.xml" % (input_filename,))
    _print("Number of records to append:  %d\n" % (len(append_records),))
def get_template_data(record):
	from invenio.config import CFG_SITE_URL
	from invenio.bibrecord import record_get_field_value, record_get_field_values

	recid = record_get_field_value(record,'001','','','')
	report_numbers = record_get_field_values('037','_','_','a')
	queue = "AUTHORS_long_list"
	subject = "long author list in #%s %s" % ( recid, ' '.join(report_numbers))
	content = "Please update the authors in %s/record/edit/%s" % ( CFG_SITE_URL, recid)
	return (queue, subject, content)
Exemple #16
0
def get_template_data(record):
    from invenio.config import CFG_SITE_URL
    from invenio.bibrecord import record_get_field_value, record_get_field_values

    recid = record_get_field_value(record, '001', '', '', '')
    report_numbers = record_get_field_values('037', '_', '_', 'a')
    queue = "HEP_ref"
    subject = "Refs for #%s %s" % (recid, ' '.join(report_numbers))
    content = "%s/record/edit/#state=edit&recid=%s" % (CFG_SITE_URL, recid)
    return (queue, subject, content)
def get_minimal_arxiv_id(record):
    """Return the OAI arXiv id in the given record skipping the prefixes.

    I.e. oai:arxiv.org:1234.1234 becomes 1234.1234 and oai:arxiv.org:hep-ex/2134123
    becomes hep-ex/2134123. Used for searching.
    """
    values = record_get_field_values(record, tag="035", code="a")
    for value in values:
        if 'arXiv' in value:
            return value.split(':')[-1]
Exemple #18
0
 def _get_approximate_address(record):
     city = record_get_field_value(record, '371', code="b")
     zipcode = record_get_field_value(record, '371', code="e")
     country = record_get_field_value(record, '371', code="d")
     address = [
         value for value in record_get_field_values(record, '371', code='a')
         if zipcode not in value
     ]
     address.extend([city, zipcode, country])
     return [elem for elem in address if elem]
Exemple #19
0
def get_minimal_arxiv_id(record):
    """
    Returns the OAI arXiv id in the given record skipping the prefixes.
    I.e. oai:arxiv.org:1234.1234 becomes 1234.1234 and oai:arxiv.org:hep-ex/2134123
    becomes hep-ex/2134123. Used for searching.
    """
    values = record_get_field_values(record, tag="035", code="a")
    for value in values:
        if 'arXiv' in value:
            return value.split(':')[-1]
Exemple #20
0
def get_template_data(record):
	from invenio.config import CFG_SITE_URL
	from invenio.bibrecord import record_get_field_value, record_get_field_values

	recid = record_get_field_value(record,'001','','','')
	report_numbers = record_get_field_values('037','_','_','a')
	queue = "HEP_ref"
	subject = "Refs for #%s %s" % ( recid, ' '.join(report_numbers))
	content = "%s/record/edit/#state=edit&recid=%s" % ( CFG_SITE_URL, recid)
	return (queue, subject, content)
Exemple #21
0
def get_template_data(record):
    from invenio.config import CFG_SITE_URL
    from invenio.bibrecord import record_get_field_value, record_get_field_values

    recid = record_get_field_value(record, '001', '', '', '')
    report_numbers = record_get_field_values('037', '_', '_', 'a')
    queue = "AUTHORS_long_list"
    subject = "long author list in #%s %s" % (recid, ' '.join(report_numbers))
    content = "Please update the authors in %s/record/edit/%s" % (CFG_SITE_URL,
                                                                  recid)
    return (queue, subject, content)
def retrieve_field_values(curdir,
                          field_name,
                          separator=None,
                          system_number_file='SN',
                          tag=None):
    """
    This is a handy function to retrieve values either from the current
    submission directory, when a form has been just submitted, or from
    an existing record (e.g. during MBI action).

    @param curdir: is the current submission directory.
    @type curdir: string
    @param field_name: is the form field name that might exists on disk.
    @type field_name: string
    @param separator: is an optional separator. If it exists, it will be used
        to retrieve multiple values contained in the field.
    @type separator: string
    @param system_number_file: is the name of the file on disk in curdir, that
        is supposed to contain the record id.
    @type system_number_file: string
    @param tag: is the full MARC tag (tag+ind1+ind2+code) that should
        contain values. If not specified, only values in curdir will
        be retrieved.
    @type tag: 6-chars
    @return: the field value(s).
    @rtype: list of strings.

    @note: if field_name exists in curdir it will take precedence over
        retrieving the values from the record.
    """
    field_file = os.path.join(curdir, field_name)
    if os.path.exists(field_file):
        field_value = open(field_file).read()
        if separator is not None:
            return [
                value.strip() for value in field_value.split(separator)
                if value.strip()
            ]
        else:
            return [field_value.strip()]
    elif tag is not None:
        system_number_file = os.path.join(curdir, system_number_file)
        if os.path.exists(system_number_file):
            recid = int(open(system_number_file).read().strip())
            record = get_record(recid)
            if separator:
                return record_get_field_values(record, tag[:3], tag[3], tag[4],
                                               tag[5])
            else:
                return [
                    record_get_field_value(record, tag[:3], tag[3], tag[4],
                                           tag[5])
                ]
    return []
def get_template_data(record):
	from invenio.config import CFG_SITE_URL
	from invenio.bibrecord import record_get_field_value, record_get_field_values

	recid = record_get_field_value(record,'001','','','')
	report_numbers = record_get_field_values('037','_','_','a')
	queue = "Exp"
	subject = "unknown experiment in #%s %s" % ( recid, ' '.join(report_numbers))
	content = "This unknown experiment: \n\n\
	has appeared in this paper. Please create a record in Experiments and update the paper at\
	 %s/record/edit/%s" % ( CFG_SITE_URL, recid )
	return (queue, subject, content)
def get_sysno_from_record(record, options):
    """Function to get the system number for a record.
       In the case of a pure text MARC record being created, the
       sysno will be retrieved from 001 (i.e. the 'recid' will be returned).
       In the case of an Aleph MARC record being created, the sysno
       will be retrieved from 970__a IF this field exists.  If not,
       None will be returned.
       @param record: the internal representation of the record
        (created by bibrecord) from which the sysno is to be retrieved.
       @param options: various options about the record to be created,
        as obtained from the command line.
       @return: a string containing a 9-digit SYSNO, -OR- None in
       certain cases for an Aleph MARC record.
    """
    if options["text-marc"] != 0:
        vals001 = record_get_field_values(rec=record, tag="001")
        if len(vals001) > 1:
            ## multiple values for recid is illegal!
            sysno = None
        elif len(vals001) < 1:
            ## no value for recid is illegal!
            sysno = None
        else:
            ## get recid
            sysno = vals001[0]
            if len(sysno) < 9:
                sysno = "0" * (9 - len(sysno)) + sysno
    else:
        vals970a = record_get_field_values(rec=record, tag="970", code="a")
        if len(vals970a) > 1:
            ## multiple SYS is illegal - return a list of them all,
            ## let other functions decide what to do
            return vals970a
        if len(vals970a) < 1:
            ## no SYS
            sysno = None
        else:
            ## get SYS
            sysno = vals970a[0][0:9]
    return sysno
def get_sysno_from_record(record, options):
    """Function to get the system number for a record.
       In the case of a pure text MARC record being created, the
       sysno will be retrieved from 001 (i.e. the 'recid' will be returned).
       In the case of an Aleph MARC record being created, the sysno
       will be retrieved from 970__a IF this field exists.  If not,
       None will be returned.
       @param record: the internal representation of the record
        (created by bibrecord) from which the sysno is to be retrieved.
       @param options: various options about the record to be created,
        as obtained from the command line.
       @return: a string containing a 9-digit SYSNO, -OR- None in
       certain cases for an Aleph MARC record.
    """
    if options["text-marc"] != 0:
        vals001 = record_get_field_values(rec=record, tag="001")
        if len(vals001) > 1:
            ## multiple values for recid is illegal!
            sysno = None
        elif len(vals001) < 1:
            ## no value for recid is illegal!
            sysno = None
        else:
            ## get recid
            sysno = vals001[0]
            if len(sysno) < 9:
                sysno = "0"*(9-len(sysno)) + sysno
    else:
        vals970a = record_get_field_values(rec=record, tag="970", code="a")
        if len(vals970a) > 1:
            ## multiple SYS is illegal - return a list of them all,
            ## let other functions decide what to do
            return vals970a
        if len(vals970a) < 1:
            ## no SYS
            sysno = None
        else:
            ## get SYS
            sysno = vals970a[0][0:9]
    return sysno
Exemple #26
0
def get_template_data(record):
	from invenio.config import CFG_SITE_URL
	from invenio.bibrecord import record_get_field_value, record_get_field_values

	recid = record_get_field_value(record,'001','','','')
	report_numbers = record_get_field_values('037','_','_','a')
	postfix =''
	if report_numbers:
		postfix = ' '
	queue = "HEP_cor"
	subject = "%s%s(#%s)" % ( ' '.join(report_numbers), postfix, recid)
	content = "Curate record here: %s/record/edit/#state=edit&recid=%s" % ( CFG_SITE_URL, recid)
	return (queue, subject, content)
Exemple #27
0
def generate_mediaexport_basket(basket_id):
    """
    Exports the content of a basket. Takes each record from a basket and
    calls either generate_mediaexport_album or generate_mediaexport.

    :param str basket_id: The basket id.
    """
    records = get_basket_content(basket_id, format='')
    recids = [record[0] for record in records]

    output = {}
    output['entries'] = []
    for record_id in recids:
        # For each record_id return metadata
        record = get_record(record_id)
        if not record:
            # There is no record, for example when the record_id < 0 (external
            # resource). Skip it.
            continue
        report_number = record_get_field_value(record, *('037', ' ', ' ', 'a'))
        album_dict = generate_mediaexport_album(record_id, report_number, False)
        album_entries = album_dict.get('entries', None)
        if album_entries:
            output['entries'].append(album_entries)
        else:
            # If it's not an album, check if it's an image
            is_image = False
            collections = record_get_field_values(record, *('980', ' ', ' ', 'a'))
            collections.append(record_get_field_values(record, *('980', ' ', ' ', 'b')))
            for collection in collections:
                if "PHOTO" in collection:
                    is_image = True
                    break
            tirage = report_number.rsplit("-", 1)[-1]
            media_dict = generate_mediaexport(record_id, is_image, report_number, tirage, False, False)
            if media_dict:
                output['entries'].append(media_dict)

    return json.dumps(output)
Exemple #28
0
def get_record_collections(recid):
    """ Returns all collections of a record, field 980
    @param recid: record id to get collections from
    @type: string

    @return: list of collections
    @rtype: list
    """
    recstruct = get_record(recid)
    return [
        collection for collection in record_get_field_values(
            recstruct, tag="980", ind1=" ", ind2=" ", code="a")
    ]
Exemple #29
0
def _get_work_type(recstruct):
    """Get work type from MARC record.

    @param recstruct: MARC record

    @return: type of given work
    @rtype: str
    """
    work_type = record_get_field_values(recstruct, '980', '', '', 'a')
    if 'book' in [x.lower() for x in work_type]:
        return 'book'

    work_type_2 = record_get_field_values(recstruct, '502', '', '', 'b')
    if 'phd' in [x.lower() for x in work_type_2]:
        return 'dissertation'

    if 'conferencepaper' in [x.lower() for x in work_type]:
        return 'conference-paper'
    elif 'data' in [x.lower() for x in work_type]:
        return 'data-set'

    published_flag = 'published' in [x.lower() for x in work_type]
    if (published_flag and
            record_get_field_values(recstruct, '773', '', '', 'p')):
        return 'journal-article'

    work_type = record_get_field_instances(recstruct, '035')
    for instance in work_type:
        field_a = False
        field_9 = False
        for tup in instance[0]:
            if tup[0] == '9':
                field_9 = True
            elif tup[0] == 'a':
                field_a = True
        if field_a and field_9 and not published_flag:
            return 'working-paper'

    return 'other'
Exemple #30
0
def get_record_collections(recid):
    """ Returns all collections of a record, field 980
    @param recid: record id to get collections from
    @type: string

    @return: list of collections
    @rtype: list
    """
    recstruct = get_record(recid)
    return [collection for collection in record_get_field_values(recstruct,
                                                            tag="980",
                                                            ind1=" ",
                                                            ind2=" ",
                                                            code="a")]
Exemple #31
0
def get_template_data(record):
    from invenio.config import CFG_SITE_URL
    from invenio.bibrecord import record_get_field_value, record_get_field_values

    recid = record_get_field_value(record, '001', '', '', '')
    report_numbers = record_get_field_values('037', '_', '_', 'a')
    postfix = ''
    if report_numbers:
        postfix = ' '
    queue = "HEP_cor"
    subject = "%s%s(#%s)" % (' '.join(report_numbers), postfix, recid)
    content = "Curate record here: %s/record/edit/#state=edit&recid=%s" % (
        CFG_SITE_URL, recid)
    return (queue, subject, content)
Exemple #32
0
def get_ids_from_recid(recid):
    record = get_record(recid)

    ## Retrieving DOI
    doi = ""
    dois = record_get_field_values(record, '024', '7', code='a')
    dois = [doi for doi in dois if doi.startswith('10.')]
    if len(dois) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (
            recid, dois)
    elif len(dois) == 1:
        doi = dois[0]

    ## Retrieving arXiv eprint
    eprint = ""
    eprints = record_get_field_values(record, '035', code='a')
    eprints = [
        an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints
        if an_eprint.lower().startswith('oai:arxiv.org:')
    ]
    if len(eprints) > 1:
        print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (
            recid, eprints)
    elif len(eprints) == 1:
        eprint = eprints[0]

    ## Retrieving Other service ID
    other_id = ''
    for field in record_get_field_instances(record, '035'):
        subfields = dict(field_get_subfield_instances(field))
        if subfields.get(
                '9',
                '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'):
            other_id = subfields['a']
    reportnumbers = record_get_field_values(record, '037', code='a')
    return [str(recid), doi, eprint, other_id] + reportnumbers
def get_template_data(record):
    from invenio.config import CFG_SITE_URL
    from invenio.bibrecord import record_get_field_value, record_get_field_values

    recid = record_get_field_value(record, "001", "", "", "")
    report_numbers = record_get_field_values("037", "_", "_", "a")
    queue = "Exp"
    subject = "unknown experiment in #%s %s" % (recid, " ".join(report_numbers))
    content = (
        "This unknown experiment: \n\n\
	has appeared in this paper. Please create a record in Experiments and update the paper at\
	 %s/record/edit/%s"
        % (CFG_SITE_URL, recid)
    )
    return (queue, subject, content)
Exemple #34
0
def record_is_conference(record):
    """
    Determine if the record is a new conference based on the value present
    on field 980

    @param record: record to be checked
    @type record: bibrecord object

    @return: True if record is a conference, False otherwise
    @rtype: boolean
    """
    # Get collection field content (tag 980)
    tag_980_content = record_get_field_values(record, "980", " ", " ", "a")
    if "CONFERENCES" in tag_980_content:
        return True
    return False
Exemple #35
0
def record_is_conference(record):
    """
    Determine if the record is a new conference based on the value present
    on field 980

    @param record: record to be checked
    @type record: bibrecord object

    @return: True if record is a conference, False otherwise
    @rtype: boolean
    """
    # Get collection field content (tag 980)
    tag_980_content = record_get_field_values(record, "980", " ", " ", "a")
    if "CONFERENCES" in tag_980_content:
        return True
    return False
def output_record(data, tag_list, url=""):
    out = []
    for tag_struct in tag_list:
        tag = tag_struct[:3]
        ind1 = tag_struct[3:4]
        ind2 = tag_struct[4:5]
        if tag.startswith("00"):
            values = record_get_field_value(data, tag)
        else:
            values = record_get_field_values(data, tag, ind1=ind1, ind2=ind2, code="%")
        if url != '' and tag == '001':
            out.append("%s: %s (%s/record/%s/export/hm)\n" % (tag, str(values), url, values))
        else:
            out.append("%s: %s\n" % (tag, str(values)))
    out.append("\n")
    return "".join(out)
def check_records(records):
    """
    Add INSPIRE ID if missing
    """
    _init_db()
    for record in records:
        if 'INSPIRE' in record_get_field_values(record, '035', code='9'):
            ## Has already the link. Good! Let's go on.
            continue
        doi = record_get_field_value(record, '024', ind1='7', code='a')
        arxiv = record_get_field_value(record, '037', code='a')
        query = 'doi:"%s"' % doi
        if arxiv:
            query += ' or %s' % arxiv
        inspireid = run_sql("SELECT inspireid FROM doi2inspireid WHERE doi=%s",
                            (doi, ))
        if inspireid:
            inspireid = inspireid[0][0]
        else:
            sleep(2)
            inspireid = [
                int(elem.strip()) for elem in urlopen(
                    create_url("http://inspirehep.net/search", {
                        'cc': 'HEP',
                        'of': 'id',
                        'p': query
                    })).read().strip()[1:-1].split(',') if elem.strip()
            ]
            if len(inspireid) == 1:
                inspireid = inspireid[0]
                try:
                    run_sql(
                        "INSERT INTO doi2inspireid(doi, inspireid, creation_date) VALUES(%s, %s, NOW())",
                        (doi, inspireid))
                except IntegrityError, err:
                    other_doi = run_sql(
                        "SELECT doi FROM doi2inspireid WHERE inspireid=%s",
                        (inspireid, ))[0][0]
                    record.warn(
                        "This record with doi %s is connected with INSPIRE id %s which is already connected to doi %s"
                        % (doi, inspireid, other_doi))
                    continue
            else:
                record.warn(
                    "More than one inspire ID matches this record: %s" %
                    inspireid)
                continue
Exemple #38
0
def papers_by_country_with_affs_csv(req, country):
    req.content_type = 'text/csv; charset=utf-8'
    req.headers_out['content-disposition'] = ('attachment; '
                                              'filename=papers_by_country.csv')

    ## print the list of linkt to the articles
    count = 1
    print >> req, country
    search = "100__w:'%s' OR 700__w:'%s'" % (country, country)
    res = perform_request_search(p='%s' % (search, ))
    print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations"
    if len(res):
        for rec_id in res:
            author_count = 11
            rec = get_record(rec_id)
            title = ''
            authors = ''
            journal = ''
            doi = ''
            inspire_record = ''
            if '245' in rec:
                title = re.sub("<.*?>", "", rec['245'][0][0][0][1])
            for sub in rec['773'][0][0]:
                if 'p' in sub[0]:
                    journal = sub[1]
            doi = get_doi(rec_id)
            if '035' in rec:
                for f in rec['035'][0][0]:
                    if 'a' in f:
                        inspire_record = 'http://inspirehep.net/record/%s' % (
                            f[1], )
            print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi,
                                                inspire_record)
            if '100' in rec:
                author = rec['100'][0][0][0][1]
                affiliations = record_get_field_values(rec,
                                                       tag='100',
                                                       code='v')
                print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            if '700' in rec:
                for auth in rec['700']:
                    author = auth[0][0][1]
                    affiliations = field_get_subfield_values(auth, code='v')
                    print >> req, ";;;;;%s;%s" % (author,
                                                  " | ".join(affiliations))
            count += 1
Exemple #39
0
def user_can_edit_record_collection(req, recid):
    """ Check if user has authorization to modify a collection
    the recid belongs to
    """
    def remove_volatile(field_value):
        """ Remove volatile keyword from field value """
        if field_value.startswith(VOLATILE_PREFIX):
            field_value = field_value[len(VOLATILE_PREFIX):]
        return field_value

    # Get the collections the record belongs to
    record_collections = get_all_collections_of_a_record(recid)

    uid = getUid(req)
    # In case we are creating a new record
    if cache_exists(recid, uid):
        dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents(
            recid, uid)
        values = record_get_field_values(record, '980', code="a")
        record_collections.extend([remove_volatile(v) for v in values])

    normalized_collections = []
    for collection in record_collections:
        # Get the normalized collection name present in the action table
        res = run_sql(
            """SELECT value FROM accARGUMENT
                         WHERE keyword='collection'
                         AND value=%s;""", (collection, ))
        if res:
            normalized_collections.append(res[0][0])
    if not normalized_collections:
        # Check if user has access to all collections
        auth_code, auth_message = acc_authorize_action(req,
                                                       'runbibedit',
                                                       collection='')
        if auth_code == 0:
            return True
    else:
        for collection in normalized_collections:
            auth_code, auth_message = acc_authorize_action(
                req, 'runbibedit', collection=collection)
            if auth_code == 0:
                return True
    return False
Exemple #40
0
def user_can_edit_record_collection(req, recid):
    """ Check if user has authorization to modify a collection
    the recid belongs to
    """
    def remove_volatile(field_value):
        """ Remove volatile keyword from field value """
        if field_value.startswith(VOLATILE_PREFIX):
            field_value = field_value[len(VOLATILE_PREFIX):]
        return field_value

    # Get the collections the record belongs to
    record_collections = get_all_collections_of_a_record(recid)

    user_info = collect_user_info(req)
    uid = user_info["uid"]
    # In case we are creating a new record
    if cache_exists(recid, uid):
        record = get_cache_contents(recid, uid)[2]
        values = record_get_field_values(record, '980', code="a")
        record_collections.extend([remove_volatile(v) for v in values])

    normalized_collections = []
    for collection in record_collections:
        # Get the normalized collection name present in the action table
        res = run_sql("""SELECT value FROM accARGUMENT
                         WHERE keyword='collection'
                         AND value=%s;""", (collection,))
        if res:
            normalized_collections.append(res[0][0])
    if not normalized_collections:
        # Check if user has access to all collections
        auth_code, dummy_message = acc_authorize_action(req,
                                                        'runbibedit',
                                                        collection='')
        if auth_code == 0:
            return True
    else:
        for collection in normalized_collections:
            auth_code, dummy_message = acc_authorize_action(req,
                                                            'runbibedit',
                                                            collection=collection)
            if auth_code == 0:
                return True
    return False
    def _get_approximate_address(record):
        def _is_a_conference(record):
            return record_get_field_values(record, '111', code='c')

        if _is_a_conference(record):
            return record_get_field_value(record, '111', code='c').split(', ')
        else:
            city = record_get_field_value(record, '371', code='b') or None
            zipcode = record_get_field_value(record, '371', code='e') or None
            country = record_get_field_value(record, '371', code='d') or None
            address = [city, zipcode, country]
            if allfields or None in address:
                addresses = record_get_field_values(record, '371', code='a')
                if zipcode:
                    addresses = [el for el in addresses if zipcode not in el]
                addresses.extend([city, zipcode, country])
                addresses = [el for el in addresses if el]
                return addresses
            return address
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None):
    """
    This is a handy function to retrieve values either from the current
    submission directory, when a form has been just submitted, or from
    an existing record (e.g. during MBI action).

    @param curdir: is the current submission directory.
    @type curdir: string
    @param field_name: is the form field name that might exists on disk.
    @type field_name: string
    @param separator: is an optional separator. If it exists, it will be used
        to retrieve multiple values contained in the field.
    @type separator: string
    @param system_number_file: is the name of the file on disk in curdir, that
        is supposed to contain the record id.
    @type system_number_file: string
    @param tag: is the full MARC tag (tag+ind1+ind2+code) that should
        contain values. If not specified, only values in curdir will
        be retrieved.
    @type tag: 6-chars
    @return: the field value(s).
    @rtype: list of strings.

    @note: if field_name exists in curdir it will take precedence over
        retrieving the values from the record.
    """
    field_file = os.path.join(curdir, field_name)
    if os.path.exists(field_file):
        field_value = open(field_file).read()
        if separator is not None:
            return [value.strip() for value in field_value.split(separator) if value.strip()]
        else:
            return [field_value.strip()]
    elif tag is not None:
        system_number_file = os.path.join(curdir, system_number_file)
        if os.path.exists(system_number_file):
            recid = int(open(system_number_file).read().strip())
            record = get_record(recid)
            if separator:
                return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5])
            else:
                return [record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5])]
    return []
Exemple #43
0
def check_record(record):
    """ move 8564_u/y to 035__a/9 """
    def normalized(kekid):
        """ normalize 'kekid' by stripping dashes and dropping the leading '19'
            if present
        """
        normid = kekid.replace('-', '')
        if normid.startswith('19') and len(normid) == 9:
            normid = normid[-7:]
        return normid

    delcount = 0
    kekids = set()
    #  look up IDs already present in 035
    for kekid in record_get_field_values(record,
                                         '035',
                                         code='a',
                                         filter_subfield_code='9',
                                         filter_subfield_value=provenance):
        kekids.add(normalized(kekid))
    for pos, val in record.iterfield('8564_u',
                                     subfield_filter=('y', provenance)):
        if val:
            kekidmatch = kekidre.match(val)
            if kekidmatch:
                kekid = (kekidmatch.group(1) or '') + kekidmatch.group(2)
                if normalized(kekid) not in kekids:
                    kekids.add(normalized(kekid))
                    subfields_to_add = (('9', provenance), ('a', kekid))
                    record_add_field(record,
                                     tag='035',
                                     ind1='_',
                                     ind2='_',
                                     subfields=subfields_to_add)
                record.delete_field((pos[0][0:3], pos[1] - delcount, None))
                delcount += 1
                record.set_amended("moved link for %s:%s" %
                                   (provenance, kekid))
            else:
                record.warn('no match for [%s]' % val)
    if len(kekids) > 1:
        record.warn('more than 1 KEK id present: %s' % kekids)
    def _get_approximate_address(record):
        def _is_a_conference(record):
            return record_get_field_values(record, '111', code='c')

        if _is_a_conference(record):
            return record_get_field_value(
                record, '111', code='c').split(', ')
        else:
            city = record_get_field_value(record, '371', code='b') or None
            zipcode = record_get_field_value(record, '371', code='e') or None
            country = record_get_field_value(record, '371', code='d') or None
            address = [city, zipcode, country]
            if allfields or None in address:
                addresses = record_get_field_values(record, '371', code='a')
                if zipcode:
                    addresses = [el for el in addresses if zipcode not in el]
                addresses.extend([city, zipcode, country])
                addresses = [el for el in addresses if el]
                return addresses
            return address
def _detect_980_values_from_marcxml_file(recs):
    """
    Read MARCXML file and return list of 980 $a values found in that file.
    Useful for checking rights.
    """
    from invenio.bibrecord import record_get_field_values

    collection_tag = run_sql("SELECT value FROM tag, field_tag, field \
                              WHERE tag.id=field_tag.id_tag AND \
                              field_tag.id_field=field.id AND \
                              field.code='collection'")
    collection_tag = collection_tag[0][0]
    dbcollids = {}
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag980 in record_get_field_values(rec,
                tag=collection_tag[:3], ind1=collection_tag[3],
                ind2=collection_tag[4], code=collection_tag[5]):
                    dbcollids[tag980] = 1
    return dbcollids.keys()
Exemple #46
0
def papers_by_country_with_affs_csv(req, country):
    req.content_type = 'text/csv; charset=utf-8'
    req.headers_out['content-disposition'] = ('attachment; '
                                              'filename=papers_by_country.csv')

    ## print the list of linkt to the articles
    count = 1
    print >> req, country
    search = "100__w:'%s' OR 700__w:'%s'" % (country, country)
    res = perform_request_search(p='%s' % (search,))
    print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations"
    if len(res):
        for rec_id in res:
            author_count = 11
            rec = get_record(rec_id)
            title = ''
            authors = ''
            journal = ''
            doi = ''
            inspire_record = ''
            if '245' in rec:
                title = re.sub("<.*?>", "", rec['245'][0][0][0][1])
            for sub in rec['773'][0][0]:
                if 'p' in sub[0]:
                    journal = sub[1]
            doi = get_doi(rec_id)
            if '035' in rec:
                for f in rec['035'][0][0]:
                    if 'a' in f:
                        inspire_record = 'http://inspirehep.net/record/%s' % (f[1],)
            print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi, inspire_record)
            if '100' in rec:
                author = rec['100'][0][0][0][1]
                affiliations = record_get_field_values(rec, tag='100', code='v')
                print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            if '700' in rec:
                for auth in rec['700']:
                    author = auth[0][0][1]
                    affiliations = field_get_subfield_values(auth, code='v')
                    print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            count += 1
def extract_035_id(record):
    """ Gets the value of the 035__a field
    Parameters:
     (BibRecord) record - the record to look at
    Return: the ID's remote mapping"""
    try:
        field_vals = record_get_field_values(record, "035", code="a",
                                             filter_subfield_code="9",
                                             filter_subfield_value=FILTER_VALUE,
                                             filter_subfield_mode="r")
        field_vals = [x for x in set(field_vals) if x.isdigit()]
        if not field_vals:
            return None
        if len(field_vals) == 1 and field_vals[0]:
            return field_vals[0]
        elif len(field_vals) > 1:
            _print("Warning: Multiple recids found in 035 for record", 6)
            _print("Assuming local recid is %s" % field_vals[0], 6)
            return field_vals[0]
    except (KeyError, ValueError, IndexError) as exc:
        _print(exc.message, 5)
Exemple #48
0
def _detect_980_values_from_marcxml_file(recs):
    """
    Read MARCXML file and return list of 980 $a values found in that file.
    Useful for checking rights.
    """
    from invenio.bibrecord import record_get_field_values

    collection_tag = run_sql("SELECT value FROM tag, field_tag, field \
                              WHERE tag.id=field_tag.id_tag AND \
                              field_tag.id_field=field.id AND \
                              field.code='collection'")
    collection_tag = collection_tag[0][0]
    dbcollids = {}
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag980 in record_get_field_values(rec,
                                                  tag=collection_tag[:3],
                                                  ind1=collection_tag[3],
                                                  ind2=collection_tag[4],
                                                  code=collection_tag[5]):
                dbcollids[tag980] = 1
    return dbcollids.keys()
Exemple #49
0
def check_record(record):
    """ move 8564_u/y to 035__a/9 """
    def normalized(kekid):
        """ normalize 'kekid' by stripping dashes and dropping the leading '19'
            if present
        """
        normid = kekid.replace('-', '')
        if normid.startswith('19') and len(normid) == 9:
            normid = normid[-7:]
        return normid
    delcount = 0
    kekids = set()
    #  look up IDs already present in 035
    for kekid in record_get_field_values(
            record, '035', code='a',
            filter_subfield_code='9',
            filter_subfield_value=provenance):
        kekids.add(normalized(kekid))
    for pos, val in record.iterfield('8564_u',
                                     subfield_filter=('y', provenance)):
        if val:
            kekidmatch = kekidre.match(val)
            if kekidmatch:
                kekid = (kekidmatch.group(1) or '') + kekidmatch.group(2)
                if normalized(kekid) not in kekids:
                    kekids.add(normalized(kekid))
                    subfields_to_add = (('9', provenance),
                                        ('a', kekid))
                    record_add_field(record, tag='035', ind1='_', ind2='_',
                                     subfields=subfields_to_add)
                record.delete_field((pos[0][0:3], pos[1] - delcount, None))
                delcount += 1
                record.set_amended(
                    "moved link for %s:%s" % (provenance, kekid))
            else:
                record.warn('no match for [%s]' % val)
    if len(kekids) > 1:
        record.warn('more than 1 KEK id present: %s' % kekids)
def get_indexable_data(record):
    """
    Returns indexable data for a Bibrecord institution record in Solr.
    """
    # Mapped from https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkupInstitutions#Field_Mapping_final
    data = {}

    data['id'] = bibrecord.record_get_field_value(record, '001')
    display_name = bibrecord.record_get_field_value(record, '110', '', '', 't') or \
            bibrecord.record_get_field_value(record, '110', '', '', 'u') or \
            bibrecord.record_get_field_value(record, '110', '', '', 'a')
    data['display_name'] = display_name.decode('utf_8')
    desy_icn = bibrecord.record_get_field_value(record, '110', '', '', 'u')
    data['desy_icn'] = desy_icn.decode('utf_8')

    for index, tags in INDEX_FIELDS.items():
        values = []
        for tag in tags:
            for value in bibrecord.record_get_field_values(
                    record, tag[:3], tag[3], tag[4], tag[5]):
                values.append(value.decode('utf_8'))
        if values:
            data[index] = list(set(values))

    # Name variants
    name_variants = get_name_variants(record)
    if name_variants:
        data['name_variants'] = name_variants

    old = bibrecord.record_get_field_value(record, '110', '', '', 'u')
    new = bibrecord.record_get_field_value(record, '110', '', '', 't')

    if old and new and old != new:
        open('etc/old_new.txt', 'a').write('%s\t%s\n' % (old, new))

    return data
def get_indexable_data(record):
    """
    Returns indexable data for a Bibrecord institution record in Solr.
    """
    # Mapped from https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkupInstitutions#Field_Mapping_final
    data = {}

    data['id'] = bibrecord.record_get_field_value(record, '001')
    display_name = bibrecord.record_get_field_value(record, '110', '', '', 't') or \
            bibrecord.record_get_field_value(record, '110', '', '', 'u') or \
            bibrecord.record_get_field_value(record, '110', '', '', 'a')
    data['display_name'] = display_name.decode('utf_8')
    desy_icn = bibrecord.record_get_field_value(record, '110', '', '', 'u')
    data['desy_icn'] = desy_icn.decode('utf_8')

    for index, tags in INDEX_FIELDS.items():
        values = []
        for tag in tags:
            for value in bibrecord.record_get_field_values(record, tag[:3],
                    tag[3], tag[4], tag[5]):
                values.append(value.decode('utf_8'))
        if values:
            data[index] = list(set(values))

    # Name variants
    name_variants = get_name_variants(record)
    if name_variants:
        data['name_variants'] = name_variants

    old = bibrecord.record_get_field_value(record, '110', '', '', 'u')
    new = bibrecord.record_get_field_value(record, '110', '', '', 't')

    if old and new and old != new:
        open('etc/old_new.txt', 'a').write('%s\t%s\n' % (old, new))

    return data
def extract_035_id(record):
    """ Gets the value of the 035__a field
    Parameters:
     (BibRecord) record - the record to look at
    Return: the ID's remote mapping"""
    try:
        field_vals = record_get_field_values(
            record,
            "035",
            code="a",
            filter_subfield_code="9",
            filter_subfield_value=FILTER_VALUE,
            filter_subfield_mode="r")
        field_vals = [x for x in set(field_vals) if x.isdigit()]
        if not field_vals:
            return None
        if len(field_vals) == 1 and field_vals[0]:
            return field_vals[0]
        elif len(field_vals) > 1:
            _print("Warning: Multiple recids found in 035 for record", 6)
            _print("Assuming local recid is %s" % field_vals[0], 6)
            return field_vals[0]
    except (KeyError, ValueError, IndexError) as exc:
        _print(exc.message, 5)
Exemple #53
0
 def lazy_parser(collection, left_tags, right_tags, volume_subfield):
     for recid in get_collection_reclist(collection):
         record = get_record(recid)
         for right_tag in right_tags:
             for right_value in record_get_field_values(
                     record, right_tag[:3], right_tag[3], right_tag[4],
                     right_tag[5]):
                 if not right_value:
                     continue  # Empty metadata
                 yield right_value, right_value
                 for left_tag in left_tags:
                     for left_field in record_get_field_instances(
                             record, left_tag[:3], left_tag[3],
                             left_tag[4]):
                         left_subfields = dict(
                             field_get_subfield_instances(left_field))
                         if left_tag[5] not in left_subfields:
                             continue  # Empty field
                         if volume_subfield in left_subfields:
                             yield left_subfields[left_tag[5]], '%s;%s' % (
                                 right_value,
                                 left_subfields[volume_subfield])
                         else:
                             yield left_subfields[left_tag[5]], right_value
Exemple #54
0
    def _next_value(self, recid=None, xml_record=None, bibrecord=None):
        """
        Returns the next texkey for the given recid

        @param recid: id of the record where the texkey will be generated
        @type recid: int

        @param xml_record: record in xml format
        @type xml_record: string

        @return: next texkey for the given recid.
        @rtype: string

        @raises TexkeyNoAuthorError: No main author (100__a) or collaboration
        (710__g) in the given recid
        """
        if recid is None and xml_record is not None:
            bibrecord = create_record(xml_record)[0]
        elif bibrecord is None:
            bibrecord = get_bibrecord(recid)

        main_author = record_get_field_value(bibrecord,
                                             tag="100",
                                             ind1="",
                                             ind2="",
                                             code="a")

        if not main_author:
            # Try with collaboration name
            main_author = record_get_field_value(bibrecord,
                                                 tag="710",
                                                 ind1="",
                                                 ind2="",
                                                 code="g")
            main_author = "".join([
                p for p in main_author.split() if p.lower() != "collaboration"
            ])

        if not main_author:
            # Try with corporate author
            main_author = record_get_field_value(bibrecord,
                                                 tag="110",
                                                 ind1="",
                                                 ind2="",
                                                 code="a")
            if not main_author:
                # Check if it is a Proceedings record
                collections = [
                    collection.lower() for collection in
                    record_get_field_values(bibrecord, "980", code="a")
                ]
                if "proceedings" in collections:
                    main_author = "Proceedings"
                else:
                    raise TexkeyNoAuthorError

        # Remove utf-8 special characters
        main_author = unidecode(main_author.decode('utf-8'))
        texkey_first_part = ""
        try:
            texkey_first_part = main_author.split(',')[0]
        except KeyError:
            raise TexkeyNoAuthorError

        # sanitize for texkey use, require at least one letter
        texkey_first_part = re.sub(r'[^-A-Za-z0-9.:/^_;&*<>?|!$+]', '',
                                   texkey_first_part)
        if len(texkey_first_part) < 1 \
           or not re.search(r'[A-Za-z]', texkey_first_part):
            raise TexkeyNoAuthorError

        year = _get_year(
            record_get_field_value(bibrecord,
                                   tag="269",
                                   ind1="",
                                   ind2="",
                                   code="c"))
        if not year:
            year = _get_year(
                record_get_field_value(bibrecord,
                                       tag="260",
                                       ind1="",
                                       ind2="",
                                       code="c"))
            if not year:
                year = _get_year(
                    record_get_field_value(bibrecord,
                                           tag="773",
                                           ind1="",
                                           ind2="",
                                           code="y"))
                if not year:
                    year = _get_year(
                        record_get_field_value(bibrecord,
                                               tag="502",
                                               ind1="",
                                               ind2="",
                                               code="d"))
                    if not year:
                        raise TexkeyNoYearError

        texkey_second_part = ''
        if year:
            texkey_second_part = year

        texkey_third_part = _texkey_random_chars(recid)

        texkey = "%s:%s%s" % \
                 (texkey_first_part, texkey_second_part, texkey_third_part)

        tries = 0
        while self._value_exists(texkey) and tries < TEXKEY_MAXTRIES:
            # Key is already in the DB, generate a new one
            texkey_third_part = _texkey_random_chars(recid, use_random=True)
            texkey = "%s:%s%s" % \
                     (texkey_first_part, texkey_second_part, texkey_third_part)
            tries += 1

        return texkey
Exemple #55
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql("SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"):
        write_message("Previous requests of oairepository still being elaborated. Let's skip this execution.")
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" % len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid, verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3)

        current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems()
             if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3)

        updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) |
             (current_oai_sets - updated_oai_sets))
        write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3)
            continue # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" % recid, verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c', filename)
    else:
        os.remove(filename)

    return True
        except (KeyError, IndexError) as err:
            _print('Error: Cannot process record without 001:recid')
            error_records.append(record)
            continue

        if skip_recid_check or not res:
            _print("Record %s does not exist: inserting" % (recid,))
            # No record found
            # Step 2: Appply filter to transform CDS MARC to Inspire MARC
            insert_records.append(apply_filter(record))
            #insert_records.append(record)
        else:
            _print("Record %s found: %r" % (recid, res))

    for record in deleted_records:
        recid = record_get_field_values(record, tag="035", code="a")[0].split(":")[-1]
        res = attempt_record_match(recid)
        if res:
            # Record exists and we should then delete it
            _print("Record %s exists. Delete it" % (recid,))
            append_records.append(record)

    # Output results. Create new files, if necessary.
    if input_filename[-4:].lower() == '.xml':
        input_filename = input_filename[:-4]


    write_record_to_file("%s.insert.xml" % (input_filename,), insert_records)
    _print("%s.insert.xml" % (input_filename,))
    _print("Number of records to insert:  %d\n"
           % (len(insert_records),))
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec['001'][0][3]
    if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980",
                                                                   code="a")]:
        record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = ["024", "041", "035", "037", "088", "100",
                          "110", "111", "242", "245", "246", "260",
                          "269", "300", "502", "650", "653", "693",
                          "700", "710", "773", "856", "520", "500",
                          "980"]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, '980', code='a'):
        if 'NOTE' in value.upper():
            collections.add('NOTE')
        if 'THESIS' in value.upper():
            collections.add('THESIS')
        if 'CONFERENCEPAPER' in value.upper():
            collections.add('ConferencePaper')


    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not 'NOTE' in collections:
        # TODO: Move this to a KB
        kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-',
              'ALICE-INT-', 'LHCb-PUB-']
        values = record_get_field_values(rec, "088", code='a')
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add('NOTE')
                break

    # 980 Arxiv tag
    if record_get_field_values(rec, '035', filter_subfield_code="a",
                               filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add('HEP')
    collections.add('CORE')

    # 980 Conference Note
    if not 'ConferencePaper' in collections:
        for value in record_get_field_values(rec, '962', code='n'):
            if value[-2:].isdigit():
                collections.add('ConferencePaper')
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec, '690', filter_subfield_code="a",
                                      filter_subfield_value='INTNOTE')
    if intnote:
        val_088 = record_get_field_values(rec, '088', filter_subfield_code="a")
        for val in val_088:
            if 'CMS' in val:
                url = ('http://weblib.cern.ch/abstract?CERN-CMS' +
                       val.split('CMS', 1)[-1])
                record_add_field(rec, '856', ind1='4', subfields=[('u', url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, '041')
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            if "eng" in subs['a']:
                continue
            new_value = translate_config(subs['a'][0], languages)
            new_subs = [('a', new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, '035')
    forbidden_values = ["cercer",
                        "inspire",
                        "xx",
                        "cern annual report",
                        "cmscms",
                        "wai01"]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            if not 'a' in subs:
                continue
            for sub in subs['9']:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs['9']]
                if 'spires' in suffixes:
                    new_subs = [('a', 'SPIRES-%s' % subs['a'][0])]
                    record_add_field(rec, '970', subfields=new_subs)
                    continue
        if 'a' in subs:
            for sub in subs['a']:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec, tag="035",
                                        field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, '088')
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            for val in subs['9']:
                if val.startswith('P0') or val.startswith('CM-P0'):
                    sf = [('9', 'CERN'), ('b', val)]
                    record_add_field(rec, '595', subfields=sf)
        for key, val in field[0]:
            if key in ['a', '9'] and not val.startswith('SIS-'):
                record_add_field(rec, '037', subfields=[('a', val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, '037')
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            for value in subs['a']:
                if 'arXiv' in value:
                    new_subs = [('a', value), ('9', 'arXiv')]
                    for fld in record_get_field_instances(rec,  '695'):
                        for key, val in field_get_subfield_instances(fld):
                            if key == 'a':
                                new_subs.append(('c', val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, '037', nf, field[4])
        for key, val in field[0]:
            if key in ['a', '9'] and val.startswith('SIS-'):
                record_delete_field(rec, '037', field_position_global=field[4])

    for field in record_get_field_instances(rec, '242'):
        record_add_field(rec, '246', subfields=field[0])
    record_delete_fields(rec, '242')

    # 269 Date normalization
    for field in record_get_field_instances(rec, '269'):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not 'THESIS' in collections:
        for field in record_get_field_instances(rec, '260'):
            record_add_field(rec, '269', subfields=field[0])
        record_delete_fields(rec, '260')

    # 300 page number
    for field in record_get_field_instances(rec, '300'):
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                if "mult." not in value and value != " p":
                    field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value))
                else:
                    record_delete_field(rec, '300',
                                        field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, '100')
    author_names.extend(record_get_field_instances(rec, '700'))
    for field in author_names:
        subs = field_get_subfields(field)
        if not 'i' in subs or 'XX' in subs['i']:
            if not 'j' in subs or 'YY' in subs['j']:
                for idx, (key, value) in enumerate(field[0]):
                    if key == 'a':
                        field[0][idx] = ('a', punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if 'THESIS' in collections:
        for field in record_get_field_instances(rec, '700'):
            record_add_field(rec, '701', subfields=field[0])
        record_delete_fields(rec, '700')

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, '502')
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == 'a':
                new_subs.append(('b', value))
            elif key == 'b':
                new_subs.append(('c', value))
            elif key == 'c':
                new_subs.append(('d', value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7')
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [('2', 'INSPIRE'), ('a', new_value)]
                else:
                    new_subs = [('2', 'SzGeCERN'), ('a', value)]
                record_add_field(rec, "650", ind1="1", ind2="7",
                                 subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, '653', ind1='1'):
        subs = field_get_subfields(field)
        new_subs = []
        if 'a' in subs:
            for val in subs['a']:
                new_subs.extend([('9', 'author'), ('a', val)])
        new_field = create_field(subfields=new_subs, ind1='1')
        record_replace_field(rec, '653', new_field, field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, '693'):
        subs = field_get_subfields(field)
        all_subs = subs.get('a', []) + subs.get('e', [])
        if 'not applicable' in [x.lower() for x in all_subs]:
            record_delete_field(rec, '693',
                                field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == 'a':
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == 'e':
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"),
                                  experiment_e)
        translated_experiments = translate_config(experiment,
                                                  experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693",
                            field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, '710'):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == '5':
                subs.pop(idx)
            elif value.startswith('CERN. Geneva'):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, '710', field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, '773'):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == 'p':
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773",
                            field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, '856', ind1='4'):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if 'z' in subs:
            is_figure = [s for s in subs['z'] if "figure" in s.lower()]
            if is_figure and 'u' in subs:
                is_subformat = [s for s in subs['u'] if "subformat" in s.lower()]
                if not is_subformat:
                    url = subs['u'][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url), verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url,
                                                    download_to_file=local_url,
                                                    timeout=30.0)
                        except InvenioFileDownloadError:
                            _print("Download failed while attempting to reach %s. Skipping.." % (url,))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" % (local_url, url), verbose=5)
                            else:
                                _print("Conversion failed on %s" % (local_url,))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(('a', url))
                        newsubs.append(('t', 'Plot'))
                        figure_counter += 1
                        if 'y' in subs:
                            newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0])))
                            newsubs.append(('n', subs['y'][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(os.path.splitext(subs['u'][0])[0])
                            newsubs.append(('d', "%05d %s" % (figure_counter, name)))
                            newsubs.append(('n', name))

        if not newsubs and 'u' in subs:
            is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s]
            if is_fulltext:
                newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])]

        if not newsubs and 'u' in subs:
            remove = True
            is_zipfile = [s for s in subs['u'] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0],
                                                  download_to_file=local_url,
                                                  timeout=30.0)
                except InvenioFileDownloadError:
                    _print("Download failed while attempting to reach %s. Skipping.."
                           % (is_zipfile[0],))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(('a', png))
                        caption = '%05d %s' % (figure_counter, os.path.basename(png))
                        plotsubs.append(('d', caption))
                        plotsubs.append(('t', 'Plot'))
                        record_add_field(rec, 'FFT', subfields=plotsubs)

        if not remove and not newsubs and 'u' in subs:
            urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch',
                    'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
                    'http://preprints.cern.ch', 'http://cds.cern.ch')
            for val in subs['u']:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith('ps.gz'):
                    remove = True

        if newsubs:
            record_add_field(rec, 'FFT', subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec, '856', ind1='4',
                                field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [('a', "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, '980', subfields=[('a', collection)])

    return rec
def generate_ticket(ticket, record):
    """
    Generates a ticket to be created, filling subject, body and queue values
    of the passed BibCatalogTicket object. The enriched object is returned.

    @param ticket: a ticket object as created by BibCatalogTicket() containing
                   the subject, body and queue to create a ticket in.
    @type ticket: record object of BibCatalogTicket.

    @param record: a recstruct object as created by bibrecord.create_record()
    @type record: record object of BibRecord.

    @return: the modified ticket object to create.
    @rtype: BibCatalogTicket
    """
    title_code = load_tag_code_from_name("title")
    abstract_code = load_tag_code_from_name("abstract")

    try:
        date_code = load_tag_code_from_name("date")
    except BibCatalogTagNotFound:
        date_code = load_tag_code_from_name("year")

    category_code = load_tag_code_from_name("subject")

    try:
        notes_code = load_tag_code_from_name("note")
    except BibCatalogTagNotFound:
        notes_code = load_tag_code_from_name("comment")

    first_author_code = load_tag_code_from_name("first author name")
    additional_author_code = load_tag_code_from_name("additional author name")

    try:
        external_id_code = load_tag_code_from_name("ext system ID")
    except BibCatalogTagNotFound:
        external_id_code = load_tag_code_from_name("primary report number")

    # List of extra info to print in the ticket.
    extra_info = []
    recid = record_id_from_record(record)

    arxiv_id = _get_minimal_arxiv_id(record, external_id_code)
    if arxiv_id:
        # We have an arxiv id - we can add special info:
        extra_info.append("ABSTRACT: http://arxiv.org/abs/%s" % (arxiv_id,))
        extra_info.append("PDF: http://arxiv.org/pdf/%s" % (arxiv_id,))

        categories = record_get_value_with_provenence(record=record,
                                                      provenence_code="2",
                                                      provenence_value="arXiv",
                                                      **split_tag_code(category_code))
        comments = record_get_value_with_provenence(record=record,
                                                    provenence_code="9",
                                                    provenence_value="arXiv",
                                                    **split_tag_code(notes_code))
        external_ids = arxiv_id
        subject = "ARXIV:" + arxiv_id
    else:
        # Not an arxiv record - Lets get generic info
        categories = record_get_value_with_provenence(record=record,
                                                      provenence_code="2",
                                                      provenence_value="SzGeCERN",
                                                      **split_tag_code(category_code))
        comments = record_get_field_values(rec=record,
                                           **split_tag_code(notes_code))
        external_id_list = record_get_field_values(rec=record,
                                                   **split_tag_code(external_id_code))
        external_ids = ", ".join(external_id_list)
        subject = "Record #%s %s" % (recid, external_ids)

    authors = record_get_field_values(record, **split_tag_code(first_author_code)) + \
              record_get_field_values(record, **split_tag_code(additional_author_code))

    text = """
%(submitdate)s

External IDs: %(external_ids)s

Title: %(title)s

Authors: %(authors)s

Categories: %(categories)s

Comments: %(comments)s

%(abstract)s

%(extra_info)s

Edit the record now: %(editurl)s

""" \
    % {
        'external_ids': external_ids,
        'submitdate': record_get_field_value(record, **split_tag_code(date_code)),
        'extra_info': "\n".join(extra_info),
        'title': record_get_field_value(record, **split_tag_code(title_code)),
        'comments': "; ".join(comments),
        'categories': " ".join(categories),
        'authors': " / ".join(authors[:10]),
        'abstract': record_get_field_value(record, **split_tag_code(abstract_code)),
        'editurl': "%s/record/edit/%s" % (CFG_SITE_URL, recid),
    }
    # To avoid errors with string formatting later, we are escaping %'s
    ticket.subject = subject
    ticket.body = text.replace('%', '%%')
    ticket.queue = "Test"
    return ticket
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in recids_for_set.iteritems()
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if not no_upload:
        task_sleep_now_if_required(can_stop_too=True)
        if tot > 0:
            task_low_level_submission('bibupload', 'oairepository', '-c',
                                      filename, '-n')
        else:
            os.remove(filename)

    return True