Ejemplo n.º 1
0
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE):
    write_message("bibcatalog_system %s" % bibcatalog_system, verbose=1)
    write_message("queue %s" % queue, verbose=1)
    if bibcatalog_system and queue:

        subject = "Refs for #%s" % recid

        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_hep = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, "a"):
                if collection == "HEP":
                    in_hep = True

        # Only create tickets for HEP
        if not in_hep:
            write_message("not in hep", verbose=1)
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, "c"):
                if category.startswith("astro-ph"):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, "a"):
                subject += " " + report_number
                break

        text = "%s/record/edit/#state=edit&recid=%s" % (CFG_SITE_SECURE_URL, recid)
        bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
Ejemplo n.º 2
0
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"):
    """
    Prints the list of editors of a record.

    @param limit: the maximum number of editors to display
    @param separator: the separator between editors.
    @param extension: a text printed if more editors than 'limit' exist
    @param print_links: if yes, print the editors as HTML link to their publications
    """
    from urllib import quote
    from invenio.config import CFG_BASE_URL
    from invenio import bibrecord

    authors = bibrecord.record_get_field_instances(bfo.get_record(), '100')

    editors = [bibrecord.field_get_subfield_values(author, 'a')[0]
               for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ]

    if print_links.lower() == "yes":
        editors = ['<a href="' + CFG_BASE_URL + '/search?f=author&p=' + \
                   quote(editor) + \
                   '&amp;ln='+ bfo.lang + \
                   '">' + editor + '</a>'
                   for editor in editors]

    if limit.isdigit() and len(editors) > int(limit):
        return separator.join(editors[:int(limit)]) + extension

    elif len(editors) > 0:
        return separator.join(editors)
Ejemplo n.º 3
0
def check_existing_pdg_fields(recids, pdg_data, current_records):
    _print_out("Comparing new and old PDG data for " + str(len(recids)) +
               " records...")
    records = {}
    for recid in recids:
        record_mod = {}
        record_mod['001'] = deepcopy(current_records[recid]['001'])
        record_mod['084'] = deepcopy(current_records[recid]['084'])
        fields = record_get_field_instances(record_mod, '084')
        current_pdg_data = []
        for field in fields:
            if is_pdg_field(field):
                current_pdg_data.append(
                    field_get_subfield_values(field, 'a')[0])

        current_set = set(current_pdg_data)
        new_set = set(pdg_data[recid])
        deletions = list(current_set - new_set)
        additions = list(new_set - current_set)

        if len(deletions) > 0 or len(additions) > 0:
            if len(deletions) > 0:
                for field in fields:
                    if is_pdg_field(field):
                        if field_get_subfield_values(field,
                                                     'a')[0] in deletions:
                            record_delete_field(record_mod,
                                                '084',
                                                ind1=' ',
                                                ind2=' ',
                                                field_position_global=field[4])

            for pdg_field in additions:
                position = record_add_field(record_mod, '084', ' ', ' ')
                record_add_subfield_into(record_mod,
                                         '084',
                                         '2',
                                         'PDG',
                                         field_position_global=position)
                record_add_subfield_into(record_mod,
                                         '084',
                                         '9',
                                         'PDG',
                                         field_position_global=position)
                record_add_subfield_into(record_mod,
                                         '084',
                                         'a',
                                         pdg_field,
                                         field_position_global=position)

            records[recid] = record_mod
            _print_verbose("Record #" + str(recid) + ": " +
                           str(len(deletions)) + " deletions and " +
                           str(len(additions)) + " additons.")
        else:
            _print_verbose("Nothing to change for record #" + str(recid))

    _print_out(str(len(records)) + " records to be corrected.")
    return records
Ejemplo n.º 4
0
def main():
    verbose = '-v' in sys.argv

    recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')
    print "Found %s records to assign texkeys" % len(recids)
    processed = []
    to_process = []
    for count, recid in enumerate(recids):
        if count % 300 == 0:
            print 'done %s of %s' % (count, len(recids))

        if verbose:
            print "processing ", recid

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                print "INFO: Record %s has already texkey %s" % (recid, value)

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                print "WARNING: Record %s has no first author or collaboration" % recid
                continue
            xml = create_xml(recid, new_texkey)
            processed.append(recid)
            to_process.append(xml)

        if len(to_process) == 500:
            process_chunk(to_process)
            to_process = []

    if to_process:
        process_chunk(to_process)

    # Finally, index all the records processed
    if processed:
        submit_bibindex_task(processed)
Ejemplo n.º 5
0
def check_one_date_per_type(fields1, fields2, final_result, type_check, subfield_list, tag):
    """Function to check if there are multiple dates of the same type"""
    logger.info("        running check_one_date_per_type")

    # I extract all the dates grouped by date type
    date_types = {}
    for field in final_result:
        date_types.setdefault(bibrecord.field_get_subfield_values(field, subfield_list[0][1])[0], []).append(
            bibrecord.field_get_subfield_values(field, subfield_list[0][0])[0]
        )
    # then I check that these dates are unique per type
    for datet in date_types:
        if len(set(date_types[datet])) > 1:
            manage_check_error('Multiple dates for type "%s" in field "%s".' % (datet, tag), type_check, logger)
    return None
Ejemplo n.º 6
0
def _create_ticket(recid, bibcatalog_system, queue):
    subject = "Refs for #%s" % recid

    if CFG_INSPIRE_SITE:
        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_core = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'CORE':
                    in_core = True
                if collection == 'arXiv':
                    # Do not create tickets for arxiv papers
                    # Tickets for arxiv papers are created in bibcatelog
                    write_message("arXiv paper", verbose=1)
                    return

        # Do not create tickets for user submissions
        for source_field in record_get_field_instances(record, "541"):
            for source in field_get_subfield_values(source_field, "c"):
                if source == "submission":
                    write_message("User submitted paper", verbose=1)
                    return

        # Only create tickets for CORE papers
        if not in_core:
            write_message("not in core papers", verbose=1)
            return

        # Do not create tickets for old records
        creation_date = run_sql(
            """SELECT creation_date FROM bibrec
                                   WHERE id = %s""", [recid])[0][0]
        if creation_date < datetime.now() - timedelta(days=30 * 4):
            return

        for report_tag in record_get_field_instances(record, "037"):
            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

    text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid)
    bibcatalog_system.ticket_submit(subject=subject,
                                    queue=queue,
                                    text=text,
                                    recordid=recid)
Ejemplo n.º 7
0
def merge_records_xml(marcxml_obj):
    """Function that takes in input a marcxml string and returns containing 
    multiple records identified by the tag "collection" and for each one calls the 
    function to merge the different flavors of the same record 
    (identified by the tag "record"). """
    logger.info(' Merger started.')
    #I get the bibrecord object from libxml2 one
    all_records = create_record_from_libxml_obj(marcxml_obj, logger)
    merged_records = []
    records_with_merging_probl = []
    for records in all_records:
        #I try to get the bibcode of the record I'm merging
        try:
            system_number_fields = records[0][FIELD_TO_MARC['system number']]
            bibcode = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
        except:
            bibcode = 'Unknown'
        logger.warn(' Merging bibcode "%s".' % bibcode)
        # Get the merged record
        try:
            merged_records.append(merge_multiple_records(records))
        except Exception, error:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            str_error_to_print = exc_type.__name__ + '\t' + str(error) + ' (Merger error)'
            logger.error(' Impossible to merge the record "%s" \t %s' % (bibcode, str_error_to_print))
            records_with_merging_probl.append((bibcode, str_error_to_print))
Ejemplo n.º 8
0
def merge_record_with_template(rec, template_name):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]

    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec,
                                 field_tag,
                                 field_instance[1],
                                 field_instance[2],
                                 subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(
                    template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code,
                                field_get_subfield_values(
                                    template_field_instance, code)[0])
    return rec
Ejemplo n.º 9
0
def merge_record_with_template(rec, template_name, is_hp_record=False):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]
    # if the record is a holding pen record make all subfields volatile
    if is_hp_record:
        record_make_all_subfields_volatile(template_bibrec)
    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec, field_tag, field_instance[1],
                                 field_instance[2], subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(field_instance, code,
                                               field_get_subfield_values(template_field_instance,
                                               code)[0])
    return rec
Ejemplo n.º 10
0
def merge_record_with_template(rec, template_name, is_hp_record=False):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    if not template:
        return
    template_bibrec = create_record(template)[0]
    # if the record is a holding pen record make all subfields volatile
    if is_hp_record:
        record_make_all_subfields_volatile(template_bibrec)
    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec,
                                 field_tag,
                                 field_instance[1],
                                 field_instance[2],
                                 subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(
                    template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code,
                                field_get_subfield_values(
                                    template_field_instance, code)[0])
    record_order_subfields(rec)
    return rec
Ejemplo n.º 11
0
def generate_ticket(ticket, record):
    """
    Generates a ticket to be created, filling subject, body and queue values
    of the passed BibCatalogTicket object. The enriched object is returned.

    @param ticket: a ticket object as created by BibCatalogTicket() containing
                   the subject, body and queue to create a ticket in.
    @type ticket: record object of BibCatalogTicket.

    @param record: a recstruct object as created by bibrecord.create_record()
    @type record: record object of BibRecord.

    @return: the modified ticket object to create.
    @rtype: BibCatalogTicket
    """
    recid = record_id_from_record(record)
    subject = []

    # Add report number in the subjecet
    report_number = ""
    for report_tag in record_get_field_instances(record, "037"):
        for report_number in field_get_subfield_values(report_tag, 'a'):
            subject.append(report_number)
            break

    subject.append("(#%s)" % (recid, ))
    text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \
           (CFG_SITE_SECURE_URL, recid)

    ticket.subject = " ".join(subject)
    ticket.body = text.replace('%', '%%')
    ticket.queue = "HEP_curation"
    return ticket
Ejemplo n.º 12
0
def generate_ticket(ticket, record):
    """
    Generates a ticket to be created, filling subject, body and queue values
    of the passed BibCatalogTicket object. The enriched object is returned.

    @param ticket: a ticket object as created by BibCatalogTicket() containing
                   the subject, body and queue to create a ticket in.
    @type ticket: record object of BibCatalogTicket.

    @param record: a recstruct object as created by bibrecord.create_record()
    @type record: record object of BibRecord.

    @return: the modified ticket object to create.
    @rtype: BibCatalogTicket
    """
    recid = record_id_from_record(record)
    subject = []

    # Add report number in the subjecet
    report_number = ""
    for report_tag in record_get_field_instances(record, "037"):
        for report_number in field_get_subfield_values(report_tag, 'a'):
            subject.append(report_number)
            break

    subject.append("(#%s)" % (recid,))
    text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \
           (CFG_SITE_SECURE_URL, recid)

    ticket.subject = " ".join(subject)
    ticket.body = text.replace('%', '%%')
    ticket.queue = "HEP_curation"
    return ticket
Ejemplo n.º 13
0
def _create_ticket(recid, bibcatalog_system, queue):
    subject = "Refs for #%s" % recid

    if CFG_INSPIRE_SITE:
        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_core = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'CORE':
                    in_core = True
                if collection == 'arXiv':
                    # Do not create tickets for arxiv papers
                    # Tickets for arxiv papers are created in bibcatelog
                    write_message("arXiv paper", verbose=1)
                    return

        # Only create tickets for HEP
        if not in_core:
            write_message("not in hep", verbose=1)
            return

        # Do not create tickets for old records
        creation_date = run_sql("""SELECT creation_date FROM bibrec
                                   WHERE id = %s""", [recid])[0][0]
        if creation_date < datetime.now() - timedelta(days=30*4):
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'c'):
                if category.startswith('astro-ph'):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

    text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL,
                                                    recid)
    bibcatalog_system.ticket_submit(subject=subject,
                                    queue=queue,
                                    text=text,
                                    recordid=recid)
Ejemplo n.º 14
0
    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False
Ejemplo n.º 15
0
    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False
def has_field_origin(field_list, origin, code):
    """
    This function checks if any of the fields for a certain tag contains
    origin in given subfield code. I.e. $9 arXiv.
    """
    for field in field_list:
        if origin in field_get_subfield_values(field, code):
            return True
    return False
Ejemplo n.º 17
0
def has_field_origin(field_list, origin, code):
    """
    This function checks if any of the fields for a certain tag contains
    origin in given subfield code. I.e. $9 arXiv.
    """
    for field in field_list:
        if origin in field_get_subfield_values(field, code):
            return True
    return False
Ejemplo n.º 18
0
def record_in_collection(record, collection):
    """
    Returns True/False if given record is in a given collection (980__a).
    """
    for collection_tag in record_get_field_instances(record, "980"):
        for coll in field_get_subfield_values(collection_tag, 'a'):
            if coll.lower() == collection.lower():
                return True
    return False
Ejemplo n.º 19
0
def record_in_collection(record, collection):
    """
    Returns True/False if given record is in a given collection (980__a).
    """
    for collection_tag in record_get_field_instances(record, "980"):
        for coll in field_get_subfield_values(collection_tag, 'a'):
            if coll.lower() == collection.lower():
                return True
    return False
def check_pub_year_consistency(merged_record, type_check):
    """Function that checks if the publication year is consistent 
    with the year at the beginning of the bibcode"""
    logger.info('      running check_pub_year_consistency')
    #definition of the list of dates I don't want to check with this function
    dates_to_skip_from_check = ['date-preprint']
    try:
        system_number_fields = merged_record[FIELD_TO_MARC['system number']]
    except KeyError:
        manage_check_error('No System Number field!', type_check, logger)
        return None
    try:
        pub_dates_fields = merged_record[FIELD_TO_MARC['publication date']]
    except KeyError:
        manage_check_error('No Publication Date field!', type_check, logger)
        return None
    #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(system_number_fields) > 1:
        manage_check_error('There are more than one System Numbers!', type_check, logger)
        return None
    system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
    num_dates_checked = 0
    for date_type_string in PUBL_DATE_TYPE_VAL_SUBFIELD:
        #I don't want to check the preprint date
        if date_type_string in dates_to_skip_from_check:
            continue
        #then I have to extract the right date (there can be different in the same field)
        pubdate = ''
        for field in pub_dates_fields:
            if bibrecord.field_get_subfield_values(field, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type_string:
                pubdate =  bibrecord.field_get_subfield_values(field, PUBL_DATE_SUBFIELD)[0]
                break
        if len(pubdate) != 0:
            num_dates_checked +=1
        else:
            continue
        #final part of the check
        if pubdate[0:4] != system_number[0:4]:
            manage_check_error('Year of "%s" not consistent with the main bibcode "%s"!' % (date_type_string, system_number), type_check, logger)
    if num_dates_checked == 0:
        manage_check_error('No dates available for this record!', type_check, logger)    
    return None
Ejemplo n.º 21
0
def format(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"):
    """
    Prints the list of editors of a record.
    
    @param limit the maximum number of editors to display
    @param separator the separator between editors.
    @param extension a text printed if more editors than 'limit' exist
    @param print_links if yes, print the editors as HTML link to their publications
    """
    from urllib import quote
    from invenio.config import CFG_SITE_URL
    from invenio import bibrecord

    authors = bibrecord.record_get_field_instances(bfo.get_record(), '700')
    editors = [bibrecord.field_get_subfield_values(author, 'a')[0]
               for author in authors if \
               len(bibrecord.field_get_subfield_values(author, "e")) > 0 \
               and bibrecord.field_get_subfield_values(author, "e")[0]=="ed."
              ]
    if print_links.lower() == "yes":
        editors = ['<a href="' + CFG_SITE_URL + '/search?f=author&p=' + \
                   quote(editor) + \
                   '&amp;ln='+ bfo.lang + \
                   '">' + editor + '</a>'
                   for editor in editors if editor.strip()]

    if len(editors) == 0:
        beginning = ''
        ending = ''
    elif len(editors) == 1:
        beginning = ''
        ending = ' (ed.)'
    else:
        beginning = ''
        ending = ' (eds.)'

    if limit.isdigit() and len(editors) > int(limit):
        return beginning + separator.join(
            editors[:int(limit)]) + extension + ending

    elif len(editors) > 0:
        return beginning + separator.join(editors) + ending
Ejemplo n.º 22
0
def check_arxiv_url(field, valid_arxiv_ids):
    url = field_get_subfield_values(field, 'u')
    if not url:
        return True
    url = url[0]
    # print 'url', url
    arxiv_id = extract_arxiv_id_from_url(url)
    # print 'id', arxiv_id
    if arxiv_id is None:
        return True
    else:
        return arxiv_id in valid_arxiv_ids
def get_name_variants(record):
    """
    Return indexable values in the 410 field.
    """
    name_variants = set()
    if '410' in record:
        fields = bibrecord.record_get_field_instances(record, '410')
        for field in fields:
            values = bibrecord.field_get_subfield_values(field, 'a')
            if values:
                if 'ADS' in bibrecord.field_get_subfield_values(field, '9'):
                    # Always index field with source ADS.
                    for value in values:
                        name_variants.add(value.decode('utf_8'))
                else:
                    # Disregard uppercase space-separated fields.
                    for value in values:
                        if not re.match('\s*[A-Z]+\s[A-Z ]+$', value):
                            name_variants.add(value.decode('utf_8'))

    return list(name_variants)
def get_name_variants(record):
    """
    Return indexable values in the 410 field.
    """
    name_variants = set()
    if '410' in record:
        fields = bibrecord.record_get_field_instances(record, '410')
        for field in fields:
            values = bibrecord.field_get_subfield_values(field, 'a')
            if values:
                if 'ADS' in bibrecord.field_get_subfield_values(field, '9'):
                    # Always index field with source ADS.
                    for value in values:
                        name_variants.add(value.decode('utf_8'))
                else:
                    # Disregard uppercase space-separated fields.
                    for value in values:
                        if not re.match('\s*[A-Z]+\s[A-Z ]+$', value):
                            name_variants.add(value.decode('utf_8'))

    return list(name_variants)
Ejemplo n.º 25
0
def check_existing_pdg_fields(recids, pdg_data, current_records):
    _print_out("Comparing new and old PDG data for " + str(len(recids)) + " records...")
    records = {}
    for recid in recids:
        record_mod = {}
        record_mod['001'] = deepcopy(current_records[recid]['001'])
        record_mod['084'] = deepcopy(current_records[recid]['084'])
        fields = record_get_field_instances(record_mod, '084')
        current_pdg_data = []
        for field in fields:
            if is_pdg_field(field):
                current_pdg_data.append(field_get_subfield_values(field, 'a')[0])

        current_set = set(current_pdg_data)
        new_set = set(pdg_data[recid])
        deletions = list(current_set - new_set)
        additions = list(new_set - current_set)

        if len(deletions) > 0 or len(additions) > 0:
            if len(deletions) > 0:
                for field in fields:
                    if is_pdg_field(field):
                        if field_get_subfield_values(field, 'a')[0] in deletions:
                            record_delete_field(record_mod, '084', ind1=' ', ind2=' ',
                                                field_position_global=field[4])

            for pdg_field in additions:
                position = record_add_field(record_mod, '084', ' ', ' ')
                record_add_subfield_into(record_mod, '084', '2', 'PDG', field_position_global=position)
                record_add_subfield_into(record_mod, '084', '9', 'PDG', field_position_global=position)
                record_add_subfield_into(record_mod, '084', 'a', pdg_field, field_position_global=position)

            records[recid] = record_mod
            _print_verbose("Record #" + str(recid) + ": " + str(len(deletions)) +
                           " deletions and " + str(len(additions)) + " additons.")
        else:
            _print_verbose("Nothing to change for record #" + str(recid))

    _print_out(str(len(records)) + " records to be corrected.")
    return records
def tarballs_by_recids(recids, sdir):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids.

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live

    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recid.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = int(recid)

    arXiv_ids = []

    for recid in list_of_ids:
        rec = get_record(recid)
        for afieldinstance in record_get_field_instances(rec, tag='037'):
            if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                arXiv_ids.append(arXiv_id)

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Ejemplo n.º 27
0
def tarballs_by_recids(recids, sdir):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids.

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live

    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    list_of_ids = []

    if "," in recids:
        recids = recids.split(",")
        for recid in recids:
            if "-" in recid:
                low, high = recid.split("-")
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if "-" in recids:
            low, high = recid.split("-")
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = int(recid)

    arXiv_ids = []

    for recid in list_of_ids:
        rec = get_record(recid)
        for afieldinstance in record_get_field_instances(rec, tag="037"):
            if "arXiv" == field_get_subfield_values(afieldinstance, "9")[0]:
                arXiv_id = field_get_subfield_values(afieldinstance, "a")[0]
                arXiv_ids.append(arXiv_id)

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Ejemplo n.º 28
0
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE):
    write_message('bibcatalog_system %s' % bibcatalog_system, verbose=1)
    write_message('queue %s' % queue, verbose=1)
    if bibcatalog_system and queue:

        subject = "Refs for #%s" % recid

        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_hep = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'HEP':
                    in_hep = True

        # Only create tickets for HEP
        if not in_hep:
            write_message("not in hep", verbose=1)
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'c'):
                if category.startswith('astro-ph'):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

        text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, \
                                                        recid)
        bibcatalog_system.ticket_submit(subject=subject,
                                        queue=queue,
                                        text=text,
                                        recordid=recid)
Ejemplo n.º 29
0
def format_element(bfo,
                   limit,
                   separator=' ; ',
                   extension='[...]',
                   print_links="yes"):
    """
    Prints the list of editors of a record.

    @param limit: the maximum number of editors to display
    @param separator: the separator between editors.
    @param extension: a text printed if more editors than 'limit' exist
    @param print_links: if yes, print the editors as HTML link
           to their publications
    """
    from urllib import quote
    from invenio.config import CFG_BASE_URL
    from invenio.bibrecord import field_get_subfield_values, \
        record_get_field_instances

    authors = record_get_field_instances(bfo.get_record(), '100') + \
        record_get_field_instances(bfo.get_record(), '700')
    editors = [
        field_get_subfield_values(author, 'a')[0] for author in authors
        if len(field_get_subfield_values(author, "e")) > 0
        and field_get_subfield_values(author, "e")[0] == "ed."
    ]

    if print_links.lower() == "yes":
        editors = [
            '<a href="%s/search?f=author&amp;p=%s&amp;ln=%s">%s</a>' %
            (CFG_BASE_URL, quote(editor), bfo.lang, editor)
            for editor in editors
        ]

    if limit.isdigit() and len(editors) > int(limit):
        return separator.join(editors[:int(limit)]) + extension

    elif len(editors) > 0:
        return separator.join(editors)
def first_author_bibcode_consistency(merged_record, type_check):
    """Function that checks if the last letter of the main bibcode 
    is consistent with the first letter of the first author"""
    logger.info('      running first_author_bibcode_consistency')
    bibstems_to_skip_from_check = ['QB']
    try:
        system_number_fields = merged_record[FIELD_TO_MARC['system number']]
    except KeyError:
        manage_check_error('No System Number field!', type_check, logger)
        return None
    try:
        first_author_fields = merged_record[FIELD_TO_MARC['first author']]
    except KeyError:
        manage_check_error('No First Author field!', type_check, logger)
        return None
    #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(system_number_fields) > 1:
        manage_check_error('There are more than one System Numbers!', type_check, logger)
        return None
    #the first author field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed)
    if len(first_author_fields) > 1:
        manage_check_error('There are more than one First Author!', type_check, logger)
        return None
    system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0]
    first_author = bibrecord.field_get_subfield_values(first_author_fields[0], AUTHOR_NAME_SUBFIELD)[0]
    #If the bibcode has a bibstem to skip, I don't do anything
    for elem in bibstems_to_skip_from_check:
        if system_number[4:4+len(elem)] == elem:
            return None
    if first_author[0].lower() != system_number[-1].lower():
        #if the last letter of the system number is a dot, then I want to give a different message
        if system_number[-1] == '.':
            manage_check_error('The main bibcode "%s" doesn\'t have an initial even if there is a First Author "%s"!' % (system_number, first_author), type_check, logger)
        else:
            manage_check_error('First Author "%s" not consistent with the main bibcode "%s"!' % (first_author, system_number), type_check, logger)
    return None
Ejemplo n.º 31
0
def get_origin(fields):
    """function that extracts the origin of a field"""
    origins = set()
    for field in fields:
        origins.update(bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD))

    if not origins:
        raise OriginNotFound(fields)
    elif len(origins) > 2:
        raise OriginNotFound(fields)

    origin = origins.pop().strip('; ')
    if not origin:
        raise OriginNotFound(fields)

    return origin
Ejemplo n.º 32
0
def papers_by_country_with_affs_csv(req, country):
    req.content_type = 'text/csv; charset=utf-8'
    req.headers_out['content-disposition'] = ('attachment; '
                                              'filename=papers_by_country.csv')

    ## print the list of linkt to the articles
    count = 1
    print >> req, country
    search = "100__w:'%s' OR 700__w:'%s'" % (country, country)
    res = perform_request_search(p='%s' % (search, ))
    print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations"
    if len(res):
        for rec_id in res:
            author_count = 11
            rec = get_record(rec_id)
            title = ''
            authors = ''
            journal = ''
            doi = ''
            inspire_record = ''
            if '245' in rec:
                title = re.sub("<.*?>", "", rec['245'][0][0][0][1])
            for sub in rec['773'][0][0]:
                if 'p' in sub[0]:
                    journal = sub[1]
            doi = get_doi(rec_id)
            if '035' in rec:
                for f in rec['035'][0][0]:
                    if 'a' in f:
                        inspire_record = 'http://inspirehep.net/record/%s' % (
                            f[1], )
            print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi,
                                                inspire_record)
            if '100' in rec:
                author = rec['100'][0][0][0][1]
                affiliations = record_get_field_values(rec,
                                                       tag='100',
                                                       code='v')
                print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            if '700' in rec:
                for auth in rec['700']:
                    author = auth[0][0][1]
                    affiliations = field_get_subfield_values(auth, code='v')
                    print >> req, ";;;;;%s;%s" % (author,
                                                  " | ".join(affiliations))
            count += 1
Ejemplo n.º 33
0
def check_duplicate_normalized_author_names(fields1, fields2, final_result, type_check, subfield_list, tag):
    """
    Checks if there are authors with the same normalized name. This will
    prevent the correct matching of authors from one author list to the other.
    """
    logger.info("        running check_duplicate_normalized_author_names")

    author_names = set()
    for field in final_result:
        author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0]
        if author in author_names:
            # I don't raise an error if I have duplicated normalized author names,
            # I simply return the trusted list
            manage_check_error(
                'Duplicated normalized author name for "%s" in field "%s".' % (author, tag), type_check, logger
            )
        else:
            author_names.add(author)
    return None
Ejemplo n.º 34
0
def pub_date_merger(fields1, fields2, tag):
    """function to merge dates. the peculiarity of this merge is that 
    we need to create a new field based on which date is available"""
    all_dates = take_all_no_checks(fields1, fields2, tag)
    if len(all_dates) > 0:
        #removing the main-date if present
        for date in all_dates:
            if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == 'main-date':
                logger.info('        Main date already available: trying to re-create it')
                del(all_dates[all_dates.index(date)])
                break
        #I need to extract the best date available
        main_pub_date = None
        main_pub_date_primary = 'False'
        #first I try to extract it from the canonical metadata
        done = False
        for date_type in PUBL_DATE_TYPE_VAL_SUBFIELD:
            if done:
                break
            for date in all_dates:
                if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type and bibrecord.field_get_subfield_values(date, PRIMARY_METADATA_SUBFIELD)[0] == 'True':
                    main_pub_date = bibrecord.field_get_subfield_values(date, PUBL_DATE_SUBFIELD)[0]
                    main_pub_date_primary = 'True'
                    done = True
                    break
        #if I'm not successful I try with a normal metadata
        if main_pub_date == None:
            done = False
            for date_type in PUBL_DATE_TYPE_VAL_SUBFIELD:
                if done:
                    break
                for date in all_dates:
                    if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type:
                        main_pub_date = bibrecord.field_get_subfield_values(date, PUBL_DATE_SUBFIELD)[0]
                        done = True
                        break
        #if I still don't have a main date it means that I have a date that is not in the list of expected dates
        #so I take the first one 
        #P.S. I should never get a this point
        if main_pub_date == None:
            logger.info('        All the dates available are not recognized as good for a main date: picking the first available')
            main_pub_date = bibrecord.field_get_subfield_values(all_dates[0], PUBL_DATE_SUBFIELD)[0]

        #finally I append the main date to the list of dates
        all_dates.append(([(PUBL_DATE_SUBFIELD, main_pub_date), (PUBL_DATE_TYPE_SUBFIELD, 'main-date'), (ORIGIN_SUBFIELD, 'ADS metadata'), 
                           (PRIMARY_METADATA_SUBFIELD, main_pub_date_primary)],) + all_dates[0][1:])
        return all_dates
    else:
        return all_dates
Ejemplo n.º 35
0
def merge_record_with_template(rec, template_name):
    """ Extend the record rec with the contents of the template and return it"""
    template = get_record_template(template_name)
    template_bibrec = create_record(template)[0]

    for field_tag in template_bibrec:
        if not record_has_field(rec, field_tag):
            for field_instance in template_bibrec[field_tag]:
                record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0])
        else:
            for template_field_instance in template_bibrec[field_tag]:
                subfield_codes_template = field_get_subfield_codes(template_field_instance)
                for field_instance in rec[field_tag]:
                    subfield_codes = field_get_subfield_codes(field_instance)
                    for code in subfield_codes_template:
                        if code not in subfield_codes:
                            field_add_subfield(
                                field_instance, code, field_get_subfield_values(template_field_instance, code)[0]
                            )
    return rec
Ejemplo n.º 36
0
def papers_by_country_with_affs_csv(req, country):
    req.content_type = 'text/csv; charset=utf-8'
    req.headers_out['content-disposition'] = ('attachment; '
                                              'filename=papers_by_country.csv')

    ## print the list of linkt to the articles
    count = 1
    print >> req, country
    search = "100__w:'%s' OR 700__w:'%s'" % (country, country)
    res = perform_request_search(p='%s' % (search,))
    print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations"
    if len(res):
        for rec_id in res:
            author_count = 11
            rec = get_record(rec_id)
            title = ''
            authors = ''
            journal = ''
            doi = ''
            inspire_record = ''
            if '245' in rec:
                title = re.sub("<.*?>", "", rec['245'][0][0][0][1])
            for sub in rec['773'][0][0]:
                if 'p' in sub[0]:
                    journal = sub[1]
            doi = get_doi(rec_id)
            if '035' in rec:
                for f in rec['035'][0][0]:
                    if 'a' in f:
                        inspire_record = 'http://inspirehep.net/record/%s' % (f[1],)
            print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi, inspire_record)
            if '100' in rec:
                author = rec['100'][0][0][0][1]
                affiliations = record_get_field_values(rec, tag='100', code='v')
                print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            if '700' in rec:
                for auth in rec['700']:
                    author = auth[0][0][1]
                    affiliations = field_get_subfield_values(auth, code='v')
                    print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations))
            count += 1
Ejemplo n.º 37
0
def extend_author_field(author_field, cds_id):
    """Extend author datafield by CDS authority id and Beard tag.

    Extends the author datafield by the MARC subfields
        $$0:AUTHOR|(CDS)<cds_id>
        $$9:#BEARD#
    if $$0:AUTHOR|(CDS)<cds_id> does not exist in `author_field`.

    :param author_field:
        Example:
            # from invenio.search_engine import get_record
            # from invenio.bibrecord import record_get_field_instances
            # record = get_record(2150939)
            # author_field = record_get_field_instances(record, "100")[0]
            author_field = ([('a', 'Ellis, John'),
                             ('u', "King's Coll. London"),
                             ('u', 'CERN')], ' ', ' ', '', 32)
    :param str cds_id: sequence of numbers representing the CDS id
        Example:
            cds_id = '2108556'

    :result:
        Example:
            author_field = ([('a', 'Ellis, John'),
                             ('u', "King's Coll. London"),
                             ('u', 'CERN'),
                             ('0', 'AUTHOR|(CDS)2108556'),
                             ('9', '#BEARD#')], ' ', ' ', '', 32)

    :return: True, if `author_field` has been  updated, False otherwise
    """
    cds_authority_id = "AUTHOR|(CDS){0}".format(cds_id)
    if cds_authority_id not in field_get_subfield_values(author_field, '0'):
        field_add_subfield(author_field, "0", cds_authority_id)
        field_add_subfield(author_field, "9", "#BEARD#")
        return True

    return False
Ejemplo n.º 38
0
            recid = retrieve_rec_id(record, "")

        if not recid or recid == -1:
            # Try again with p_r_s
            arxiv_id = get_minimal_arxiv_id(record)
            if arxiv_id:
                results = perform_request_search(p="reportnumber:%s" %
                                                 (arxiv_id, ),
                                                 of='id')
                if len(results) > 0:
                    # FIXME: Ambiguous results may happen. Now just taking first result..
                    recid = results[0]

        # 773 RefExtract PubNote extraction
        for field in record_get_field_instances(record, '773'):
            for value in field_get_subfield_values(field, 'x'):
                extract = extract_journal_reference(value)
                if extract:
                    subfields = [('x', value)]
                    if extract.get('volume', False):
                        subfields.append(('v', str(extract['volume'])))
                    if extract.get('title', False):
                        subfields.append(('p', str(extract['title'])))
                    if extract.get('year', False):
                        subfields.append(('y', str(extract['year'])))
                    if extract.get('page', False):
                        subfields.append(('c', str(extract['page'])))
                    new_field = create_field(subfields,
                                             global_position=field[4])
                    record_replace_field(record, '773', new_field, field[4])
                    break
Ejemplo n.º 39
0
def task_run_core():
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                try:
                    value = field_get_subfield_values(instance, "a")[0]
                except IndexError:
                    value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" % (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message("WARNING: Record %s has no first author or collaboration" % recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" % (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    #FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    return True
Ejemplo n.º 40
0
            recid = retrieve_rec_id(record, "")

        if not recid or recid == -1:
            # Try again with p_r_s
            arxiv_id = get_minimal_arxiv_id(record)
            if arxiv_id:
                results = perform_request_search(p="reportnumber:%s" %
                                                 (arxiv_id, ),
                                                 of='id')
                if len(results) > 0:
                    # FIXME: Ambiguous results may happen. Now just taking first result..
                    recid = results[0]

        # 773 RefExtract PubNote extraction
        for field in record_get_field_instances(record, '773'):
            for value in field_get_subfield_values(field, 'x'):
                extract = extract_journal_reference(value)
                if extract:
                    subfields = [('x', value)]
                    if extract.get('volume', False):
                        subfields.append(('v', str(extract['volume'])))
                    if extract.get('title', False):
                        subfields.append(('p', str(extract['title'])))
                    if extract.get('year', False):
                        subfields.append(('y', str(extract['year'])))
                    if extract.get('page', False):
                        subfields.append(('c', str(extract['page'])))
                    new_field = create_field(subfields,
                                             global_position=field[4])
                    record_replace_field(record, '773', new_field, field[4])
                    break
Ejemplo n.º 41
0
def has_field_origin(field_list, origin, code):
    """Check if any of the fields for a certain tag contains  origin in given subfield code."""
    for field in field_list:
        if origin in field_get_subfield_values(field, code):
            return True
    return False
Ejemplo n.º 42
0
def tarballs_by_recids(recids,
                       sdir,
                       docname=None,
                       doctype=None,
                       docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(
                            afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(
                            afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_type() == doctype
                ]
            if docname:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_name() == docname
                ]
            if docformat:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_format() == docformat
                ]
            local_files.extend([(docfile.get_path(), recid)
                                for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
                                        "9") and has_field_origin(
                                            existing_field_list, "arXiv", "9"):
                        fields_to_correct.append((tag, [field]))
                    else:
                        holdingpen = True
                    # Check for duplicates and add title update as 246
                    field_list_246 = record_get_field_instances(
                        existing_record, "246")
                    if not has_field(field, field_list_246):
                        fields_to_add.append(("246", [field]))
                else:
                    corrected_fields = []
                    if has_field_origin(new_field_list, "arXiv", "9") \
                       and has_field_origin(existing_field_list, "arXiv", "9"):
                        for field in existing_field_list:
                            if not "arXiv" in field_get_subfield_values(
                                    field, "9"):
                                corrected_fields.append(field)
                        for field in new_field_list:
                            if not has_field(field, corrected_fields):
                                corrected_fields.append(field)

                    action = get_action(tag, diff_code, action_dict)
                    if action == 'holdingpen' and not holdingpen:
                        holdingpen = True

                    if action == 'correct' or len(corrected_fields) > 0:
                        if len(corrected_fields) == 0:
                            corrected_fields = new_field_list
                        fields_to_correct.append((tag, corrected_fields))

                    if action == 'append':
            recid = None
        else:
            recid = retrieve_rec_id(record, "")

        if not recid or recid == -1:
            # Try again with p_r_s
            arxiv_id = get_minimal_arxiv_id(record)
            if arxiv_id:
                results = perform_request_search(p="reportnumber:%s" % (arxiv_id,), of='id')
                if len(results) > 0:
                    # FIXME: Ambiguous results may happen. Now just taking first result..
                    recid = results[0]

        # 773 RefExtract PubNote extraction
        for field in record_get_field_instances(record, '773'):
            for value in field_get_subfield_values(field, 'x'):
                extract = extract_journal_reference(value)
                if extract:
                    subfields = [('x', value)]
                    if extract.get('volume', False):
                        subfields.append(('v', str(extract['volume'])))
                    if extract.get('title', False):
                        subfields.append(('p', str(extract['title'])))
                    if extract.get('year', False):
                        subfields.append(('y', str(extract['year'])))
                    if extract.get('page', False):
                        subfields.append(('c', str(extract['page'])))
                    new_field = create_field(subfields, global_position=field[4])
                    record_replace_field(record, '773', new_field, field[4])
                    break
Ejemplo n.º 45
0
            recid = None
        else:
            recid = retrieve_rec_id(record, "")

        if not recid or recid == -1:
            # Try again with p_r_s
            arxiv_id = get_minimal_arxiv_id(record)
            if arxiv_id:
                results = perform_request_search(p="reportnumber:%s" % (arxiv_id,), of='id')
                if len(results) > 0:
                    # FIXME: Ambiguous results may happen. Now just taking first result..
                    recid = results[0]

        # 773 RefExtract PubNote extraction
        for field in record_get_field_instances(record, '773'):
            for value in field_get_subfield_values(field, 'x'):
                extract = extract_journal_reference(value)
                if extract:
                    subfields = [('x', value)]
                    if extract.get('volume', False):
                        subfields.append(('v', str(extract['volume'])))
                    if extract.get('title', False):
                        subfields.append(('p', str(extract['title'])))
                    if extract.get('year', False):
                        subfields.append(('y', str(extract['year'])))
                    if extract.get('page', False):
                        subfields.append(('c', str(extract['page'])))
                    new_field = create_field(subfields, global_position=field[4])
                    record_replace_field(record, '773', new_field, field[4])
                    break
Ejemplo n.º 46
0
def is_pdg_field(field):
    if field_get_subfield_values(field, '2')[0] == 'PDG':
        if field_get_subfield_values(field, '9')[0] == 'PDG':
            return True
    return False
Ejemplo n.º 47
0
def has_field_origin(field_list, origin, code):
    """Check if any of the fields for a certain tag contains  origin in given subfield code."""
    for field in field_list:
        if origin in field_get_subfield_values(field, code):
            return True
    return False
Ejemplo n.º 48
0
def update_record(record_id, authors):
    """Update authors in CDS record.

    :param int record_id: record to update author datafields
        Example:
            record_id = 2150939
    :param dict authors: dictionary where keys are author full names and
        values the CDS profile ids to be updated in the given record
        Example:
            authors = {'Ellis, John': '2108556'}

    :return: string representing the record XML element containing
        author (`100`) and/or co-author (`700`) datafields. Empty string if
        nothing to update
        Example:
            '<record>
                <controlfield tag="001">2150939</controlfield>
                <datafield tag="100" ind1=" " ind2=" ">
                    <subfield code="a">Ellis, John</subfield>
                    <subfield code="u">King's Coll. London</subfield>
                    <subfield code="u">CERN</subfield>
                    <subfield code="0">AUTHOR|(CDS)2108556</subfield>
                    <subfield code="9">#BEARD#</subfield>
                </datafield>
            </record>'
    """
    record = get_record(record_id)
    record_author = record_get_field_instances(record, "100")
    record_coauthors = record_get_field_instances(record, "700")

    if len(record_author) > 1:
        print ("Oops: several '100' (main author) fields have been found in "
               "record '{0}'".format(record_id))
        return ""

    datafields = ""
    author = False
    for author_field in record_author:
        try:
            author_name = field_get_subfield_values(author_field, 'a')[0]
            try:
                cds_id = authors[author_name]
                if extend_author_field(author_field, cds_id):
                    datafields += field_xml_output(author_field, "100")
                    author = True
            except KeyError:
                pass
        except IndexError:
            # Author field (`100`) does not have subfield `a`
            pass

    if len(authors) > 1 or not author:
        for coauthor_field in record_coauthors:
            try:
                coauthor_name = field_get_subfield_values(
                    coauthor_field, 'a')[0]
                try:
                    cds_id = authors[coauthor_name]
                    if extend_author_field(coauthor_field, cds_id):
                        author = True
                except KeyError:
                    pass
            except IndexError:
                # Co-author field (`700`) does not have subfield `a`
                pass
            datafields += field_xml_output(coauthor_field, "700")

    # Nothing to update
    if not author:
        # print "No authors to update in record '{0}'".format(record_id)
        return ""

    record = ('<record><controlfield tag="001">{0}</controlfield>{1}'
              '</record>'.format(record_id, datafields))
    return record
Ejemplo n.º 49
0
def task_run_core(name=NAME):
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = task_get_task_param('recids')
    if recids:
        start_date = None
        write_message("processing recids from commandline")
    else:
        start_date = datetime.now()
        recids = intbitset()
        recids |= intbitset(
            perform_request_search(p='-035:spirestex -035:inspiretex',
                                   cc='HEP'))

        if task_get_task_param('all'):
            write_message("processing all records without texkey")
        else:
            _, last_date = fetch_last_updated(name)
            recids = recids & fetch_records_modified_since(last_date)
            write_message("processing records modified since: %s" % last_date)

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct,
                                                   tag="035",
                                                   ind1="",
                                                   ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "a")[0]
            except IndexError:
                value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" %
                              (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message(
                    "WARNING: Record %s has no first author or collaboration" %
                    recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    # FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    if start_date:
        store_last_updated(0, start_date, name)

    return True
Ejemplo n.º 50
0
def is_pdg_field(field):
    if field_get_subfield_values(field, '2')[0] == 'PDG':
        if field_get_subfield_values(field, '9')[0] == 'PDG':
            return True
    return False
def merge_creation_modification_dates(merged_record):
    """Function that grabs all the origins in the merged record 
    and creates a merged version of the creation and modification date 
    based only on the found origins"""
    #I create a local copy to avoid problems
    record = deepcopy(merged_record)
    #I extract all the creation and modification dates
    try:
        creat_mod = record[FIELD_TO_MARC['creation and modification date']]
    except KeyError:
        logger.warning('      No Creation-Modification field available!')
        return record
    #then I extract all the origins from all the fields but the creation and modification date
    origins = []
    for field_code in record:
        if field_code != FIELD_TO_MARC['creation and modification date']:
            for field in record[field_code]:
                try:
                    origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0]
                    if origin !='':
                        origins.append(origin)
                #if there is origin this is a problem, but I don't have to manage it here
                except IndexError:
                    pass
    #I unique the list
    origins = list(set(origins))        
    #then for each field in creation e modification date I check if it has an origin used in other fields
    #and if so I update creation and modification dates
    new_creation_modification_date = {}
    for field in creat_mod:
        try:
            origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0]
        except IndexError:
            origin = ''
        
        if origin in origins:
            #I have to put or update the creation and modification date
            if len(new_creation_modification_date) == 0:
                #if there is no creation or modification date I simply insert the field
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0]
                new_creation_modification_date[MODIFICATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0]
                new_creation_modification_date[ORIGIN_SUBFIELD] = origin
                new_creation_modification_date['origin_importance'] = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin)
            else:
                #otherwise I have to check which one is the oldest for creation and newest for modification
                old_creation = new_creation_modification_date[CREATION_DATE_SUBFIELD]
                old_modification = new_creation_modification_date[CREATION_DATE_SUBFIELD]
                new_creation = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0]
                new_modification = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0]
                
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_creation if old_creation <= new_creation else new_creation
                new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_modification if old_modification >= new_modification else new_modification
                #then at the end I put as origin the most trusted origin
                old_origin = new_creation_modification_date[ORIGIN_SUBFIELD]
                old_origin_import = new_creation_modification_date['origin_importance']
                new_origin_import = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin)
                new_creation_modification_date[ORIGIN_SUBFIELD] = old_origin if old_origin_import >= new_origin_import else origin
                new_creation_modification_date['origin_importance'] = old_origin_import if old_origin_import >= new_origin_import else new_origin_import            
    #then I upgrade the field
    record[FIELD_TO_MARC['creation and modification date']] = [([(MODIFICATION_DATE_SUBFIELD, new_creation_modification_date[MODIFICATION_DATE_SUBFIELD]), 
                               (CREATION_DATE_SUBFIELD, new_creation_modification_date[CREATION_DATE_SUBFIELD]),
                               (ORIGIN_SUBFIELD, new_creation_modification_date[ORIGIN_SUBFIELD])], ) + creat_mod[0][1:]]
    return record
Ejemplo n.º 52
0
def convert_record(rec):
    "Convert single REC record to JSON."

    jrec = {}

    # recid / 001
    recid = record_get_field_value(rec, tag="001")
    if recid:
        jrec['recid'] = record_get_field_value(rec, tag="001")

    # doi / 0247 $2 DOI
    dois = filter_field_instances(record_get_field_instances(rec, tag="024", ind1="7"),
                                  "2", "DOI")
    if dois:
        jrec['doi'] = field_get_subfield_values(dois[0], "a")[0]

    # CMS ConfDB ID / 035
    cmsconfdbids = filter_field_instances(record_get_field_instances(rec, tag="035"),
                                  "9", "CMS-ConfDB")
    if cmsconfdbids:
        jrec['cms_confdb_id'] = field_get_subfield_values(cmsconfdbids[0], "a")[0]

    # report number / 037

    # authors / 100, 700
    authors = []
    for field_instance in record_get_field_instances(rec, tag="100") + record_get_field_instances(rec, tag="700"):
        author = {}
        author_names = field_get_subfield_values(field_instance, "a")
        if author_names:
            author['name'] = author_names[0]
        author_ccids = field_get_subfield_values(field_instance, "h")
        if author_ccids:
            author['ccid'] = author_ccids[0]
        author_inspireids = field_get_subfield_values(field_instance, "i")
        if author_inspireids:
            author['inspireid'] = author_inspireids[0]
        author_affiliations = field_get_subfield_values(field_instance, "u")
        if author_affiliations:
            author['affiliation'] = author_affiliations[0]
        authors.append(author)
    if authors:
        jrec['authors'] = authors

    # collaboration / 110, 710
    collaboration_name = record_get_field_value(rec, tag="110", code="a")
    collaboration_name_additionals = record_get_field_values(rec, tag="710", code="a")
    collaboration_group = record_get_field_value(rec, tag="110", code="g")
    collaboration_recid = record_get_field_value(rec, tag="110", code="w")
    if collaboration_name or collaboration_group or collaboration_recid:
        collaboration = {}
        if collaboration_name:
            for collaboration_name_additional in collaboration_name_additionals:
                collaboration_name += ' and ' + collaboration_name_additional
            collaboration['name'] = collaboration_name
        if collaboration_group:
            collaboration['group'] = collaboration_group
        if collaboration_recid:
            collaboration['recid'] = collaboration_recid
        jrec['collaboration'] = collaboration

    # title / 245 $a
    title = record_get_field_value(rec, tag="245", code="a")
    if title:
        jrec['title'] = title

    # title subtitle / 245 $b
    title_subtitle = record_get_field_value(rec, tag="245", code="b")
    if title_subtitle:
        jrec['title_subtitle'] = title_subtitle

    # title additional / 246 $a
    title_additional = record_get_field_value(rec, tag="246", code="a")
    if title_additional:
        jrec['title_additional'] = title_additional

    # title additional subtitle / 246 $b
    title_additional_subtitle = record_get_field_value(rec, tag="246", code="b")
    if title_additional_subtitle:
        jrec['title_additional_subtitle'] = title_additional_subtitle

    # publisher / 260
    publisher = record_get_field_value(rec, tag="260", code="b")
    if publisher:
        jrec['publisher'] = publisher

    # date_published / 260
    date_published = record_get_field_value(rec, tag="260", code="c")
    if date_published:
        jrec['date_published'] = date_published

    # date_created, date_reprocessed / 264
    date_created = record_get_field_value(rec, tag="264", ind2="0", code="c")
    if date_created:
        jrec['date_created'] = date_created

    # date_reprocessed / 960
    date_reprocessed = record_get_field_value(rec, tag="960", code="c")
    if date_reprocessed:
        jrec['date_reprocessed'] = date_reprocessed

    # prepublication with reportnumber / 269, 037
    prepublication = {}
    for field_instance in record_get_field_instances(rec, tag="269"):
        prepublication_places = field_get_subfield_values(field_instance, "a")
        if prepublication_places:
            prepublication['place'] = prepublication_places[0]
        prepublication_publishers = field_get_subfield_values(field_instance, "b")
        if prepublication_publishers:
            prepublication['publisher'] = prepublication_publishers[0]
        prepublication_dates = field_get_subfield_values(field_instance, "c")
        if prepublication_dates:
            prepublication_time = time.strptime(prepublication_dates[0], "%d %b %Y")
            prepublication['date'] = time.strftime("%Y-%m-%d", prepublication_time)
        prepublication_reportnumber = record_get_field_value(rec, tag="037", code="a")
        if prepublication_reportnumber:
            prepublication['report_number'] = prepublication_reportnumber
    if prepublication:
        jrec['prepublication'] = prepublication

    # pileup / 770
    pileup = {}
    pileup_description = record_get_field_value(rec, tag="770", code="i")
    if pileup_description:
        pileup = {'description': pileup_description}
        pileup_links = []
        for field_instance in record_get_field_instances(rec, tag="770"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_titles = field_get_subfield_values(field_instance, 'a')
            pileup_link = {}
            if field_instance_recids:
                pileup_link['recid'] = field_instance_recids[0]
            if field_instance_titles:
                pileup_link['title'] = field_instance_titles[0]
            if pileup_link:
                if pileup.has_key('links'):
                    pileup['links'].append(pileup_link)
                else:
                    pileup['links'] = [pileup_link, ]
        jrec['pileup'] = pileup

    # extent / 300
    extent = record_get_field_value(rec, tag="300", code="a")
    if extent and False:  # we decided not to retain extent field in COD3
        jrec['extent'] = extent

    # dataset_semantics / 505
    dataset_semantics = []
    for field_instance in record_get_field_instances(rec, tag="505"):
        entry = {}
        entry_variables = field_get_subfield_values(field_instance, "t")
        if entry_variables:
            entry['variable'] = entry_variables[0]
        entry_descriptions = field_get_subfield_values(field_instance, "g")
        if entry_descriptions:
            entry['description'] = entry_descriptions[0]
        if dataset_semantics:
            dataset_semantics.append(entry)
        else:
            dataset_semantics = [entry, ]
    if dataset_semantics:
        jrec['dataset_semantics'] = dataset_semantics

    # collections / 980
    collections = record_get_field_values(rec, tag="980", code="a")
    collections.extend(record_get_field_values(rec, tag="980", code="b"))
    collections.extend(record_get_field_values(rec, tag="980", code="c"))
    if 'DELETED' in collections:
        return {} # record was deleted
    if 'Education' in collections:
        collections.remove('Education')
    if 'Research' in collections:
        collections.remove('Research')
    if collections:
        jrec['collections'] = collections

    # distribution / 256
    distribution = {}
    distribution_number_events = sum([int(x) for x in record_get_field_values(rec, tag="256", code="e")])
    if distribution_number_events:
        distribution['number_events'] = distribution_number_events
    distribution_number_files = sum([int(x) for x in record_get_field_values(rec, tag="256", code="f")])
    if distribution_number_files:
        distribution['number_files'] = distribution_number_files
    distribution_size = sum([int(x) for x in record_get_field_values(rec, tag="256", code="b")])
    if distribution_size:
        distribution['size'] = distribution_size
    formats = []
    if '.root' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")):
        formats.append('root')
    if '/AOD/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")):
        formats.append('aod')
    if '/AODSIM/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")):
        formats.append('aodsim')
    if '/RAW/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")):
        formats.append('raw')
    if 'OPERA' in ' '.join(collections):
        formats.append('csv')
    if '.tar.gz' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")):
        formats.append('gz')
    fft_extensions = []
    for fft in record_get_field_values(rec, tag="FFT", code="a"):
        fft_basename, fft_extension = os.path.splitext(fft)
        if 'file-indexes' in fft_basename:
            continue
        if fft_extension == '.configFile':
            fft_extension = '.py'
        if fft_extension not in fft_extensions:
            fft_extensions.append(fft_extension[1:])
    for fft_extension in fft_extensions:
        if fft_extension:
            if not fft_extension in formats:
                formats.append(fft_extension)
    if formats:
        distribution['formats'] = formats
    if distribution:
        jrec['distribution'] = distribution

    # system_details / 538
    system_details = {}
    system_details_release = record_get_field_value(rec, tag="538", code="a")
    if system_details_release:
        system_details_release = system_details_release.replace('Recommended release for analysis: ', '')
        system_details_release = system_details_release.replace('Recommended Software Release: ', '')
        system_details_release = system_details_release.replace('Software release: ', '')
        system_details_release = system_details_release.replace('Release: ', '')
        system_details['release'] = system_details_release
    system_details_global_tag = record_get_field_value(rec, tag="538", code="b")
    if system_details_global_tag:
        system_details_global_tag = system_details_global_tag.replace('Global tag: ', '')
        system_details['global_tag'] = system_details_global_tag
    system_details_description = record_get_field_value(rec, tag="538", code="i")
    if system_details_description:
        system_details['description'] = system_details_description
    system_details_url = record_get_field_value(rec, tag="538", code="u")
    if system_details_url:
        system_details['url'] = system_details_url
    system_details_recid = record_get_field_value(rec, tag="538", code="w")
    if system_details_recid:
        system_details['recid'] = system_details_recid
    if system_details:
        jrec['system_details'] = system_details

    # abstract / 520
    abstract_description = record_get_field_value(rec, tag="520", code="a")
    if abstract_description:
        if 'http://opendata.cern.ch/' in abstract_description:
            abstract_description = abstract_description.replace('http://opendata.cern.ch/', '/')
        abstract = {'description': abstract_description}
        abstract_links = []
        for field_instance in record_get_field_instances(rec, tag="520"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            abstract_link = {}
            if field_instance_recids:
                abstract_link['recid'] = field_instance_recids[0]
            if field_instance_urls:
                field_instance_url = field_instance_urls[0]
                if field_instance_url.startswith('http://opendata.cern.ch/'):
                    field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                abstract_link['url'] = field_instance_url
            if abstract_link:
                if abstract.has_key('links'):
                    abstract['links'].append(abstract_link)
                else:
                    abstract['links'] = [abstract_link, ]
        jrec['abstract'] = abstract

    # methodology / 567
    methodology_description = record_get_field_value(rec, tag="567", code="a")
    if methodology_description:
        if 'http://opendata.cern.ch/' in methodology_description:
            methodology_description = methodology_description.replace('http://opendata.cern.ch/', '/')
        methodology = {'description': methodology_description}
        methodology_links = []
        for field_instance in record_get_field_instances(rec, tag="567"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            methodology_link = {}
            if field_instance_recids:
                methodology_link['recid'] = field_instance_recids[0]
            if field_instance_urls:
                field_instance_url = field_instance_urls[0]
                if field_instance_url.startswith('http://opendata.cern.ch/'):
                    field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                methodology_link['url'] = field_instance_url
            if methodology_link:
                if methodology.has_key('links'):
                    methodology['links'].append(methodology_link)
                else:
                    methodology['links'] = [methodology_link, ]
        jrec['methodology'] = methodology

    # license / 540
    license_attribution = record_get_field_value(rec, tag="540", code="a")
    if license_attribution:
        license = {'attribution': license_attribution}
        jrec['license'] = license

    # validation / 583
    validation_description = record_get_field_value(rec, tag="583", code="a")
    if validation_description:
        validation = {'description': validation_description}
        validation_links = []
        for field_instance in record_get_field_instances(rec, tag="583"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            field_instance_descriptions = field_get_subfield_values(field_instance, 'y')
            validation_link = {}
            if field_instance_recids:
                validation_link['recid'] = field_instance_recids[0]
            if field_instance_urls:
                field_instance_url = field_instance_urls[0]
                if field_instance_url.startswith('http://opendata.cern.ch/'):
                    field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                validation_link['url'] = field_instance_url
            if field_instance_descriptions:
                validation_link['description'] = field_instance_descriptions[0]
            if validation_link:
                if validation.has_key('links'):
                    validation['links'].append(validation_link)
                else:
                    validation['links'] = [validation_link, ]
        jrec['validation'] = validation

    # use_with / 516
    use_with_description = record_get_field_value(rec, tag="516", code="a")
    if int(recid) == 221 and \
       use_with_description == 'Use this with 2011 CMS open data':
        use_with_description = 'Use this with 2011 and 2012 CMS open data'
    if use_with_description:
        if int(recid) == 560:
            use_with_description = use_with_description.replace('http://opendata.cern.ch/', '/')
        use_with = {'description': use_with_description}
        use_with_links = []
        for field_instance in record_get_field_instances(rec, tag="516"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            field_instance_descriptions = field_get_subfield_values(field_instance, 'y')
            # workaround for one record:
            if not field_instance_recids and field_instance_urls and \
               field_instance_urls[0] == 'http://opendata.cern.ch/record/14':
                field_instance_recids = ["14", ]
            for field_instance_recid in field_instance_recids:
                use_with_link = {}
                use_with_link['recid'] = field_instance_recid
                if field_instance_urls:
                    field_instance_url = field_instance_urls[0]
                    if field_instance_url.startswith('http://opendata.cern.ch/'):
                        field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                    use_with_link['url'] = field_instance_url
                if field_instance_descriptions:
                    use_with_link['description'] = field_instance_descriptions[0]
                if use_with_link:
                    if use_with.has_key('links'):
                        use_with['links'].append(use_with_link)
                    else:
                        use_with['links'] = [use_with_link, ]
        jrec['use_with'] = use_with

    # usage / 581
    usage_description = record_get_field_value(rec, tag="581", code="a")
    if usage_description:
        if int(recid) == 560:
            usage_description = usage_description.replace('http://opendata.cern.ch/', '/')
        usage = {'description': usage_description}
        usage_links = []
        for field_instance in record_get_field_instances(rec, tag="581"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            field_instance_descriptions = field_get_subfield_values(field_instance, 'y')
            usage_link = {}
            if field_instance_recids:
                usage_link['recid'] = field_instance_recids[0]
            if field_instance_urls:
                field_instance_url = field_instance_urls[0]
                if field_instance_url.startswith('http://opendata.cern.ch/'):
                    field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                if field_instance_url.startswith('http://atlas-opendata.web.cern.ch/atlas-opendata/'):
                    field_instance_url = field_instance_url.replace('http://atlas-opendata.web.cern.ch/atlas-opendata/',
                                                                    'http://opendata.atlas.cern/')
                if field_instance_url.startswith('https://github.com/katilp/pattuples2011'):
                    field_instance_url = field_instance_url.replace('https://github.com/katilp/pattuples2011',
                                                                    'https://github.com/cms-opendata-analyses/pattuples2011')
                usage_link['url'] = field_instance_url
            if field_instance_descriptions:
                usage_link['description'] = field_instance_descriptions[0]
            if usage_link:
                if usage.has_key('links'):
                    usage['links'].append(usage_link)
                else:
                    usage['links'] = [usage_link, ]
        jrec['usage'] = usage

    # note / 556
    note_description = record_get_field_value(rec, tag="556", code="a")
    if note_description:
        note = {'description': note_description}
        note_links = []
        for field_instance in record_get_field_instances(rec, tag="556"):
            field_instance_recids = field_get_subfield_values(field_instance, 'w')
            field_instance_urls = field_get_subfield_values(field_instance, 'u')
            field_instance_titles = field_get_subfield_values(field_instance, 'y')
            note_link = {}
            if field_instance_recids:
                note_link['recid'] = field_instance_recids[0]
            if field_instance_urls:
                field_instance_url = field_instance_urls[0]
                if field_instance_url.startswith('http://opendata.cern.ch/'):
                    field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/')
                note_link['url'] = field_instance_url
            if field_instance_urls:
                note_link['title'] = field_instance_titles[0]
            if note_link:
                if note.has_key('links'):
                    note['links'].append(note_link)
                else:
                    note['links'] = [note_link, ]
        jrec['note'] = note

    # note / 500
    comment = record_get_field_value(rec, tag="500", code="a")
    if comment:
        if jrec.has_key('note'):
            raise StandardError('Sorry, cannot have both note/556 and note/500 fields.')
        else:
            jrec['note'] = {'description': comment}

    # generator / 593
    generator = {}
    generator_name = record_get_field_value(rec, tag="593", code="a")
    if generator_name:
        generator_name = generator_name.replace('Generators: ', '')
        generator_names = generator_name.split()
        generator['names'] = generator_names
    generator_global_tag = record_get_field_value(rec, tag="593", code="b")
    if generator_global_tag:
        generator_global_tag = generator_global_tag.replace('Global tag: ', '')
        generator['global_tag'] = generator_global_tag
    if generator:
        jrec['generator'] = generator

    # accelerator / 693
    accelerator = record_get_field_value(rec, tag="693", code="a")
    if accelerator:
        jrec['accelerator'] = accelerator

    # experiment / 693
    experiment = record_get_field_value(rec, tag="693", code="e")
    if not experiment and (recid == '60' or recid == '352'):
        experiment = 'ATLAS'
    if not experiment and (recid == '450' or recid == '451'):
        experiment = 'CMS'
    if not experiment and (recid == '452'):
        experiment = 'OPERA'
    if experiment:
        jrec['experiment'] = experiment

    # run_period / 964
    run_period = record_get_field_value(rec, tag="964", ind2="0", code="c")
    if run_period:
        if run_period == '2011RunA':
            run_period = 'Run2011A'
        jrec['run_period'] = run_period

    # generation / for simulated data
    # FIXME to be populated from DAS client? introduce structure inside methodology field

    # selection / for collision data
    # FIXME to be populated from DAS client? introduce structure inside methodology field

    # collision_information / 942
    collision_information_energy = record_get_field_value(rec, tag="942", code="e")
    collision_information_luminosity = record_get_field_value(rec, tag="942", code="l")
    collision_information_type = record_get_field_value(rec, tag="942", code="t")
    if collision_information_energy or collision_information_luminosity or collision_information_type:
        collision_information = {}
        if collision_information_energy:
            collision_information_energy = collision_information_energy.replace('Collision energy: ', '')
            collision_information_energy = collision_information_energy.replace('Collision energy:', '')
            collision_information['energy'] = collision_information_energy
        if collision_information_luminosity:
            collision_information['luminosity'] = collision_information_luminosity
        if collision_information_type:
            collision_information['type'] = collision_information_type
        else:
            if 'Data' in ' '.join(collections):
                if 'PbPb' in title:
                    collision_information['type'] = 'PbPb'
                else:
                    collision_information['type'] = 'pp'
        jrec['collision_information'] = collision_information

    # parent_dataset / 772
    parent_dataset_title = record_get_field_value(rec, tag="772", code="a")
    parent_dataset_doi = record_get_field_value(rec, tag="772", code="o")
    parent_dataset_recid = record_get_field_value(rec, tag="772", code="w")
    if parent_dataset_title or parent_dataset_doi or parent_dataset_recid:
        parent_dataset = {}
        parent_dataset['type'] = 'isChildOf'
        if parent_dataset_title:
            parent_dataset['title'] = parent_dataset_title
        if parent_dataset_doi:
            parent_dataset['doi'] = parent_dataset_doi
        if parent_dataset_recid:
            parent_dataset['recid'] = parent_dataset_recid
        if parent_dataset_title != title:
            if jrec.has_key('relations'):
                jrec['relations'].append(parent_dataset)
            else:
                jrec['relations'] = [parent_dataset, ]

    # code to produce files / 777
    code_to_produce_files_description = record_get_field_value(rec, tag="777", code="a")
    code_to_produce_files_recid = record_get_field_value(rec, tag="777", code="w")
    if code_to_produce_files_description or code_to_produce_files_recid:
        code_to_produce_files = {}
        code_to_produce_files['type'] = 'isProducedBy'
        if code_to_produce_files_description:
            code_to_produce_files['description'] = code_to_produce_files_description
        if code_to_produce_files_recid:
            code_to_produce_files['recid'] = code_to_produce_files_recid
        if jrec.has_key('relations'):
            jrec['relations'].append(code_to_produce_files)
        else:
            jrec['relations'] = [code_to_produce_files, ]

    # related dataset / 786
    for field_instance in record_get_field_instances(rec, tag="786"):
        related_dataset_descriptions = field_get_subfield_values(field_instance, "a")
        related_dataset_recids = field_get_subfield_values(field_instance, "w")
        if related_dataset_descriptions or related_dataset_recids:
            related_dataset = {}
            related_dataset['type'] = 'isPartOf'
            if related_dataset_descriptions:
                related_dataset['description'] = related_dataset_descriptions[0]
            if related_dataset_recids:
                related_dataset['recid'] = related_dataset_recids[0]
            if jrec.has_key('relations'):
                jrec['relations'].append(related_dataset)
            else:
                jrec['relations'] = [related_dataset, ]

    # related item / 787
    related_item_description = record_get_field_value(rec, tag="787", code="a")
    related_item_recids = record_get_field_values(rec, tag="787", code="w")
    related_item_note = record_get_field_value(rec, tag="787", code="n")
    related_item_url = record_get_field_value(rec, tag="787", code="u")
    related_item_label = record_get_field_value(rec, tag="787", code="y")
    if related_item_description and not related_item_recids:
        # workaround for a record
        if related_item_description == 'The default output of the code below is a ROOT file Mu00val.root':
            note = related_item_description
            if jrec.has_key('note') and jrec['note'].has_key('description'):
                jrec['note']['description'] += note
            else:
                jrec['note'] = {'description': note}
    if related_item_recids:
        for related_item_recid in related_item_recids:
            related_item = {}
            related_item['type'] = 'isRelatedTo'
            if related_item_description:
                related_item['description'] = related_item_description
            if related_item_recid:
                related_item['recid'] = related_item_recid
            if jrec.has_key('relations'):
                jrec['relations'].append(related_item)
            else:
                jrec['relations'] = [related_item, ]
    if related_item_url:
        link = {}
        if related_item_note:
            link['description'] = related_item_note
        if related_item_url:
            link['url'] = related_item_url
        if related_item_label:
            if related_item_label != related_item_url:
                link['comment'] = related_item_label
        if jrec.has_key('links'):
            jrec['links'].append(link)
        else:
            jrec['links'] = [link, ]

    # files / 8567
    files = []
    for file_instance in record_get_field_instances(rec, tag="856", ind1="7"):
        afile = {}
        file_type = field_get_subfield_values(file_instance, "2")[0]
        if file_type and file_type != 'xrootd':
            afile['type'] = file_type
        file_uri = field_get_subfield_values(file_instance, "u")[0]
        afile['uri'] = file_uri
        file_size = field_get_subfield_values(file_instance, "s")[0]
        afile['size'] = int(file_size)
        afile['checksum'] = 'sha1:0000000000000000000000000000000000000000'  # FIXME detect real SHA1 of files
        if 'eos-file-indexes' in " ".join(record_get_field_values(rec, 'FFT', code='a')):
            continue # remove individual ROOT files when we have file indexes
        files.append(afile)
    if files:
        if jrec.has_key('files'):
            jrec['files'].extend(files)
        else:
            jrec['files'] = files

    # files / FFT
    files = []
    for file_instance in record_get_field_instances(rec, tag="FFT"):

        # read FFT file properties
        file_name = field_get_subfield_values(file_instance, "a")[0]
        file_name = os.path.basename(file_name)
        try:
            file_name_alias = field_get_subfield_values(file_instance, "n")[0]
            file_name_alias = re.sub(r'^(.*?)\.', file_name_alias + '.', file_name)
        except IndexError:
            file_name_alias = ''
        file_descriptions = field_get_subfield_values(file_instance, "z")
        if file_descriptions:
            file_description = file_descriptions[0]
        else:
            file_description = ''

        #  output location that will be populated below
        file_uri = ''
        file_type = ''

        # CMS-Primary-Datasets, CMS-Simulated-Datasets
        if 'CMS-Primary-Datasets' in collections or \
           'CMS-Simulated-Datasets' in collections:
            match = re.search(r'(.*?)_(.*?)_(.*)_(AOD|RAW)_(.*)_([0-9]+)_file_index.txt$', file_name)
            if match:
                file_type = 'index'
                file_experiment, file_release, file_dataset, file_format, file_version, file_volume = match.groups()
                file_uri = 'root://eospublic.cern.ch//eos/opendata/' + \
                        file_experiment.lower() + '/' + \
                        file_release + '/' + \
                        file_dataset + '/' + \
                        file_format + '/' + \
                        file_version + '/' + \
                        'file-indexes/' + file_name
            else:
                match = re.search(r'(.*?)_(MonteCarlo[0-9]+)_(.*?)_(.*)_(AODSIM)_(.*)_([0-9]+)_file_index.txt$', file_name)
                if match:
                    file_type = 'index'
                    file_experiment, file_montecarlo, file_release, file_dataset, file_format, file_version, file_volume = match.groups()
                    file_uri = 'root://eospublic.cern.ch//eos/opendata/' + \
                            file_experiment.lower() + '/' + \
                            file_montecarlo + '/' + \
                            file_release + '/' + \
                            file_dataset + '/' + \
                            file_format + '/' + \
                            file_version + '/' + \
                            'file-indexes/' + file_name

        # cms-eventdisplay-files
        if int(recid) >= 600 and int(recid) <= 613:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/' + os.path.splitext(file_name)[0] + '/IG/Apr21ReReco-v1/' + file_name

        # cms-eventdisplay-files-Run2011A
        if int(recid) >= 614 and int(recid) <= 632:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/' + os.path.splitext(file_name)[0].replace('_Run2011A', '') + '/IG/12Oct2013-v1/' + file_name.replace('_Run2011A', '')

        # CMS-Configuration-Files
        if 'CMS-Configuration-Files' in collections:
            if file_name.endswith('configFile'):
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/configuration-files/2011/' + file_name + '.py'
            else:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/configuration-files/2011/' + file_name

        # LHCb-Derived-Datasets
        if 'LHCb-Derived-Datasets' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/lhcb/MasterclassDatasets/D0lifetime/2014/file-indexes/' + file_name
            file_type = 'index'

        # ALICE-Derived-Datasets
        if 'ALICE-Derived-Datasets' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/' + file_name
            file_type = 'index'

        # ALICE-Reconstructed-Data
        if 'ALICE-Reconstructed-Data' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/' + file_name
            file_type = 'index'

        # CMS-Validated-Runs
        if 'CMS-Validated-Runs' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/validated-runs/' + date_created + '/' + file_name

        # cms-derived-csv-Run2011A
        if int(recid) == 545:
            match = re.search(r'^(.*)_(.*)_Run2011A.csv$', file_name)
            file_name_filename, file_name_dataset = match.groups()
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/' + file_name_dataset + '/CSV/12Oct2013-v1/' + file_name_filename + '.csv'

        # cms-tools-vm-image.xml
        if int(recid) >= 249 and int(recid) <= 250:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/environment/2010/' + file_name

        # cms-tools-vm-image-Run2011A.xml
        if int(recid) >= 251 and int(recid) <= 252:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/environment/2011/' + file_name

        # CMS-Open-Data-Instructions
        if 'CMS-Open-Data-Instructions' in collections:
            if int(recid) == 55:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/documentation/outreach-exercise-2010/' + file_name
            if int(recid) == 72:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/documentation/hst-programme-2016/' + file_name

        # CMS-Luminosity-Information
        if 'CMS-Luminosity-Information' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/luminosity/' + date_created + '/' + file_name

        # CMS-Trigger-Information
        if 'CMS-Trigger-Information' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/trigger-information/' + date_created + '/' + file_name

        # cms-derived-pattuples-ana
        if int(recid) == 201:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Mu/PATtuples/file-indexes/' + file_name
            file_type = 'index'
        if int(recid) == 202:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Electron/PATtuples/file-indexes/' + file_name
            file_type = 'index'

        # cms-derived-pattuples-ana-Run2011A
        if int(recid) == 230:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/DoubleMu/PATtuples/file-indexes/' + file_name
            file_type = 'index'
        if int(recid) == 231:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/DoubleElectron/PATtuples/file-indexes/' + file_name
            file_type = 'index'

        # cms-hamburg-files
        if int(recid) >= 203 and int(recid) <= 212:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/hep-tutorial-2012/' + file_name

        # CMS-Learning-Resources
        if 'CMS-Learning-Resources' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/hep-tutorial-2012/' + file_name

        # cms-tools-ana
        if int(recid) == 101:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/ayrodrig-OutreachExercise2010/' + file_name
        if int(recid) == 200:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/ayrodrig-pattuples2010/' + file_name

        # cms-tools-dimuon-spectrum-2010
        if int(recid) == 560:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/dimuon-spectrum-2010/' + file_name

        # cms-tools-dimuon-filter
        if int(recid) == 553:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/SUSYBSMAnalysis-RazorFilter/' + file_name
        if int(recid) == 552:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/dimuon-filter/' + file_name

        # cms-validation-code-Run2010B
        if int(recid) == 460:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-Mu/' + file_name
        if int(recid) == 461:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-Commissioning/' + file_name
        if int(recid) == 462:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-MinimumBias/' + file_name

        # cms-csv-files
        if int(recid) == 554:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/MultiJet/CSV/Apr21ReReco-v1/' + file_name_alias
        if int(recid) == 700:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Mu/CSV/Apr21ReReco-v1/' + file_name_alias

        # cms-masterclass-files
        if int(recid) >= 300 and int(recid) <= 310:
            if file_name_alias.startswith('masterclass.'):
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/masterclass-2014/' + file_name
            else:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/masterclass-2014/' + file_name_alias

        # atlas-derived-datasets
        if 'ATLAS-Derived-Datasets' in collections:
            if int(recid) == 3860:
                # atlas-all-samples
                file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name
                file_type = 'index'
            elif int(recid) == 390 or int(recid) == 391:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name
                file_type = 'index'
            else:
                match = re.search(r'ATLAS_MasterclassDatasets_(.*)_([0-9]+)_dataset_([0-9]+)_file_index.txt$', file_name)
                file_xpath, file_year, file_number = match.groups()
                file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/MasterclassDatasets/' + file_xpath + '/' + file_year + '/' + file_number + '/file-indexes/' + file_name
                file_type = 'index'

        # ATLAS-Tools
        if 'ATLAS-Tools' in collections:
            if int(recid) == 352:
                file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/MasterclassDatasets/WPath/2015/Software/' + file_name
                file_type = 'index'
            else:
                file_type = 'index'
                if int(recid) == 3851:
                    file_name = file_name.replace('size_M_', 'size_S_')
                file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name
                file_type = 'index'

        # ATLAS-Simulated-Datasets
        if 'ATLAS-Simulated-Datasets' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name
            file_type = 'index'

        # ATLAS-Higgs-Challenge-2014
        if 'ATLAS-Higgs-Challenge-2014' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/higgs-challenge-2014/' + file_name

        # ALICE-Learning-Resources
        if 'ALICE-Learning-Resources' in collections:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/documentation/' + file_name

        # cms-condition-data-Run2011A
        if int(recid) >= 1801 and int(recid) <= 1802:
            file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/db/file-indexes/' + file_name
            file_type = 'index'

        # OPERA
        if 'OPERA' in ' '.join(collections):
            match = re.search(r'^(.*).(csv|zip)$', file_name)
            if match:
                file_name_base, file_name_ext = match.groups()
                if file_name_ext == 'zip':
                    file_uri_base = 'root://eospublic.cern.ch//eos/opendata/opera/datasets/multiplicity'
                elif file_name_ext == 'csv':
                    file_uri_base = 'root://eospublic.cern.ch//eos/opendata/opera/events/multiplicity'
                else:
                    raise StandardError('Not expected.')
                file_uri = file_uri_base + '/' + file_name_base + '.' + file_name_ext

        # author lists
        if 'Author-Lists' in collections:
            match = re.search(r'^(.*)-author-list-(.*).pdf$', file_name)
            if match:
                file_name_exp, file_name_year = match.groups()
                file_uri = 'root://eospublic.cern.ch//eos/opendata/' + file_name_exp.lower() + '/documentation/' + file_name

        # data policies
        if 'Data-Policies' in collections:
            match = re.search(r'^(.*)-Data-Policy.pdf$', file_name)
            if match:
                file_name_exp, = match.groups()
                file_uri = 'root://eospublic.cern.ch//eos/opendata/' + file_name_exp.lower() + '/documentation/' + file_name

        # ok, recognised enough; now generate files output
        if file_uri:
            afile = {}
            if file_type and file_type != 'xrootd':
                afile['type'] = file_type
            afile['uri'] = file_uri
            afile['size'] = fft_file_cache_info[file_name]['size']
            if file_description:
                afile['description'] = file_description
            afile['checksum'] = 'sha1:' + fft_file_cache_info[file_name]['checksum']
            files.append(afile)
    if files:
        if jrec.has_key('files'):
            jrec['files'].extend(files)
        else:
            jrec['files'] = files

    # keywords / 653
    keywords = []
    keyword_values = record_get_field_values(rec, tag="653", ind1="1", code="a")
    if int(recid) in [50, 53, 54, 61, 51, 52, 57, 58, 72]:
        keyword_values.append('education')
    if int(recid) in [53, 57, 72]:
        keyword_values.append('teaching')
    for keyword in keyword_values:
        if keyword != experiment:
            keyword = keyword.lower()
            if keyword == 'masterclasses':
                keyword = 'masterclass'
            keywords.append(keyword)
    if keywords:
        if jrec.has_key('keywords'):
            jrec['keywords'].extend(keywords)
        else:
            jrec['keywords'] = keywords

    # topic / 655
    topic = {}
    for field_instance in record_get_field_instances(rec, tag="655", ind2="7"):
        topic_categories = field_get_subfield_values(field_instance, "a")
        if topic_categories:
            topic['category'] = topic_categories[0]
        topic_sources = field_get_subfield_values(field_instance, "9")
        if topic_sources:
            topic['source'] = topic_sources[0]
    if topic:
        jrec['topic'] = topic

    # language / 041
    language = record_get_field_value(rec, tag="041", code="a")
    if language:
        if language == 'eng':
            language = 'English'
        jrec['language'] = language

    # links / 8564
    links = []
    for file_instance in record_get_field_instances(rec, tag="856", ind1="4"):
        link = {}
        link_hostname = field_get_subfield_values(file_instance, "a")
        if link_hostname:
            link['hostname'] = link_hostname[0]
        link_compression_information = field_get_subfield_values(file_instance, "c")
        if link_compression_information:
            link['compression_information'] = link_compression_information[0]
        link_size = field_get_subfield_values(file_instance, "s")
        if link_size:
            link['size'] = link_size[0]
        link_url = field_get_subfield_values(file_instance, "u")
        if link_url:
            link_url = link_url[0]
            if link_url.startswith('http://opendata.cern.ch/'):
                link_url = link_url.replace('http://opendata.cern.ch/', '/')
            if 'CMS-Learning-Resources' in collections and link_url == 'http://mattbellis.github.io/Particle-Physics-Playground/':
                link_url = 'http://particle-physics-playground.github.io/'
            if 'CMS-Learning-Resources' in collections and link_url == 'http://ippog.web.cern.ch/resources/2012/cms-hep-tutorial':
                link_url = 'http://ippog.org/resources/2012/cms-hep-tutorial'
            link['url'] = link_url
        link_description = field_get_subfield_values(file_instance, "y")
        if link_description:
            link['description'] = link_description[0]
        link_comment = field_get_subfield_values(file_instance, "z")
        if link_comment:
            link['comment'] = link_comment[0]
        links.append(link)
    if links:
        if jrec.has_key('links'):
            jrec['links'].extend(links)
        else:
            jrec['links'] = links

    # type, subtype / new
    jrec['type'] = {}
    if 'Primary-Dataset' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Collision', ]
    elif '-Detector-Datasets' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Derived', ]
    elif '-Detector-Events' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Derived', ]
    elif 'Derived-Dataset' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Derived', ]
    elif 'Reconstructed-Data' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Collision', ]
    elif 'Simulated-Dataset' in ' '.join(collections):
        jrec['type']['primary'] = 'Dataset'
        jrec['type']['secondary'] = ['Simulated', ]
    elif 'Tools' in ' '.join(collections):
        if 'virtual machine' in abstract_description.lower():
            jrec['type']['primary'] = 'Environment'
            jrec['type']['secondary'] = ['VM']
        elif int(recid) in [402, ]:
            jrec['type']['primary'] = 'Environment'
            jrec['type']['secondary'] = ['VM', ]
        else:
            jrec['type']['primary'] = 'Software'
            if int(recid) in [560, 234, 553, 101, 1200, 1201, 1202, 1203]:
                jrec['type']['secondary'] = ['Analysis', ]
            elif int(recid) in [220, 3850, 212, 221]:
                jrec['type']['secondary'] = ['Framework', ]
            elif int(recid) in [550, 352, 233, 200, 552, 1201, 551]:
                jrec['type']['secondary'] = ['Tool', ]
            else:
                jrec['type']['secondary'] = []
    elif 'Validated-Runs' in ' '.join(collections):
        jrec['type']['primary'] = 'Environment'
        jrec['type']['secondary'] = ['Validation', ]
    elif 'Validation-Utilities' in ' '.join(collections):
        jrec['type']['primary'] = 'Software'
        jrec['type']['secondary'] = ['Validation', ]
    elif 'Learning-Resources' in ' '.join(collections):
        jrec['type']['primary'] = 'Documentation'
        if int(recid) in [40, 59, 60, 51, 50, 61, 53, 52, 41]:
            jrec['type']['secondary'] = ['Activities', ]
        else:
            jrec['type']['secondary'] = []
    elif 'Configuration-Files' in ' '.join(collections):
        jrec['type']['primary'] = 'Supplementaries'
        jrec['type']['secondary'] = ['Configuration', ]
    elif 'Trigger-Information' in ' '.join(collections):
        jrec['type']['primary'] = 'Supplementaries'
        jrec['type']['secondary'] = ['Trigger', ]
    elif 'Luminosity-Information' in ' '.join(collections):
        jrec['type']['primary'] = 'Supplementaries'
        jrec['type']['secondary'] = ['Luminosity', ]
    elif 'Condition-Data' in ' '.join(collections):
        jrec['type']['primary'] = 'Environment'
        jrec['type']['secondary'] = ['Condition', ]
    elif 'Open-Data-Instructions' in ' '.join(collections):
        jrec['type']['primary'] = 'Documentation'
        if int(recid) in [57, 58]:
            jrec['type']['secondary'] = ['Help', ]
        elif int(recid) in [56, ]:
            jrec['type']['secondary'] = ['Report', ]
        elif int(recid) in [70, 71]:
            jrec['type']['secondary'] = ['Guide', ]
        if int(recid) in [72, 55]:
            jrec['type']['secondary'] = ['Activities', ]
        else:
            jrec['type']['secondary'] = []
    elif 'Data-Policies' in ' '.join(collections):
        jrec['type']['primary'] = 'Documentation'
        jrec['type']['secondary'] = ['Policy', ]
    elif 'Author-Lists' in ' '.join(collections):
        jrec['type']['primary'] = 'Documentation'
        jrec['type']['secondary'] = ['Authors', ]
    elif 'ATLAS-Higgs-Challenge-2014' in ' '.join(collections):
        if 'Dataset' in title:
            jrec['type']['primary'] = 'Dataset'
            jrec['type']['secondary'] = ['Derived', ]
        elif 'Documentation' in title:
            jrec['type']['primary'] = 'Documentation'
            jrec['type']['secondary'] = ['Activities', ]
        elif 'Video' in title:
            jrec['type']['primary'] = 'Documentation'
            jrec['type']['secondary'] = ['Activities', ]
        elif 'Software' in title:
            jrec['type']['primary'] = 'Software'
            jrec['type']['secondary'] = ['Analysis', ]
        else:
            jrec['type']['primary'] = 'FIXME'
            jrec['type']['secondary'] = []
    else:
        jrec['type']['primary'] = 'FIXME'
        jrec['type']['secondary'] = ['FIXME', ]

    return jrec