def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE): write_message("bibcatalog_system %s" % bibcatalog_system, verbose=1) write_message("queue %s" % queue, verbose=1) if bibcatalog_system and queue: subject = "Refs for #%s" % recid # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_hep = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, "a"): if collection == "HEP": in_hep = True # Only create tickets for HEP if not in_hep: write_message("not in hep", verbose=1) return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, "c"): if category.startswith("astro-ph"): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, "a"): subject += " " + report_number break text = "%s/record/edit/#state=edit&recid=%s" % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit: the maximum number of editors to display @param separator: the separator between editors. @param extension: a text printed if more editors than 'limit' exist @param print_links: if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_BASE_URL from invenio import bibrecord authors = bibrecord.record_get_field_instances(bfo.get_record(), '100') editors = [bibrecord.field_get_subfield_values(author, 'a')[0] for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ] if print_links.lower() == "yes": editors = ['<a href="' + CFG_BASE_URL + '/search?f=author&p=' + \ quote(editor) + \ '&ln='+ bfo.lang + \ '">' + editor + '</a>' for editor in editors] if limit.isdigit() and len(editors) > int(limit): return separator.join(editors[:int(limit)]) + extension elif len(editors) > 0: return separator.join(editors)
def check_existing_pdg_fields(recids, pdg_data, current_records): _print_out("Comparing new and old PDG data for " + str(len(recids)) + " records...") records = {} for recid in recids: record_mod = {} record_mod['001'] = deepcopy(current_records[recid]['001']) record_mod['084'] = deepcopy(current_records[recid]['084']) fields = record_get_field_instances(record_mod, '084') current_pdg_data = [] for field in fields: if is_pdg_field(field): current_pdg_data.append( field_get_subfield_values(field, 'a')[0]) current_set = set(current_pdg_data) new_set = set(pdg_data[recid]) deletions = list(current_set - new_set) additions = list(new_set - current_set) if len(deletions) > 0 or len(additions) > 0: if len(deletions) > 0: for field in fields: if is_pdg_field(field): if field_get_subfield_values(field, 'a')[0] in deletions: record_delete_field(record_mod, '084', ind1=' ', ind2=' ', field_position_global=field[4]) for pdg_field in additions: position = record_add_field(record_mod, '084', ' ', ' ') record_add_subfield_into(record_mod, '084', '2', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', '9', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', 'a', pdg_field, field_position_global=position) records[recid] = record_mod _print_verbose("Record #" + str(recid) + ": " + str(len(deletions)) + " deletions and " + str(len(additions)) + " additons.") else: _print_verbose("Nothing to change for record #" + str(recid)) _print_out(str(len(records)) + " records to be corrected.") return records
def main(): verbose = '-v' in sys.argv recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP') print "Found %s records to assign texkeys" % len(recids) processed = [] to_process = [] for count, recid in enumerate(recids): if count % 300 == 0: print 'done %s of %s' % (count, len(recids)) if verbose: print "processing ", recid # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True print "INFO: Record %s has already texkey %s" % (recid, value) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: print "WARNING: Record %s has no first author or collaboration" % recid continue xml = create_xml(recid, new_texkey) processed.append(recid) to_process.append(xml) if len(to_process) == 500: process_chunk(to_process) to_process = [] if to_process: process_chunk(to_process) # Finally, index all the records processed if processed: submit_bibindex_task(processed)
def check_one_date_per_type(fields1, fields2, final_result, type_check, subfield_list, tag): """Function to check if there are multiple dates of the same type""" logger.info(" running check_one_date_per_type") # I extract all the dates grouped by date type date_types = {} for field in final_result: date_types.setdefault(bibrecord.field_get_subfield_values(field, subfield_list[0][1])[0], []).append( bibrecord.field_get_subfield_values(field, subfield_list[0][0])[0] ) # then I check that these dates are unique per type for datet in date_types: if len(set(date_types[datet])) > 1: manage_check_error('Multiple dates for type "%s" in field "%s".' % (datet, tag), type_check, logger) return None
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Do not create tickets for user submissions for source_field in record_get_field_instances(record, "541"): for source in field_get_subfield_values(source_field, "c"): if source == "submission": write_message("User submitted paper", verbose=1) return # Only create tickets for CORE papers if not in_core: write_message("not in core papers", verbose=1) return # Do not create tickets for old records creation_date = run_sql( """SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30 * 4): return for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def merge_records_xml(marcxml_obj): """Function that takes in input a marcxml string and returns containing multiple records identified by the tag "collection" and for each one calls the function to merge the different flavors of the same record (identified by the tag "record"). """ logger.info(' Merger started.') #I get the bibrecord object from libxml2 one all_records = create_record_from_libxml_obj(marcxml_obj, logger) merged_records = [] records_with_merging_probl = [] for records in all_records: #I try to get the bibcode of the record I'm merging try: system_number_fields = records[0][FIELD_TO_MARC['system number']] bibcode = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] except: bibcode = 'Unknown' logger.warn(' Merging bibcode "%s".' % bibcode) # Get the merged record try: merged_records.append(merge_multiple_records(records)) except Exception, error: exc_type, exc_obj, exc_tb = sys.exc_info() str_error_to_print = exc_type.__name__ + '\t' + str(error) + ' (Merger error)' logger.error(' Impossible to merge the record "%s" \t %s' % (bibcode, str_error_to_print)) records_with_merging_probl.append((bibcode, str_error_to_print))
def merge_record_with_template(rec, template_name): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes( template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values( template_field_instance, code)[0]) return rec
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield(field_instance, code, field_get_subfield_values(template_field_instance, code)[0]) return rec
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes( template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values( template_field_instance, code)[0]) record_order_subfields(rec) return rec
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ recid = record_id_from_record(record) subject = [] # Add report number in the subjecet report_number = "" for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject.append(report_number) break subject.append("(#%s)" % (recid, )) text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \ (CFG_SITE_SECURE_URL, recid) ticket.subject = " ".join(subject) ticket.body = text.replace('%', '%%') ticket.queue = "HEP_curation" return ticket
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ recid = record_id_from_record(record) subject = [] # Add report number in the subjecet report_number = "" for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject.append(report_number) break subject.append("(#%s)" % (recid,)) text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \ (CFG_SITE_SECURE_URL, recid) ticket.subject = " ".join(subject) ticket.body = text.replace('%', '%%') ticket.queue = "HEP_curation" return ticket
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Only create tickets for HEP if not in_core: write_message("not in hep", verbose=1) return # Do not create tickets for old records creation_date = run_sql("""SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30*4): return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def has_field_origin(field_list, origin, code): """ This function checks if any of the fields for a certain tag contains origin in given subfield code. I.e. $9 arXiv. """ for field in field_list: if origin in field_get_subfield_values(field, code): return True return False
def has_field_origin(field_list, origin, code): """ This function checks if any of the fields for a certain tag contains origin in given subfield code. I.e. $9 arXiv. """ for field in field_list: if origin in field_get_subfield_values(field, code): return True return False
def record_in_collection(record, collection): """ Returns True/False if given record is in a given collection (980__a). """ for collection_tag in record_get_field_instances(record, "980"): for coll in field_get_subfield_values(collection_tag, 'a'): if coll.lower() == collection.lower(): return True return False
def record_in_collection(record, collection): """ Returns True/False if given record is in a given collection (980__a). """ for collection_tag in record_get_field_instances(record, "980"): for coll in field_get_subfield_values(collection_tag, 'a'): if coll.lower() == collection.lower(): return True return False
def check_pub_year_consistency(merged_record, type_check): """Function that checks if the publication year is consistent with the year at the beginning of the bibcode""" logger.info(' running check_pub_year_consistency') #definition of the list of dates I don't want to check with this function dates_to_skip_from_check = ['date-preprint'] try: system_number_fields = merged_record[FIELD_TO_MARC['system number']] except KeyError: manage_check_error('No System Number field!', type_check, logger) return None try: pub_dates_fields = merged_record[FIELD_TO_MARC['publication date']] except KeyError: manage_check_error('No Publication Date field!', type_check, logger) return None #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(system_number_fields) > 1: manage_check_error('There are more than one System Numbers!', type_check, logger) return None system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] num_dates_checked = 0 for date_type_string in PUBL_DATE_TYPE_VAL_SUBFIELD: #I don't want to check the preprint date if date_type_string in dates_to_skip_from_check: continue #then I have to extract the right date (there can be different in the same field) pubdate = '' for field in pub_dates_fields: if bibrecord.field_get_subfield_values(field, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type_string: pubdate = bibrecord.field_get_subfield_values(field, PUBL_DATE_SUBFIELD)[0] break if len(pubdate) != 0: num_dates_checked +=1 else: continue #final part of the check if pubdate[0:4] != system_number[0:4]: manage_check_error('Year of "%s" not consistent with the main bibcode "%s"!' % (date_type_string, system_number), type_check, logger) if num_dates_checked == 0: manage_check_error('No dates available for this record!', type_check, logger) return None
def format(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit the maximum number of editors to display @param separator the separator between editors. @param extension a text printed if more editors than 'limit' exist @param print_links if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_SITE_URL from invenio import bibrecord authors = bibrecord.record_get_field_instances(bfo.get_record(), '700') editors = [bibrecord.field_get_subfield_values(author, 'a')[0] for author in authors if \ len(bibrecord.field_get_subfield_values(author, "e")) > 0 \ and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ] if print_links.lower() == "yes": editors = ['<a href="' + CFG_SITE_URL + '/search?f=author&p=' + \ quote(editor) + \ '&ln='+ bfo.lang + \ '">' + editor + '</a>' for editor in editors if editor.strip()] if len(editors) == 0: beginning = '' ending = '' elif len(editors) == 1: beginning = '' ending = ' (ed.)' else: beginning = '' ending = ' (eds.)' if limit.isdigit() and len(editors) > int(limit): return beginning + separator.join( editors[:int(limit)]) + extension + ending elif len(editors) > 0: return beginning + separator.join(editors) + ending
def check_arxiv_url(field, valid_arxiv_ids): url = field_get_subfield_values(field, 'u') if not url: return True url = url[0] # print 'url', url arxiv_id = extract_arxiv_id_from_url(url) # print 'id', arxiv_id if arxiv_id is None: return True else: return arxiv_id in valid_arxiv_ids
def get_name_variants(record): """ Return indexable values in the 410 field. """ name_variants = set() if '410' in record: fields = bibrecord.record_get_field_instances(record, '410') for field in fields: values = bibrecord.field_get_subfield_values(field, 'a') if values: if 'ADS' in bibrecord.field_get_subfield_values(field, '9'): # Always index field with source ADS. for value in values: name_variants.add(value.decode('utf_8')) else: # Disregard uppercase space-separated fields. for value in values: if not re.match('\s*[A-Z]+\s[A-Z ]+$', value): name_variants.add(value.decode('utf_8')) return list(name_variants)
def get_name_variants(record): """ Return indexable values in the 410 field. """ name_variants = set() if '410' in record: fields = bibrecord.record_get_field_instances(record, '410') for field in fields: values = bibrecord.field_get_subfield_values(field, 'a') if values: if 'ADS' in bibrecord.field_get_subfield_values(field, '9'): # Always index field with source ADS. for value in values: name_variants.add(value.decode('utf_8')) else: # Disregard uppercase space-separated fields. for value in values: if not re.match('\s*[A-Z]+\s[A-Z ]+$', value): name_variants.add(value.decode('utf_8')) return list(name_variants)
def check_existing_pdg_fields(recids, pdg_data, current_records): _print_out("Comparing new and old PDG data for " + str(len(recids)) + " records...") records = {} for recid in recids: record_mod = {} record_mod['001'] = deepcopy(current_records[recid]['001']) record_mod['084'] = deepcopy(current_records[recid]['084']) fields = record_get_field_instances(record_mod, '084') current_pdg_data = [] for field in fields: if is_pdg_field(field): current_pdg_data.append(field_get_subfield_values(field, 'a')[0]) current_set = set(current_pdg_data) new_set = set(pdg_data[recid]) deletions = list(current_set - new_set) additions = list(new_set - current_set) if len(deletions) > 0 or len(additions) > 0: if len(deletions) > 0: for field in fields: if is_pdg_field(field): if field_get_subfield_values(field, 'a')[0] in deletions: record_delete_field(record_mod, '084', ind1=' ', ind2=' ', field_position_global=field[4]) for pdg_field in additions: position = record_add_field(record_mod, '084', ' ', ' ') record_add_subfield_into(record_mod, '084', '2', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', '9', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', 'a', pdg_field, field_position_global=position) records[recid] = record_mod _print_verbose("Record #" + str(recid) + ": " + str(len(deletions)) + " deletions and " + str(len(additions)) + " additons.") else: _print_verbose("Nothing to change for record #" + str(recid)) _print_out(str(len(records)) + " records to be corrected.") return records
def tarballs_by_recids(recids, sdir): """ Take a string representing one recid or several and get the associated tarballs for those ids. @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @return: tarballs ([string, string, ...]): locations of tarballs """ list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recid.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = int(recid) arXiv_ids = [] for recid in list_of_ids: rec = get_record(recid) for afieldinstance in record_get_field_instances(rec, tag='037'): if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) return tarballs_by_arXiv_id(arXiv_ids, sdir)
def tarballs_by_recids(recids, sdir): """ Take a string representing one recid or several and get the associated tarballs for those ids. @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @return: tarballs ([string, string, ...]): locations of tarballs """ list_of_ids = [] if "," in recids: recids = recids.split(",") for recid in recids: if "-" in recid: low, high = recid.split("-") recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if "-" in recids: low, high = recid.split("-") list_of_ids = range(int(low), int(high)) else: list_of_ids = int(recid) arXiv_ids = [] for recid in list_of_ids: rec = get_record(recid) for afieldinstance in record_get_field_instances(rec, tag="037"): if "arXiv" == field_get_subfield_values(afieldinstance, "9")[0]: arXiv_id = field_get_subfield_values(afieldinstance, "a")[0] arXiv_ids.append(arXiv_id) return tarballs_by_arXiv_id(arXiv_ids, sdir)
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE): write_message('bibcatalog_system %s' % bibcatalog_system, verbose=1) write_message('queue %s' % queue, verbose=1) if bibcatalog_system and queue: subject = "Refs for #%s" % recid # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_hep = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'HEP': in_hep = True # Only create tickets for HEP if not in_hep: write_message("not in hep", verbose=1) return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, \ recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit: the maximum number of editors to display @param separator: the separator between editors. @param extension: a text printed if more editors than 'limit' exist @param print_links: if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_BASE_URL from invenio.bibrecord import field_get_subfield_values, \ record_get_field_instances authors = record_get_field_instances(bfo.get_record(), '100') + \ record_get_field_instances(bfo.get_record(), '700') editors = [ field_get_subfield_values(author, 'a')[0] for author in authors if len(field_get_subfield_values(author, "e")) > 0 and field_get_subfield_values(author, "e")[0] == "ed." ] if print_links.lower() == "yes": editors = [ '<a href="%s/search?f=author&p=%s&ln=%s">%s</a>' % (CFG_BASE_URL, quote(editor), bfo.lang, editor) for editor in editors ] if limit.isdigit() and len(editors) > int(limit): return separator.join(editors[:int(limit)]) + extension elif len(editors) > 0: return separator.join(editors)
def first_author_bibcode_consistency(merged_record, type_check): """Function that checks if the last letter of the main bibcode is consistent with the first letter of the first author""" logger.info(' running first_author_bibcode_consistency') bibstems_to_skip_from_check = ['QB'] try: system_number_fields = merged_record[FIELD_TO_MARC['system number']] except KeyError: manage_check_error('No System Number field!', type_check, logger) return None try: first_author_fields = merged_record[FIELD_TO_MARC['first author']] except KeyError: manage_check_error('No First Author field!', type_check, logger) return None #the system number field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(system_number_fields) > 1: manage_check_error('There are more than one System Numbers!', type_check, logger) return None #the first author field should e unique, so if there are more than 1 fields, I have a problem (and I cannot proceed) if len(first_author_fields) > 1: manage_check_error('There are more than one First Author!', type_check, logger) return None system_number = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] first_author = bibrecord.field_get_subfield_values(first_author_fields[0], AUTHOR_NAME_SUBFIELD)[0] #If the bibcode has a bibstem to skip, I don't do anything for elem in bibstems_to_skip_from_check: if system_number[4:4+len(elem)] == elem: return None if first_author[0].lower() != system_number[-1].lower(): #if the last letter of the system number is a dot, then I want to give a different message if system_number[-1] == '.': manage_check_error('The main bibcode "%s" doesn\'t have an initial even if there is a First Author "%s"!' % (system_number, first_author), type_check, logger) else: manage_check_error('First Author "%s" not consistent with the main bibcode "%s"!' % (first_author, system_number), type_check, logger) return None
def get_origin(fields): """function that extracts the origin of a field""" origins = set() for field in fields: origins.update(bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)) if not origins: raise OriginNotFound(fields) elif len(origins) > 2: raise OriginNotFound(fields) origin = origins.pop().strip('; ') if not origin: raise OriginNotFound(fields) return origin
def papers_by_country_with_affs_csv(req, country): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=papers_by_country.csv') ## print the list of linkt to the articles count = 1 print >> req, country search = "100__w:'%s' OR 700__w:'%s'" % (country, country) res = perform_request_search(p='%s' % (search, )) print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations" if len(res): for rec_id in res: author_count = 11 rec = get_record(rec_id) title = '' authors = '' journal = '' doi = '' inspire_record = '' if '245' in rec: title = re.sub("<.*?>", "", rec['245'][0][0][0][1]) for sub in rec['773'][0][0]: if 'p' in sub[0]: journal = sub[1] doi = get_doi(rec_id) if '035' in rec: for f in rec['035'][0][0]: if 'a' in f: inspire_record = 'http://inspirehep.net/record/%s' % ( f[1], ) print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi, inspire_record) if '100' in rec: author = rec['100'][0][0][0][1] affiliations = record_get_field_values(rec, tag='100', code='v') print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations)) if '700' in rec: for auth in rec['700']: author = auth[0][0][1] affiliations = field_get_subfield_values(auth, code='v') print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations)) count += 1
def check_duplicate_normalized_author_names(fields1, fields2, final_result, type_check, subfield_list, tag): """ Checks if there are authors with the same normalized name. This will prevent the correct matching of authors from one author list to the other. """ logger.info(" running check_duplicate_normalized_author_names") author_names = set() for field in final_result: author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0] if author in author_names: # I don't raise an error if I have duplicated normalized author names, # I simply return the trusted list manage_check_error( 'Duplicated normalized author name for "%s" in field "%s".' % (author, tag), type_check, logger ) else: author_names.add(author) return None
def pub_date_merger(fields1, fields2, tag): """function to merge dates. the peculiarity of this merge is that we need to create a new field based on which date is available""" all_dates = take_all_no_checks(fields1, fields2, tag) if len(all_dates) > 0: #removing the main-date if present for date in all_dates: if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == 'main-date': logger.info(' Main date already available: trying to re-create it') del(all_dates[all_dates.index(date)]) break #I need to extract the best date available main_pub_date = None main_pub_date_primary = 'False' #first I try to extract it from the canonical metadata done = False for date_type in PUBL_DATE_TYPE_VAL_SUBFIELD: if done: break for date in all_dates: if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type and bibrecord.field_get_subfield_values(date, PRIMARY_METADATA_SUBFIELD)[0] == 'True': main_pub_date = bibrecord.field_get_subfield_values(date, PUBL_DATE_SUBFIELD)[0] main_pub_date_primary = 'True' done = True break #if I'm not successful I try with a normal metadata if main_pub_date == None: done = False for date_type in PUBL_DATE_TYPE_VAL_SUBFIELD: if done: break for date in all_dates: if bibrecord.field_get_subfield_values(date, PUBL_DATE_TYPE_SUBFIELD)[0] == date_type: main_pub_date = bibrecord.field_get_subfield_values(date, PUBL_DATE_SUBFIELD)[0] done = True break #if I still don't have a main date it means that I have a date that is not in the list of expected dates #so I take the first one #P.S. I should never get a this point if main_pub_date == None: logger.info(' All the dates available are not recognized as good for a main date: picking the first available') main_pub_date = bibrecord.field_get_subfield_values(all_dates[0], PUBL_DATE_SUBFIELD)[0] #finally I append the main date to the list of dates all_dates.append(([(PUBL_DATE_SUBFIELD, main_pub_date), (PUBL_DATE_TYPE_SUBFIELD, 'main-date'), (ORIGIN_SUBFIELD, 'ADS metadata'), (PRIMARY_METADATA_SUBFIELD, main_pub_date_primary)],) + all_dates[0][1:]) return all_dates else: return all_dates
def merge_record_with_template(rec, template_name): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) template_bibrec = create_record(template)[0] for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values(template_field_instance, code)[0] ) return rec
def papers_by_country_with_affs_csv(req, country): req.content_type = 'text/csv; charset=utf-8' req.headers_out['content-disposition'] = ('attachment; ' 'filename=papers_by_country.csv') ## print the list of linkt to the articles count = 1 print >> req, country search = "100__w:'%s' OR 700__w:'%s'" % (country, country) res = perform_request_search(p='%s' % (search,)) print >> req, "#;Title;Journal;DOI;Inspire record;Author;Affiliations" if len(res): for rec_id in res: author_count = 11 rec = get_record(rec_id) title = '' authors = '' journal = '' doi = '' inspire_record = '' if '245' in rec: title = re.sub("<.*?>", "", rec['245'][0][0][0][1]) for sub in rec['773'][0][0]: if 'p' in sub[0]: journal = sub[1] doi = get_doi(rec_id) if '035' in rec: for f in rec['035'][0][0]: if 'a' in f: inspire_record = 'http://inspirehep.net/record/%s' % (f[1],) print >> req, "%s;%s;%s;%s;%s;;" % (count, title, journal, doi, inspire_record) if '100' in rec: author = rec['100'][0][0][0][1] affiliations = record_get_field_values(rec, tag='100', code='v') print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations)) if '700' in rec: for auth in rec['700']: author = auth[0][0][1] affiliations = field_get_subfield_values(auth, code='v') print >> req, ";;;;;%s;%s" % (author, " | ".join(affiliations)) count += 1
def extend_author_field(author_field, cds_id): """Extend author datafield by CDS authority id and Beard tag. Extends the author datafield by the MARC subfields $$0:AUTHOR|(CDS)<cds_id> $$9:#BEARD# if $$0:AUTHOR|(CDS)<cds_id> does not exist in `author_field`. :param author_field: Example: # from invenio.search_engine import get_record # from invenio.bibrecord import record_get_field_instances # record = get_record(2150939) # author_field = record_get_field_instances(record, "100")[0] author_field = ([('a', 'Ellis, John'), ('u', "King's Coll. London"), ('u', 'CERN')], ' ', ' ', '', 32) :param str cds_id: sequence of numbers representing the CDS id Example: cds_id = '2108556' :result: Example: author_field = ([('a', 'Ellis, John'), ('u', "King's Coll. London"), ('u', 'CERN'), ('0', 'AUTHOR|(CDS)2108556'), ('9', '#BEARD#')], ' ', ' ', '', 32) :return: True, if `author_field` has been updated, False otherwise """ cds_authority_id = "AUTHOR|(CDS){0}".format(cds_id) if cds_authority_id not in field_get_subfield_values(author_field, '0'): field_add_subfield(author_field, "0", cds_authority_id) field_add_subfield(author_field, "9", "#BEARD#") return True return False
recid = retrieve_rec_id(record, "") if not recid or recid == -1: # Try again with p_r_s arxiv_id = get_minimal_arxiv_id(record) if arxiv_id: results = perform_request_search(p="reportnumber:%s" % (arxiv_id, ), of='id') if len(results) > 0: # FIXME: Ambiguous results may happen. Now just taking first result.. recid = results[0] # 773 RefExtract PubNote extraction for field in record_get_field_instances(record, '773'): for value in field_get_subfield_values(field, 'x'): extract = extract_journal_reference(value) if extract: subfields = [('x', value)] if extract.get('volume', False): subfields.append(('v', str(extract['volume']))) if extract.get('title', False): subfields.append(('p', str(extract['title']))) if extract.get('year', False): subfields.append(('y', str(extract['year']))) if extract.get('page', False): subfields.append(('c', str(extract['page']))) new_field = create_field(subfields, global_position=field[4]) record_replace_field(record, '773', new_field, field[4]) break
def task_run_core(): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP') write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "z")[0] except IndexError: try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message("WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed #FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) return True
recid = retrieve_rec_id(record, "") if not recid or recid == -1: # Try again with p_r_s arxiv_id = get_minimal_arxiv_id(record) if arxiv_id: results = perform_request_search(p="reportnumber:%s" % (arxiv_id, ), of='id') if len(results) > 0: # FIXME: Ambiguous results may happen. Now just taking first result.. recid = results[0] # 773 RefExtract PubNote extraction for field in record_get_field_instances(record, '773'): for value in field_get_subfield_values(field, 'x'): extract = extract_journal_reference(value) if extract: subfields = [('x', value)] if extract.get('volume', False): subfields.append(('v', str(extract['volume']))) if extract.get('title', False): subfields.append(('p', str(extract['title']))) if extract.get('year', False): subfields.append(('y', str(extract['year']))) if extract.get('page', False): subfields.append(('c', str(extract['page']))) new_field = create_field(subfields, global_position=field[4]) record_replace_field(record, '773', new_field, field[4]) break
def has_field_origin(field_list, origin, code): """Check if any of the fields for a certain tag contains origin in given subfield code.""" for field in field_list: if origin in field_get_subfield_values(field, code): return True return False
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
"9") and has_field_origin( existing_field_list, "arXiv", "9"): fields_to_correct.append((tag, [field])) else: holdingpen = True # Check for duplicates and add title update as 246 field_list_246 = record_get_field_instances( existing_record, "246") if not has_field(field, field_list_246): fields_to_add.append(("246", [field])) else: corrected_fields = [] if has_field_origin(new_field_list, "arXiv", "9") \ and has_field_origin(existing_field_list, "arXiv", "9"): for field in existing_field_list: if not "arXiv" in field_get_subfield_values( field, "9"): corrected_fields.append(field) for field in new_field_list: if not has_field(field, corrected_fields): corrected_fields.append(field) action = get_action(tag, diff_code, action_dict) if action == 'holdingpen' and not holdingpen: holdingpen = True if action == 'correct' or len(corrected_fields) > 0: if len(corrected_fields) == 0: corrected_fields = new_field_list fields_to_correct.append((tag, corrected_fields)) if action == 'append':
recid = None else: recid = retrieve_rec_id(record, "") if not recid or recid == -1: # Try again with p_r_s arxiv_id = get_minimal_arxiv_id(record) if arxiv_id: results = perform_request_search(p="reportnumber:%s" % (arxiv_id,), of='id') if len(results) > 0: # FIXME: Ambiguous results may happen. Now just taking first result.. recid = results[0] # 773 RefExtract PubNote extraction for field in record_get_field_instances(record, '773'): for value in field_get_subfield_values(field, 'x'): extract = extract_journal_reference(value) if extract: subfields = [('x', value)] if extract.get('volume', False): subfields.append(('v', str(extract['volume']))) if extract.get('title', False): subfields.append(('p', str(extract['title']))) if extract.get('year', False): subfields.append(('y', str(extract['year']))) if extract.get('page', False): subfields.append(('c', str(extract['page']))) new_field = create_field(subfields, global_position=field[4]) record_replace_field(record, '773', new_field, field[4]) break
recid = None else: recid = retrieve_rec_id(record, "") if not recid or recid == -1: # Try again with p_r_s arxiv_id = get_minimal_arxiv_id(record) if arxiv_id: results = perform_request_search(p="reportnumber:%s" % (arxiv_id,), of='id') if len(results) > 0: # FIXME: Ambiguous results may happen. Now just taking first result.. recid = results[0] # 773 RefExtract PubNote extraction for field in record_get_field_instances(record, '773'): for value in field_get_subfield_values(field, 'x'): extract = extract_journal_reference(value) if extract: subfields = [('x', value)] if extract.get('volume', False): subfields.append(('v', str(extract['volume']))) if extract.get('title', False): subfields.append(('p', str(extract['title']))) if extract.get('year', False): subfields.append(('y', str(extract['year']))) if extract.get('page', False): subfields.append(('c', str(extract['page']))) new_field = create_field(subfields, global_position=field[4]) record_replace_field(record, '773', new_field, field[4]) break
def is_pdg_field(field): if field_get_subfield_values(field, '2')[0] == 'PDG': if field_get_subfield_values(field, '9')[0] == 'PDG': return True return False
def has_field_origin(field_list, origin, code): """Check if any of the fields for a certain tag contains origin in given subfield code.""" for field in field_list: if origin in field_get_subfield_values(field, code): return True return False
def update_record(record_id, authors): """Update authors in CDS record. :param int record_id: record to update author datafields Example: record_id = 2150939 :param dict authors: dictionary where keys are author full names and values the CDS profile ids to be updated in the given record Example: authors = {'Ellis, John': '2108556'} :return: string representing the record XML element containing author (`100`) and/or co-author (`700`) datafields. Empty string if nothing to update Example: '<record> <controlfield tag="001">2150939</controlfield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Ellis, John</subfield> <subfield code="u">King's Coll. London</subfield> <subfield code="u">CERN</subfield> <subfield code="0">AUTHOR|(CDS)2108556</subfield> <subfield code="9">#BEARD#</subfield> </datafield> </record>' """ record = get_record(record_id) record_author = record_get_field_instances(record, "100") record_coauthors = record_get_field_instances(record, "700") if len(record_author) > 1: print ("Oops: several '100' (main author) fields have been found in " "record '{0}'".format(record_id)) return "" datafields = "" author = False for author_field in record_author: try: author_name = field_get_subfield_values(author_field, 'a')[0] try: cds_id = authors[author_name] if extend_author_field(author_field, cds_id): datafields += field_xml_output(author_field, "100") author = True except KeyError: pass except IndexError: # Author field (`100`) does not have subfield `a` pass if len(authors) > 1 or not author: for coauthor_field in record_coauthors: try: coauthor_name = field_get_subfield_values( coauthor_field, 'a')[0] try: cds_id = authors[coauthor_name] if extend_author_field(coauthor_field, cds_id): author = True except KeyError: pass except IndexError: # Co-author field (`700`) does not have subfield `a` pass datafields += field_xml_output(coauthor_field, "700") # Nothing to update if not author: # print "No authors to update in record '{0}'".format(record_id) return "" record = ('<record><controlfield tag="001">{0}</controlfield>{1}' '</record>'.format(record_id, datafields)) return record
def task_run_core(name=NAME): """ Performs a search to find records without a texkey, generates a new one and uploads the changes in chunks """ recids = task_get_task_param('recids') if recids: start_date = None write_message("processing recids from commandline") else: start_date = datetime.now() recids = intbitset() recids |= intbitset( perform_request_search(p='-035:spirestex -035:inspiretex', cc='HEP')) if task_get_task_param('all'): write_message("processing all records without texkey") else: _, last_date = fetch_last_updated(name) recids = recids & fetch_records_modified_since(last_date) write_message("processing records modified since: %s" % last_date) write_message("Found %s records to assign texkeys" % len(recids)) processed_recids = [] xml_to_process = [] for count, recid in enumerate(recids): write_message("processing recid %s" % recid) # Check that the record does not have already a texkey has_texkey = False recstruct = get_record(recid) for instance in record_get_field_instances(recstruct, tag="035", ind1="", ind2=""): try: provenance = field_get_subfield_values(instance, "9")[0] except IndexError: provenance = "" try: value = field_get_subfield_values(instance, "a")[0] except IndexError: value = "" provenances = ["SPIRESTeX", "INSPIRETeX"] if provenance in provenances and value: has_texkey = True write_message("INFO: Record %s has already texkey %s" % (recid, value)) if not has_texkey: TexKeySeq = TexkeySeq() new_texkey = "" try: new_texkey = TexKeySeq.next_value(recid) except TexkeyNoAuthorError: write_message( "WARNING: Record %s has no first author or collaboration" % recid) continue except TexkeyNoYearError: write_message("WARNING: Record %s has no year" % recid) continue write_message("Created texkey %s for record %d" % (new_texkey, recid)) xml = create_xml(recid, new_texkey) processed_recids.append(recid) xml_to_process.append(xml) task_update_progress("Done %d out of %d." % (count, len(recids))) task_sleep_now_if_required() # sequence ID to be used in all subsequent tasks sequence_id = str(random.randrange(1, 4294967296)) if xml_to_process: process_chunk(xml_to_process, sequence_id) # Finally, index all the records processed # FIXME: Waiting for sequence id to be fixed # if processed_recids: # submit_bibindex_task(processed_recids, sequence_id) if start_date: store_last_updated(0, start_date, name) return True
def is_pdg_field(field): if field_get_subfield_values(field, '2')[0] == 'PDG': if field_get_subfield_values(field, '9')[0] == 'PDG': return True return False
def merge_creation_modification_dates(merged_record): """Function that grabs all the origins in the merged record and creates a merged version of the creation and modification date based only on the found origins""" #I create a local copy to avoid problems record = deepcopy(merged_record) #I extract all the creation and modification dates try: creat_mod = record[FIELD_TO_MARC['creation and modification date']] except KeyError: logger.warning(' No Creation-Modification field available!') return record #then I extract all the origins from all the fields but the creation and modification date origins = [] for field_code in record: if field_code != FIELD_TO_MARC['creation and modification date']: for field in record[field_code]: try: origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] if origin !='': origins.append(origin) #if there is origin this is a problem, but I don't have to manage it here except IndexError: pass #I unique the list origins = list(set(origins)) #then for each field in creation e modification date I check if it has an origin used in other fields #and if so I update creation and modification dates new_creation_modification_date = {} for field in creat_mod: try: origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] except IndexError: origin = '' if origin in origins: #I have to put or update the creation and modification date if len(new_creation_modification_date) == 0: #if there is no creation or modification date I simply insert the field new_creation_modification_date[CREATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0] new_creation_modification_date[MODIFICATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0] new_creation_modification_date[ORIGIN_SUBFIELD] = origin new_creation_modification_date['origin_importance'] = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin) else: #otherwise I have to check which one is the oldest for creation and newest for modification old_creation = new_creation_modification_date[CREATION_DATE_SUBFIELD] old_modification = new_creation_modification_date[CREATION_DATE_SUBFIELD] new_creation = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0] new_modification = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0] new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_creation if old_creation <= new_creation else new_creation new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_modification if old_modification >= new_modification else new_modification #then at the end I put as origin the most trusted origin old_origin = new_creation_modification_date[ORIGIN_SUBFIELD] old_origin_import = new_creation_modification_date['origin_importance'] new_origin_import = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin) new_creation_modification_date[ORIGIN_SUBFIELD] = old_origin if old_origin_import >= new_origin_import else origin new_creation_modification_date['origin_importance'] = old_origin_import if old_origin_import >= new_origin_import else new_origin_import #then I upgrade the field record[FIELD_TO_MARC['creation and modification date']] = [([(MODIFICATION_DATE_SUBFIELD, new_creation_modification_date[MODIFICATION_DATE_SUBFIELD]), (CREATION_DATE_SUBFIELD, new_creation_modification_date[CREATION_DATE_SUBFIELD]), (ORIGIN_SUBFIELD, new_creation_modification_date[ORIGIN_SUBFIELD])], ) + creat_mod[0][1:]] return record
def convert_record(rec): "Convert single REC record to JSON." jrec = {} # recid / 001 recid = record_get_field_value(rec, tag="001") if recid: jrec['recid'] = record_get_field_value(rec, tag="001") # doi / 0247 $2 DOI dois = filter_field_instances(record_get_field_instances(rec, tag="024", ind1="7"), "2", "DOI") if dois: jrec['doi'] = field_get_subfield_values(dois[0], "a")[0] # CMS ConfDB ID / 035 cmsconfdbids = filter_field_instances(record_get_field_instances(rec, tag="035"), "9", "CMS-ConfDB") if cmsconfdbids: jrec['cms_confdb_id'] = field_get_subfield_values(cmsconfdbids[0], "a")[0] # report number / 037 # authors / 100, 700 authors = [] for field_instance in record_get_field_instances(rec, tag="100") + record_get_field_instances(rec, tag="700"): author = {} author_names = field_get_subfield_values(field_instance, "a") if author_names: author['name'] = author_names[0] author_ccids = field_get_subfield_values(field_instance, "h") if author_ccids: author['ccid'] = author_ccids[0] author_inspireids = field_get_subfield_values(field_instance, "i") if author_inspireids: author['inspireid'] = author_inspireids[0] author_affiliations = field_get_subfield_values(field_instance, "u") if author_affiliations: author['affiliation'] = author_affiliations[0] authors.append(author) if authors: jrec['authors'] = authors # collaboration / 110, 710 collaboration_name = record_get_field_value(rec, tag="110", code="a") collaboration_name_additionals = record_get_field_values(rec, tag="710", code="a") collaboration_group = record_get_field_value(rec, tag="110", code="g") collaboration_recid = record_get_field_value(rec, tag="110", code="w") if collaboration_name or collaboration_group or collaboration_recid: collaboration = {} if collaboration_name: for collaboration_name_additional in collaboration_name_additionals: collaboration_name += ' and ' + collaboration_name_additional collaboration['name'] = collaboration_name if collaboration_group: collaboration['group'] = collaboration_group if collaboration_recid: collaboration['recid'] = collaboration_recid jrec['collaboration'] = collaboration # title / 245 $a title = record_get_field_value(rec, tag="245", code="a") if title: jrec['title'] = title # title subtitle / 245 $b title_subtitle = record_get_field_value(rec, tag="245", code="b") if title_subtitle: jrec['title_subtitle'] = title_subtitle # title additional / 246 $a title_additional = record_get_field_value(rec, tag="246", code="a") if title_additional: jrec['title_additional'] = title_additional # title additional subtitle / 246 $b title_additional_subtitle = record_get_field_value(rec, tag="246", code="b") if title_additional_subtitle: jrec['title_additional_subtitle'] = title_additional_subtitle # publisher / 260 publisher = record_get_field_value(rec, tag="260", code="b") if publisher: jrec['publisher'] = publisher # date_published / 260 date_published = record_get_field_value(rec, tag="260", code="c") if date_published: jrec['date_published'] = date_published # date_created, date_reprocessed / 264 date_created = record_get_field_value(rec, tag="264", ind2="0", code="c") if date_created: jrec['date_created'] = date_created # date_reprocessed / 960 date_reprocessed = record_get_field_value(rec, tag="960", code="c") if date_reprocessed: jrec['date_reprocessed'] = date_reprocessed # prepublication with reportnumber / 269, 037 prepublication = {} for field_instance in record_get_field_instances(rec, tag="269"): prepublication_places = field_get_subfield_values(field_instance, "a") if prepublication_places: prepublication['place'] = prepublication_places[0] prepublication_publishers = field_get_subfield_values(field_instance, "b") if prepublication_publishers: prepublication['publisher'] = prepublication_publishers[0] prepublication_dates = field_get_subfield_values(field_instance, "c") if prepublication_dates: prepublication_time = time.strptime(prepublication_dates[0], "%d %b %Y") prepublication['date'] = time.strftime("%Y-%m-%d", prepublication_time) prepublication_reportnumber = record_get_field_value(rec, tag="037", code="a") if prepublication_reportnumber: prepublication['report_number'] = prepublication_reportnumber if prepublication: jrec['prepublication'] = prepublication # pileup / 770 pileup = {} pileup_description = record_get_field_value(rec, tag="770", code="i") if pileup_description: pileup = {'description': pileup_description} pileup_links = [] for field_instance in record_get_field_instances(rec, tag="770"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_titles = field_get_subfield_values(field_instance, 'a') pileup_link = {} if field_instance_recids: pileup_link['recid'] = field_instance_recids[0] if field_instance_titles: pileup_link['title'] = field_instance_titles[0] if pileup_link: if pileup.has_key('links'): pileup['links'].append(pileup_link) else: pileup['links'] = [pileup_link, ] jrec['pileup'] = pileup # extent / 300 extent = record_get_field_value(rec, tag="300", code="a") if extent and False: # we decided not to retain extent field in COD3 jrec['extent'] = extent # dataset_semantics / 505 dataset_semantics = [] for field_instance in record_get_field_instances(rec, tag="505"): entry = {} entry_variables = field_get_subfield_values(field_instance, "t") if entry_variables: entry['variable'] = entry_variables[0] entry_descriptions = field_get_subfield_values(field_instance, "g") if entry_descriptions: entry['description'] = entry_descriptions[0] if dataset_semantics: dataset_semantics.append(entry) else: dataset_semantics = [entry, ] if dataset_semantics: jrec['dataset_semantics'] = dataset_semantics # collections / 980 collections = record_get_field_values(rec, tag="980", code="a") collections.extend(record_get_field_values(rec, tag="980", code="b")) collections.extend(record_get_field_values(rec, tag="980", code="c")) if 'DELETED' in collections: return {} # record was deleted if 'Education' in collections: collections.remove('Education') if 'Research' in collections: collections.remove('Research') if collections: jrec['collections'] = collections # distribution / 256 distribution = {} distribution_number_events = sum([int(x) for x in record_get_field_values(rec, tag="256", code="e")]) if distribution_number_events: distribution['number_events'] = distribution_number_events distribution_number_files = sum([int(x) for x in record_get_field_values(rec, tag="256", code="f")]) if distribution_number_files: distribution['number_files'] = distribution_number_files distribution_size = sum([int(x) for x in record_get_field_values(rec, tag="256", code="b")]) if distribution_size: distribution['size'] = distribution_size formats = [] if '.root' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")): formats.append('root') if '/AOD/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")): formats.append('aod') if '/AODSIM/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")): formats.append('aodsim') if '/RAW/' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")): formats.append('raw') if 'OPERA' in ' '.join(collections): formats.append('csv') if '.tar.gz' in ' '.join(record_get_field_values(rec, tag="856", ind1="7", code="u")): formats.append('gz') fft_extensions = [] for fft in record_get_field_values(rec, tag="FFT", code="a"): fft_basename, fft_extension = os.path.splitext(fft) if 'file-indexes' in fft_basename: continue if fft_extension == '.configFile': fft_extension = '.py' if fft_extension not in fft_extensions: fft_extensions.append(fft_extension[1:]) for fft_extension in fft_extensions: if fft_extension: if not fft_extension in formats: formats.append(fft_extension) if formats: distribution['formats'] = formats if distribution: jrec['distribution'] = distribution # system_details / 538 system_details = {} system_details_release = record_get_field_value(rec, tag="538", code="a") if system_details_release: system_details_release = system_details_release.replace('Recommended release for analysis: ', '') system_details_release = system_details_release.replace('Recommended Software Release: ', '') system_details_release = system_details_release.replace('Software release: ', '') system_details_release = system_details_release.replace('Release: ', '') system_details['release'] = system_details_release system_details_global_tag = record_get_field_value(rec, tag="538", code="b") if system_details_global_tag: system_details_global_tag = system_details_global_tag.replace('Global tag: ', '') system_details['global_tag'] = system_details_global_tag system_details_description = record_get_field_value(rec, tag="538", code="i") if system_details_description: system_details['description'] = system_details_description system_details_url = record_get_field_value(rec, tag="538", code="u") if system_details_url: system_details['url'] = system_details_url system_details_recid = record_get_field_value(rec, tag="538", code="w") if system_details_recid: system_details['recid'] = system_details_recid if system_details: jrec['system_details'] = system_details # abstract / 520 abstract_description = record_get_field_value(rec, tag="520", code="a") if abstract_description: if 'http://opendata.cern.ch/' in abstract_description: abstract_description = abstract_description.replace('http://opendata.cern.ch/', '/') abstract = {'description': abstract_description} abstract_links = [] for field_instance in record_get_field_instances(rec, tag="520"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') abstract_link = {} if field_instance_recids: abstract_link['recid'] = field_instance_recids[0] if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') abstract_link['url'] = field_instance_url if abstract_link: if abstract.has_key('links'): abstract['links'].append(abstract_link) else: abstract['links'] = [abstract_link, ] jrec['abstract'] = abstract # methodology / 567 methodology_description = record_get_field_value(rec, tag="567", code="a") if methodology_description: if 'http://opendata.cern.ch/' in methodology_description: methodology_description = methodology_description.replace('http://opendata.cern.ch/', '/') methodology = {'description': methodology_description} methodology_links = [] for field_instance in record_get_field_instances(rec, tag="567"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') methodology_link = {} if field_instance_recids: methodology_link['recid'] = field_instance_recids[0] if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') methodology_link['url'] = field_instance_url if methodology_link: if methodology.has_key('links'): methodology['links'].append(methodology_link) else: methodology['links'] = [methodology_link, ] jrec['methodology'] = methodology # license / 540 license_attribution = record_get_field_value(rec, tag="540", code="a") if license_attribution: license = {'attribution': license_attribution} jrec['license'] = license # validation / 583 validation_description = record_get_field_value(rec, tag="583", code="a") if validation_description: validation = {'description': validation_description} validation_links = [] for field_instance in record_get_field_instances(rec, tag="583"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') field_instance_descriptions = field_get_subfield_values(field_instance, 'y') validation_link = {} if field_instance_recids: validation_link['recid'] = field_instance_recids[0] if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') validation_link['url'] = field_instance_url if field_instance_descriptions: validation_link['description'] = field_instance_descriptions[0] if validation_link: if validation.has_key('links'): validation['links'].append(validation_link) else: validation['links'] = [validation_link, ] jrec['validation'] = validation # use_with / 516 use_with_description = record_get_field_value(rec, tag="516", code="a") if int(recid) == 221 and \ use_with_description == 'Use this with 2011 CMS open data': use_with_description = 'Use this with 2011 and 2012 CMS open data' if use_with_description: if int(recid) == 560: use_with_description = use_with_description.replace('http://opendata.cern.ch/', '/') use_with = {'description': use_with_description} use_with_links = [] for field_instance in record_get_field_instances(rec, tag="516"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') field_instance_descriptions = field_get_subfield_values(field_instance, 'y') # workaround for one record: if not field_instance_recids and field_instance_urls and \ field_instance_urls[0] == 'http://opendata.cern.ch/record/14': field_instance_recids = ["14", ] for field_instance_recid in field_instance_recids: use_with_link = {} use_with_link['recid'] = field_instance_recid if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') use_with_link['url'] = field_instance_url if field_instance_descriptions: use_with_link['description'] = field_instance_descriptions[0] if use_with_link: if use_with.has_key('links'): use_with['links'].append(use_with_link) else: use_with['links'] = [use_with_link, ] jrec['use_with'] = use_with # usage / 581 usage_description = record_get_field_value(rec, tag="581", code="a") if usage_description: if int(recid) == 560: usage_description = usage_description.replace('http://opendata.cern.ch/', '/') usage = {'description': usage_description} usage_links = [] for field_instance in record_get_field_instances(rec, tag="581"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') field_instance_descriptions = field_get_subfield_values(field_instance, 'y') usage_link = {} if field_instance_recids: usage_link['recid'] = field_instance_recids[0] if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') if field_instance_url.startswith('http://atlas-opendata.web.cern.ch/atlas-opendata/'): field_instance_url = field_instance_url.replace('http://atlas-opendata.web.cern.ch/atlas-opendata/', 'http://opendata.atlas.cern/') if field_instance_url.startswith('https://github.com/katilp/pattuples2011'): field_instance_url = field_instance_url.replace('https://github.com/katilp/pattuples2011', 'https://github.com/cms-opendata-analyses/pattuples2011') usage_link['url'] = field_instance_url if field_instance_descriptions: usage_link['description'] = field_instance_descriptions[0] if usage_link: if usage.has_key('links'): usage['links'].append(usage_link) else: usage['links'] = [usage_link, ] jrec['usage'] = usage # note / 556 note_description = record_get_field_value(rec, tag="556", code="a") if note_description: note = {'description': note_description} note_links = [] for field_instance in record_get_field_instances(rec, tag="556"): field_instance_recids = field_get_subfield_values(field_instance, 'w') field_instance_urls = field_get_subfield_values(field_instance, 'u') field_instance_titles = field_get_subfield_values(field_instance, 'y') note_link = {} if field_instance_recids: note_link['recid'] = field_instance_recids[0] if field_instance_urls: field_instance_url = field_instance_urls[0] if field_instance_url.startswith('http://opendata.cern.ch/'): field_instance_url = field_instance_url.replace('http://opendata.cern.ch/', '/') note_link['url'] = field_instance_url if field_instance_urls: note_link['title'] = field_instance_titles[0] if note_link: if note.has_key('links'): note['links'].append(note_link) else: note['links'] = [note_link, ] jrec['note'] = note # note / 500 comment = record_get_field_value(rec, tag="500", code="a") if comment: if jrec.has_key('note'): raise StandardError('Sorry, cannot have both note/556 and note/500 fields.') else: jrec['note'] = {'description': comment} # generator / 593 generator = {} generator_name = record_get_field_value(rec, tag="593", code="a") if generator_name: generator_name = generator_name.replace('Generators: ', '') generator_names = generator_name.split() generator['names'] = generator_names generator_global_tag = record_get_field_value(rec, tag="593", code="b") if generator_global_tag: generator_global_tag = generator_global_tag.replace('Global tag: ', '') generator['global_tag'] = generator_global_tag if generator: jrec['generator'] = generator # accelerator / 693 accelerator = record_get_field_value(rec, tag="693", code="a") if accelerator: jrec['accelerator'] = accelerator # experiment / 693 experiment = record_get_field_value(rec, tag="693", code="e") if not experiment and (recid == '60' or recid == '352'): experiment = 'ATLAS' if not experiment and (recid == '450' or recid == '451'): experiment = 'CMS' if not experiment and (recid == '452'): experiment = 'OPERA' if experiment: jrec['experiment'] = experiment # run_period / 964 run_period = record_get_field_value(rec, tag="964", ind2="0", code="c") if run_period: if run_period == '2011RunA': run_period = 'Run2011A' jrec['run_period'] = run_period # generation / for simulated data # FIXME to be populated from DAS client? introduce structure inside methodology field # selection / for collision data # FIXME to be populated from DAS client? introduce structure inside methodology field # collision_information / 942 collision_information_energy = record_get_field_value(rec, tag="942", code="e") collision_information_luminosity = record_get_field_value(rec, tag="942", code="l") collision_information_type = record_get_field_value(rec, tag="942", code="t") if collision_information_energy or collision_information_luminosity or collision_information_type: collision_information = {} if collision_information_energy: collision_information_energy = collision_information_energy.replace('Collision energy: ', '') collision_information_energy = collision_information_energy.replace('Collision energy:', '') collision_information['energy'] = collision_information_energy if collision_information_luminosity: collision_information['luminosity'] = collision_information_luminosity if collision_information_type: collision_information['type'] = collision_information_type else: if 'Data' in ' '.join(collections): if 'PbPb' in title: collision_information['type'] = 'PbPb' else: collision_information['type'] = 'pp' jrec['collision_information'] = collision_information # parent_dataset / 772 parent_dataset_title = record_get_field_value(rec, tag="772", code="a") parent_dataset_doi = record_get_field_value(rec, tag="772", code="o") parent_dataset_recid = record_get_field_value(rec, tag="772", code="w") if parent_dataset_title or parent_dataset_doi or parent_dataset_recid: parent_dataset = {} parent_dataset['type'] = 'isChildOf' if parent_dataset_title: parent_dataset['title'] = parent_dataset_title if parent_dataset_doi: parent_dataset['doi'] = parent_dataset_doi if parent_dataset_recid: parent_dataset['recid'] = parent_dataset_recid if parent_dataset_title != title: if jrec.has_key('relations'): jrec['relations'].append(parent_dataset) else: jrec['relations'] = [parent_dataset, ] # code to produce files / 777 code_to_produce_files_description = record_get_field_value(rec, tag="777", code="a") code_to_produce_files_recid = record_get_field_value(rec, tag="777", code="w") if code_to_produce_files_description or code_to_produce_files_recid: code_to_produce_files = {} code_to_produce_files['type'] = 'isProducedBy' if code_to_produce_files_description: code_to_produce_files['description'] = code_to_produce_files_description if code_to_produce_files_recid: code_to_produce_files['recid'] = code_to_produce_files_recid if jrec.has_key('relations'): jrec['relations'].append(code_to_produce_files) else: jrec['relations'] = [code_to_produce_files, ] # related dataset / 786 for field_instance in record_get_field_instances(rec, tag="786"): related_dataset_descriptions = field_get_subfield_values(field_instance, "a") related_dataset_recids = field_get_subfield_values(field_instance, "w") if related_dataset_descriptions or related_dataset_recids: related_dataset = {} related_dataset['type'] = 'isPartOf' if related_dataset_descriptions: related_dataset['description'] = related_dataset_descriptions[0] if related_dataset_recids: related_dataset['recid'] = related_dataset_recids[0] if jrec.has_key('relations'): jrec['relations'].append(related_dataset) else: jrec['relations'] = [related_dataset, ] # related item / 787 related_item_description = record_get_field_value(rec, tag="787", code="a") related_item_recids = record_get_field_values(rec, tag="787", code="w") related_item_note = record_get_field_value(rec, tag="787", code="n") related_item_url = record_get_field_value(rec, tag="787", code="u") related_item_label = record_get_field_value(rec, tag="787", code="y") if related_item_description and not related_item_recids: # workaround for a record if related_item_description == 'The default output of the code below is a ROOT file Mu00val.root': note = related_item_description if jrec.has_key('note') and jrec['note'].has_key('description'): jrec['note']['description'] += note else: jrec['note'] = {'description': note} if related_item_recids: for related_item_recid in related_item_recids: related_item = {} related_item['type'] = 'isRelatedTo' if related_item_description: related_item['description'] = related_item_description if related_item_recid: related_item['recid'] = related_item_recid if jrec.has_key('relations'): jrec['relations'].append(related_item) else: jrec['relations'] = [related_item, ] if related_item_url: link = {} if related_item_note: link['description'] = related_item_note if related_item_url: link['url'] = related_item_url if related_item_label: if related_item_label != related_item_url: link['comment'] = related_item_label if jrec.has_key('links'): jrec['links'].append(link) else: jrec['links'] = [link, ] # files / 8567 files = [] for file_instance in record_get_field_instances(rec, tag="856", ind1="7"): afile = {} file_type = field_get_subfield_values(file_instance, "2")[0] if file_type and file_type != 'xrootd': afile['type'] = file_type file_uri = field_get_subfield_values(file_instance, "u")[0] afile['uri'] = file_uri file_size = field_get_subfield_values(file_instance, "s")[0] afile['size'] = int(file_size) afile['checksum'] = 'sha1:0000000000000000000000000000000000000000' # FIXME detect real SHA1 of files if 'eos-file-indexes' in " ".join(record_get_field_values(rec, 'FFT', code='a')): continue # remove individual ROOT files when we have file indexes files.append(afile) if files: if jrec.has_key('files'): jrec['files'].extend(files) else: jrec['files'] = files # files / FFT files = [] for file_instance in record_get_field_instances(rec, tag="FFT"): # read FFT file properties file_name = field_get_subfield_values(file_instance, "a")[0] file_name = os.path.basename(file_name) try: file_name_alias = field_get_subfield_values(file_instance, "n")[0] file_name_alias = re.sub(r'^(.*?)\.', file_name_alias + '.', file_name) except IndexError: file_name_alias = '' file_descriptions = field_get_subfield_values(file_instance, "z") if file_descriptions: file_description = file_descriptions[0] else: file_description = '' # output location that will be populated below file_uri = '' file_type = '' # CMS-Primary-Datasets, CMS-Simulated-Datasets if 'CMS-Primary-Datasets' in collections or \ 'CMS-Simulated-Datasets' in collections: match = re.search(r'(.*?)_(.*?)_(.*)_(AOD|RAW)_(.*)_([0-9]+)_file_index.txt$', file_name) if match: file_type = 'index' file_experiment, file_release, file_dataset, file_format, file_version, file_volume = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/' + \ file_experiment.lower() + '/' + \ file_release + '/' + \ file_dataset + '/' + \ file_format + '/' + \ file_version + '/' + \ 'file-indexes/' + file_name else: match = re.search(r'(.*?)_(MonteCarlo[0-9]+)_(.*?)_(.*)_(AODSIM)_(.*)_([0-9]+)_file_index.txt$', file_name) if match: file_type = 'index' file_experiment, file_montecarlo, file_release, file_dataset, file_format, file_version, file_volume = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/' + \ file_experiment.lower() + '/' + \ file_montecarlo + '/' + \ file_release + '/' + \ file_dataset + '/' + \ file_format + '/' + \ file_version + '/' + \ 'file-indexes/' + file_name # cms-eventdisplay-files if int(recid) >= 600 and int(recid) <= 613: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/' + os.path.splitext(file_name)[0] + '/IG/Apr21ReReco-v1/' + file_name # cms-eventdisplay-files-Run2011A if int(recid) >= 614 and int(recid) <= 632: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/' + os.path.splitext(file_name)[0].replace('_Run2011A', '') + '/IG/12Oct2013-v1/' + file_name.replace('_Run2011A', '') # CMS-Configuration-Files if 'CMS-Configuration-Files' in collections: if file_name.endswith('configFile'): file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/configuration-files/2011/' + file_name + '.py' else: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/configuration-files/2011/' + file_name # LHCb-Derived-Datasets if 'LHCb-Derived-Datasets' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/lhcb/MasterclassDatasets/D0lifetime/2014/file-indexes/' + file_name file_type = 'index' # ALICE-Derived-Datasets if 'ALICE-Derived-Datasets' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/' + file_name file_type = 'index' # ALICE-Reconstructed-Data if 'ALICE-Reconstructed-Data' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/' + file_name file_type = 'index' # CMS-Validated-Runs if 'CMS-Validated-Runs' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/validated-runs/' + date_created + '/' + file_name # cms-derived-csv-Run2011A if int(recid) == 545: match = re.search(r'^(.*)_(.*)_Run2011A.csv$', file_name) file_name_filename, file_name_dataset = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/' + file_name_dataset + '/CSV/12Oct2013-v1/' + file_name_filename + '.csv' # cms-tools-vm-image.xml if int(recid) >= 249 and int(recid) <= 250: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/environment/2010/' + file_name # cms-tools-vm-image-Run2011A.xml if int(recid) >= 251 and int(recid) <= 252: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/environment/2011/' + file_name # CMS-Open-Data-Instructions if 'CMS-Open-Data-Instructions' in collections: if int(recid) == 55: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/documentation/outreach-exercise-2010/' + file_name if int(recid) == 72: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/documentation/hst-programme-2016/' + file_name # CMS-Luminosity-Information if 'CMS-Luminosity-Information' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/luminosity/' + date_created + '/' + file_name # CMS-Trigger-Information if 'CMS-Trigger-Information' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/trigger-information/' + date_created + '/' + file_name # cms-derived-pattuples-ana if int(recid) == 201: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Mu/PATtuples/file-indexes/' + file_name file_type = 'index' if int(recid) == 202: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Electron/PATtuples/file-indexes/' + file_name file_type = 'index' # cms-derived-pattuples-ana-Run2011A if int(recid) == 230: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/DoubleMu/PATtuples/file-indexes/' + file_name file_type = 'index' if int(recid) == 231: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/DoubleElectron/PATtuples/file-indexes/' + file_name file_type = 'index' # cms-hamburg-files if int(recid) >= 203 and int(recid) <= 212: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/hep-tutorial-2012/' + file_name # CMS-Learning-Resources if 'CMS-Learning-Resources' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/hep-tutorial-2012/' + file_name # cms-tools-ana if int(recid) == 101: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/ayrodrig-OutreachExercise2010/' + file_name if int(recid) == 200: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/ayrodrig-pattuples2010/' + file_name # cms-tools-dimuon-spectrum-2010 if int(recid) == 560: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/dimuon-spectrum-2010/' + file_name # cms-tools-dimuon-filter if int(recid) == 553: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/SUSYBSMAnalysis-RazorFilter/' + file_name if int(recid) == 552: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/dimuon-filter/' + file_name # cms-validation-code-Run2010B if int(recid) == 460: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-Mu/' + file_name if int(recid) == 461: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-Commissioning/' + file_name if int(recid) == 462: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/software/validation-2010-MinimumBias/' + file_name # cms-csv-files if int(recid) == 554: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/MultiJet/CSV/Apr21ReReco-v1/' + file_name_alias if int(recid) == 700: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2010B/Mu/CSV/Apr21ReReco-v1/' + file_name_alias # cms-masterclass-files if int(recid) >= 300 and int(recid) <= 310: if file_name_alias.startswith('masterclass.'): file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/masterclass-2014/' + file_name else: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/masterclass-2014/' + file_name_alias # atlas-derived-datasets if 'ATLAS-Derived-Datasets' in collections: if int(recid) == 3860: # atlas-all-samples file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name file_type = 'index' elif int(recid) == 390 or int(recid) == 391: file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name file_type = 'index' else: match = re.search(r'ATLAS_MasterclassDatasets_(.*)_([0-9]+)_dataset_([0-9]+)_file_index.txt$', file_name) file_xpath, file_year, file_number = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/MasterclassDatasets/' + file_xpath + '/' + file_year + '/' + file_number + '/file-indexes/' + file_name file_type = 'index' # ATLAS-Tools if 'ATLAS-Tools' in collections: if int(recid) == 352: file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/MasterclassDatasets/WPath/2015/Software/' + file_name file_type = 'index' else: file_type = 'index' if int(recid) == 3851: file_name = file_name.replace('size_M_', 'size_S_') file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name file_type = 'index' # ATLAS-Simulated-Datasets if 'ATLAS-Simulated-Datasets' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/OutreachDatasets/2016-07-29/file-indexes/' + file_name file_type = 'index' # ATLAS-Higgs-Challenge-2014 if 'ATLAS-Higgs-Challenge-2014' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/atlas/higgs-challenge-2014/' + file_name # ALICE-Learning-Resources if 'ALICE-Learning-Resources' in collections: file_uri = 'root://eospublic.cern.ch//eos/opendata/alice/documentation/' + file_name # cms-condition-data-Run2011A if int(recid) >= 1801 and int(recid) <= 1802: file_uri = 'root://eospublic.cern.ch//eos/opendata/cms/Run2011A/db/file-indexes/' + file_name file_type = 'index' # OPERA if 'OPERA' in ' '.join(collections): match = re.search(r'^(.*).(csv|zip)$', file_name) if match: file_name_base, file_name_ext = match.groups() if file_name_ext == 'zip': file_uri_base = 'root://eospublic.cern.ch//eos/opendata/opera/datasets/multiplicity' elif file_name_ext == 'csv': file_uri_base = 'root://eospublic.cern.ch//eos/opendata/opera/events/multiplicity' else: raise StandardError('Not expected.') file_uri = file_uri_base + '/' + file_name_base + '.' + file_name_ext # author lists if 'Author-Lists' in collections: match = re.search(r'^(.*)-author-list-(.*).pdf$', file_name) if match: file_name_exp, file_name_year = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/' + file_name_exp.lower() + '/documentation/' + file_name # data policies if 'Data-Policies' in collections: match = re.search(r'^(.*)-Data-Policy.pdf$', file_name) if match: file_name_exp, = match.groups() file_uri = 'root://eospublic.cern.ch//eos/opendata/' + file_name_exp.lower() + '/documentation/' + file_name # ok, recognised enough; now generate files output if file_uri: afile = {} if file_type and file_type != 'xrootd': afile['type'] = file_type afile['uri'] = file_uri afile['size'] = fft_file_cache_info[file_name]['size'] if file_description: afile['description'] = file_description afile['checksum'] = 'sha1:' + fft_file_cache_info[file_name]['checksum'] files.append(afile) if files: if jrec.has_key('files'): jrec['files'].extend(files) else: jrec['files'] = files # keywords / 653 keywords = [] keyword_values = record_get_field_values(rec, tag="653", ind1="1", code="a") if int(recid) in [50, 53, 54, 61, 51, 52, 57, 58, 72]: keyword_values.append('education') if int(recid) in [53, 57, 72]: keyword_values.append('teaching') for keyword in keyword_values: if keyword != experiment: keyword = keyword.lower() if keyword == 'masterclasses': keyword = 'masterclass' keywords.append(keyword) if keywords: if jrec.has_key('keywords'): jrec['keywords'].extend(keywords) else: jrec['keywords'] = keywords # topic / 655 topic = {} for field_instance in record_get_field_instances(rec, tag="655", ind2="7"): topic_categories = field_get_subfield_values(field_instance, "a") if topic_categories: topic['category'] = topic_categories[0] topic_sources = field_get_subfield_values(field_instance, "9") if topic_sources: topic['source'] = topic_sources[0] if topic: jrec['topic'] = topic # language / 041 language = record_get_field_value(rec, tag="041", code="a") if language: if language == 'eng': language = 'English' jrec['language'] = language # links / 8564 links = [] for file_instance in record_get_field_instances(rec, tag="856", ind1="4"): link = {} link_hostname = field_get_subfield_values(file_instance, "a") if link_hostname: link['hostname'] = link_hostname[0] link_compression_information = field_get_subfield_values(file_instance, "c") if link_compression_information: link['compression_information'] = link_compression_information[0] link_size = field_get_subfield_values(file_instance, "s") if link_size: link['size'] = link_size[0] link_url = field_get_subfield_values(file_instance, "u") if link_url: link_url = link_url[0] if link_url.startswith('http://opendata.cern.ch/'): link_url = link_url.replace('http://opendata.cern.ch/', '/') if 'CMS-Learning-Resources' in collections and link_url == 'http://mattbellis.github.io/Particle-Physics-Playground/': link_url = 'http://particle-physics-playground.github.io/' if 'CMS-Learning-Resources' in collections and link_url == 'http://ippog.web.cern.ch/resources/2012/cms-hep-tutorial': link_url = 'http://ippog.org/resources/2012/cms-hep-tutorial' link['url'] = link_url link_description = field_get_subfield_values(file_instance, "y") if link_description: link['description'] = link_description[0] link_comment = field_get_subfield_values(file_instance, "z") if link_comment: link['comment'] = link_comment[0] links.append(link) if links: if jrec.has_key('links'): jrec['links'].extend(links) else: jrec['links'] = links # type, subtype / new jrec['type'] = {} if 'Primary-Dataset' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Collision', ] elif '-Detector-Datasets' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Derived', ] elif '-Detector-Events' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Derived', ] elif 'Derived-Dataset' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Derived', ] elif 'Reconstructed-Data' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Collision', ] elif 'Simulated-Dataset' in ' '.join(collections): jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Simulated', ] elif 'Tools' in ' '.join(collections): if 'virtual machine' in abstract_description.lower(): jrec['type']['primary'] = 'Environment' jrec['type']['secondary'] = ['VM'] elif int(recid) in [402, ]: jrec['type']['primary'] = 'Environment' jrec['type']['secondary'] = ['VM', ] else: jrec['type']['primary'] = 'Software' if int(recid) in [560, 234, 553, 101, 1200, 1201, 1202, 1203]: jrec['type']['secondary'] = ['Analysis', ] elif int(recid) in [220, 3850, 212, 221]: jrec['type']['secondary'] = ['Framework', ] elif int(recid) in [550, 352, 233, 200, 552, 1201, 551]: jrec['type']['secondary'] = ['Tool', ] else: jrec['type']['secondary'] = [] elif 'Validated-Runs' in ' '.join(collections): jrec['type']['primary'] = 'Environment' jrec['type']['secondary'] = ['Validation', ] elif 'Validation-Utilities' in ' '.join(collections): jrec['type']['primary'] = 'Software' jrec['type']['secondary'] = ['Validation', ] elif 'Learning-Resources' in ' '.join(collections): jrec['type']['primary'] = 'Documentation' if int(recid) in [40, 59, 60, 51, 50, 61, 53, 52, 41]: jrec['type']['secondary'] = ['Activities', ] else: jrec['type']['secondary'] = [] elif 'Configuration-Files' in ' '.join(collections): jrec['type']['primary'] = 'Supplementaries' jrec['type']['secondary'] = ['Configuration', ] elif 'Trigger-Information' in ' '.join(collections): jrec['type']['primary'] = 'Supplementaries' jrec['type']['secondary'] = ['Trigger', ] elif 'Luminosity-Information' in ' '.join(collections): jrec['type']['primary'] = 'Supplementaries' jrec['type']['secondary'] = ['Luminosity', ] elif 'Condition-Data' in ' '.join(collections): jrec['type']['primary'] = 'Environment' jrec['type']['secondary'] = ['Condition', ] elif 'Open-Data-Instructions' in ' '.join(collections): jrec['type']['primary'] = 'Documentation' if int(recid) in [57, 58]: jrec['type']['secondary'] = ['Help', ] elif int(recid) in [56, ]: jrec['type']['secondary'] = ['Report', ] elif int(recid) in [70, 71]: jrec['type']['secondary'] = ['Guide', ] if int(recid) in [72, 55]: jrec['type']['secondary'] = ['Activities', ] else: jrec['type']['secondary'] = [] elif 'Data-Policies' in ' '.join(collections): jrec['type']['primary'] = 'Documentation' jrec['type']['secondary'] = ['Policy', ] elif 'Author-Lists' in ' '.join(collections): jrec['type']['primary'] = 'Documentation' jrec['type']['secondary'] = ['Authors', ] elif 'ATLAS-Higgs-Challenge-2014' in ' '.join(collections): if 'Dataset' in title: jrec['type']['primary'] = 'Dataset' jrec['type']['secondary'] = ['Derived', ] elif 'Documentation' in title: jrec['type']['primary'] = 'Documentation' jrec['type']['secondary'] = ['Activities', ] elif 'Video' in title: jrec['type']['primary'] = 'Documentation' jrec['type']['secondary'] = ['Activities', ] elif 'Software' in title: jrec['type']['primary'] = 'Software' jrec['type']['secondary'] = ['Analysis', ] else: jrec['type']['primary'] = 'FIXME' jrec['type']['secondary'] = [] else: jrec['type']['primary'] = 'FIXME' jrec['type']['secondary'] = ['FIXME', ] return jrec