def get_parent_blog(recid): """ This function returns the parent blog of any post or comment given its recid @param recid: comment or post recid @type recid: int @return: parent blog recid @rtype: int """ coll = get_fieldvalues(recid, '980__a')[0] if coll == 'BLOG': return recid elif coll == 'COMMENT': parent_post = get_parent_post(recid) recid = parent_post parent_blog = get_fieldvalues(recid, '760__w') if parent_blog: if parent_blog[0]: return int(parent_blog[0]) else: return None else: return None
def book_information_from_MARC(recid): """ Retrieve book's information from MARC @param recid: identify the record. Primary key of bibrec. @type recid: int @return tuple with title, year, author, isbn and editor. """ book_title = ' '.join(get_fieldvalues(recid, "245__a") + \ get_fieldvalues(recid, "245__b") + \ get_fieldvalues(recid, "245__n") + \ get_fieldvalues(recid, "245__p")) book_year = ' '.join(get_fieldvalues(recid, "260__c")) book_author = ' '.join(get_fieldvalues(recid, "100__a") + \ get_fieldvalues(recid, "100__u")) book_isbn = ' '.join(get_fieldvalues(recid, "020__a")) book_editor = ' , '.join(get_fieldvalues(recid, "260__a") + \ get_fieldvalues(recid, "260__b")) return (book_title, book_year, book_author, book_isbn, book_editor)
def create_signature_blocks(record_id): """Create signature blocks given the record_id. :param int record_id: record-id Example: record_id = 1369415 :return: list of strings representing phonetic blocks for author's and co-author's full names. Empty list, if no author's found Example: [u'ELj', u'MCLAGHLANm', u'VARBASTj'] """ signature_blocks = [] author = get_fieldvalues(record_id, "100__a") coauthors = get_fieldvalues(record_id, "700__a") authors = [] authors.extend(author) authors.extend(coauthors) for author in authors: signature_block = create_signature_block(author) if signature_block: signature_blocks.append(signature_block) return signature_blocks
def book_information_from_MARC(recid): """ Retrieve book's information from MARC @param recid: identify the record. Primary key of bibrec. @type recid: int @return tuple with title, year, author, isbn and editor. """ # FIXME do the same that book_title_from_MARC book_title = book_title_from_MARC(recid) book_year = ''.join(get_fieldvalues(recid, "260__c")) author_tags = ['100__a', '700__a', '721__a'] book_author = '' for tag in author_tags: l = get_fieldvalues(recid, tag) for c in l: book_author += c + '; ' book_author = book_author[:-2] l = get_fieldvalues(recid, "020__a") book_isbn = '' for isbn in l: book_isbn += isbn + ', ' book_isbn = book_isbn[:-2] book_editor = ', '.join(get_fieldvalues(recid, "260__a") + \ get_fieldvalues(recid, "260__b")) return (book_title, book_year, book_author, book_isbn, book_editor)
def build_issns_from_local_site(): """ Retrieves the ISSNs from the local database. Store the "journal name -> issn" relation. Normalize journal names a little bit: - strip whithespace chars (left and right) - all lower case - remove "[Online]" suffix Print the result as Python dict structure. """ rec_id_list = perform_request_search(cc='Periodicals', of='id') built_issns = {} #built_issns = issns # Uncomment this to extend existing issns dict # (e.g. in case of manual addition) for rec_id in rec_id_list: journal_name_list = get_fieldvalues(rec_id, '210__%') issn_list = get_fieldvalues(rec_id, '022__a') if issn_list: issn = issn_list[0] # There should be only one ISSN for journal_name in journal_name_list: # Depending on how journal names are entered into the database, # you might want to do some processing before saving: journal_name = journal_name.lower().strip() if journal_name.endswith("[online]"): journal_name = journal_name[:-8].rstrip() built_issns[journal_name] = issn prtyp = pprint.PrettyPrinter(indent=4) prtyp.pprint(built_issns)
def us_affiliations(req): from invenio.search_engine_utils import get_fieldvalues req.content_type = "text/html" print >> req, pageheaderonly("USA affiliations", req=req) affiliations = [] tmp = [] tmp.extend(get_fieldvalues(perform_request_search(p="*"), '100__u', False)) tmp.extend(get_fieldvalues(perform_request_search(p="*"), '100__v', False)) tmp.extend(get_fieldvalues(perform_request_search(p="*"), '700__u', False)) tmp.extend(get_fieldvalues(perform_request_search(p="*"), '700__v', False)) def _find_usa(x): return ("United States of America" in x or "United States" in x or "USA" in x or "U.S.A" in x) affiliations.extend(filter(_find_usa, tmp)) affiliations = set(affiliations) replaces = [('United States of America', ''), ("United States", ''), ("USA", ''), ("U.S.A", ''), ("University", ''), ("State", ''), ('Department of Physics and Astronomy', ""), ('Department of Physics', ""), ('Department', ''), (",", '')] affs = map(lambda x: multi_replace(x, replaces).strip(), affiliations) affiliations2 = zip(affiliations, affs) for a in sorted(affiliations2, key=lambda aff: aff[1]): req.write(a[0] + '<br />') req.write(pagefooteronly(req=req)) return ""
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) if rec_oaiid_tag: rec_oaiid = rec_oaiid_tag[0] rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def check_records(records, amend_case=False): for record in records: for position, value in record.iterfields(['100__a', '700__a']): value = value.decode('utf8') new_value = NAME_CACHE.get(value) if new_value is None: search_value = value if ',' in value: splitted_values = search_value.split(',', 1) search_value = u"%s %s" % (splitted_values[1].strip(), splitted_values[0].strip()) original_family_name = value.split(',')[0].strip() search_value = RE_SPACES.sub(' ', search_value).strip() if len(search_value.split()) < 3: # Simple name continue i = perform_request_search(p=u'author:"%s"' % search_value, cc='HepNames') possible_values = get_fieldvalues(i, '100__a', sort=False) + get_fieldvalues(i, '400__a', sort=False) for correct_value in possible_values: correct_value = correct_value.decode('utf8') if search_value.lower().endswith(" " + correct_value.lower().split(',')[0]): family_name = correct_value.split(',')[0].strip() if len(family_name) < len(original_family_name): continue first_name = search_value[:-(len(family_name) + 1)].strip() new_value = u'%s, %s' % (family_name, first_name) NAME_CACHE[value] = new_value break else: NAME_CACHE[value] = value if new_value: if amend_case and new_value == value: continue elif new_value.lower() == value.lower(): continue record.amend_field(position, new_value.encode('utf8'))
def get_authors_from_record( recID, tags, use_bibauthorid=CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ if use_bibauthorid: authors = get_personids_from_record(recID) else: def get_id(table, author): """Get id from bibxxx tables""" return run_sql("SELECT id FROM bib%s WHERE value = %%s" % table, (author, )) authors = set() def add_ids(table, authors_list): for author in authors_list: if len(authors) > CFG_SELFCITES_AUTHOR_LIMIT: break authors.add(get_id(table, author)) add_ids('10x', get_fieldvalues(recID, tags['first_author'])) add_ids('70x', get_fieldvalues(recID, tags['additional_author'])) add_ids('72x', get_fieldvalues(recID, tags['alternative_author_name'])) return authors
def get_video_thumbnail(recid): """ Returns the URL and ALT text for a video thumbnail of a given record """ comments = get_fieldvalues(recid, '8564_z') descriptions = get_fieldvalues(recid, '8564_y') urls = get_fieldvalues(recid, '8564_u') for pos, comment in enumerate(comments): if comment in ('SUGGESTIONTHUMB', 'BIGTHUMB', 'THUMB', 'SMALLTHUMB', 'POSTER'): return (urls[pos], descriptions[pos]) return ("", "")
def get_journal_info(recid, tags): record_info = [] # TODO: handle recors with multiple journals tagsvalues = {} # we store the tags and their values here # like c->444 y->1999 p->"journal of foo", # v->20 tmp = get_fieldvalues(recid, tags['publication']['journal']) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['volume']) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['year']) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, tags['publication']['pages']) if tmp: # if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages # check if we have the required data ok = True for c in tags['publication_format']: if c in ('p', 'v', 'y', 'c'): if c not in tagsvalues: ok = False if ok: publ = format_journal(tags['publication_format'], tagsvalues) record_info += [publ] alt_volume = get_alt_volume(tagsvalues['v']) if alt_volume: tagsvalues2 = tagsvalues.copy() tagsvalues2['v'] = alt_volume publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] # Add codens for coden in get_kb_mappings('CODENS', value=tagsvalues['p']): tagsvalues2 = tagsvalues.copy() tagsvalues2['p'] = coden['key'] publ = format_journal(tags['publication_format'], tagsvalues2) record_info += [publ] return record_info
def get_authors_from_record(recID, tags): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ mainauth_list = get_fieldvalues(recID, tags['first_author']) coauth_list = get_fieldvalues(recID, tags['additional_author']) extauth_list = get_fieldvalues(recID, tags['alternative_author_name']) authors = set(mainauth_list) authors.update(coauth_list) authors.update(extauth_list) return authors
def format_element(bfo): """ Returns all the links used as references in a post """ current_language = bfo.lang links = bfo.fields('856_0') menu_out = "" if links: menu_out = '<h4>%s:</h4>' % cfg_messages["in_issue"][current_language] for link in links: link_url = link.get('u') link_data = link.get('y', link_url) link_title = link.get('z', '') menu_out += """<div class="litem"><a href="%s"%s>%s</a></div>""" % (link_url, link_title and ' title="%s"' % link_title or '' , link_data) recid_in_archive = perform_request_search(p = link_url, f = '520__u') # differentiate between links to sources inside # the archive and sources outside if recid_in_archive: menu_out += """<div style="padding-left:20px;"><h4>This content is also available in the archive: </h4>""" try: title = get_fieldvalues(recid_in_archive[0], "245__a")[0] except: title = "Untitled" menu_out += """<span class="moreinfo"><a href="%s/record/%s">%s</a></span></div></br>""" % (CFG_SITE_URL, recid_in_archive[0], title) return menu_out
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: result_hitset = search_pattern( p=name, f=self.institution_name_field ) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def get_index_strings_by_control_no(control_no): """extracts the index-relevant strings from the authority record referenced by the 'control_no' parameter and returns it as a list of strings :param control_no: a (INVENIO) MARC internal control_no to an authority record :type control_no: string (e.g. 'author:(ABC)1234') :param expected_type: the type of authority record expected :type expected_type: string, e.g. 'author', 'journal' etc. :return: list of index-relevant strings from the referenced authority record """ from invenio.bibindex_engine import list_union #return value string_list = [] #1. get recID and authority type corresponding to control_no rec_IDs = get_low_level_recIDs_from_control_no(control_no) #2. concatenate and return all the info from the interesting fields for this record for rec_id in rec_IDs: # in case we get multiple authority records for tag in CFG_BIBAUTHORITY_AUTHORITY_SUBFIELDS_TO_INDEX.get( get_type_from_control_no(control_no), []): new_strings = get_fieldvalues(rec_id, tag) string_list = list_union(new_strings, string_list) #return return string_list
def task_run_core(): """ run daemon """ #write_message("Getting expired loans ...", verbose=9) expired_loans = get_expired_loan() for (borrower_id, loan_id, recid) in expired_loans: (number_of_letters, date_letters) = get_overdue_letters_info(loan_id) if number_of_letters == 0: content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id) elif number_of_letters == 1 and send_second_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id) elif number_of_letters == 2 and send_third_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) else: content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) title = ''.join(get_fieldvalues(recid, "245__a")) subject = "LOAN RECALL: " + title update_expired_loan(loan_id) #write_message("Updating information about expired loans") send_overdue_letter(borrower_id, subject, content) #write_message("Sending overdue letter") #write_message("Done!!") return 1
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: if name.strip(): result_hitset = search_pattern(p=name, f=self.institution_name_field) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def main(): to_update = [] to_update_recids = [] recids = perform_request_search(p="970__a:'SPIRES'") for done, recid in enumerate(recids): if done % 50 == 0: print 'done %s of %s' % (done + 1, len(recids)) existing_fields = set(get_fieldvalues(recid, '980__a')) if 'HEP' in existing_fields: continue xml = create_our_record(recid) to_update.append(xml) to_update_recids.append(recid) if len(to_update) == 1000 or done + 1 == len(recids) and len(to_update) > 0: task_id = submit_task(to_update) print 'submitted task id %s' % task_id wait_for_task(task_id) task_id = submit_bibindex_task(to_update_recids) print 'submitted task id %s' % task_id wait_for_task(task_id) to_update = [] to_update_recids = []
def create_xml(recid): """ Searches for duplicate instances of 773 and keeps the good one. """ tag = '773__' tag_value = tag + 'p' journal = get_fieldvalues(recid, tag_value) if len(journal) == 2 and journal[0] == journal[1]: record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', \ controlfield_value=str(recid)) field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) correct_subfields = [] c_value = False for field_instance in field_instances: for code, value in field_instance[0]: if value == 'To appear in the proceedings of': pass elif (code, value) not in correct_subfields: if code == 'c': if c_value: if len(value) > len(c_value): c_value = value else: c_value = value else: correct_subfields.append((code, value)) if c_value: correct_subfields.append(('c', c_value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record) return None
def get_record_year(recid): record_date = [] for tag in DATE_TAGS: record_date = get_fieldvalues(recid, tag) if record_date: break return record_date
def setUp(self, recid=RECID, arxiv_id=ARXIV_ID): self.recid = recid self.arxiv_id = arxiv_id self.arxiv_version = 1 self.bibupload_xml = """<record> <controlfield tag="001">%s</controlfield> <datafield tag="037" ind1=" " ind2=" "> <subfield code="a">arXiv:%s</subfield> <subfield code="9">arXiv</subfield> <subfield code="c">hep-ph</subfield> </datafield> </record>""" % (recid, arxiv_id) bibtask.setup_loggers() bibtask.task_set_task_param('verbose', 0) recs = bibupload.xml_marc_to_records(self.bibupload_xml) status, dummy, err = bibupload.bibupload(recs[0], opt_mode='correct') assert status == 0, err.strip() assert len(get_fieldvalues(recid, '037__a')) == 1 def mocked_oai_harvest_get(prefix, baseurl, harvestpath, verb, identifier): temp_fd, temp_path = mkstemp() os.write(temp_fd, ARXIV_OAI_RESPONSE % self.arxiv_version) os.close(temp_fd) return [temp_path] self.oai_harvest_get = oai_harvest_daemon.oai_harvest_get oai_harvest_daemon.oai_harvest_get = mocked_oai_harvest_get def mocked_get_oai_src(params={}): return [{'baseurl': ''}] self.get_oai_src = oai_harvest_dblayer.get_oai_src oai_harvest_dblayer.get_oai_src = mocked_get_oai_src
def task_run_core(): """ run daemon """ #write_message("Getting expired loans ...", verbose=9) expired_loans = get_expired_loan() for (borrower_id, loan_id, recid) in expired_loans: (number_of_letters, date_letters) = get_overdue_letters_info(loan_id) if number_of_letters == 0: content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id) elif number_of_letters == 1 and send_second_recall(date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id) elif number_of_letters == 2 and send_third_recall(date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) else: content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) title = ''.join(get_fieldvalues(recid, "245__a")) subject = "LOAN RECALL: " + title update_expired_loan(loan_id) #write_message("Updating information about expired loans") send_overdue_letter(borrower_id, subject, content) #write_message("Sending overdue letter") #write_message("Done!!") return 1
def book_title_from_MARC(recid): """ Retrieve book's title from MARC @param recid: identify the record. Primary key of bibrec. @type recid: int @return book's title """ book_title = ' '.join(get_fieldvalues(recid, "245__a") + \ get_fieldvalues(recid, "245__b") + \ get_fieldvalues(recid, "245__n") + \ get_fieldvalues(recid, "245__p")) return book_title
def format_element(bfo, newline=False, show_doi=False): """ Prints link to proceedings if the proceedings exist. If not, nothing is returned. @param newline: if True, add <br /> at the end @param show_doi: if True, show DOI of the proceeding in brackets """ cnum = str(bfo.field('111__g')) out = "" if not cnum: #Something is wrong, return empty string return out search_result = perform_request_search(p="773__w:" + cnum + " and 980__a:proceedings") if search_result: if len(search_result) > 1: # multiple proceedings proceedings = [] for i, recID in enumerate(search_result): # check for the DOI and put it in brackets in the output doi = get_fieldvalues(recID, '0247_a') if show_doi and doi: proceedings.append('<a href="/record/%(ID)s">#%(number)s</a> (DOI: <a href="https://doi.org/%(doi)s">%(doi)s</a>)' % {'ID': recID, 'number': i+1, 'doi': doi[0]}) else: proceedings.append('<a href="/record/%(ID)s">#%(number)s</a>' % {'ID': recID, 'number': i+1}) out = 'Proceedings: ' out += ', '.join(proceedings) elif len(search_result) == 1: # only one proceeding out += '<a href="/record/' + str(search_result[0]) + '">Proceedings</a>' if newline: out += '<br/>' return out
def update_references(recid, overwrite=True): """Update references for a record First, we extract references from a record. Then, we are not updating the record directly but adding a bibupload task in -c mode which takes care of updating the record. Parameters: * recid: the id of the record """ if not overwrite: # Check for references in record record = get_record(recid) if record and record_has_field(record, '999'): raise RecordHasReferences('Record has references and overwrite ' \ 'mode is disabled: %s' % recid) if get_fieldvalues(recid, '999C59'): raise RecordHasReferences('Record has been curated: %s' % recid) # Parse references references_xml = extract_references_from_record_xml(recid) # Save new record to file (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR) temp_file = os.fdopen(temp_fd, 'w') temp_file.write(references_xml.encode('utf-8')) temp_file.close() # Update record task_low_level_submission('bibupload', 'refextract', '-P', '5', '-c', temp_path)
def format_element(bfo, newline=False, show_doi=False): """ Prints link to proceedings if the proceedings exist. If not, nothing is returned. @param newline: if True, add <br /> at the end @param show_doi: if True, show DOI of the proceeding in brackets """ cnum = str(bfo.field('111__g')) out = "" if not cnum: #Something is wrong, return empty string return out search_result = search_pattern(p="773__w:" + cnum + " and 980__a:proceedings") if search_result: if len(search_result) > 1: # multiple proceedings proceedings = [] for i, recID in enumerate(search_result): # check for the DOI and put it in brackets in the output doi = get_fieldvalues(recID, '0247_a') if show_doi and doi: proceedings.append('<a href="/record/%(ID)s">#%(number)s</a> (DOI: <a href="http://dx.doi.org/%(doi)s">%(doi)s</a>)' % {'ID': recID, 'number': i+1, 'doi': doi[0]}) else: proceedings.append('<a href="/record/%(ID)s">#%(number)s</a>' % {'ID': recID, 'number': i+1}) out = 'Proceedings: ' out += ', '.join(proceedings) elif len(search_result) == 1: # only one proceeding out += '<a href="/record/' + str(search_result[0]) + '">Proceedings</a>' if newline: out += '<br/>' return out
def update_references(recid, overwrite=True): """Update references for a record First, we extract references from a record. Then, we are not updating the record directly but adding a bibupload task in -c mode which takes care of updating the record. Parameters: * recid: the id of the record """ if not overwrite: # Check for references in record record = get_record(recid) if record and record_has_field(record, "999"): raise RecordHasReferences("Record has references and overwrite " "mode is disabled: %s" % recid) if get_fieldvalues(recid, "999C59"): raise RecordHasReferences("Record has been curated: %s" % recid) # Parse references references_xml = extract_references_from_record_xml(recid) # Save new record to file (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR) temp_file = os.fdopen(temp_fd, "w") temp_file.write(references_xml.encode("utf-8")) temp_file.close() # Update record task_low_level_submission("bibupload", "refextract", "-P", "5", "-c", temp_path)
def get_index_strings_by_control_no(control_no): """extracts the index-relevant strings from the authority record referenced by the 'control_no' parameter and returns it as a list of strings @param control_no: a (INVENIO) MARC internal control_no to an authority record @type control_no: string (e.g. 'author:(ABC)1234') @param expected_type: the type of authority record expected @type expected_type: string, e.g. 'author', 'journal' etc. @return: list of index-relevant strings from the referenced authority record """ from invenio.bibindex_engine import list_union #return value string_list = [] #1. get recID and authority type corresponding to control_no rec_IDs = get_low_level_recIDs_from_control_no(control_no) #2. concatenate and return all the info from the interesting fields for this record for rec_id in rec_IDs: # in case we get multiple authority records for tag in CFG_BIBAUTHORITY_AUTHORITY_SUBFIELDS_TO_INDEX.get(get_type_from_control_no(control_no)): new_strings = get_fieldvalues(rec_id, tag) string_list = list_union(new_strings, string_list) #return return string_list
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) if rec_oaiid_tag: rec_oaiid = rec_oaiid_tag[0] rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == "regexp": # check via regexp: this is fast, but may not be precise re_match_001 = re.compile('<controlfield tag="001">%s</controlfield>' % (recid)) re_match_oaiid = re.compile( '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], rec_oaiid) ) re_match_sysno = re.compile( '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], rec_sysno) ) file_content = open(filename).read() if re_match_001.search(file_content): return True if rec_oaiid_tag: if re_match_oaiid.search(file_content): return True if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) if rec_oaiid_tag: rec_oaiid = rec_oaiid_tag[0] rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == 'regexp': # check via regexp: this is fast, but may not be precise re_match_001 = re.compile( '<controlfield tag="001">%s</controlfield>' % (recid)) re_match_oaiid = re.compile( '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], rec_oaiid)) re_match_sysno = re.compile( '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], rec_sysno)) file_content = open(filename).read() if re_match_001.search(file_content): return True if rec_oaiid_tag: if re_match_oaiid.search(file_content): return True if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def get_authors_from_record(recID, tags, use_bibauthorid=CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ if use_bibauthorid: authors = get_personids_from_record(recID) else: authors_list = chain( get_fieldvalues(recID, tags['first_author']), get_fieldvalues(recID, tags['additional_author']), get_fieldvalues(recID, tags['alternative_author_name'])) authors = set(hash(author) for author in list(authors_list)[:20]) return authors
def test_bibauthority_get_dependent_records_for_control_no(self): """bibauthority - test get_dependent_records_for_control_no()""" control_no_field = CFG_BIBAUTHORITY_RECORD_CONTROL_NUMBER_FIELD control_nos = get_fieldvalues(118, control_no_field) count = 0 for control_no in control_nos: count += len(get_dependent_records_for_control_no(control_no)) self.assertTrue(count)
def is_periodical(recid): rec_type = get_fieldvalues(recid, "690C_a") if len(rec_type) > 0: for value in rec_type: if value == 'PERI': return True return False
def get_authors_from_record(recID, tags, use_bibauthorid=CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ if use_bibauthorid: authors = get_personids_from_record(recID) else: authors_list = chain( get_fieldvalues(recID, tags['first_author']), get_fieldvalues(recID, tags['additional_author']), get_fieldvalues(recID, tags['alternative_author_name'])) authors = set(hash(author) for author in list(authors_list)[:21]) return authors
def main(): bibcatalog_system = BibCatalogSystemRT() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] recids = xrange(1, max_id + 1) for done, recid in enumerate(recids): if recid < 1124295: continue if recid >= 1183878: break if get_fieldvalues(recid, '999C6a') \ and not get_fieldvalues(recid, '999C59'): print '* processing', recid create_ticket(recid, bibcatalog_system) if (done + 1) % 25 == 0: print 'done %s of %s' % (done + 1, len(recids))
def goto(type, document='', number=0, lang='en', modif=0): today = time.strftime('%Y-%m-%d') if type == 'SSR': ## We would like a CERN Staff Rules and Regulations recids = perform_request_search(cc='Staff Rules and Regulations', f="925__a:1996-01-01->%s 925__b:%s->9999-99-99" % (today, today)) recid = recids[-1] reportnumber = get_fieldvalues(recid, '037__a')[0] edition = int(reportnumber[-2:]) ## e.g. CERN-STAFF-RULES-ED08 return BibRecDocs(recid).get_bibdoc(make_cern_ssr_docname(lang, edition, modif)).get_file('.pdf').get_url() elif type == "OPER-CIRC": recids = perform_request_search(cc="Operational Circulars", p="reportnumber=\"CERN-OPER-CIRC-%s-*\"" % number, sf="925__a") recid = recids[-1] documents = {} bibrecdocs = BibRecDocs(recid) for docname in bibrecdocs.get_bibdoc_names(): ldocname = docname.lower() if 'implementation' in ldocname: _register_document(documents, docname, 'implementation_en') elif 'application' in ldocname: _register_document(documents, docname, 'implementation_fr') elif 'archiving' in ldocname: _register_document(documents, docname, 'archiving_en') elif 'archivage' in ldocname: _register_document(documents, docname, 'archiving_fr') elif 'annexe' in ldocname or 'annexes_fr' in ldocname: _register_document(documents, docname, 'annex_fr') elif 'annexes_en' in ldocname or 'annex' in ldocname: _register_document(documents, docname, 'annex_en') elif '_en_' in ldocname or '_eng_' in ldocname or '_angl_' in ldocname: _register_document(documents, docname, 'en') elif '_fr_' in ldocname: _register_document(documents, docname, 'fr') return bibrecdocs.get_bibdoc(documents[document]).get_file('.pdf').get_url() elif type == 'ADMIN-CIRC': recids = perform_request_search(cc="Administrative Circulars", p="reportnumber=\"CERN-ADMIN-CIRC-%s-*\"" % number, sf="925__a") recid = recids[-1] documents = {} bibrecdocs = BibRecDocs(recid) for docname in bibrecdocs.get_bibdoc_names(): ldocname = docname.lower() if 'implementation' in ldocname: _register_document(documents, docname, 'implementation-en') elif 'application' in ldocname: _register_document(documents, docname, 'implementation-fr') elif 'archiving' in ldocname: _register_document(documents, docname, 'archiving-en') elif 'archivage' in ldocname: _register_document(documents, docname, 'archiving-fr') elif 'annexe' in ldocname or 'annexes_fr' in ldocname: _register_document(documents, docname, 'annex-fr') elif 'annexes_en' in ldocname or 'annex' in ldocname: _register_document(documents, docname, 'annex-en') elif '_en_' in ldocname or '_eng_' in ldocname or '_angl_' in ldocname: _register_document(documents, docname, 'en') elif '_fr_' in ldocname: _register_document(documents, docname, 'fr') return bibrecdocs.get_bibdoc(documents[document]).get_file('.pdf').get_url()
def search_result_info(recid): """Return report number of a record or if it doen't exist return the recid itself. """ report_numbers = get_fieldvalues(recid, '037__a') if len(report_numbers) == 0: return "#"+str(recid) else: return report_numbers[0]
def get_authors_from_record(recID, tags): """Get all authors for a record We need this function because there's 3 different types of authors and to fetch each one of them we need look through MARC tags """ authors = get_personids_from_bibrec(recID) if not authors: mainauth_list = get_fieldvalues(recID, tags['first_author']) coauth_list = get_fieldvalues(recID, tags['additional_author']) extauth_list = get_fieldvalues(recID, tags['alternative_author_name']) authors = set(mainauth_list) authors.update(coauth_list) authors.update(extauth_list) return authors
def search_result_info(recid): """Return report number of a record or if it doen't exist return the recid itself. """ report_numbers = get_fieldvalues(recid, '037__a') if len(report_numbers) == 0: return "#" + str(recid) else: return report_numbers[0]
def get_video_duration(recid): """ Return the duration of a video """ duration = get_fieldvalues(recid, '950__d') if duration: duration = duration[0] duration = timecode_to_seconds(duration) return human_readable_time(duration) else: return ""
def extract_arxiv_ids_from_recid(recid): for report_number in get_fieldvalues(recid, '037__a'): if not report_number.startswith('arXiv'): continue # Extract arxiv id try: yield report_number.split(':')[1] except IndexError: raise InvalidReportNumber(report_number)
def check_record_for_refextract(recid): if get_fieldvalues(recid, '999C6v'): # References extracted by refextract if get_fieldvalues(recid, '999C59'): # They have been curated # To put in the HP and create ticket in the future needs_submitting = False else: # They haven't been curated, we safely extract from the new pdf needs_submitting = True elif not get_fieldvalues(recid, '999C5_'): # No references in the record, we can safely extract # new references needs_submitting = True else: # Old record, with either no curated references or references # curated by SLAC. We cannot distinguish, so we do nothing needs_submitting = False return needs_submitting
def get_control_nos_from_recID(recID): """ get a list of control numbers from the record ID @param recID: record ID @type recID: int @return: authority record control number """ return get_fieldvalues(recID, CFG_BIBAUTHORITY_RECORD_CONTROL_NUMBER_FIELD, repetitive_values=False)
def check_records(records, amend_case=False): for record in records: for position, value in record.iterfields(['100__a', '700__a']): value = value.decode('utf8') new_value = NAME_CACHE.get(value) if new_value is None: search_value = value if ',' in value: splitted_values = search_value.split(',', 1) search_value = u"%s %s" % (splitted_values[1].strip(), splitted_values[0].strip()) original_family_name = value.split(',')[0].strip() search_value = RE_SPACES.sub(' ', search_value).strip() if len(search_value.split()) < 3: # Simple name continue i = perform_request_search(p=u'author:"%s"' % search_value, cc='HepNames') possible_values = get_fieldvalues( i, '100__a', sort=False) + get_fieldvalues( i, '400__a', sort=False) for correct_value in possible_values: correct_value = correct_value.decode('utf8') if search_value.lower().endswith( " " + correct_value.lower().split(',')[0]): family_name = correct_value.split(',')[0].strip() if len(family_name) < len(original_family_name): continue first_name = search_value[:-(len(family_name) + 1)].strip() new_value = u'%s, %s' % (family_name, first_name) NAME_CACHE[value] = new_value break else: NAME_CACHE[value] = value if new_value: if amend_case and new_value == value: continue elif new_value.lower() == value.lower(): continue record.amend_field(position, new_value.encode('utf8'))
def bst_hal(): doi_map, arxiv_map = get_hal_maps() matchable_records = get_record_ids_to_export() write_message("Total matchable records: %s" % len(matchable_records)) hal_records = get_hal_records() write_message("Already matched records: %s" % len(hal_records)) bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******') tot_records = matchable_records - hal_records write_message("Records to be checked: %s" % len(tot_records)) for i, recid in enumerate(tot_records): if i % 1000 == 0: write_message("%s records done out of %s" % (i, len(tot_records))) task_sleep_now_if_required() dois = get_fieldvalues(recid, tag='0247__a', sort=False) arxivs = get_fieldvalues(recid, tag='037__a', sort=False) matched_hal = [doi_map[doi] for doi in dois if doi in doi_map] matched_hal += [ arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map ] # Let's assert that we matched only one single hal document at most matched_hal_id = set(id(entry) for entry in matched_hal) if len(matched_hal) > 1: write_message( "WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr) continue elif not matched_hal: continue hal_id = matched_hal[0]['halId_s'] rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')]) write_message("Record %s matched HAL record %s" % (recid, hal_id)) bibupload.add(record_xml_output(rec)) return True
def holdings(recid): acquisition_src = get_fieldvalues(recid, AMZ_ACQUISITION_IDENTIFIER_TAG) if acquisition_src and acquisition_src[0].startswith( 'AMZ') and db.has_copies(recid) == False: action = "proposal" else: action = "borrowal" holdings_information = perform_get_holdings_information(recid, request, \ action=action, ln=g.ln) return render_template('record_holdings.html', holdings_information=holdings_information)
def get_item_info_for_search_result(recid): """ Get the item's info from MARC in order to create a search result with more details @param recid: identify the record. Primary key of bibrec. @type recid: int @return book's informations (author, editor and number of copies) """ book_author = ' '.join(get_fieldvalues(recid, "100__a") + \ get_fieldvalues(recid, "100__u")) book_editor = ' , '.join(get_fieldvalues(recid, "260__a") + \ get_fieldvalues(recid, "260__b") + \ get_fieldvalues(recid, "260__c")) book_copies = ' '.join(get_fieldvalues(recid, "964__a")) if not book_copies: book_copies = db.get_number_copies(recid) book_infos = (book_author, book_editor, book_copies) return book_infos
def record_can_overwrite_refs(recid): if get_fieldvalues(recid, '999C6v'): # References extracted by refextract if 'curator' in [ value.lower().strip() for value in get_fieldvalues( recid, '999C59', repetitive_values=False) if value.strip() ]: # They have been curated # To put in the HP and create ticket in the future needs_submitting = False else: # They haven't been curated, we safely extract from the new pdf needs_submitting = True elif not get_fieldvalues(recid, '999C5_'): # No references in the record, we can safely extract # new references needs_submitting = True else: # Old record, with either no curated references or references # curated by SLAC. We cannot distinguish, so we do nothing needs_submitting = False return needs_submitting