def test_parse_tag(self): """ bibformat - result of parsing tags""" tags_and_parsed_tags = ['245COc', ['245', 'C', 'O', 'c'], '245C_c', ['245', 'C', '' , 'c'], '245__c', ['245', '' , '' , 'c'], '245__$$c', ['245', '' , '' , 'c'], '245__$c', ['245', '' , '' , 'c'], '245 $c', ['245', '' , '' , 'c'], '245 $$c', ['245', '' , '' , 'c'], '245__.c', ['245', '' , '' , 'c'], '245 .c', ['245', '' , '' , 'c'], '245C_$c', ['245', 'C', '' , 'c'], '245CO$$c', ['245', 'C', 'O', 'c'], '245CO.c', ['245', 'C', 'O', 'c'], '245$c', ['245', '' , '' , 'c'], '245.c', ['245', '' , '' , 'c'], '245$$c', ['245', '' , '' , 'c'], '245__%', ['245', '' , '' , '%'], '245__$$%', ['245', '' , '' , '%'], '245__$%', ['245', '' , '' , '%'], '245 $%', ['245', '' , '' , '%'], '245 $$%', ['245', '' , '' , '%'], '245$%', ['245', '' , '' , '%'], '245.%', ['245', '' , '' , '%'], '245_O.%', ['245', '' , 'O', '%'], '245.%', ['245', '' , '' , '%'], '245$$%', ['245', '' , '' , '%'], '2%5$$a', ['2%5', '' , '' , 'a'], '2%%%%a', ['2%%', '%', '%', 'a'], '2%%__a', ['2%%', '' , '' , 'a'], '2%%a', ['2%%', '' , '' , 'a']] for i in range(0, len(tags_and_parsed_tags), 2): parsed_tag = bibformat_utils.parse_tag(tags_and_parsed_tags[i]) self.assertEqual(parsed_tag, tags_and_parsed_tags[i+1])
def format_element(bfo, tag, limit, instances_separator=" ", subfields_separator=" ", extension=""): """ Prints the given field of a record. If tag is in range [001, 010], this element assumes that it accesses a control field. Else it considers it accesses a data field. @param tag: the tag code of the field that is to be printed @param instances_separator: a separator between instances of field @param subfields_separator: a separator between subfields of an instance @param limit: the maximum number of values to display. @param extension: a text printed at the end if 'limit' has been exceeded """ # Check if data or control field p_tag = parse_tag(tag) if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11): return bfo.control_field(tag) elif p_tag[0].isdigit(): # Get values without subcode. # We will filter unneeded subcode later if p_tag[1] == '': p_tag[1] = '_' if p_tag[2] == '': p_tag[2] = '_' values = bfo.fields(p_tag[0]+p_tag[1]+p_tag[2]) # Values will # always be a # list of # dicts else: return '' x = 0 instances_out = [] # Retain each instance output for instance in values: filtered_values = [value for (subcode, value) in instance.iteritems() if p_tag[3] == '' or p_tag[3] == '%' \ or p_tag[3] == subcode] if len(filtered_values) > 0: # We have found some corresponding subcode(s) if limit.isdigit() and x + len(filtered_values) >= int(limit): # We are going to exceed the limit filtered_values = filtered_values[:int(limit)-x] # Takes only needed one if len(filtered_values) > 0: # do not append empty list! instances_out.append(subfields_separator.join(filtered_values)) x += len(filtered_values) # record that so we know limit has been exceeded break # No need to go further else: instances_out.append(subfields_separator.join(filtered_values)) x += len(filtered_values) ext_out = '' if limit.isdigit() and x > int(limit): ext_out = extension return instances_separator.join(instances_out) + ext_out
def get_authors_tags(config=CITATION_CONFIG): """ Get the tags for main author, coauthors, alternative authors from config """ function = config.get("rank_method", "function") tags_names = [ 'first_author', 'additional_author', 'alternative_author_name', 'collaboration_name', ] tags = {} for t in tags_names: r_tag = config.get(function, t) tags[t] = tagify(parse_tag(r_tag)) return tags
def get_self_citations(new_record_list, citationdic, initial_selfcitdict, config): """Check which items have been cited by one of the authors of the citing item: go through id's in new_record_list, use citationdic to get citations, update "selfcites". Selfcites is originally initial_selfcitdict. Return selfcites. """ i = 0 #just for debugging .. #get the tags for main author, coauthors, ext authors from config tags = ['first_author', 'additional_author', 'alternative_author_name'] for t in tags: try: dummy = config.get(config.get("rank_method", "function"), t) except: register_exception(prefix="attribute " + t + " missing in config", alert_admin=True) return initial_selfcitdict r_mainauthortag = config.get(config.get("rank_method", "function"), "first_author") r_coauthortag = config.get(config.get("rank_method", "function"), "additional_author") r_extauthortag = config.get(config.get("rank_method", "function"), "alternative_author_name") #parse the tags mainauthortag = tagify(parse_tag(r_mainauthortag)) coauthortag = tagify(parse_tag(r_coauthortag)) extauthortag = tagify(parse_tag(r_extauthortag)) selfcites = initial_selfcitdict for k in new_record_list: if (i % 1000 == 0): mesg = "Selfcites done " + str(i) + " of " + str( len(new_record_list)) + " records" write_message(mesg) task_update_progress(mesg) i = i + 1 #get the author of k authorlist = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authorlist.append(coauthl) authorlist.append(extauthl) #author tag #print "record "+str(k)+" by "+str(authorlist) #print "is cited by" #get the "x-cites-this" list if citationdic.has_key(k): xct = citationdic[k] for c in xct: #get authors of c cauthorlist = get_fieldvalues(c, mainauthortag) coauthl = get_fieldvalues(c, coauthortag) extauthl = get_fieldvalues(c, extauthortag) cauthorlist.extend(coauthl) cauthorlist.extend(extauthl) #print str(c)+" by "+str(cauthorlist) for ca in cauthorlist: if (ca in authorlist): #found! if selfcites.has_key(k): val = selfcites[k] #add only if not there already if val: if not c in val: val.append(c) selfcites[k] = val else: #new key for selfcites selfcites[k] = [c] mesg = "Selfcites done fully" write_message(mesg) task_update_progress(mesg) return selfcites
def get_citation_informations(recid_list, config): """scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] d_reports_numbers = {} #dict of recid -> institute-given-report-code d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz'] d_references_s = { } #dict of recid -> list_of_the_entries_of_this_recs_bibliography d_records_s = {} #dict of recid -> this_records_publication_info citation_informations = [] write_message("config function " + config.get("rank_method", "function"), verbose=9) function = "" try: function = config.get("rank_method", "function") except: register_exception( prefix="cfg section [rank_method] has no attribute called function", alert_admin=True) #we cannot continue return [{}, {}, {}, {}] record_pri_number_tag = "" try: record_pri_number_tag = config.get(function, "primary_report_number") except: register_exception(prefix="cfg section " + function + " has no attribute primary_report_number", alert_admin=True) return [{}, {}, {}, {}] record_add_number_tag = "" try: record_add_number_tag = config.get( config.get("rank_method", "function"), "additional_report_number") except: register_exception(prefix="config error. cfg section " + function + " has no attribute additional_report_number", alert_admin=True) return [{}, {}, {}, {}] reference_number_tag = "" try: reference_number_tag = config.get( config.get("rank_method", "function"), "reference_via_report_number") except: register_exception(prefix="config error. cfg section " + function + " has no attribute reference_via_report_number", alert_admin=True) return [{}, {}, {}, {}] reference_tag = "" try: reference_tag = config.get(config.get("rank_method", "function"), "reference_via_pubinfo") except: register_exception(prefix="config error. cfg section " + function + " has no attribute reference_via_pubinfo", alert_admin=True) return [{}, {}, {}, {}] p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag)) #037a: contains (often) the "hep-ph/0501084" tag of THIS record p_record_add_number_tag = tagify(parse_tag(record_add_number_tag)) #088a: additional short identifier for the record p_reference_number_tag = tagify(parse_tag(reference_number_tag)) #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002 p_reference_tag = tagify(parse_tag(reference_tag)) #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371 #fields needed to construct the pubinfo for this record publication_pages_tag = "" publication_year_tag = "" publication_journal_tag = "" publication_volume_tag = "" publication_format_string = "p v (y) c" try: tag = config.get(function, "pubinfo_journal_page") publication_pages_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_year") publication_year_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_title") publication_journal_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_volume") publication_volume_tag = tagify(parse_tag(tag)) publication_format_string = config.get(function, "pubinfo_journal_format") except: pass #print values for tags for debugging if task_get_task_param('verbose') >= 9: write_message("tag values") write_message("p_record_pri_number_tag " + str(p_record_pri_number_tag)) write_message("p_reference_tag " + str(p_reference_tag)) write_message("publication_journal_tag " + str(publication_journal_tag)) write_message("publication_format_string is " + publication_format_string) done = 0 #for status reporting numrecs = len(recid_list) # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2], (p_reference_tag,)) or \ run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2], (p_reference_number_tag,)): for recid in recid_list: if (done % 10 == 0): task_sleep_now_if_required() #in fact we can sleep any time here if (done % 1000 == 0): mesg = "get cit.inf done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag) add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag) reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag) references_s = get_fieldvalues(recid, p_reference_tag) l_report_numbers = pri_report_numbers l_report_numbers.extend(add_report_numbers) d_reports_numbers[recid] = l_report_numbers if reference_report_numbers: d_references_report_numbers[recid] = reference_report_numbers references_s = get_fieldvalues(recid, p_reference_tag) write_message(str(recid) + "'s " + str(p_reference_tag) + " values " + str(references_s), verbose=9) if references_s: d_references_s[recid] = references_s #get a combination of #journal vol (year) pages if publication_pages_tag and publication_journal_tag and \ publication_volume_tag and publication_year_tag and publication_format_string: tagsvalues = {} #we store the tags and their values here #like c->444 y->1999 p->"journal of foo",v->20 tagsvalues["p"] = "" tagsvalues["y"] = "" tagsvalues["c"] = "" tagsvalues["v"] = "" tmp = get_fieldvalues(recid, publication_journal_tag) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, publication_volume_tag) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, publication_year_tag) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, publication_pages_tag) if tmp: #if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages #format the publ infostring according to the format publ = "" ok = 1 for i in range(0, len(publication_format_string)): current = publication_format_string[i] #these are supported if current == "p" or current == "c" or current == "v" \ or current == "y": if tagsvalues[current]: #add the value in the string publ += tagsvalues[current] else: ok = 0 break #it was needed and not found else: publ += current #just add the character in the format string if ok: write_message("d_records_s (publication info) for " + str(recid) + " is " + publ, verbose=9) d_records_s[recid] = publ else: mesg = "Warning: there are no records with tag values for " mesg += p_reference_number_tag + " or " + p_reference_tag + ". Nothing to do." write_message(mesg) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) citation_informations.append(d_reports_numbers) citation_informations.append(d_references_report_numbers) citation_informations.append(d_references_s) citation_informations.append(d_records_s) end_time = os.times()[4] write_message("Execution time for generating citation info from record: %.2f sec" % \ (end_time - begin_time)) return citation_informations
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config): """Traverses citedbydict in order to build "which author is quoted where" dict. The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means Apollinaire is cited in records 1,2 and 3. Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict: the dicts from the database. Output: authorciteddict. It is initially set to initial_author_dict """ #sorry bout repeated code to get the tags tags = ['first_author', 'additional_author', 'alternative_author_name'] tagvals = {} for t in tags: try: x = config.get(config.get("rank_method", "function"), t) tagvals[t] = x except: register_exception(prefix="attribute "+t+" missing in config", alert_admin=True) return initial_author_dict #parse the tags mainauthortag = tagify(parse_tag(tagvals['first_author'])) coauthortag = tagify(parse_tag(tagvals['additional_author'])) extauthortag = tagify(parse_tag(tagvals['alternative_author_name'])) if task_get_task_param('verbose') >= 9: write_message("mainauthortag "+mainauthortag) write_message("coauthortag "+coauthortag) write_message("extauthortag "+extauthortag) author_cited_in = initial_author_dict if citedbydict: i = 0 #just a counter for debug write_message("Checking records referred to in new records") for u in updated_redic_list: if (i % 1000 == 0): mesg = "Author ref done "+str(i)+" of "+str(len(updated_redic_list))+" records" write_message(mesg) task_update_progress(mesg) i = i + 1 if citedbydict.has_key(u): these_cite_k = citedbydict[u] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None authors = get_fieldvalues(u, mainauthortag) coauthl = get_fieldvalues(u, coauthortag) extauthl = get_fieldvalues(u, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author ref done fully" write_message(mesg) task_update_progress(mesg) #go through the dictionary again: all keys but search only if new records are cited write_message("Checking authors in new records") i = 0 for k in citedbydict.keys(): if (i % 1000 == 0): mesg = "Author cit done "+str(i)+" of "+str(len(citedbydict.keys()))+" records" write_message(mesg) task_update_progress(mesg) i = i + 1 these_cite_k = citedbydict[k] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None #do things only if these_cite_k contains any new stuff intersec_list = list(set(these_cite_k)&set(updated_redic_list)) if intersec_list: authors = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author cit done fully" write_message(mesg) task_update_progress(mesg) return author_cited_in
def test_parse_tag(self): """ bibformat - result of parsing tags""" tags_and_parsed_tags = [ "245COc", ["245", "C", "O", "c"], "245C_c", ["245", "C", "", "c"], "245__c", ["245", "", "", "c"], "245__$$c", ["245", "", "", "c"], "245__$c", ["245", "", "", "c"], "245 $c", ["245", "", "", "c"], "245 $$c", ["245", "", "", "c"], "245__.c", ["245", "", "", "c"], "245 .c", ["245", "", "", "c"], "245C_$c", ["245", "C", "", "c"], "245CO$$c", ["245", "C", "O", "c"], "245CO.c", ["245", "C", "O", "c"], "245$c", ["245", "", "", "c"], "245.c", ["245", "", "", "c"], "245$$c", ["245", "", "", "c"], "245__%", ["245", "", "", "%"], "245__$$%", ["245", "", "", "%"], "245__$%", ["245", "", "", "%"], "245 $%", ["245", "", "", "%"], "245 $$%", ["245", "", "", "%"], "245$%", ["245", "", "", "%"], "245.%", ["245", "", "", "%"], "245_O.%", ["245", "", "O", "%"], "245.%", ["245", "", "", "%"], "245$$%", ["245", "", "", "%"], "2%5$$a", ["2%5", "", "", "a"], "2%%%%a", ["2%%", "%", "%", "a"], "2%%__a", ["2%%", "", "", "a"], "2%%a", ["2%%", "", "", "a"], ] for i in range(0, len(tags_and_parsed_tags), 2): parsed_tag = bibformat_utils.parse_tag(tags_and_parsed_tags[i]) self.assertEqual(parsed_tag, tags_and_parsed_tags[i + 1])
def get_citation_informations(recid_list, config): """scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] d_reports_numbers = {} #dict of recid -> institute-given-report-code d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz'] d_references_s = {} #dict of recid -> list_of_the_entries_of_this_recs_bibliography d_records_s = {} #dict of recid -> this_records_publication_info citation_informations = [] write_message("config function "+config.get("rank_method", "function"), verbose=9) function = "" try: function = config.get("rank_method", "function") except: register_exception(prefix="cfg section [rank_method] has no attribute called function", alert_admin=True) #we cannot continue return [ {}, {}, {}, {} ] record_pri_number_tag = "" try: record_pri_number_tag = config.get(function, "primary_report_number") except: register_exception(prefix="cfg section "+function+" has no attribute primary_report_number", alert_admin=True) return [ {}, {}, {}, {} ] record_add_number_tag = "" try: record_add_number_tag = config.get(config.get("rank_method", "function"), "additional_report_number") except: register_exception(prefix="config error. cfg section "+function+" has no attribute additional_report_number", alert_admin=True) return [ {}, {}, {}, {} ] reference_number_tag = "" try: reference_number_tag = config.get(config.get("rank_method", "function"), "reference_via_report_number") except: register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_report_number", alert_admin=True) return [ {}, {}, {}, {} ] reference_tag = "" try: reference_tag = config.get(config.get("rank_method", "function"), "reference_via_pubinfo") except: register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_pubinfo", alert_admin=True) return [ {}, {}, {}, {} ] p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag)) #037a: contains (often) the "hep-ph/0501084" tag of THIS record p_record_add_number_tag = tagify(parse_tag(record_add_number_tag)) #088a: additional short identifier for the record p_reference_number_tag = tagify(parse_tag(reference_number_tag)) #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002 p_reference_tag = tagify(parse_tag(reference_tag)) #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371 #fields needed to construct the pubinfo for this record publication_pages_tag = "" publication_year_tag = "" publication_journal_tag = "" publication_volume_tag = "" publication_format_string = "p v (y) c" try: tag = config.get(function, "pubinfo_journal_page") publication_pages_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_year") publication_year_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_title") publication_journal_tag = tagify(parse_tag(tag)) tag = config.get(function, "pubinfo_journal_volume") publication_volume_tag = tagify(parse_tag(tag)) publication_format_string = config.get(function, "pubinfo_journal_format") except: pass #print values for tags for debugging if task_get_task_param('verbose') >= 9: write_message("tag values") write_message("p_record_pri_number_tag "+str(p_record_pri_number_tag)) write_message("p_reference_tag "+str(p_reference_tag)) write_message("publication_journal_tag "+str(publication_journal_tag)) write_message("publication_format_string is "+publication_format_string) done = 0 #for status reporting numrecs = len(recid_list) # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2], (p_reference_tag,)) or \ run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2], (p_reference_number_tag,)): for recid in recid_list: if (done % 10 == 0): task_sleep_now_if_required() #in fact we can sleep any time here if (done % 1000 == 0): mesg = "get cit.inf done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 if recid in INTBITSET_OF_DELETED_RECORDS: # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag) add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag) reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag) references_s = get_fieldvalues(recid, p_reference_tag) l_report_numbers = pri_report_numbers l_report_numbers.extend(add_report_numbers) d_reports_numbers[recid] = l_report_numbers if reference_report_numbers: d_references_report_numbers[recid] = reference_report_numbers references_s = get_fieldvalues(recid, p_reference_tag) write_message(str(recid)+"'s "+str(p_reference_tag)+" values "+str(references_s), verbose=9) if references_s: d_references_s[recid] = references_s #get a combination of #journal vol (year) pages if publication_pages_tag and publication_journal_tag and \ publication_volume_tag and publication_year_tag and publication_format_string: tagsvalues = {} #we store the tags and their values here #like c->444 y->1999 p->"journal of foo",v->20 tagsvalues["p"] = "" tagsvalues["y"] = "" tagsvalues["c"] = "" tagsvalues["v"] = "" tmp = get_fieldvalues(recid, publication_journal_tag) if tmp: tagsvalues["p"] = tmp[0] tmp = get_fieldvalues(recid, publication_volume_tag) if tmp: tagsvalues["v"] = tmp[0] tmp = get_fieldvalues(recid, publication_year_tag) if tmp: tagsvalues["y"] = tmp[0] tmp = get_fieldvalues(recid, publication_pages_tag) if tmp: #if the page numbers have "x-y" take just x pages = tmp[0] hpos = pages.find("-") if hpos > 0: pages = pages[:hpos] tagsvalues["c"] = pages #format the publ infostring according to the format publ = "" ok = 1 for i in range (0, len(publication_format_string)): current = publication_format_string[i] #these are supported if current == "p" or current == "c" or current == "v" \ or current == "y": if tagsvalues[current]: #add the value in the string publ += tagsvalues[current] else: ok = 0 break #it was needed and not found else: publ += current #just add the character in the format string if ok: write_message("d_records_s (publication info) for "+str(recid)+" is "+publ, verbose=9) d_records_s[recid] = publ else: mesg = "Warning: there are no records with tag values for " mesg += p_reference_number_tag+" or "+p_reference_tag+". Nothing to do." write_message(mesg) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) citation_informations.append(d_reports_numbers) citation_informations.append(d_references_report_numbers) citation_informations.append(d_references_s) citation_informations.append(d_records_s) end_time = os.times()[4] write_message("Execution time for generating citation info from record: %.2f sec" % \ (end_time - begin_time)) return citation_informations
def format_element(bfo, tag, limit, instances_separator=" ", subfields_separator=" ", extension="", output_pattern=""): """ Prints the given field of a record. If tag is in range [001, 010], this element assumes that it accesses a control field. Else it considers it accesses a data field. <p>For eg. consider the following metdata: <pre> 100__ $$aCalatroni, S$$uCERN 245__ $$aStatus of the EP Simulations and Facilities for the SPL 700__ $$aFerreira, L$$uCERN 700__ $$aMacatrao, M$$uCERN 700__ $$aSkala, A$$uCERN 700__ $$aSosin, M$$uCERN 700__ $$ade Waele, R$$uCERN 700__ $$aWithofs, Y$$uKHLim, Diepenbeek </pre> The following calls to bfe_field would print: <pre> <BFE_FIELD tag="700" instances_separator="<br/>" subfields_separator=" - "> Ferreira, L - CERN Macatrao, M - CERN Skala, A - CERN Sosin, M - CERN de Waele, R - CERN Withofs, Y - KHLim, Diepenbeek </pre> </p> <p>For more advanced formatting, the <code>output_pattern</code> parameter can be used to output the subfields of each instance in the specified way. For eg. consider the following metadata: <pre> 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999 775__ $$b12. Aufl.$$c1963$$w278898 775__ $$b14. Aufl.$$c1983$$w107899 775__ $$b13. Aufl.$$c1974$$w99635 </pre> with the following <code>output_pattern</code>: <pre> <a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s</a> </pre> would print:<br/> <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/> <a href="/record/278898">12. Aufl. (1963) </a><br/> <a href="/record/107899">14. Aufl. (1983) </a><br/> <a href="/record/99635">13. Aufl. (1974) </a> <br/>(<code>instances_separator="<br/>"</code> set for readability)<br/> The output pattern must follow <a href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python string formatting</a> syntax. The format must use parenthesized notation to map to the subfield code. This currently restricts the support of <code>output_pattern</code> to non-repeatable subfields</p> @param tag: the tag code of the field that is to be printed @param instances_separator: a separator between instances of field @param subfields_separator: a separator between subfields of an instance @param limit: the maximum number of values to display. @param extension: a text printed at the end if 'limit' has been exceeded @param output_pattern: when specified, prints the subfields of each instance according to pattern specified as parameter (following Python string formatting convention) """ # Check if data or control field p_tag = parse_tag(tag) if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11): return bfo.control_field(tag) elif p_tag[0].isdigit(): # Get values without subcode. # We will filter unneeded subcode later if p_tag[1] == '': p_tag[1] = '_' if p_tag[2] == '': p_tag[2] = '_' values = bfo.fields(p_tag[0] + p_tag[1] + p_tag[2]) # Values will # always be a # list of # dicts else: return '' x = 0 instances_out = [] # Retain each instance output for instance in values: filtered_values = [value for (subcode, value) in instance.iteritems() if p_tag[3] == '' or p_tag[3] == '%' \ or p_tag[3] == subcode] if len(filtered_values) > 0: # We have found some corresponding subcode(s) if limit.isdigit() and x + len(filtered_values) >= int(limit): # We are going to exceed the limit filtered_values = filtered_values[:int(limit) - x] # Takes only needed one if len(filtered_values) > 0: # do not append empty list! if output_pattern: try: instances_out.append(output_pattern % DictNoKeyError(instance)) except: pass else: instances_out.append( subfields_separator.join(filtered_values)) x += len( filtered_values ) # record that so we know limit has been exceeded break # No need to go further else: if output_pattern: try: instances_out.append(output_pattern % DictNoKeyError(instance)) except: pass else: instances_out.append( subfields_separator.join(filtered_values)) x += len(filtered_values) ext_out = '' if limit.isdigit() and x > int(limit): ext_out = extension return instances_separator.join(instances_out) + ext_out
def get_tags_config(config): """Fetch needs config from our config file""" # Probably "citation" unless this file gets renamed function = config.get("rank_method", "function") write_message("config function %s" % function, verbose=9) tags = {} # 037a: contains (often) the "hep-ph/0501084" tag of THIS record try: tag = config.get(function, "primary_report_number") except ConfigParser.NoOptionError: tags['record_pri_number'] = None else: tags['record_pri_number'] = tagify(parse_tag(tag)) # 088a: additional short identifier for the record try: tag = config.get(function, "additional_report_number") except ConfigParser.NoOptionError: tags['record_add_number'] = None else: tags['record_add_number'] = tagify(parse_tag(tag)) # 999C5r. this is in the reference list, refers to other records. # Looks like: hep-ph/0408002 try: tag = config.get(function, "reference_via_report_number") except ConfigParser.NoOptionError: tags['refs_report_number'] = None else: tags['refs_report_number'] = tagify(parse_tag(tag)) # 999C5s. this is in the reference list, refers to other records. # Looks like: Phys.Rev.,A21,78 try: tag = config.get(function, "reference_via_pubinfo") except ConfigParser.NoOptionError: tags['refs_journal'] = None else: tags['refs_journal'] = tagify(parse_tag(tag)) # 999C5a. this is in the reference list, refers to other records. # Looks like: 10.1007/BF03170733 try: tag = config.get(function, "reference_via_doi") except ConfigParser.NoOptionError: tags['refs_doi'] = None else: tags['refs_doi'] = tagify(parse_tag(tag)) # Fields needed to construct the journals for this record try: tag = { 'pages': config.get(function, "pubinfo_journal_page"), 'year': config.get(function, "pubinfo_journal_year"), 'journal': config.get(function, "pubinfo_journal_title"), 'volume': config.get(function, "pubinfo_journal_volume"), } except ConfigParser.NoOptionError: tags['publication'] = None else: tags['publication'] = { 'pages': tagify(parse_tag(tag['pages'])), 'year': tagify(parse_tag(tag['year'])), 'journal': tagify(parse_tag(tag['journal'])), 'volume': tagify(parse_tag(tag['volume'])), } # Fields needed to lookup the DOIs tags['doi'] = get_field_tags('doi') # 999C5s. A standardized way of writing a reference in the reference list. # Like: Nucl. Phys. B 710 (2000) 371 try: tags['publication_format'] = config.get(function, "pubinfo_journal_format") except ConfigParser.NoOptionError: tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM # Print values of tags for debugging write_message("tag values: %r" % [tags], verbose=9) return tags
def format_element(bfo, tag, limit, instances_separator=" ", subfields_separator=" ", extension="", output_pattern=""): """ Prints the given field of a record. If tag is in range [001, 010], this element assumes that it accesses a control field. Else it considers it accesses a data field. <p>For eg. consider the following metdata: <pre> 100__ $$aCalatroni, S$$uCERN 245__ $$aStatus of the EP Simulations and Facilities for the SPL 700__ $$aFerreira, L$$uCERN 700__ $$aMacatrao, M$$uCERN 700__ $$aSkala, A$$uCERN 700__ $$aSosin, M$$uCERN 700__ $$ade Waele, R$$uCERN 700__ $$aWithofs, Y$$uKHLim, Diepenbeek </pre> The following calls to bfe_field would print: <pre> <BFE_FIELD tag="700" instances_separator="<br/>" subfields_separator=" - "> Ferreira, L - CERN Macatrao, M - CERN Skala, A - CERN Sosin, M - CERN de Waele, R - CERN Withofs, Y - KHLim, Diepenbeek </pre> </p> <p>For more advanced formatting, the <code>output_pattern</code> parameter can be used to output the subfields of each instance in the specified way. For eg. consider the following metadata: <pre> 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999 775__ $$b12. Aufl.$$c1963$$w278898 775__ $$b14. Aufl.$$c1983$$w107899 775__ $$b13. Aufl.$$c1974$$w99635 </pre> with the following <code>output_pattern</code>: <pre> <a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s</a> </pre> would print:<br/> <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/> <a href="/record/278898">12. Aufl. (1963) </a><br/> <a href="/record/107899">14. Aufl. (1983) </a><br/> <a href="/record/99635">13. Aufl. (1974) </a> <br/>(<code>instances_separator="<br/>"</code> set for readability)<br/> The output pattern must follow <a href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python string formatting</a> syntax. The format must use parenthesized notation to map to the subfield code. This currently restricts the support of <code>output_pattern</code> to non-repeatable subfields</p> @param tag: the tag code of the field that is to be printed @param instances_separator: a separator between instances of field @param subfields_separator: a separator between subfields of an instance @param limit: the maximum number of values to display. @param extension: a text printed at the end if 'limit' has been exceeded @param output_pattern: when specified, prints the subfields of each instance according to pattern specified as parameter (following Python string formatting convention) @param bfo: BibFormatObject which represents the record to format. """ # Check if data or control field try: limit = int(limit) except ValueError: limit = 0 p_tag = parse_tag(tag) if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11): return bfo.control_field(tag) flos = [] # Final list of string # Get values without subcode. # We will filter unneeded subcode later if p_tag[1] == '': p_tag[1] = '_' if p_tag[2] == '': p_tag[2] = '_' # values will always be a list. if not output_pattern: values = bfo.fields_ordered(''.join(p_tag)) else: values = bfo.fields(''.join(p_tag)) # At this step values can be a list of dict a list of string or an empty list. if not values: return '' # At this point we are sure we will get at least an element in values. x = 0 if isinstance(values[0], list): if limit: for instance in values: x += len(instance) if x > limit: flos.append(subfields_separator.join(instance[:limit - x]) + extension) break else: flos.append(subfields_separator.join(instance)) else: flos = [subfields_separator.join(instance) for instance in values] elif isinstance(values[0], dict): flos = [output_pattern % DictNoKeyError(instance) for instance in values] else: flos = values return instances_separator.join(flos)
def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config): """Traverses citedbydict in order to build "which author is quoted where" dict. The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means Apollinaire is cited in records 1,2 and 3. Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict: the dicts from the database. Output: authorciteddict. It is initially set to initial_author_dict """ #sorry bout repeated code to get the tags tags = ['first_author', 'additional_author', 'alternative_author_name'] tagvals = {} for t in tags: try: x = config.get(config.get("rank_method", "function"), t) tagvals[t] = x except: register_exception(prefix="attribute " + t + " missing in config", alert_admin=True) return initial_author_dict #parse the tags mainauthortag = tagify(parse_tag(tagvals['first_author'])) coauthortag = tagify(parse_tag(tagvals['additional_author'])) extauthortag = tagify(parse_tag(tagvals['alternative_author_name'])) if task_get_task_param('verbose') >= 9: write_message("mainauthortag " + mainauthortag) write_message("coauthortag " + coauthortag) write_message("extauthortag " + extauthortag) author_cited_in = initial_author_dict if citedbydict: i = 0 #just a counter for debug write_message("Checking records referred to in new records") for u in updated_redic_list: if (i % 1000 == 0): mesg = "Author ref done " + str(i) + " of " + str( len(updated_redic_list)) + " records" write_message(mesg) task_update_progress(mesg) i = i + 1 if citedbydict.has_key(u): these_cite_k = citedbydict[u] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None authors = get_fieldvalues(u, mainauthortag) coauthl = get_fieldvalues(u, coauthortag) extauthl = get_fieldvalues(u, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author ref done fully" write_message(mesg) task_update_progress(mesg) #go through the dictionary again: all keys but search only if new records are cited write_message("Checking authors in new records") i = 0 for k in citedbydict.keys(): if (i % 1000 == 0): mesg = "Author cit done " + str(i) + " of " + str( len(citedbydict.keys())) + " records" write_message(mesg) task_update_progress(mesg) i = i + 1 these_cite_k = citedbydict[k] if (these_cite_k is None): these_cite_k = [] #verify it is an empty list, not None #do things only if these_cite_k contains any new stuff intersec_list = list(set(these_cite_k) & set(updated_redic_list)) if intersec_list: authors = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authors.extend(coauthl) authors.extend(extauthl) for a in authors: if a and author_cited_in.has_key(a): #add all elements in these_cite_k #that are not there already for citer in these_cite_k: tmplist = author_cited_in[a] if (tmplist.count(citer) == 0): tmplist.append(citer) author_cited_in[a] = tmplist else: author_cited_in[a] = these_cite_k mesg = "Author cit done fully" write_message(mesg) task_update_progress(mesg) return author_cited_in
def get_self_citations(new_record_list, citationdic, initial_selfcitdict, config): """Check which items have been cited by one of the authors of the citing item: go through id's in new_record_list, use citationdic to get citations, update "selfcites". Selfcites is originally initial_selfcitdict. Return selfcites. """ i = 0 #just for debugging .. #get the tags for main author, coauthors, ext authors from config tags = ['first_author', 'additional_author', 'alternative_author_name'] for t in tags: try: dummy = config.get(config.get("rank_method", "function"), t) except: register_exception(prefix="attribute "+t+" missing in config", alert_admin=True) return initial_selfcitdict r_mainauthortag = config.get(config.get("rank_method", "function"), "first_author") r_coauthortag = config.get(config.get("rank_method", "function"), "additional_author") r_extauthortag = config.get(config.get("rank_method", "function"), "alternative_author_name") #parse the tags mainauthortag = tagify(parse_tag(r_mainauthortag)) coauthortag = tagify(parse_tag(r_coauthortag)) extauthortag = tagify(parse_tag(r_extauthortag)) selfcites = initial_selfcitdict for k in new_record_list: if (i % 1000 == 0): mesg = "Selfcites done "+str(i)+" of "+str(len(new_record_list))+" records" write_message(mesg) task_update_progress(mesg) i = i+1 #get the author of k authorlist = get_fieldvalues(k, mainauthortag) coauthl = get_fieldvalues(k, coauthortag) extauthl = get_fieldvalues(k, extauthortag) authorlist.append(coauthl) authorlist.append(extauthl) #author tag #print "record "+str(k)+" by "+str(authorlist) #print "is cited by" #get the "x-cites-this" list if citationdic.has_key(k): xct = citationdic[k] for c in xct: #get authors of c cauthorlist = get_fieldvalues(c, mainauthortag) coauthl = get_fieldvalues(c, coauthortag) extauthl = get_fieldvalues(c, extauthortag) cauthorlist.extend(coauthl) cauthorlist.extend(extauthl) #print str(c)+" by "+str(cauthorlist) for ca in cauthorlist: if (ca in authorlist): #found! if selfcites.has_key(k): val = selfcites[k] #add only if not there already if val: if not c in val: val.append(c) selfcites[k] = val else: #new key for selfcites selfcites[k] = [c] mesg = "Selfcites done fully" write_message(mesg) task_update_progress(mesg) return selfcites
def format_element(bfo, tag, limit, instances_separator=" ", subfields_separator=" ", extension="", output_pattern=""): """ Prints the given field of a record. If tag is in range [001, 010], this element assumes that it accesses a control field. Else it considers it accesses a data field. <p>For eg. consider the following metdata: <pre> 100__ $$aCalatroni, S$$uCERN 245__ $$aStatus of the EP Simulations and Facilities for the SPL 700__ $$aFerreira, L$$uCERN 700__ $$aMacatrao, M$$uCERN 700__ $$aSkala, A$$uCERN 700__ $$aSosin, M$$uCERN 700__ $$ade Waele, R$$uCERN 700__ $$aWithofs, Y$$uKHLim, Diepenbeek </pre> The following calls to bfe_field would print: <pre> <BFE_FIELD tag="700" instances_separator="<br/>" subfields_separator=" - "> Ferreira, L - CERN Macatrao, M - CERN Skala, A - CERN Sosin, M - CERN de Waele, R - CERN Withofs, Y - KHLim, Diepenbeek </pre> </p> <p>For more advanced formatting, the <code>output_pattern</code> parameter can be used to output the subfields of each instance in the specified way. For eg. consider the following metadata: <pre> 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999 775__ $$b12. Aufl.$$c1963$$w278898 775__ $$b14. Aufl.$$c1983$$w107899 775__ $$b13. Aufl.$$c1974$$w99635 </pre> with the following <code>output_pattern</code>: <pre> <a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s</a> </pre> would print:<br/> <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/> <a href="/record/278898">12. Aufl. (1963) </a><br/> <a href="/record/107899">14. Aufl. (1983) </a><br/> <a href="/record/99635">13. Aufl. (1974) </a> <br/>(<code>instances_separator="<br/>"</code> set for readability)<br/> The output pattern must follow <a href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python string formatting</a> syntax. The format must use parenthesized notation to map to the subfield code. This currently restricts the support of <code>output_pattern</code> to non-repeatable subfields</p> @param tag: the tag code of the field that is to be printed @param instances_separator: a separator between instances of field @param subfields_separator: a separator between subfields of an instance @param limit: the maximum number of values to display. @param extension: a text printed at the end if 'limit' has been exceeded @param output_pattern: when specified, prints the subfields of each instance according to pattern specified as parameter (following Python string formatting convention) """ # Check if data or control field p_tag = parse_tag(tag) if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11): return bfo.control_field(tag) elif p_tag[0].isdigit(): # Get values without subcode. # We will filter unneeded subcode later if p_tag[1] == '': p_tag[1] = '_' if p_tag[2] == '': p_tag[2] = '_' values = bfo.fields(p_tag[0]+p_tag[1]+p_tag[2]) # Values will # always be a # list of # dicts else: return '' x = 0 instances_out = [] # Retain each instance output for instance in values: filtered_values = [value for (subcode, value) in instance.iteritems() if p_tag[3] == '' or p_tag[3] == '%' \ or p_tag[3] == subcode] if len(filtered_values) > 0: # We have found some corresponding subcode(s) if limit.isdigit() and x + len(filtered_values) >= int(limit): # We are going to exceed the limit filtered_values = filtered_values[:int(limit)-x] # Takes only needed one if len(filtered_values) > 0: # do not append empty list! if output_pattern: try: instances_out.append(output_pattern % DictNoKeyError(instance)) except: pass else: instances_out.append(subfields_separator.join(filtered_values)) x += len(filtered_values) # record that so we know limit has been exceeded break # No need to go further else: if output_pattern: try: instances_out.append(output_pattern % DictNoKeyError(instance)) except: pass else: instances_out.append(subfields_separator.join(filtered_values)) x += len(filtered_values) ext_out = '' if limit.isdigit() and x > int(limit): ext_out = extension return instances_separator.join(instances_out) + ext_out