def citations_nb_counts(): """Get number of citations for the record `recid`.""" recid = request.view_args.get("recid") if recid is None: return from intbitset import intbitset from invenio.legacy.bibrank.citation_searcher import get_cited_by, get_cited_by_count if CFG_BIBRANK_SHOW_CITATION_LINKS: if CFG_INSPIRE_SITE: from invenio.legacy.search_engine import search_unit citers_recids = intbitset(get_cited_by(recid)) citeable_recids = search_unit(p="citeable", f="collection") return len(citers_recids & citeable_recids) else: return get_cited_by_count(recid) return 0
def citations_nb_counts(): """Get number of citations for the record `recid`.""" recid = request.view_args.get('recid') if recid is None: return from intbitset import intbitset from invenio.legacy.bibrank.citation_searcher import (get_cited_by, get_cited_by_count) if CFG_BIBRANK_SHOW_CITATION_LINKS: if CFG_INSPIRE_SITE: from invenio.legacy.search_engine import search_unit citers_recids = intbitset(get_cited_by(recid)) citeable_recids = search_unit(p='citeable', f='collection') return len(citers_recids & citeable_recids) else: return get_cited_by_count(recid) return 0
def deleted_recids_cache(cache={}): if 'deleted_records' not in cache: cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a') return cache['deleted_records']
def deleted_recids_cache(cache={}): if "deleted_records" not in cache: cache["deleted_records"] = search_unit(p="DELETED", f="collection", m="a") return cache["deleted_records"]
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in iteritems(references_info['report-numbers']): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in iteritems(references_info['journals']): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in iteritems(references_info['doi']): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in iteritems(records_info['report-numbers']): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in iteritems(records_info['journals']): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in iteritems(records_info['doi']): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(iteritems(citations), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(iteritems(references), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(iteritems(selfcites), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(iteritems(selfrefs), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(iteritems(authorcites), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking rec report numbers: %.2f sec" % (t5-t4)) write_message("... checking rec journals: %.2f sec" % (t6-t5)) write_message("... checking rec DOI: %.2f sec" % (t7-t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7-t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
deserialize_via_marshal from invenio.modules.indexer.tokenizers.BibIndexJournalTokenizer import \ CFG_JOURNAL_PUBINFO_STANDARD_FORM, \ CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK from invenio.legacy.search_engine import search_pattern, search_unit from invenio.legacy.bibrecord import get_fieldvalues from invenio.modules.formatter.utils import parse_tag from invenio.modules.knowledge.api import get_kb_mappings from invenio.legacy.bibsched.bibtask import write_message, task_get_option, \ task_update_progress, task_sleep_now_if_required, \ task_get_task_param from invenio.ext.logging import register_exception from invenio.legacy.bibindex.engine import get_field_tags INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a') re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK) def get_recids_matching_query(p, f, m='e'): """Return set of recIDs matching query for pattern p in field f.""" return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS def get_citation_weight(rank_method_code, config, chunk_size=20000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() quick = task_get_option("quick") != "no"