Python search_unit Examples, invenio.legacy.search_engine.search_unit Python Examples

Example #1

0

Show file

File: utils.py Project: jirikuncar/invenio

def citations_nb_counts():
    """Get number of citations for the record `recid`."""
    recid = request.view_args.get("recid")
    if recid is None:
        return

    from intbitset import intbitset
    from invenio.legacy.bibrank.citation_searcher import get_cited_by, get_cited_by_count

    if CFG_BIBRANK_SHOW_CITATION_LINKS:
        if CFG_INSPIRE_SITE:
            from invenio.legacy.search_engine import search_unit

            citers_recids = intbitset(get_cited_by(recid))
            citeable_recids = search_unit(p="citeable", f="collection")
            return len(citers_recids & citeable_recids)
        else:
            return get_cited_by_count(recid)
    return 0

Example #2

0

Show file

def citations_nb_counts():
    """Get number of citations for the record `recid`."""
    recid = request.view_args.get('recid')
    if recid is None:
        return

    from intbitset import intbitset
    from invenio.legacy.bibrank.citation_searcher import (get_cited_by,
                                                          get_cited_by_count)

    if CFG_BIBRANK_SHOW_CITATION_LINKS:
        if CFG_INSPIRE_SITE:
            from invenio.legacy.search_engine import search_unit
            citers_recids = intbitset(get_cited_by(recid))
            citeable_recids = search_unit(p='citeable', f='collection')
            return len(citers_recids & citeable_recids)
        else:
            return get_cited_by_count(recid)
    return 0

Example #3

0

Show file

File: citation_indexer.py Project: chokribr/invenio-1

def deleted_recids_cache(cache={}):
    if 'deleted_records' not in cache:
        cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
    return cache['deleted_records']

Example #4

0

Show file

File: citation_indexer.py Project: jirikuncar/invenio

def deleted_recids_cache(cache={}):
    if "deleted_records" not in cache:
        cache["deleted_records"] = search_unit(p="DELETED", f="collection", m="a")
    return cache["deleted_records"]

Example #5

0

Show file

File: citation_indexer.py Project: Theer108/invenio

def deleted_recids_cache(cache={}):
    if 'deleted_records' not in cache:
        cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
    return cache['deleted_records']

Example #6

0

Show file

File: citation_indexer.py Project: mhellmic/b2share

def ref_analyzer(citation_informations, dicts,
                 updated_recids, tags, do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in iteritems(references_info['report-numbers']):
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in iteritems(references_info['journals']):
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in iteritems(references_info['doi']):
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in iteritems(records_info['report-numbers']):
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in iteritems(records_info['journals']):
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in iteritems(records_info['doi']):
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(iteritems(citations), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(iteritems(references), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(iteritems(selfcites), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(iteritems(selfrefs), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(iteritems(authorcites), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
    write_message("... checking rec journals: %.2f sec" % (t6-t5))
    write_message("... checking rec DOI: %.2f sec" % (t7-t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites

Example #7

0

Show file

File: citation_indexer.py Project: mhellmic/b2share

                            deserialize_via_marshal
from invenio.modules.indexer.tokenizers.BibIndexJournalTokenizer import \
    CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
    CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
from invenio.legacy.search_engine import search_pattern, search_unit
from invenio.legacy.bibrecord import get_fieldvalues
from invenio.modules.formatter.utils import parse_tag
from invenio.modules.knowledge.api import get_kb_mappings
from invenio.legacy.bibsched.bibtask import write_message, task_get_option, \
                     task_update_progress, task_sleep_now_if_required, \
                     task_get_task_param
from invenio.ext.logging import register_exception
from invenio.legacy.bibindex.engine import get_field_tags


INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a')

re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)

def get_recids_matching_query(p, f, m='e'):
    """Return set of recIDs matching query for pattern p in field f."""
    return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS


def get_citation_weight(rank_method_code, config, chunk_size=20000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()

    quick = task_get_option("quick") != "no"