Python parse_tag Examples, invenio.bibformat_utils.parse_tag Python Examples

Example #1

0

Show file

File: bibformat_engine_tests.py Project: epfl-si/invenio-infoscience

    def test_parse_tag(self):
        """ bibformat - result of parsing tags"""
        tags_and_parsed_tags = ['245COc',   ['245', 'C', 'O', 'c'],
                                '245C_c',   ['245', 'C', '' , 'c'],
                                '245__c',   ['245', '' , '' , 'c'],
                                '245__$$c', ['245', '' , '' , 'c'],
                                '245__$c',  ['245', '' , '' , 'c'],
                                '245  $c',  ['245', '' , '' , 'c'],
                                '245  $$c', ['245', '' , '' , 'c'],
                                '245__.c',  ['245', '' , '' , 'c'],
                                '245  .c',  ['245', '' , '' , 'c'],
                                '245C_$c',  ['245', 'C', '' , 'c'],
                                '245CO$$c', ['245', 'C', 'O', 'c'],
                                '245CO.c',  ['245', 'C', 'O', 'c'],
                                '245$c',    ['245', '' , '' , 'c'],
                                '245.c',    ['245', '' , '' , 'c'],
                                '245$$c',   ['245', '' , '' , 'c'],
                                '245__%',   ['245', '' , '' , '%'],
                                '245__$$%', ['245', '' , '' , '%'],
                                '245__$%',  ['245', '' , '' , '%'],
                                '245  $%',  ['245', '' , '' , '%'],
                                '245  $$%', ['245', '' , '' , '%'],
                                '245$%',    ['245', '' , '' , '%'],
                                '245.%',    ['245', '' , '' , '%'],
                                '245_O.%',  ['245', '' , 'O', '%'],
                                '245.%',    ['245', '' , '' , '%'],
                                '245$$%',   ['245', '' , '' , '%'],
                                '2%5$$a',   ['2%5', '' , '' , 'a'],
                                '2%%%%a',   ['2%%', '%', '%', 'a'],
                                '2%%__a',   ['2%%', '' , '' , 'a'],
                                '2%%a',     ['2%%', '' , '' , 'a']]

        for i in range(0, len(tags_and_parsed_tags), 2):
            parsed_tag = bibformat_utils.parse_tag(tags_and_parsed_tags[i])
            self.assertEqual(parsed_tag, tags_and_parsed_tags[i+1])

Example #2

0

Show file

File: bibformat_engine_unit_tests.py Project: AlbertoPeon/invenio

    def test_parse_tag(self):
        """ bibformat - result of parsing tags"""
        tags_and_parsed_tags = ['245COc',   ['245', 'C', 'O', 'c'],
                                '245C_c',   ['245', 'C', '' , 'c'],
                                '245__c',   ['245', '' , '' , 'c'],
                                '245__$$c', ['245', '' , '' , 'c'],
                                '245__$c',  ['245', '' , '' , 'c'],
                                '245  $c',  ['245', '' , '' , 'c'],
                                '245  $$c', ['245', '' , '' , 'c'],
                                '245__.c',  ['245', '' , '' , 'c'],
                                '245  .c',  ['245', '' , '' , 'c'],
                                '245C_$c',  ['245', 'C', '' , 'c'],
                                '245CO$$c', ['245', 'C', 'O', 'c'],
                                '245CO.c',  ['245', 'C', 'O', 'c'],
                                '245$c',    ['245', '' , '' , 'c'],
                                '245.c',    ['245', '' , '' , 'c'],
                                '245$$c',   ['245', '' , '' , 'c'],
                                '245__%',   ['245', '' , '' , '%'],
                                '245__$$%', ['245', '' , '' , '%'],
                                '245__$%',  ['245', '' , '' , '%'],
                                '245  $%',  ['245', '' , '' , '%'],
                                '245  $$%', ['245', '' , '' , '%'],
                                '245$%',    ['245', '' , '' , '%'],
                                '245.%',    ['245', '' , '' , '%'],
                                '245_O.%',  ['245', '' , 'O', '%'],
                                '245.%',    ['245', '' , '' , '%'],
                                '245$$%',   ['245', '' , '' , '%'],
                                '2%5$$a',   ['2%5', '' , '' , 'a'],
                                '2%%%%a',   ['2%%', '%', '%', 'a'],
                                '2%%__a',   ['2%%', '' , '' , 'a'],
                                '2%%a',     ['2%%', '' , '' , 'a']]

        for i in range(0, len(tags_and_parsed_tags), 2):
            parsed_tag = bibformat_utils.parse_tag(tags_and_parsed_tags[i])
            self.assertEqual(parsed_tag, tags_and_parsed_tags[i+1])

Example #3

0

Show file

def format_element(bfo, tag, limit, instances_separator=" ",
           subfields_separator=" ", extension=""):
    """
    Prints the given field of a record.
    If tag is in range [001, 010], this element assumes
    that it accesses a control field. Else it considers it
    accesses a data field.

    @param tag: the tag code of the field that is to be printed
    @param instances_separator: a separator between instances of field
    @param subfields_separator: a separator between subfields of an instance
    @param limit: the maximum number of values to display.
    @param extension: a text printed at the end if 'limit' has been exceeded
    """
    # Check if data or control field
    p_tag = parse_tag(tag)
    if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11):
        return  bfo.control_field(tag)
    elif p_tag[0].isdigit():
        # Get values without subcode.
        # We will filter unneeded subcode later
        if p_tag[1] == '':
            p_tag[1] = '_'
        if p_tag[2] == '':
            p_tag[2] = '_'
        values = bfo.fields(p_tag[0]+p_tag[1]+p_tag[2]) # Values will
                                                        # always be a
                                                        # list of
                                                        # dicts
    else:
        return ''

    x = 0
    instances_out = [] # Retain each instance output
    for instance in values:
        filtered_values = [value for (subcode, value) in instance.iteritems()
                          if p_tag[3] == '' or p_tag[3] == '%' \
                           or p_tag[3] == subcode]
        if len(filtered_values) > 0:
            # We have found some corresponding subcode(s)
            if limit.isdigit() and x + len(filtered_values) >= int(limit):
                # We are going to exceed the limit
                filtered_values = filtered_values[:int(limit)-x] # Takes only needed one
                if len(filtered_values) > 0: # do not append empty list!
                    instances_out.append(subfields_separator.join(filtered_values))
                    x += len(filtered_values) # record that so we know limit has been exceeded
                break # No need to go further
            else:
                instances_out.append(subfields_separator.join(filtered_values))
                x += len(filtered_values)

    ext_out = ''
    if limit.isdigit() and x > int(limit):
        ext_out = extension

    return instances_separator.join(instances_out) + ext_out

Example #4

0

Show file

File: search_engine_summarizer.py Project: bopopescu/invenio_new

def get_authors_tags(config=CITATION_CONFIG):
    """
    Get the tags for main author, coauthors, alternative authors from config
    """
    function = config.get("rank_method", "function")

    tags_names = [
        'first_author',
        'additional_author',
        'alternative_author_name',
        'collaboration_name',
    ]

    tags = {}
    for t in tags_names:
        r_tag = config.get(function, t)
        tags[t] = tagify(parse_tag(r_tag))

    return tags

Example #5

0

Show file

File: search_engine_summarizer.py Project: Kennethhole/Invenio-1

def get_authors_tags(config=CITATION_CONFIG):
    """
    Get the tags for main author, coauthors, alternative authors from config
    """
    function = config.get("rank_method", "function")

    tags_names = [
        'first_author',
        'additional_author',
        'alternative_author_name',
        'collaboration_name',
    ]

    tags = {}
    for t in tags_names:
        r_tag = config.get(function, t)
        tags[t] = tagify(parse_tag(r_tag))

    return tags

Example #6

0

Show file

File: bibrank_citation_indexer.py Project: robk5uj/invenio

def get_self_citations(new_record_list, citationdic, initial_selfcitdict,
                       config):
    """Check which items have been cited by one of the authors of the
       citing item: go through id's in new_record_list, use citationdic to get citations,
       update "selfcites". Selfcites is originally initial_selfcitdict. Return selfcites.
    """
    i = 0  #just for debugging ..
    #get the tags for main author, coauthors, ext authors from config
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    for t in tags:
        try:
            dummy = config.get(config.get("rank_method", "function"), t)
        except:
            register_exception(prefix="attribute " + t + " missing in config",
                               alert_admin=True)
            return initial_selfcitdict

    r_mainauthortag = config.get(config.get("rank_method", "function"),
                                 "first_author")
    r_coauthortag = config.get(config.get("rank_method", "function"),
                               "additional_author")
    r_extauthortag = config.get(config.get("rank_method", "function"),
                                "alternative_author_name")
    #parse the tags
    mainauthortag = tagify(parse_tag(r_mainauthortag))
    coauthortag = tagify(parse_tag(r_coauthortag))
    extauthortag = tagify(parse_tag(r_extauthortag))

    selfcites = initial_selfcitdict
    for k in new_record_list:
        if (i % 1000 == 0):
            mesg = "Selfcites done " + str(i) + " of " + str(
                len(new_record_list)) + " records"
            write_message(mesg)
            task_update_progress(mesg)
        i = i + 1
        #get the author of k
        authorlist = get_fieldvalues(k, mainauthortag)
        coauthl = get_fieldvalues(k, coauthortag)
        extauthl = get_fieldvalues(k, extauthortag)
        authorlist.append(coauthl)
        authorlist.append(extauthl)
        #author tag
        #print "record "+str(k)+" by "+str(authorlist)
        #print "is cited by"
        #get the "x-cites-this" list
        if citationdic.has_key(k):
            xct = citationdic[k]
            for c in xct:
                #get authors of c
                cauthorlist = get_fieldvalues(c, mainauthortag)
                coauthl = get_fieldvalues(c, coauthortag)
                extauthl = get_fieldvalues(c, extauthortag)
                cauthorlist.extend(coauthl)
                cauthorlist.extend(extauthl)
                #print str(c)+" by "+str(cauthorlist)
                for ca in cauthorlist:
                    if (ca in authorlist):
                        #found!
                        if selfcites.has_key(k):
                            val = selfcites[k]
                            #add only if not there already
                            if val:
                                if not c in val:
                                    val.append(c)
                            selfcites[k] = val
                        else:
                            #new key for selfcites
                            selfcites[k] = [c]

    mesg = "Selfcites done fully"
    write_message(mesg)
    task_update_progress(mesg)

    return selfcites

Example #7

0

Show file

File: bibrank_citation_indexer.py Project: robk5uj/invenio

def get_citation_informations(recid_list, config):
    """scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]
    d_reports_numbers = {}  #dict of recid -> institute-given-report-code
    d_references_report_numbers = {}  #dict of recid -> ['astro-ph/xyz']
    d_references_s = {
    }  #dict of recid -> list_of_the_entries_of_this_recs_bibliography
    d_records_s = {}  #dict of recid -> this_records_publication_info
    citation_informations = []

    write_message("config function " + config.get("rank_method", "function"),
                  verbose=9)
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(
            prefix="cfg section [rank_method] has no attribute called function",
            alert_admin=True)
        #we cannot continue
        return [{}, {}, {}, {}]
    record_pri_number_tag = ""
    try:
        record_pri_number_tag = config.get(function, "primary_report_number")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attribute primary_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]
    record_add_number_tag = ""
    try:
        record_add_number_tag = config.get(
            config.get("rank_method", "function"), "additional_report_number")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute additional_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    reference_number_tag = ""
    try:
        reference_number_tag = config.get(
            config.get("rank_method", "function"),
            "reference_via_report_number")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute reference_via_report_number",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    reference_tag = ""
    try:
        reference_tag = config.get(config.get("rank_method", "function"),
                                   "reference_via_pubinfo")
    except:
        register_exception(prefix="config error. cfg section " + function +
                           " has no attribute reference_via_pubinfo",
                           alert_admin=True)
        return [{}, {}, {}, {}]

    p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag))
    #037a: contains (often) the "hep-ph/0501084" tag of THIS record
    p_record_add_number_tag = tagify(parse_tag(record_add_number_tag))
    #088a: additional short identifier for the record
    p_reference_number_tag = tagify(parse_tag(reference_number_tag))
    #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002
    p_reference_tag = tagify(parse_tag(reference_tag))
    #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371
    #fields needed to construct the pubinfo for this record
    publication_pages_tag = ""
    publication_year_tag = ""
    publication_journal_tag = ""
    publication_volume_tag = ""
    publication_format_string = "p v (y) c"
    try:
        tag = config.get(function, "pubinfo_journal_page")
        publication_pages_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_year")
        publication_year_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_title")
        publication_journal_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_volume")
        publication_volume_tag = tagify(parse_tag(tag))
        publication_format_string = config.get(function,
                                               "pubinfo_journal_format")
    except:
        pass

    #print values for tags for debugging
    if task_get_task_param('verbose') >= 9:
        write_message("tag values")
        write_message("p_record_pri_number_tag " +
                      str(p_record_pri_number_tag))
        write_message("p_reference_tag " + str(p_reference_tag))
        write_message("publication_journal_tag " +
                      str(publication_journal_tag))
        write_message("publication_format_string is " +
                      publication_format_string)
    done = 0  #for status reporting
    numrecs = len(recid_list)

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:
    if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2],
               (p_reference_tag,)) or \
       run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2],
               (p_reference_number_tag,)):
        for recid in recid_list:
            if (done % 10 == 0):
                task_sleep_now_if_required()
                #in fact we can sleep any time here

            if (done % 1000 == 0):
                mesg = "get cit.inf done " + str(done) + " of " + str(numrecs)
                write_message(mesg)
                task_update_progress(mesg)
            done = done + 1

            pri_report_numbers = get_fieldvalues(recid,
                                                 p_record_pri_number_tag)
            add_report_numbers = get_fieldvalues(recid,
                                                 p_record_add_number_tag)
            reference_report_numbers = get_fieldvalues(recid,
                                                       p_reference_number_tag)
            references_s = get_fieldvalues(recid, p_reference_tag)

            l_report_numbers = pri_report_numbers
            l_report_numbers.extend(add_report_numbers)
            d_reports_numbers[recid] = l_report_numbers

            if reference_report_numbers:
                d_references_report_numbers[recid] = reference_report_numbers

            references_s = get_fieldvalues(recid, p_reference_tag)
            write_message(str(recid) + "'s " + str(p_reference_tag) +
                          " values " + str(references_s),
                          verbose=9)
            if references_s:
                d_references_s[recid] = references_s

            #get a combination of
            #journal vol (year) pages
            if publication_pages_tag and publication_journal_tag and \
                 publication_volume_tag and publication_year_tag and publication_format_string:
                tagsvalues = {}  #we store the tags and their values here
                #like c->444 y->1999 p->"journal of foo",v->20
                tagsvalues["p"] = ""
                tagsvalues["y"] = ""
                tagsvalues["c"] = ""
                tagsvalues["v"] = ""
                tmp = get_fieldvalues(recid, publication_journal_tag)
                if tmp:
                    tagsvalues["p"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_volume_tag)
                if tmp:
                    tagsvalues["v"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_year_tag)
                if tmp:
                    tagsvalues["y"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_pages_tag)
                if tmp:
                    #if the page numbers have "x-y" take just x
                    pages = tmp[0]
                    hpos = pages.find("-")
                    if hpos > 0:
                        pages = pages[:hpos]
                    tagsvalues["c"] = pages
                #format the publ infostring according to the format
                publ = ""
                ok = 1
                for i in range(0, len(publication_format_string)):
                    current = publication_format_string[i]
                    #these are supported
                    if current == "p" or current == "c" or current == "v" \
                                      or current == "y":
                        if tagsvalues[current]:
                            #add the value in the string
                            publ += tagsvalues[current]
                        else:
                            ok = 0
                            break  #it was needed and not found
                    else:
                        publ += current  #just add the character in the format string
                if ok:
                    write_message("d_records_s (publication info) for " +
                                  str(recid) + " is " + publ,
                                  verbose=9)
                    d_records_s[recid] = publ
    else:
        mesg = "Warning: there are no records with tag values for "
        mesg += p_reference_number_tag + " or " + p_reference_tag + ". Nothing to do."
        write_message(mesg)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    citation_informations.append(d_reports_numbers)
    citation_informations.append(d_references_report_numbers)
    citation_informations.append(d_references_s)
    citation_informations.append(d_records_s)
    end_time = os.times()[4]
    write_message("Execution time for generating citation info from record: %.2f sec" % \
                  (end_time - begin_time))
    return citation_informations

Example #8

0

Show file

File: bibrank_citation_indexer.py Project: Kennethhole/Invenio-1

def get_author_citations(updated_redic_list, citedbydict, initial_author_dict, config):
    """Traverses citedbydict in order to build "which author is quoted where" dict.
       The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means
       Apollinaire is cited in records 1,2 and 3.
       Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict:
              the dicts from the database.
       Output: authorciteddict. It is initially set to initial_author_dict
    """

    #sorry bout repeated code to get the tags
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    tagvals = {}
    for t in tags:
        try:
            x = config.get(config.get("rank_method", "function"), t)
            tagvals[t] = x
        except:
            register_exception(prefix="attribute "+t+" missing in config", alert_admin=True)
            return initial_author_dict

    #parse the tags
    mainauthortag = tagify(parse_tag(tagvals['first_author']))
    coauthortag = tagify(parse_tag(tagvals['additional_author']))
    extauthortag = tagify(parse_tag(tagvals['alternative_author_name']))
    if task_get_task_param('verbose') >= 9:
        write_message("mainauthortag "+mainauthortag)
        write_message("coauthortag "+coauthortag)
        write_message("extauthortag "+extauthortag)

    author_cited_in = initial_author_dict
    if citedbydict:
        i = 0 #just a counter for debug
        write_message("Checking records referred to in new records")
        for u in updated_redic_list:
            if (i % 1000 == 0):
                mesg = "Author ref done "+str(i)+" of "+str(len(updated_redic_list))+" records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            if citedbydict.has_key(u):
                these_cite_k = citedbydict[u]
                if (these_cite_k is None):
                    these_cite_k = [] #verify it is an empty list, not None
                authors = get_fieldvalues(u, mainauthortag)
                coauthl = get_fieldvalues(u, coauthortag)
                extauthl = get_fieldvalues(u, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author ref done fully"
        write_message(mesg)
        task_update_progress(mesg)

        #go through the dictionary again: all keys but search only if new records are cited
        write_message("Checking authors in new records")
        i = 0
        for k in citedbydict.keys():
            if (i % 1000 == 0):
                mesg = "Author cit done "+str(i)+" of "+str(len(citedbydict.keys()))+" records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            these_cite_k = citedbydict[k]
            if (these_cite_k is None):
                these_cite_k = [] #verify it is an empty list, not None
            #do things only if these_cite_k contains any new stuff
            intersec_list = list(set(these_cite_k)&set(updated_redic_list))
            if intersec_list:
                authors = get_fieldvalues(k, mainauthortag)
                coauthl = get_fieldvalues(k, coauthortag)
                extauthl = get_fieldvalues(k, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author cit done fully"
        write_message(mesg)
        task_update_progress(mesg)

    return author_cited_in

Example #9

0

Show file

File: bibformat_engine_tests.py Project: kaplun/Invenio-OpenAIRE

    def test_parse_tag(self):
        """ bibformat - result of parsing tags"""
        tags_and_parsed_tags = [
            "245COc",
            ["245", "C", "O", "c"],
            "245C_c",
            ["245", "C", "", "c"],
            "245__c",
            ["245", "", "", "c"],
            "245__$$c",
            ["245", "", "", "c"],
            "245__$c",
            ["245", "", "", "c"],
            "245  $c",
            ["245", "", "", "c"],
            "245  $$c",
            ["245", "", "", "c"],
            "245__.c",
            ["245", "", "", "c"],
            "245  .c",
            ["245", "", "", "c"],
            "245C_$c",
            ["245", "C", "", "c"],
            "245CO$$c",
            ["245", "C", "O", "c"],
            "245CO.c",
            ["245", "C", "O", "c"],
            "245$c",
            ["245", "", "", "c"],
            "245.c",
            ["245", "", "", "c"],
            "245$$c",
            ["245", "", "", "c"],
            "245__%",
            ["245", "", "", "%"],
            "245__$$%",
            ["245", "", "", "%"],
            "245__$%",
            ["245", "", "", "%"],
            "245  $%",
            ["245", "", "", "%"],
            "245  $$%",
            ["245", "", "", "%"],
            "245$%",
            ["245", "", "", "%"],
            "245.%",
            ["245", "", "", "%"],
            "245_O.%",
            ["245", "", "O", "%"],
            "245.%",
            ["245", "", "", "%"],
            "245$$%",
            ["245", "", "", "%"],
            "2%5$$a",
            ["2%5", "", "", "a"],
            "2%%%%a",
            ["2%%", "%", "%", "a"],
            "2%%__a",
            ["2%%", "", "", "a"],
            "2%%a",
            ["2%%", "", "", "a"],
        ]

        for i in range(0, len(tags_and_parsed_tags), 2):
            parsed_tag = bibformat_utils.parse_tag(tags_and_parsed_tags[i])
            self.assertEqual(parsed_tag, tags_and_parsed_tags[i + 1])

Example #10

0

Show file

File: bibrank_citation_indexer.py Project: Kennethhole/Invenio-1

def get_citation_informations(recid_list, config):
    """scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]
    d_reports_numbers = {} #dict of recid -> institute-given-report-code
    d_references_report_numbers = {} #dict of recid -> ['astro-ph/xyz']
    d_references_s = {} #dict of recid -> list_of_the_entries_of_this_recs_bibliography
    d_records_s = {} #dict of recid -> this_records_publication_info
    citation_informations = []

    write_message("config function "+config.get("rank_method", "function"), verbose=9)
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(prefix="cfg section [rank_method] has no attribute called function", alert_admin=True)
        #we cannot continue
        return [ {}, {}, {}, {} ]
    record_pri_number_tag = ""
    try:
        record_pri_number_tag = config.get(function, "primary_report_number")
    except:
        register_exception(prefix="cfg section "+function+" has no attribute primary_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]
    record_add_number_tag = ""
    try:
        record_add_number_tag = config.get(config.get("rank_method", "function"),
                                       "additional_report_number")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute additional_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]

    reference_number_tag = ""
    try:
        reference_number_tag = config.get(config.get("rank_method", "function"),
                                      "reference_via_report_number")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_report_number", alert_admin=True)
        return [ {}, {}, {}, {} ]

    reference_tag = ""
    try:
        reference_tag = config.get(config.get("rank_method", "function"),
                               "reference_via_pubinfo")
    except:
        register_exception(prefix="config error. cfg section "+function+" has no attribute reference_via_pubinfo", alert_admin=True)
        return [ {}, {}, {}, {} ]

    p_record_pri_number_tag = tagify(parse_tag(record_pri_number_tag))
    #037a: contains (often) the "hep-ph/0501084" tag of THIS record
    p_record_add_number_tag = tagify(parse_tag(record_add_number_tag))
    #088a: additional short identifier for the record
    p_reference_number_tag = tagify(parse_tag(reference_number_tag))
    #999C5r. this is in the reference list, refers to other records. Looks like: hep-ph/0408002
    p_reference_tag = tagify(parse_tag(reference_tag))
    #999C5s. A standardized way of writing a reference in the reference list. Like: Nucl. Phys. B 710 (2000) 371
    #fields needed to construct the pubinfo for this record
    publication_pages_tag = ""
    publication_year_tag = ""
    publication_journal_tag = ""
    publication_volume_tag = ""
    publication_format_string = "p v (y) c"
    try:
        tag = config.get(function, "pubinfo_journal_page")
        publication_pages_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_year")
        publication_year_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_title")
        publication_journal_tag = tagify(parse_tag(tag))
        tag = config.get(function, "pubinfo_journal_volume")
        publication_volume_tag = tagify(parse_tag(tag))
        publication_format_string = config.get(function, "pubinfo_journal_format")
    except:
        pass

    #print values for tags for debugging
    if task_get_task_param('verbose') >= 9:
        write_message("tag values")
        write_message("p_record_pri_number_tag "+str(p_record_pri_number_tag))
        write_message("p_reference_tag "+str(p_reference_tag))
        write_message("publication_journal_tag "+str(publication_journal_tag))
        write_message("publication_format_string is "+publication_format_string)
    done = 0 #for status reporting
    numrecs = len(recid_list)

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:
    if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_tag[0:2],
               (p_reference_tag,)) or \
       run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % p_reference_number_tag[0:2],
               (p_reference_number_tag,)):
        for recid in recid_list:
            if (done % 10 == 0):
                task_sleep_now_if_required()
                #in fact we can sleep any time here

            if (done % 1000 == 0):
                mesg = "get cit.inf done "+str(done)+" of "+str(numrecs)
                write_message(mesg)
                task_update_progress(mesg)
            done = done+1

            if recid in INTBITSET_OF_DELETED_RECORDS:
                # do not treat this record since it was deleted; we
                # skip it like this in case it was only soft-deleted
                # e.g. via bibedit (i.e. when collection tag 980 is
                # DELETED but other tags like report number or journal
                # publication info remained the same, so the calls to
                # get_fieldvalues() below would return old values)
                continue

            pri_report_numbers = get_fieldvalues(recid, p_record_pri_number_tag)
            add_report_numbers = get_fieldvalues(recid, p_record_add_number_tag)
            reference_report_numbers = get_fieldvalues(recid, p_reference_number_tag)
            references_s = get_fieldvalues(recid, p_reference_tag)

            l_report_numbers = pri_report_numbers
            l_report_numbers.extend(add_report_numbers)
            d_reports_numbers[recid] = l_report_numbers

            if reference_report_numbers:
                d_references_report_numbers[recid] = reference_report_numbers

            references_s = get_fieldvalues(recid, p_reference_tag)
            write_message(str(recid)+"'s "+str(p_reference_tag)+" values "+str(references_s), verbose=9)
            if references_s:
                d_references_s[recid] = references_s

            #get a combination of
            #journal vol (year) pages
            if publication_pages_tag and publication_journal_tag and \
                 publication_volume_tag and publication_year_tag and publication_format_string:
                tagsvalues = {} #we store the tags and their values here
                                #like c->444 y->1999 p->"journal of foo",v->20
                tagsvalues["p"] = ""
                tagsvalues["y"] = ""
                tagsvalues["c"] = ""
                tagsvalues["v"] = ""
                tmp = get_fieldvalues(recid, publication_journal_tag)
                if tmp:
                    tagsvalues["p"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_volume_tag)
                if tmp:
                    tagsvalues["v"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_year_tag)
                if tmp:
                    tagsvalues["y"] = tmp[0]
                tmp = get_fieldvalues(recid, publication_pages_tag)
                if tmp:
                    #if the page numbers have "x-y" take just x
                    pages = tmp[0]
                    hpos = pages.find("-")
                    if hpos > 0:
                        pages = pages[:hpos]
                    tagsvalues["c"] = pages
                #format the publ infostring according to the format
                publ = ""
                ok = 1
                for i in range (0, len(publication_format_string)):
                    current = publication_format_string[i]
                    #these are supported
                    if current == "p" or current == "c" or current == "v" \
                                      or current == "y":
                        if tagsvalues[current]:
                            #add the value in the string
                            publ += tagsvalues[current]
                        else:
                            ok = 0
                            break #it was needed and not found
                    else:
                        publ += current #just add the character in the format string
                if ok:
                    write_message("d_records_s (publication info) for "+str(recid)+" is "+publ, verbose=9)
                    d_records_s[recid] = publ
    else:
        mesg = "Warning: there are no records with tag values for "
        mesg += p_reference_number_tag+" or "+p_reference_tag+". Nothing to do."
        write_message(mesg)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    citation_informations.append(d_reports_numbers)
    citation_informations.append(d_references_report_numbers)
    citation_informations.append(d_references_s)
    citation_informations.append(d_records_s)
    end_time = os.times()[4]
    write_message("Execution time for generating citation info from record: %.2f sec" % \
                  (end_time - begin_time))
    return citation_informations

Example #11

0

Show file

File: bfe_field.py Project: chokribr/inveniotest

def format_element(bfo,
                   tag,
                   limit,
                   instances_separator=" ",
                   subfields_separator=" ",
                   extension="",
                   output_pattern=""):
    """
    Prints the given field of a record.
    If tag is in range [001, 010], this element assumes
    that it accesses a control field. Else it considers it
    accesses a data field.

    <p>For eg. consider the following metdata:
    <pre>
 100__ $$aCalatroni, S$$uCERN
 245__ $$aStatus of the EP Simulations and Facilities for the SPL
 700__ $$aFerreira, L$$uCERN
 700__ $$aMacatrao, M$$uCERN
 700__ $$aSkala, A$$uCERN
 700__ $$aSosin, M$$uCERN
 700__ $$ade Waele, R$$uCERN
 700__ $$aWithofs, Y$$uKHLim, Diepenbeek
    </pre>
    The following calls to bfe_field would print:
    <pre>
    &lt;BFE_FIELD tag="700" instances_separator="&lt;br/>" subfields_separator=" - ">

    Ferreira, L - CERN
    Macatrao, M - CERN
    Skala, A - CERN
    Sosin, M - CERN
    de Waele, R - CERN
    Withofs, Y - KHLim, Diepenbeek
    </pre>
    </p>

    <p>For more advanced formatting, the <code>output_pattern</code>
    parameter can be used to output the subfields of each instance in
    the specified way. For eg. consider the following metadata:
    <pre>
 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999
 775__ $$b12. Aufl.$$c1963$$w278898
 775__ $$b14. Aufl.$$c1983$$w107899
 775__ $$b13. Aufl.$$c1974$$w99635
    </pre>
    with the following <code>output_pattern</code>:

    <pre>
    &lt;a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s&lt;/a>
    </pre>
    would print:<br/>

    <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/>
    <a href="/record/278898">12. Aufl. (1963) </a><br/>
    <a href="/record/107899">14. Aufl. (1983) </a><br/>
    <a href="/record/99635">13. Aufl. (1974) </a>

    <br/>(<code>instances_separator="&lt;br/>"</code> set for
    readability)<br/> The output pattern must follow <a
    href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python
    string formatting</a> syntax. The format must use parenthesized
    notation to map to the subfield code. This currently restricts the
    support of <code>output_pattern</code> to non-repeatable
    subfields</p>

    @param tag: the tag code of the field that is to be printed
    @param instances_separator: a separator between instances of field
    @param subfields_separator: a separator between subfields of an instance
    @param limit: the maximum number of values to display.
    @param extension: a text printed at the end if 'limit' has been exceeded
    @param output_pattern: when specified, prints the subfields of each instance according to pattern specified as parameter (following Python string formatting convention)
    """
    # Check if data or control field
    p_tag = parse_tag(tag)
    if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11):
        return bfo.control_field(tag)
    elif p_tag[0].isdigit():
        # Get values without subcode.
        # We will filter unneeded subcode later
        if p_tag[1] == '':
            p_tag[1] = '_'
        if p_tag[2] == '':
            p_tag[2] = '_'
        values = bfo.fields(p_tag[0] + p_tag[1] + p_tag[2])  # Values will
        # always be a
        # list of
        # dicts
    else:
        return ''

    x = 0
    instances_out = []  # Retain each instance output
    for instance in values:
        filtered_values = [value for (subcode, value) in instance.iteritems()
                          if p_tag[3] == '' or p_tag[3] == '%' \
                           or p_tag[3] == subcode]
        if len(filtered_values) > 0:
            # We have found some corresponding subcode(s)
            if limit.isdigit() and x + len(filtered_values) >= int(limit):
                # We are going to exceed the limit
                filtered_values = filtered_values[:int(limit) -
                                                  x]  # Takes only needed one
                if len(filtered_values) > 0:  # do not append empty list!
                    if output_pattern:
                        try:
                            instances_out.append(output_pattern %
                                                 DictNoKeyError(instance))
                        except:
                            pass
                    else:
                        instances_out.append(
                            subfields_separator.join(filtered_values))
                    x += len(
                        filtered_values
                    )  # record that so we know limit has been exceeded
                break  # No need to go further
            else:
                if output_pattern:
                    try:
                        instances_out.append(output_pattern %
                                             DictNoKeyError(instance))
                    except:
                        pass
                else:
                    instances_out.append(
                        subfields_separator.join(filtered_values))
                x += len(filtered_values)

    ext_out = ''
    if limit.isdigit() and x > int(limit):
        ext_out = extension

    return instances_separator.join(instances_out) + ext_out

Example #12

0

Show file

File: bibrank_citation_indexer.py Project: labordoc/labordoc-next

def get_tags_config(config):
    """Fetch needs config from our config file"""
    # Probably "citation" unless this file gets renamed
    function = config.get("rank_method", "function")
    write_message("config function %s" % function, verbose=9)

    tags = {}

    # 037a: contains (often) the "hep-ph/0501084" tag of THIS record
    try:
        tag = config.get(function, "primary_report_number")
    except ConfigParser.NoOptionError:
        tags['record_pri_number'] = None
    else:
        tags['record_pri_number'] = tagify(parse_tag(tag))

    # 088a: additional short identifier for the record
    try:
        tag = config.get(function, "additional_report_number")
    except ConfigParser.NoOptionError:
        tags['record_add_number'] = None
    else:
        tags['record_add_number'] = tagify(parse_tag(tag))

    # 999C5r. this is in the reference list, refers to other records.
    # Looks like: hep-ph/0408002
    try:
        tag = config.get(function, "reference_via_report_number")
    except ConfigParser.NoOptionError:
        tags['refs_report_number'] = None
    else:
        tags['refs_report_number'] = tagify(parse_tag(tag))
    # 999C5s. this is in the reference list, refers to other records.
    # Looks like: Phys.Rev.,A21,78
    try:
        tag = config.get(function, "reference_via_pubinfo")
    except ConfigParser.NoOptionError:
        tags['refs_journal'] = None
    else:
        tags['refs_journal'] = tagify(parse_tag(tag))
    # 999C5a. this is in the reference list, refers to other records.
    # Looks like: 10.1007/BF03170733
    try:
        tag = config.get(function, "reference_via_doi")
    except ConfigParser.NoOptionError:
        tags['refs_doi'] = None
    else:
        tags['refs_doi'] = tagify(parse_tag(tag))

    # Fields needed to construct the journals for this record
    try:
        tag = {
            'pages': config.get(function, "pubinfo_journal_page"),
            'year': config.get(function, "pubinfo_journal_year"),
            'journal': config.get(function, "pubinfo_journal_title"),
            'volume': config.get(function, "pubinfo_journal_volume"),
        }
    except ConfigParser.NoOptionError:
        tags['publication'] = None
    else:
        tags['publication'] = {
            'pages': tagify(parse_tag(tag['pages'])),
            'year': tagify(parse_tag(tag['year'])),
            'journal': tagify(parse_tag(tag['journal'])),
            'volume': tagify(parse_tag(tag['volume'])),
        }

    # Fields needed to lookup the DOIs
    tags['doi'] = get_field_tags('doi')

    # 999C5s. A standardized way of writing a reference in the reference list.
    # Like: Nucl. Phys. B 710 (2000) 371
    try:
        tags['publication_format'] = config.get(function,
                                                "pubinfo_journal_format")
    except ConfigParser.NoOptionError:
        tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM

    # Print values of tags for debugging
    write_message("tag values: %r" % [tags], verbose=9)

    return tags

Example #13

0

Show file

File: bfe_field.py Project: aw-bib/tind-invenio

def format_element(bfo, tag, limit, instances_separator=" ", subfields_separator=" ", extension="",
                   output_pattern=""):
    """
    Prints the given field of a record.
    If tag is in range [001, 010], this element assumes
    that it accesses a control field. Else it considers it
    accesses a data field.

    <p>For eg. consider the following metdata:
    <pre>
 100__ $$aCalatroni, S$$uCERN
 245__ $$aStatus of the EP Simulations and Facilities for the SPL
 700__ $$aFerreira, L$$uCERN
 700__ $$aMacatrao, M$$uCERN
 700__ $$aSkala, A$$uCERN
 700__ $$aSosin, M$$uCERN
 700__ $$ade Waele, R$$uCERN
 700__ $$aWithofs, Y$$uKHLim, Diepenbeek
    </pre>
    The following calls to bfe_field would print:
    <pre>
    &lt;BFE_FIELD tag="700" instances_separator="&lt;br/>" subfields_separator=" - ">

    Ferreira, L - CERN
    Macatrao, M - CERN
    Skala, A - CERN
    Sosin, M - CERN
    de Waele, R - CERN
    Withofs, Y - KHLim, Diepenbeek
    </pre>
    </p>

    <p>For more advanced formatting, the <code>output_pattern</code>
    parameter can be used to output the subfields of each instance in
    the specified way. For eg. consider the following metadata:
    <pre>
 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999
 775__ $$b12. Aufl.$$c1963$$w278898
 775__ $$b14. Aufl.$$c1983$$w107899
 775__ $$b13. Aufl.$$c1974$$w99635
    </pre>
    with the following <code>output_pattern</code>:

    <pre>
    &lt;a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s&lt;/a>
    </pre>
    would print:<br/>

    <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/>
    <a href="/record/278898">12. Aufl. (1963) </a><br/>
    <a href="/record/107899">14. Aufl. (1983) </a><br/>
    <a href="/record/99635">13. Aufl. (1974) </a>

    <br/>(<code>instances_separator="&lt;br/>"</code> set for
    readability)<br/> The output pattern must follow <a
    href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python
    string formatting</a> syntax. The format must use parenthesized
    notation to map to the subfield code. This currently restricts the
    support of <code>output_pattern</code> to non-repeatable
    subfields</p>

    @param tag: the tag code of the field that is to be printed
    @param instances_separator: a separator between instances of field
    @param subfields_separator: a separator between subfields of an instance
    @param limit: the maximum number of values to display.
    @param extension: a text printed at the end if 'limit' has been exceeded
    @param output_pattern: when specified, prints the subfields of each instance according to
     pattern specified as parameter (following Python string formatting convention)
    @param bfo: BibFormatObject which represents the record to format.
    """
    # Check if data or control field
    try:
        limit = int(limit)
    except ValueError:
        limit = 0

    p_tag = parse_tag(tag)
    if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11):
        return bfo.control_field(tag)

    flos = []  # Final list of string

    # Get values without subcode.
    # We will filter unneeded subcode later
    if p_tag[1] == '':
        p_tag[1] = '_'
    if p_tag[2] == '':
        p_tag[2] = '_'

    # values will always be a list.
    if not output_pattern:
        values = bfo.fields_ordered(''.join(p_tag))
    else:
        values = bfo.fields(''.join(p_tag))

    # At this step values can be a list of dict a list of string or an empty list.
    if not values:
        return ''
    # At this point we are sure we will get at least an element in values.
    x = 0
    if isinstance(values[0], list):
        if limit:
            for instance in values:
                x += len(instance)
                if x > limit:
                    flos.append(subfields_separator.join(instance[:limit - x]) + extension)
                    break
                else:
                    flos.append(subfields_separator.join(instance))
        else:
            flos = [subfields_separator.join(instance) for instance in values]

    elif isinstance(values[0], dict):
        flos = [output_pattern % DictNoKeyError(instance) for instance in values]
    else:
        flos = values

    return instances_separator.join(flos)

Example #14

0

Show file

File: bibrank_citation_indexer.py Project: robk5uj/invenio

def get_author_citations(updated_redic_list, citedbydict, initial_author_dict,
                         config):
    """Traverses citedbydict in order to build "which author is quoted where" dict.
       The keys of this are author names. An entry like "Apollinaire"->[1,2,3] means
       Apollinaire is cited in records 1,2 and 3.
       Input: citedbydict, updated_redic_list = records to be searched, initial_author_dict:
              the dicts from the database.
       Output: authorciteddict. It is initially set to initial_author_dict
    """

    #sorry bout repeated code to get the tags
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    tagvals = {}
    for t in tags:
        try:
            x = config.get(config.get("rank_method", "function"), t)
            tagvals[t] = x
        except:
            register_exception(prefix="attribute " + t + " missing in config",
                               alert_admin=True)
            return initial_author_dict

    #parse the tags
    mainauthortag = tagify(parse_tag(tagvals['first_author']))
    coauthortag = tagify(parse_tag(tagvals['additional_author']))
    extauthortag = tagify(parse_tag(tagvals['alternative_author_name']))
    if task_get_task_param('verbose') >= 9:
        write_message("mainauthortag " + mainauthortag)
        write_message("coauthortag " + coauthortag)
        write_message("extauthortag " + extauthortag)

    author_cited_in = initial_author_dict
    if citedbydict:
        i = 0  #just a counter for debug
        write_message("Checking records referred to in new records")
        for u in updated_redic_list:
            if (i % 1000 == 0):
                mesg = "Author ref done " + str(i) + " of " + str(
                    len(updated_redic_list)) + " records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            if citedbydict.has_key(u):
                these_cite_k = citedbydict[u]
                if (these_cite_k is None):
                    these_cite_k = []  #verify it is an empty list, not None
                authors = get_fieldvalues(u, mainauthortag)
                coauthl = get_fieldvalues(u, coauthortag)
                extauthl = get_fieldvalues(u, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author ref done fully"
        write_message(mesg)
        task_update_progress(mesg)

        #go through the dictionary again: all keys but search only if new records are cited
        write_message("Checking authors in new records")
        i = 0
        for k in citedbydict.keys():
            if (i % 1000 == 0):
                mesg = "Author cit done " + str(i) + " of " + str(
                    len(citedbydict.keys())) + " records"
                write_message(mesg)
                task_update_progress(mesg)
            i = i + 1

            these_cite_k = citedbydict[k]
            if (these_cite_k is None):
                these_cite_k = []  #verify it is an empty list, not None
            #do things only if these_cite_k contains any new stuff
            intersec_list = list(set(these_cite_k) & set(updated_redic_list))
            if intersec_list:
                authors = get_fieldvalues(k, mainauthortag)
                coauthl = get_fieldvalues(k, coauthortag)
                extauthl = get_fieldvalues(k, extauthortag)
                authors.extend(coauthl)
                authors.extend(extauthl)
                for a in authors:
                    if a and author_cited_in.has_key(a):
                        #add all elements in these_cite_k
                        #that are not there already
                        for citer in these_cite_k:
                            tmplist = author_cited_in[a]
                            if (tmplist.count(citer) == 0):
                                tmplist.append(citer)
                                author_cited_in[a] = tmplist
                            else:
                                author_cited_in[a] = these_cite_k

        mesg = "Author cit done fully"
        write_message(mesg)
        task_update_progress(mesg)

    return author_cited_in

Example #15

0

Show file

File: bibrank_citation_indexer.py Project: Kennethhole/Invenio-1

def get_self_citations(new_record_list, citationdic, initial_selfcitdict, config):
    """Check which items have been cited by one of the authors of the
       citing item: go through id's in new_record_list, use citationdic to get citations,
       update "selfcites". Selfcites is originally initial_selfcitdict. Return selfcites.
    """
    i = 0 #just for debugging ..
    #get the tags for main author, coauthors, ext authors from config
    tags = ['first_author', 'additional_author', 'alternative_author_name']
    for t in tags:
        try:
            dummy = config.get(config.get("rank_method", "function"), t)
        except:
            register_exception(prefix="attribute "+t+" missing in config", alert_admin=True)
            return initial_selfcitdict

    r_mainauthortag = config.get(config.get("rank_method", "function"), "first_author")
    r_coauthortag = config.get(config.get("rank_method", "function"), "additional_author")
    r_extauthortag = config.get(config.get("rank_method", "function"), "alternative_author_name")
    #parse the tags
    mainauthortag = tagify(parse_tag(r_mainauthortag))
    coauthortag = tagify(parse_tag(r_coauthortag))
    extauthortag = tagify(parse_tag(r_extauthortag))

    selfcites = initial_selfcitdict
    for k in new_record_list:
        if (i % 1000 == 0):
            mesg = "Selfcites done "+str(i)+" of "+str(len(new_record_list))+" records"
            write_message(mesg)
            task_update_progress(mesg)
        i = i+1
        #get the author of k
        authorlist = get_fieldvalues(k, mainauthortag)
        coauthl = get_fieldvalues(k, coauthortag)
        extauthl = get_fieldvalues(k, extauthortag)
        authorlist.append(coauthl)
        authorlist.append(extauthl)
        #author tag
        #print "record "+str(k)+" by "+str(authorlist)
        #print "is cited by"
        #get the "x-cites-this" list
        if citationdic.has_key(k):
            xct = citationdic[k]
            for c in xct:
                #get authors of c
                cauthorlist = get_fieldvalues(c, mainauthortag)
                coauthl = get_fieldvalues(c, coauthortag)
                extauthl = get_fieldvalues(c, extauthortag)
                cauthorlist.extend(coauthl)
                cauthorlist.extend(extauthl)
                #print str(c)+" by "+str(cauthorlist)
                for ca in cauthorlist:
                    if (ca in authorlist):
                        #found!
                        if selfcites.has_key(k):
                            val = selfcites[k]
                            #add only if not there already
                            if val:
                                if not c in val:
                                    val.append(c)
                            selfcites[k] = val
                        else:
                            #new key for selfcites
                            selfcites[k] = [c]

    mesg = "Selfcites done fully"
    write_message(mesg)
    task_update_progress(mesg)

    return selfcites

Example #16

0

Show file

File: bibrank_citation_indexer.py Project: AlbertoPeon/invenio

def get_tags_config(config):
    """Fetch needs config from our config file"""
    # Probably "citation" unless this file gets renamed
    function = config.get("rank_method", "function")
    write_message("config function %s" % function, verbose=9)

    tags = {}

    # 037a: contains (often) the "hep-ph/0501084" tag of THIS record
    try:
        tag = config.get(function, "primary_report_number")
    except ConfigParser.NoOptionError:
        tags['record_pri_number'] = None
    else:
        tags['record_pri_number'] = tagify(parse_tag(tag))

    # 088a: additional short identifier for the record
    try:
        tag = config.get(function, "additional_report_number")
    except ConfigParser.NoOptionError:
        tags['record_add_number'] = None
    else:
        tags['record_add_number'] = tagify(parse_tag(tag))

    # 999C5r. this is in the reference list, refers to other records.
    # Looks like: hep-ph/0408002
    try:
        tag = config.get(function, "reference_via_report_number")
    except ConfigParser.NoOptionError:
        tags['refs_report_number'] = None
    else:
        tags['refs_report_number'] = tagify(parse_tag(tag))
    # 999C5s. this is in the reference list, refers to other records.
    # Looks like: Phys.Rev.,A21,78
    try:
        tag = config.get(function, "reference_via_pubinfo")
    except ConfigParser.NoOptionError:
        tags['refs_journal'] = None
    else:
        tags['refs_journal'] = tagify(parse_tag(tag))
    # 999C5a. this is in the reference list, refers to other records.
    # Looks like: 10.1007/BF03170733
    try:
        tag = config.get(function, "reference_via_doi")
    except ConfigParser.NoOptionError:
        tags['refs_doi'] = None
    else:
        tags['refs_doi'] = tagify(parse_tag(tag))

    # Fields needed to construct the journals for this record
    try:
        tag = {
            'pages': config.get(function, "pubinfo_journal_page"),
            'year': config.get(function, "pubinfo_journal_year"),
            'journal': config.get(function, "pubinfo_journal_title"),
            'volume': config.get(function, "pubinfo_journal_volume"),
        }
    except ConfigParser.NoOptionError:
        tags['publication'] = None
    else:
        tags['publication'] = {
            'pages': tagify(parse_tag(tag['pages'])),
            'year': tagify(parse_tag(tag['year'])),
            'journal': tagify(parse_tag(tag['journal'])),
            'volume': tagify(parse_tag(tag['volume'])),
        }

    # Fields needed to lookup the DOIs
    tags['doi'] = get_field_tags('doi')

    # 999C5s. A standardized way of writing a reference in the reference list.
    # Like: Nucl. Phys. B 710 (2000) 371
    try:
        tags['publication_format'] = config.get(function,
                                                "pubinfo_journal_format")
    except ConfigParser.NoOptionError:
        tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM

    # Print values of tags for debugging
    write_message("tag values: %r" % [tags], verbose=9)

    return tags

Example #17

0

Show file

File: bfe_field.py Project: AlbertoPeon/invenio

def format_element(bfo, tag, limit, instances_separator=" ",
           subfields_separator=" ", extension="", output_pattern=""):
    """
    Prints the given field of a record.
    If tag is in range [001, 010], this element assumes
    that it accesses a control field. Else it considers it
    accesses a data field.

    <p>For eg. consider the following metdata:
    <pre>
 100__ $$aCalatroni, S$$uCERN
 245__ $$aStatus of the EP Simulations and Facilities for the SPL
 700__ $$aFerreira, L$$uCERN
 700__ $$aMacatrao, M$$uCERN
 700__ $$aSkala, A$$uCERN
 700__ $$aSosin, M$$uCERN
 700__ $$ade Waele, R$$uCERN
 700__ $$aWithofs, Y$$uKHLim, Diepenbeek
    </pre>
    The following calls to bfe_field would print:
    <pre>
    &lt;BFE_FIELD tag="700" instances_separator="&lt;br/>" subfields_separator=" - ">

    Ferreira, L - CERN
    Macatrao, M - CERN
    Skala, A - CERN
    Sosin, M - CERN
    de Waele, R - CERN
    Withofs, Y - KHLim, Diepenbeek
    </pre>
    </p>

    <p>For more advanced formatting, the <code>output_pattern</code>
    parameter can be used to output the subfields of each instance in
    the specified way. For eg. consider the following metadata:
    <pre>
 775__ $$b15. Aufl.$$c1995-1996$$nv.1$$pGrundlagen und Werkstoffe$$w317999
 775__ $$b12. Aufl.$$c1963$$w278898
 775__ $$b14. Aufl.$$c1983$$w107899
 775__ $$b13. Aufl.$$c1974$$w99635
    </pre>
    with the following <code>output_pattern</code>:

    <pre>
    &lt;a href="/record/%(w)s">%(b)s (%(c)s) %(n)s %(p)s&lt;/a>
    </pre>
    would print:<br/>

    <a href="/record/317999">15. Aufl. (1995-1996) v.1 Grundlagen und Werkstoffe</a><br/>
    <a href="/record/278898">12. Aufl. (1963) </a><br/>
    <a href="/record/107899">14. Aufl. (1983) </a><br/>
    <a href="/record/99635">13. Aufl. (1974) </a>

    <br/>(<code>instances_separator="&lt;br/>"</code> set for
    readability)<br/> The output pattern must follow <a
    href="http://docs.python.org/library/stdtypes.html#string-formatting-operations">Python
    string formatting</a> syntax. The format must use parenthesized
    notation to map to the subfield code. This currently restricts the
    support of <code>output_pattern</code> to non-repeatable
    subfields</p>

    @param tag: the tag code of the field that is to be printed
    @param instances_separator: a separator between instances of field
    @param subfields_separator: a separator between subfields of an instance
    @param limit: the maximum number of values to display.
    @param extension: a text printed at the end if 'limit' has been exceeded
    @param output_pattern: when specified, prints the subfields of each instance according to pattern specified as parameter (following Python string formatting convention)
    """
    # Check if data or control field
    p_tag = parse_tag(tag)
    if p_tag[0].isdigit() and int(p_tag[0]) in range(0, 11):
        return  bfo.control_field(tag)
    elif p_tag[0].isdigit():
        # Get values without subcode.
        # We will filter unneeded subcode later
        if p_tag[1] == '':
            p_tag[1] = '_'
        if p_tag[2] == '':
            p_tag[2] = '_'
        values = bfo.fields(p_tag[0]+p_tag[1]+p_tag[2]) # Values will
                                                        # always be a
                                                        # list of
                                                        # dicts
    else:
        return ''

    x = 0
    instances_out = [] # Retain each instance output
    for instance in values:
        filtered_values = [value for (subcode, value) in instance.iteritems()
                          if p_tag[3] == '' or p_tag[3] == '%' \
                           or p_tag[3] == subcode]
        if len(filtered_values) > 0:
            # We have found some corresponding subcode(s)
            if limit.isdigit() and x + len(filtered_values) >= int(limit):
                # We are going to exceed the limit
                filtered_values = filtered_values[:int(limit)-x] # Takes only needed one
                if len(filtered_values) > 0: # do not append empty list!
                    if output_pattern:
                        try:
                            instances_out.append(output_pattern % DictNoKeyError(instance))
                        except:
                            pass
                    else:
                        instances_out.append(subfields_separator.join(filtered_values))
                    x += len(filtered_values) # record that so we know limit has been exceeded
                break # No need to go further
            else:
                if output_pattern:
                    try:
                        instances_out.append(output_pattern % DictNoKeyError(instance))
                    except:
                        pass
                else:
                    instances_out.append(subfields_separator.join(filtered_values))
                x += len(filtered_values)

    ext_out = ''
    if limit.isdigit() and x > int(limit):
        ext_out = extension

    return instances_separator.join(instances_out) + ext_out