Exemple #1
0
 def test_EJOURNALS_keys(self):
     """bibknowledge - test left/right rules (key lookups)"""
     from invenio.modules.knowledge.api import get_kbr_values, get_kbr_keys
     mykeys = get_kbr_keys("EJOURNALS", "Acta")
     self.assertEqual(2, len(mykeys))
     mykeys = get_kbr_values("EJOURNALS", '', searchtype='e')
     self.assertEqual(0, len(mykeys))
     mykeys = get_kbr_values("EJOURNALS", searchtype='s')
     self.assertEqual(327, len(mykeys))
     mykeys = get_kbr_values("EJOURNALS", searchkey='', searchtype='s')
     self.assertEqual(327, len(mykeys))
Exemple #2
0
def crossref_translate_title(record):
    """
    Convert the record's title to the Inspire specific abbreviation
    of the title (using JOURNALS knowledge base)
    @return: changed record
    """
    # probably there is only one 773 field
    # but just in case let's treat it as a list
    for field in record_get_field_instances(record, '773'):
        title = field[0][0][1]
        new_title = get_kbr_values("JOURNALS", title, searchtype='e')
        if new_title:
            # returned value is a list, and we need only the first value
            new_title = new_title[0][0]
            position = field[4]
            record_modify_subfield(rec=record, tag='773', subfield_code='p', \
            value=new_title, subfield_position=0, field_position_global=position)
Exemple #3
0
def crossref_translate_title(record):
    """
    Convert the record's title to the Inspire specific abbreviation
    of the title (using JOURNALS knowledge base)
    @return: changed record
    """
    # probably there is only one 773 field
    # but just in case let's treat it as a list
    for field in record_get_field_instances(record, '773'):
        title = field[0][0][1]
        new_title = get_kbr_values("JOURNALS", title, searchtype='e')
        if new_title:
            # returned value is a list, and we need only the first value
            new_title = new_title[0][0]
            position = field[4]
            record_modify_subfield(rec=record, tag='773', subfield_code='p',
                                   value=new_title, subfield_position=0,
                                   field_position_global=position)
Exemple #4
0
    def kb(self, kb, string, default=""):
        """
        Returns the value of the "string" in the knowledge base "kb".

        If kb does not exist or string does not exist in kb,
        returns 'default' string or empty string if not specified.

        :param kb: a knowledge base name
        :param string: the string we want to translate
        :param default: a default value returned if 'string' not found in 'kb'
        @return: a string value corresponding to translated input with given kb
        """
        if not string:
            return default

        val = get_kbr_values(kb, searchkey=string, searchtype='e')

        try:
            return val[0][0]
        except IndexError:
            return default
Exemple #5
0
def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
    """Return list of synonyms for TERM by looking in KBR_NAME.

    :param term: search-time term or index-time term
    :param kbr_name: knowledge base name
    :param match_type: specifies how the term matches against the KBR
        before doing the lookup.  Could be `exact' (default),
        'leading_to_comma', `leading_to_number'.
    :param use_memoise: can we memoise while doing lookups?
    :return: list of term synonyms
    """
    dterms = {}
    # exact match is default:
    term_for_lookup = term
    term_remainder = ''
    from invenio.legacy.bibindex.engine_config import \
        CFG_BIBINDEX_SYNONYM_MATCH_TYPE as MATCH_TYPES
    # but maybe match different term:
    if match_type == MATCH_TYPES['leading_to_comma']:
        mmm = re.match(r'^(.*?)(\s*,.*)$', term)
        if mmm:
            term_for_lookup = mmm.group(1)
            term_remainder = mmm.group(2)
    elif match_type == MATCH_TYPES['leading_to_number']:
        mmm = re.match(r'^(.*?)(\s*\d.*)$', term)
        if mmm:
            term_for_lookup = mmm.group(1)
            term_remainder = mmm.group(2)
    # FIXME: workaround: escaping SQL wild-card signs, since KBR's
    # exact search is doing LIKE query, so would match everything:
    term_for_lookup = term_for_lookup.replace('%', '\\%')
    # OK, now find synonyms:
    for kbr_values in get_kbr_values(kbr_name,
                                     searchkey=term_for_lookup,
                                     searchtype='e',
                                     use_memoise=use_memoise):
        for kbr_value in kbr_values:
            dterms[kbr_value + term_remainder] = 1
    # return list of term synonyms:
    return dterms.keys()
Exemple #6
0
def get_synonym_terms(term, kbr_name, match_type, use_memoise=False):
    """Return list of synonyms for TERM by looking in KBR_NAME.

    :param term: search-time term or index-time term
    :param kbr_name: knowledge base name
    :param match_type: specifies how the term matches against the KBR
        before doing the lookup.  Could be `exact' (default),
        'leading_to_comma', `leading_to_number'.
    :param use_memoise: can we memoise while doing lookups?
    :return: list of term synonyms
    """
    dterms = {}
    # exact match is default:
    term_for_lookup = term
    term_remainder = ''
    from invenio.legacy.bibindex.engine_config import \
        CFG_BIBINDEX_SYNONYM_MATCH_TYPE as MATCH_TYPES
    # but maybe match different term:
    if match_type == MATCH_TYPES['leading_to_comma']:
        mmm = re.match(r'^(.*?)(\s*,.*)$', term)
        if mmm:
            term_for_lookup = mmm.group(1)
            term_remainder = mmm.group(2)
    elif match_type == MATCH_TYPES['leading_to_number']:
        mmm = re.match(r'^(.*?)(\s*\d.*)$', term)
        if mmm:
            term_for_lookup = mmm.group(1)
            term_remainder = mmm.group(2)
    # FIXME: workaround: escaping SQL wild-card signs, since KBR's
    # exact search is doing LIKE query, so would match everything:
    term_for_lookup = term_for_lookup.replace('%', '\\%')
    # OK, now find synonyms:
    for kbr_values in get_kbr_values(kbr_name,
                                     searchkey=term_for_lookup,
                                     searchtype='e',
                                     use_memoise=use_memoise):
        for kbr_value in kbr_values:
            dterms[kbr_value + term_remainder] = 1
    # return list of term synonyms:
    return dterms.keys()
Exemple #7
0
 def test_EJOURNALS_values(self):
     """bibknowledge - test a left/right rule (value lookup)"""
     from invenio.modules.knowledge.api import get_kbr_values
     vals = get_kbr_values("EJOURNALS", "Astron.")
     self.assertEqual(29, len(vals))
Exemple #8
0
def compare_metadata(metadata, rec):
    """
    Compare a record with the metadata returned by crossref
    @param rec Record
    @param doc xml.etree.ElementTree representation of the xml returned by crossref
    """
    confidence_different = 0
    msgs = []

    # Check title
    title_crossref = metadata["title"]
    title_record = get_value(rec, "773__p")
    title_similarity = None
    volume_extra = ""
    if title_crossref != "" and title_record is not None:
        # Remove Volume number from the title
        title_crossref = re.sub(":.*$", "", title_crossref)
        if re.search(" [A-Z]$", title_crossref):
            volume_extra = title_crossref[-1]
            title_crossref = title_crossref[:-2]
        title_crossref = re.sub(" (Section|Volume)$", "", title_crossref)
        abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e')
        title_similarity = compare_str(abbr_title, title_record)
        confidence_different += (1 - title_similarity) * 2
        if title_similarity < 0.6:
            msgs.append(
                "Incorrect journal name (773__p) or wrongly assigned DOI")

    # Check issn
    issn_crossref = metadata["issn"]
    issn_record = get_value(rec, "022__a")
    if issn_crossref != "" and issn_record is not None and issn_crossref != issn_record:
        confidence_different += 3
        msgs.append("Invalid ISSN (022__a) or wrongly assigned DOI")

    # Check page number
    page_crossref = metadata["page"]
    page_record = get_value(rec, "773__c")
    if page_record is not None and page_crossref != "":
        page_record = page_record.split("-")[0]
        page_crossref = page_crossref.split("-")[0]
        if page_record != page_crossref:
            confidence_different += 3
            msgs.append("Invalid page number (773__c) or wrongly assigned DOI")

    # Check author
    author_crossref = metadata["author"]
    author_record = get_value(rec, "100__a")
    if author_crossref != "" and author_record is not None:
        author_similarity = compare_str(author_crossref, author_record)
        confidence_different += (1 - author_similarity) * 1.5
        if author_similarity < 0.7:
            msgs.append("Invalid author (100__a) or wrongly assigned DOI")

    # Check issue
    issue_crossref = metadata["issue"]
    issue_record = get_value(rec, "773__n")
    if issue_crossref != "" and issue_record is not None and issue_crossref != issue_record:
        confidence_different += 2
        msgs.append("Invalid issue (773__n) or wrongly assigned DOI")

    # Check year
    year_crossref = metadata["year"]
    year_record = get_value(rec, "773__y")
    if year_crossref != "" and year_record is not None and year_crossref != year_record:
        confidence_different += 2
        msgs.append("Invalid year (773__y) or wrongly assigned DOI")

    # Check volume
    volume_crossref = metadata["volume"]
    volume_record = get_value(rec, "773__v")
    if volume_crossref != "" and volume_record is not None:
        volume_crossref = volume_extra + volume_crossref
        if volume_crossref != volume_record:
            confidence_different += 2
            msgs.append("Invalid volume (773__v) or wrongly assigned DOI")

    if confidence_different > 4:
        for msg in msgs:
            rec.set_invalid(msg)
Exemple #9
0
def compare_metadata(metadata, rec):
    """
    Compare a record with the metadata returned by crossref
    @param rec Record
    @param doc xml.etree.ElementTree representation of the xml returned by crossref
    """
    confidence_different = 0
    msgs = []

    # Check title
    title_crossref = metadata["title"]
    title_record = get_value(rec, "773__p")
    title_similarity = None
    volume_extra = ""
    if title_crossref != "" and title_record is not None:
        # Remove Volume number from the title
        title_crossref = re.sub(":.*$", "", title_crossref)
        if re.search(" [A-Z]$", title_crossref):
            volume_extra = title_crossref[-1]
            title_crossref = title_crossref[:-2]
        title_crossref = re.sub(" (Section|Volume)$", "", title_crossref)
        abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e')
        title_similarity = compare_str(abbr_title, title_record)
        confidence_different += (1 - title_similarity)*2
        if title_similarity < 0.6:
            msgs.append("Incorrect journal name (773__p) or wrongly assigned DOI")

    # Check issn
    issn_crossref = metadata["issn"]
    issn_record = get_value(rec, "022__a")
    if issn_crossref != "" and issn_record is not None and issn_crossref != issn_record:
        confidence_different += 3
        msgs.append("Invalid ISSN (022__a) or wrongly assigned DOI")

    # Check page number
    page_crossref = metadata["page"]
    page_record = get_value(rec, "773__c")
    if page_record is not None and page_crossref != "":
        page_record = page_record.split("-")[0]
        page_crossref = page_crossref.split("-")[0]
        if page_record != page_crossref:
            confidence_different += 3
            msgs.append("Invalid page number (773__c) or wrongly assigned DOI")

    # Check author
    author_crossref = metadata["author"]
    author_record = get_value(rec, "100__a")
    if author_crossref != "" and author_record is not None:
        author_similarity = compare_str(author_crossref, author_record)
        confidence_different += (1 - author_similarity)*1.5
        if author_similarity < 0.7:
            msgs.append("Invalid author (100__a) or wrongly assigned DOI")

    # Check issue
    issue_crossref = metadata["issue"]
    issue_record = get_value(rec, "773__n")
    if issue_crossref != "" and issue_record is not None and issue_crossref != issue_record:
        confidence_different += 2
        msgs.append("Invalid issue (773__n) or wrongly assigned DOI")


    # Check year
    year_crossref = metadata["year"]
    year_record = get_value(rec, "773__y")
    if year_crossref != "" and year_record is not None and year_crossref != year_record:
        confidence_different += 2
        msgs.append("Invalid year (773__y) or wrongly assigned DOI")

    # Check volume
    volume_crossref = metadata["volume"]
    volume_record = get_value(rec, "773__v")
    if volume_crossref != "" and volume_record is not None:
        volume_crossref = volume_extra + volume_crossref
        if volume_crossref != volume_record:
            confidence_different += 2
            msgs.append("Invalid volume (773__v) or wrongly assigned DOI")

    if confidence_different > 4:
        for msg in msgs:
            rec.set_invalid(msg)