Exemple #1
0
def match_unit(record, p, f=None, m='a', wl=None):
    """Match record to basic match unit."""
    from invenio.modules.jsonalchemy.parser import guess_legacy_field_names
    from invenio.legacy.bibindex.engine_utils import get_field_tags

    if record is None:
        return p is None

    if f is not None:
        fields = (get_field_tags(f, 'nonmarc') +
                  guess_legacy_field_names(f, 'marc', 'recordext')[f] + [f])
        for field in fields:
            if field not in record:
                continue
            if match_unit(record[field], p, f=None, m=m, wl=None):
                return True
        return False

    # compile search value only once for non exact search
    if m != 'e' and isinstance(p, six.string_types):
        p = re.compile(p)

    if isinstance(record, list):
        return any([match_unit(field, p, f=f, m=m, wl=wl) for field in record])
    elif isinstance(record, dict):
        return any([
            match_unit(field, p, f=f, m=m, wl=wl) for field in record.values()
        ])

    if m == 'e':
        return str(record) == p
    return p.search(str(record)) is not None
Exemple #2
0
def match_unit(record, p, f=None, m='a', wl=None):
    """Match record to basic match unit."""
    from invenio.modules.jsonalchemy.parser import guess_legacy_field_names
    from invenio.legacy.bibindex.engine_utils import get_field_tags

    if record is None:
        return p is None

    if f is not None:
        fields = (get_field_tags(f, 'nonmarc') +
                  guess_legacy_field_names(f, 'marc', 'recordext')[f] + [f])
        for field in fields:
            if field not in record:
                continue
            if match_unit(record[field], p, f=None, m=m, wl=None):
                return True
        return False

    # compile search value only once for non exact search
    if m != 'e' and isinstance(p, six.string_types):
        p = re.compile(p)

    if isinstance(record, list):
        return any([match_unit(field, p, f=f, m=m, wl=wl)
                    for field in record])
    elif isinstance(record, dict):
        return any([match_unit(field, p, f=f, m=m, wl=wl)
                    for field in record.values()])

    if m == 'e':
        return str(record) == p
    return p.search(str(record)) is not None
Exemple #3
0
def format_element(bfo,
                   name,
                   tag_name='',
                   tag='',
                   kb='',
                   kb_default_output='',
                   var='',
                   protocol='googlescholar'):
    """Prints a custom field in a way suitable to be used in HTML META
    tags.  In particular conforms to Google Scholar harvesting protocol as
    defined http://scholar.google.com/intl/en/scholar/inclusion.html and
    Open Graph http://ogp.me/

    @param tag_name: the name, from tag table, of the field to be exported
    looks initially for names prefixed by "meta-"<tag_name>
    then looks for exact name, then falls through to "tag"
    @param tag: the MARC tag to be exported (only if not defined by tag_name). Comma-separated list of tags.
    @param name: name to be displayed in the meta headers, labelling this value.
    @param kb: a knowledge base through which to process the retrieved value if necessary.
    @param kb: when a '<code>kb</code>' is specified and no match for value is found, what shall we
               return? Either return the given parameter or specify "{value}" to return the retrieved
               value before processing though kb.
    @param var: the name of a variable to output instead of field from metadata.
                Allowed values are those supported by bfe_server_info and
                bfe_client_info. Overrides <code>name</code> and <code>tag_name</code>
    @param protocol: the protocol this tag is aimed at. Can be used to switch on/off support for a given "protocol". Can take values among 'googlescholar', 'opengraph'
    @see: bfe_server_info.py, bfe_client_info.py
    """
    if protocol == 'googlescholar' and not CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR:
        return ""
    elif protocol == 'opengraph' and not CFG_WEBSEARCH_ENABLE_OPENGRAPH:
        return ""

    matched_by_tag_name_p = False
    tags = []
    if var:
        # delegate to bfe_server_info or bfe_client_info:
        value = server_info(bfo, var)
        if value.startswith("Unknown variable: "):
            # Oops variable was not defined there
            value = client_info(bfo, var)
        return not value.startswith("Unknown variable: ") and \
               create_metatag(name=name, content=cgi.escape(value, True)) \
               or ""
    elif tag_name:
        # First check for special meta named tags
        tags = get_field_tags("meta-" + tag_name)
        if not tags:
            # then check for regular tags
            tags = get_field_tags(tag_name)
        matched_by_tag_name_p = tags and True or False
    if not tags and tag:
        # fall back to explicit marc tag
        if ',' in tag:
            tags = tag.split(',')
        else:
            tags = [tag]
    if not tags:
        return ''
    out = []

    if protocol == 'googlescholar' and \
      (tags == ['100__a'] or tags == ['700__a']):
        # Authors for Google Scholar: remove names that are not purely
        # author (thesis director, coordinator, etc). Assume that
        # existence of $e subfield is a sign. Since this assumption
        # might be wrong, put some strong conditions in order to get
        # into this branch, with easy way to bypass.
        values = [field_instance[tags[0][-1]] for field_instance in bfo.fields(tags[0][:-1], escape=9) \
                  if not field_instance.has_key('e')]
    else:
        # Standard fetching of values
        values = [bfo.fields(marctag, escape=9) for marctag in tags]

    if name == 'citation_dissertation_institution':
        if CFG_CERN_SITE and \
          'THESIS' in bfo.fields('980__a'):
            authors = bfo.fields('100__', escape=9)
            authors.extend(bfo.fields('700__', escape=9))
            values = [field_instance['u'] for field_instance in authors \
              if not field_instance.has_key('e') and  field_instance.has_key('u')]
        elif tag == '100__u' and not matched_by_tag_name_p:
            # TODO: find way to map correctly this tag
            values = []

    for value in values:
        if isinstance(value, list):
            for val in value:
                if isinstance(val, dict):
                    out.extend(val.values())
                else:
                    out.append(val)
        elif isinstance(value, dict):
            out.extend(value.values())
        else:
            out.append(value)

    if name == 'citation_date':
        for idx in range(len(out)):
            out[idx] = out[idx].replace('-', '/')

    elif name == 'citation_publication_date':
        for idx in range(len(out)):
            # Stop at first match
            parsed_date = parse_date_for_googlescholar(out[idx])
            if parsed_date:
                out = [parsed_date]
                break

    out = dict(zip(out, len(out) * [''])).keys()  # Remove duplicates

    if kb:
        if kb_default_output == "{value}":
            out = [bfo.kb(kb, value, value) for value in out]
        else:
            out = [bfo.kb(kb, value, kb_default_output) for value in out]
    return '\n'.join(
        [create_metatag(name=name, content=value) for value in out])
def get_tags_config(config):
    """Fetch needs config from our config file"""
    # Probably "citation" unless this file gets renamed
    function = config.get("rank_method", "function")
    write_message("config function %s" % function, verbose=9)

    tags = {}

    # 037a: contains (often) the "hep-ph/0501084" tag of THIS record
    try:
        tag = config.get(function, "primary_report_number")
    except ConfigParser.NoOptionError:
        tags['record_pri_number'] = None
    else:
        tags['record_pri_number'] = tagify(parse_tag(tag))

    # 088a: additional short identifier for the record
    try:
        tag = config.get(function, "additional_report_number")
    except ConfigParser.NoOptionError:
        tags['record_add_number'] = None
    else:
        tags['record_add_number'] = tagify(parse_tag(tag))

    # 999C5r. this is in the reference list, refers to other records.
    # Looks like: hep-ph/0408002
    try:
        tag = config.get(function, "reference_via_report_number")
    except ConfigParser.NoOptionError:
        tags['refs_report_number'] = None
    else:
        tags['refs_report_number'] = tagify(parse_tag(tag))
    # 999C5s. this is in the reference list, refers to other records.
    # Looks like: Phys.Rev.,A21,78
    try:
        tag = config.get(function, "reference_via_pubinfo")
    except ConfigParser.NoOptionError:
        tags['refs_journal'] = None
    else:
        tags['refs_journal'] = tagify(parse_tag(tag))
    # 999C5a. this is in the reference list, refers to other records.
    # Looks like: 10.1007/BF03170733
    try:
        tag = config.get(function, "reference_via_doi")
    except ConfigParser.NoOptionError:
        tags['refs_doi'] = None
    else:
        tags['refs_doi'] = tagify(parse_tag(tag))

    # 999C50. this is in the reference list, refers to other records.
    # Looks like: 1205
    try:
        tag = config.get(function, "reference_via_record_id")
    except ConfigParser.NoOptionError:
        tags['refs_record_id'] = None
    else:
        tags['refs_record_id'] = tagify(parse_tag(tag))

    # 999C5i. this is in the reference list, refers to other records.
    # Looks like: 9781439520031
    try:
        tag = config.get(function, "reference_via_isbn")
    except ConfigParser.NoOptionError:
        tags['refs_isbn'] = None
    else:
        tags['refs_isbn'] = tagify(parse_tag(tag))

    # Fields needed to construct the journals for this record
    try:
        tag = {
            'pages': config.get(function, "pubinfo_journal_page"),
            'year': config.get(function, "pubinfo_journal_year"),
            'journal': config.get(function, "pubinfo_journal_title"),
            'volume': config.get(function, "pubinfo_journal_volume"),
        }
    except ConfigParser.NoOptionError:
        tags['publication'] = None
    else:
        tags['publication'] = {
            'pages': tagify(parse_tag(tag['pages'])),
            'year': tagify(parse_tag(tag['year'])),
            'journal': tagify(parse_tag(tag['journal'])),
            'volume': tagify(parse_tag(tag['volume'])),
        }

    # Fields needed to lookup the DOIs
    tags['doi'] = get_field_tags('doi')

    # Fields needed to lookup the ISBN
    tags['isbn'] = get_field_tags('isbn')

    # 999C5s. A standardized way of writing a reference in the reference list.
    # Like: Nucl. Phys. B 710 (2000) 371
    try:
        tags['publication_format'] = config.get(function,
                                                "pubinfo_journal_format")
    except ConfigParser.NoOptionError:
        tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM

    # Print values of tags for debugging
    write_message("tag values: %r" % [tags], verbose=9)

    return tags
def get_tags_config(config):
    """Fetch needs config from our config file"""
    # Probably "citation" unless this file gets renamed
    function = config.get("rank_method", "function")
    write_message("config function %s" % function, verbose=9)

    tags = {}

    # 037a: contains (often) the "hep-ph/0501084" tag of THIS record
    try:
        tag = config.get(function, "primary_report_number")
    except ConfigParser.NoOptionError:
        tags["record_pri_number"] = None
    else:
        tags["record_pri_number"] = tagify(parse_tag(tag))

    # 088a: additional short identifier for the record
    try:
        tag = config.get(function, "additional_report_number")
    except ConfigParser.NoOptionError:
        tags["record_add_number"] = None
    else:
        tags["record_add_number"] = tagify(parse_tag(tag))

    # 999C5r. this is in the reference list, refers to other records.
    # Looks like: hep-ph/0408002
    try:
        tag = config.get(function, "reference_via_report_number")
    except ConfigParser.NoOptionError:
        tags["refs_report_number"] = None
    else:
        tags["refs_report_number"] = tagify(parse_tag(tag))
    # 999C5s. this is in the reference list, refers to other records.
    # Looks like: Phys.Rev.,A21,78
    try:
        tag = config.get(function, "reference_via_pubinfo")
    except ConfigParser.NoOptionError:
        tags["refs_journal"] = None
    else:
        tags["refs_journal"] = tagify(parse_tag(tag))
    # 999C5a. this is in the reference list, refers to other records.
    # Looks like: 10.1007/BF03170733
    try:
        tag = config.get(function, "reference_via_doi")
    except ConfigParser.NoOptionError:
        tags["refs_doi"] = None
    else:
        tags["refs_doi"] = tagify(parse_tag(tag))

    # 999C50. this is in the reference list, refers to other records.
    # Looks like: 1205
    try:
        tag = config.get(function, "reference_via_record_id")
    except ConfigParser.NoOptionError:
        tags["refs_record_id"] = None
    else:
        tags["refs_record_id"] = tagify(parse_tag(tag))

    # 999C5i. this is in the reference list, refers to other records.
    # Looks like: 9781439520031
    try:
        tag = config.get(function, "reference_via_isbn")
    except ConfigParser.NoOptionError:
        tags["refs_isbn"] = None
    else:
        tags["refs_isbn"] = tagify(parse_tag(tag))

    # Fields needed to construct the journals for this record
    try:
        tag = {
            "pages": config.get(function, "pubinfo_journal_page"),
            "year": config.get(function, "pubinfo_journal_year"),
            "journal": config.get(function, "pubinfo_journal_title"),
            "volume": config.get(function, "pubinfo_journal_volume"),
        }
    except ConfigParser.NoOptionError:
        tags["publication"] = None
    else:
        tags["publication"] = {
            "pages": tagify(parse_tag(tag["pages"])),
            "year": tagify(parse_tag(tag["year"])),
            "journal": tagify(parse_tag(tag["journal"])),
            "volume": tagify(parse_tag(tag["volume"])),
        }

    # Fields needed to lookup the DOIs
    tags["doi"] = get_field_tags("doi")

    # Fields needed to lookup the ISBN
    tags["isbn"] = get_field_tags("isbn")

    # 999C5s. A standardized way of writing a reference in the reference list.
    # Like: Nucl. Phys. B 710 (2000) 371
    try:
        tags["publication_format"] = config.get(function, "pubinfo_journal_format")
    except ConfigParser.NoOptionError:
        tags["publication_format"] = CFG_JOURNAL_PUBINFO_STANDARD_FORM

    # Print values of tags for debugging
    write_message("tag values: %r" % [tags], verbose=9)

    return tags
Exemple #6
0
def format_element(bfo, name, tag_name='', tag='', kb='', kb_default_output='', var='', protocol='googlescholar'):
    """Prints a custom field in a way suitable to be used in HTML META
    tags.  In particular conforms to Google Scholar harvesting protocol as
    defined http://scholar.google.com/intl/en/scholar/inclusion.html and
    Open Graph http://ogp.me/

    @param tag_name: the name, from tag table, of the field to be exported
    looks initially for names prefixed by "meta-"<tag_name>
    then looks for exact name, then falls through to "tag"
    @param tag: the MARC tag to be exported (only if not defined by tag_name). Comma-separated list of tags.
    @param name: name to be displayed in the meta headers, labelling this value.
    @param kb: a knowledge base through which to process the retrieved value if necessary.
    @param kb: when a '<code>kb</code>' is specified and no match for value is found, what shall we
               return? Either return the given parameter or specify "{value}" to return the retrieved
               value before processing though kb.
    @param var: the name of a variable to output instead of field from metadata.
                Allowed values are those supported by bfe_server_info and
                bfe_client_info. Overrides <code>name</code> and <code>tag_name</code>
    @param protocol: the protocol this tag is aimed at. Can be used to switch on/off support for a given "protocol". Can take values among 'googlescholar', 'opengraph'
    @see: bfe_server_info.py, bfe_client_info.py
    """
    if protocol == 'googlescholar' and not CFG_WEBSEARCH_ENABLE_GOOGLESCHOLAR:
        return ""
    elif protocol == 'opengraph' and not CFG_WEBSEARCH_ENABLE_OPENGRAPH:
        return ""

    matched_by_tag_name_p = False
    tags = []
    if var:
        # delegate to bfe_server_info or bfe_client_info:
        value = server_info(bfo, var)
        if value.startswith("Unknown variable: "):
            # Oops variable was not defined there
            value = client_info(bfo, var)
        return not value.startswith("Unknown variable: ") and \
               create_metatag(name=name, content=cgi.escape(value, True)) \
               or ""
    elif tag_name:
        # First check for special meta named tags
        tags = get_field_tags("meta-" + tag_name)
        if not tags:
            # then check for regular tags
            tags = get_field_tags(tag_name)
        matched_by_tag_name_p = tags and True or False
    if not tags and tag:
        # fall back to explicit marc tag
        if ',' in tag:
            tags = tag.split(',')
        else:
            tags = [tag]
    if not tags:
        return ''
    out = []

    if protocol == 'googlescholar' and \
      (tags == ['100__a'] or tags == ['700__a']):
      # Authors for Google Scholar: remove names that are not purely
      # author (thesis director, coordinator, etc). Assume that
      # existence of $e subfield is a sign. Since this assumption
      # might be wrong, put some strong conditions in order to get
      # into this branch, with easy way to bypass.
      values = [field_instance[tags[0][-1]] for field_instance in bfo.fields(tags[0][:-1], escape=9) \
                if not field_instance.has_key('e')]
    else:
        # Standard fetching of values
        values = [bfo.fields(marctag, escape=9) for marctag in tags]


    if name == 'citation_dissertation_institution':
        if CFG_CERN_SITE and \
          'THESIS' in bfo.fields('980__a'):
                authors = bfo.fields('100__', escape=9)
                authors.extend(bfo.fields('700__', escape=9))
                values = [field_instance['u'] for field_instance in authors \
                  if not field_instance.has_key('e') and  field_instance.has_key('u')]
        elif tag == '100__u' and not matched_by_tag_name_p:
            # TODO: find way to map correctly this tag
            values = []

    for value in values:
        if isinstance(value, list):
            for val in value:
                if isinstance(val, dict):
                    out.extend(val.values())
                else:
                    out.append(val)
        elif isinstance(value, dict):
            out.extend(value.values())
        else:
            out.append(value)

    if name == 'citation_date':
        for idx in range(len(out)):
            out[idx] = out[idx].replace('-', '/')

    elif name == 'citation_publication_date':
        for idx in range(len(out)):
            # Stop at first match
            parsed_date = parse_date_for_googlescholar(out[idx])
            if parsed_date:
                out = [parsed_date]
                break

    out = dict(zip(out, len(out)*[''])).keys() # Remove duplicates

    if kb:
        if kb_default_output == "{value}":
            out = [bfo.kb(kb, value, value) for value in out]
        else:
            out = [bfo.kb(kb, value, kb_default_output) for value in out]
    return '\n'.join([create_metatag(name=name, content=value) for value in out])