Exemple #1
0
def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0], tag="999", ind1="C", ind2="5")
    refextract_status = record_get_field_instances(references[0], tag="999", ind1="C", ind2="6")

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, "999")
        record_add_fields(record, "999", references_to_add)
        record_add_fields(record, "999", refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml
Exemple #2
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a',
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a',
            value=new_author, subfield_position=0, field_position_global=position)
Exemple #3
0
def crossref_normalize_name(record):
    """
    Changes the format of author's name (often with initials) to the proper,
    unified one, using bibauthor_name_utils tools
    @return: changed record
    """
    # pattern for removing the spaces between two initials
    pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)'
    # first, change the main author
    for field in record_get_field_instances(record, '100'):
        main_author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(main_author))
        # remove spaces between initials
        # two iterations are required
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='100', subfield_code='a',
        value=new_author, subfield_position=0, field_position_global=position)

    # then, change additional authors
    for field in record_get_field_instances(record, '700'):
        author = field[0][0][1]
        new_author = create_normalized_name(split_name_parts(author))
        for _ in range(2):
            new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author)
        position = field[4]
        record_modify_subfield(rec=record, tag='700', subfield_code='a',
            value=new_author, subfield_position=0, field_position_global=position)
Exemple #4
0
def replace_references(recid):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    """
    # Parse references
    references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)
    # Record marc xml
    record = get_record(recid)

    if references[0]:
        fields_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='%',
                                                   ind2='%')
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', fields_to_add)
        # Update record references
        out_xml = record_xml_output(record)
    else:
        out_xml = None

    return out_xml
Exemple #5
0
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'):
    """
    Given a record and field tag, this function will modify the record by
    translating the subfield values of found fields from LaTeX to chosen
    encoding for all the subfields with given code (or all if no code is given).

    :param record: record to modify, in BibRec style structure
    :type record: dict

    :param tag: tag of fields to modify
    :type tag: string

    :param code: restrict the translation to a given subfield code
    :type code: string

    :param encoding: scharacter encoding for the new value. Defaults to UTF-8.
    :type encoding: string
    """
    field_list = record_get_field_instances(record, tag)
    for field in field_list:
        subfields = field[0]
        subfield_index = 0
        for subfield_code, subfield_value in subfields:
            if code == '' or subfield_code == code:
                newvalue = translate_latex2unicode(subfield_value).encode(
                    encoding)
                record_modify_subfield(record,
                                       tag,
                                       subfield_code,
                                       newvalue,
                                       subfield_index,
                                       field_position_global=field[4])
            subfield_index += 1
Exemple #6
0
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"):
    """
    Prints the list of editors of a record.

    @param limit: the maximum number of editors to display
    @param separator: the separator between editors.
    @param extension: a text printed if more editors than 'limit' exist
    @param print_links: if yes, print the editors as HTML link to their publications
    """
    from urllib import quote
    from invenio.config import CFG_SITE_URL
    from invenio.legacy import bibrecord

    authors = bibrecord.record_get_field_instances(bfo.get_record(), '100')

    editors = [bibrecord.field_get_subfield_values(author, 'a')[0]
               for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ]

    if print_links.lower() == "yes":
        editors = ['<a href="' + CFG_SITE_URL + '/search?f=author&p=' + \
                   quote(editor) + \
                   '&amp;ln='+ bfo.lang + \
                   '">' + editor + '</a>'
                   for editor in editors]

    if limit.isdigit() and len(editors) > int(limit):
        return separator.join(editors[:int(limit)]) + extension

    elif len(editors) > 0:
        return separator.join(editors)
Exemple #7
0
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'):
    """
    Given a record and field tag, this function will modify the record by
    translating the subfield values of found fields from LaTeX to chosen
    encoding for all the subfields with given code (or all if no code is given).

    :param record: record to modify, in BibRec style structure
    :type record: dict

    :param tag: tag of fields to modify
    :type tag: string

    :param code: restrict the translation to a given subfield code
    :type code: string

    :param encoding: scharacter encoding for the new value. Defaults to UTF-8.
    :type encoding: string
    """
    field_list = record_get_field_instances(record, tag)
    for field in field_list:
        subfields = field[0]
        subfield_index = 0
        for subfield_code, subfield_value in subfields:
            if code == '' or subfield_code == code:
                newvalue = translate_latex2unicode(
                    subfield_value
                ).encode(encoding)
                record_modify_subfield(record, tag, subfield_code, newvalue,
                                       subfield_index,
                                       field_position_global=field[4])
            subfield_index += 1
Exemple #8
0
def record_get_value_with_provenence(record, provenence_value, provenence_code,
                                     tag, ind1=" ", ind2=" ", code=""):
    """
    Retrieves the value of the given field(s) with given provenence code/value
    combo.

    For example:

    If one would like to extract all subject categories (65017 $a) with a given
    provenence, in this case "arXiv" in $9:

    65017 $ahep-ph$9arXiv
    65017 $ahep-th$9arXiv
    65017 $aMath$9INSPIRE

    this function would return ["hep-ph", "hep-th"]

    Returns a list of subfield values.
    """
    fields = record_get_field_instances(record, tag, ind1, ind2)
    final_values = []
    for subfields, dummy1, dummy2, dummy3, dummy4 in fields:
        for subfield_code, value in subfields:
            if subfield_code == provenence_code and value == provenence_value:
                # We have a hit. Stop to look for right value
                break
        else:
            # No hits.. continue to next field
            continue
        for subfield_code, value in subfields:
            if subfield_code == code:
                # This is the value we are looking for with the correct provenence
                final_values.append(value)
    return final_values
Exemple #9
0
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"):
    """
    Prints the list of editors of a record.

    @param limit: the maximum number of editors to display
    @param separator: the separator between editors.
    @param extension: a text printed if more editors than 'limit' exist
    @param print_links: if yes, print the editors as HTML link to their publications
    """
    from urllib import quote
    from invenio.config import CFG_BASE_URL
    from invenio.legacy import bibrecord

    authors = bibrecord.record_get_field_instances(bfo.get_record(), '100')

    editors = [bibrecord.field_get_subfield_values(author, 'a')[0]
               for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ]

    if print_links.lower() == "yes":
        editors = ['<a href="' + CFG_BASE_URL + '/search?f=author&p=' + \
                   quote(editor) + \
                   '&amp;ln='+ bfo.lang + \
                   '">' + editor + '</a>'
                   for editor in editors]

    if limit.isdigit() and len(editors) > int(limit):
        return separator.join(editors[:int(limit)]) + extension

    elif len(editors) > 0:
        return separator.join(editors)
Exemple #10
0
def _create_ticket(recid, bibcatalog_system, queue):
    subject = "Refs for #%s" % recid

    if CFG_INSPIRE_SITE:
        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_core = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'CORE':
                    in_core = True
                if collection == 'arXiv':
                    # Do not create tickets for arxiv papers
                    # Tickets for arxiv papers are created in bibcatelog
                    write_message("arXiv paper", verbose=1)
                    return

        # Only create tickets for HEP
        if not in_core:
            write_message("not in hep", verbose=1)
            return

        # Do not create tickets for old records
        creation_date = run_sql(
            """SELECT creation_date FROM bibrec
                                   WHERE id = %s""", [recid])[0][0]
        if creation_date < datetime.now() - timedelta(days=30 * 4):
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'c'):
                if category.startswith('astro-ph'):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

    text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid)
    bibcatalog_system.ticket_submit(subject=subject,
                                    queue=queue,
                                    text=text,
                                    recordid=recid)
Exemple #11
0
def _create_ticket(recid, bibcatalog_system, queue):
    subject = "Refs for #%s" % recid

    if CFG_INSPIRE_SITE:
        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_core = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'CORE':
                    in_core = True
                if collection == 'arXiv':
                    # Do not create tickets for arxiv papers
                    # Tickets for arxiv papers are created in bibcatelog
                    write_message("arXiv paper", verbose=1)
                    return

        # Only create tickets for HEP
        if not in_core:
            write_message("not in hep", verbose=1)
            return

        # Do not create tickets for old records
        creation_date = run_sql("""SELECT creation_date FROM bibrec
                                   WHERE id = %s""", [recid])[0][0]
        if creation_date < datetime.now() - timedelta(days=30*4):
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'c'):
                if category.startswith('astro-ph'):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

    text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL,
                                                    recid)
    bibcatalog_system.ticket_submit(subject=subject,
                                    queue=queue,
                                    text=text,
                                    recordid=recid)
Exemple #12
0
    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False
Exemple #13
0
    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False
Exemple #14
0
def record_in_collection(record, collection):
    """
    Returns True/False if given record is in a given collection (980__a).
    """
    for collection_tag in record_get_field_instances(record, "980"):
        for coll in field_get_subfield_values(collection_tag, 'a'):
            if coll.lower() == collection.lower():
                return True
    return False
Exemple #15
0
def record_in_collection(record, collection):
    """
    Returns True/False if given record is in a given collection (980__a).
    """
    for collection_tag in record_get_field_instances(record, "980"):
        for coll in field_get_subfield_values(collection_tag, 'a'):
            if coll.lower() == collection.lower():
                return True
    return False
def rule_change_conf_num(header, record):
    substitutes = {
        "C78-09-18xxx": "C78-09-18.2"
    }
    for field in record_get_field_instances(record, '773'):
        for idx, (code, value) in enumerate(field[0]):
            if code == 'w' and value in substitutes.keys():
                field[0][idx] = ('w', substitutes[value])
    return record
Exemple #17
0
def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(
            txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(
        recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml
Exemple #18
0
def record_find_matching_fields(key,
                                rec,
                                tag="",
                                ind1=" ",
                                ind2=" ",
                                exact_match=False):
    """
    This utility function will look for any fieldvalues containing or equal
    to, if exact match is wanted, given keyword string. The found fields will be
    returned as a list of field instances per tag. The fields to search can be
    narrowed down to tag/indicator level.

    @param key: keyword to search for
    @type key: string

    @param rec: a record structure as returned by bibrecord.create_record()
    @type rec: dict

    @param tag: a 3 characters long string
    @type tag: string

    @param ind1: a 1 character long string
    @type ind1: string

    @param ind2: a 1 character long string
    @type ind2: string

    @return: a list of found fields in a tuple per tag: (tag, field_instances) where
        field_instances is a list of (Subfields, ind1, ind2, value, field_position_global)
        and subfields is list of (code, value)
    @rtype: list
    """
    if not tag:
        all_field_instances = rec.items()
    else:
        all_field_instances = [
            (tag, record_get_field_instances(rec, tag, ind1, ind2))
        ]
    matching_field_instances = []
    for current_tag, field_instances in all_field_instances:
        found_fields = []
        for field_instance in field_instances:
            # Get values to match: controlfield_value + subfield values
            values_to_match = [field_instance[3]] + \
                              [val for dummy_code, val in field_instance[0]]
            if exact_match and key in values_to_match:
                found_fields.append(field_instance)
            else:
                for value in values_to_match:
                    if value.find(key) > -1:
                        found_fields.append(field_instance)
                        break
        if len(found_fields) > 0:
            matching_field_instances.append((current_tag, found_fields))
    return matching_field_instances
Exemple #19
0
def _get_record_linking_fields(recid_b, recid_a, tag, ind1, ind2):
    """
    Returns the fields (defined by tag, ind1, ind2) in record (given
    by recid_b) that do not link to another given record (recid_a).
    """
    fields = []
    rec = create_record(format_record(recid_b, "xm"))[0]
    for field_instance in record_get_field_instances(rec, tag=tag, ind1=ind1, ind2=ind2):
        if not ('w', str(recid_a)) in field_instance[0]:
            fields.append(field_instance)
    return fields
def rule_create_fft(header, record):
    for field in record_get_field_instances(record, '856', ind1='4'):
        url = None
        for code, value in field_get_subfield_instances(field):
            if code == 'u':
                url = value
                break
        if url:
            subs = [('a', url), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]
            record_add_field(record, 'FFT', subfields=subs)
    return record
Exemple #21
0
    def process_record(self, record):
        """@see: BaseFieldCommand.process_record"""

        # if the tag is empty, we don't make any changes
        if self._tag == "" or self._tag == None:
            return

        matching_field_instances = \
            bibrecord.record_get_field_instances(record, self._tag,
                                                 self._ind1, self._ind2)
        for current_field in matching_field_instances:
            self._apply_subfield_commands_to_field(record, current_field[4])
Exemple #22
0
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE):
    write_message('bibcatalog_system %s' % bibcatalog_system, verbose=1)
    write_message('queue %s' % queue, verbose=1)
    if bibcatalog_system and queue:

        subject = "Refs for #%s" % recid

        # Add report number in the subjecet
        report_number = ""
        record = get_bibrecord(recid)

        in_hep = False
        for collection_tag in record_get_field_instances(record, "980"):
            for collection in field_get_subfield_values(collection_tag, 'a'):
                if collection == 'HEP':
                    in_hep = True

        # Only create tickets for HEP
        if not in_hep:
            write_message("not in hep", verbose=1)
            return

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'c'):
                if category.startswith('astro-ph'):
                    write_message("astro-ph", verbose=1)
                    # We do not curate astro-ph
                    return

            for report_number in field_get_subfield_values(report_tag, 'a'):
                subject += " " + report_number
                break

        text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, \
                                                        recid)
        bibcatalog_system.ticket_submit(subject=subject,
                                        queue=queue,
                                        text=text,
                                        recordid=recid)
Exemple #23
0
def _get_record_linking_fields(recid_b, recid_a, tag, ind1, ind2):
    """
    Returns the fields (defined by tag, ind1, ind2) in record (given
    by recid_b) that do not link to another given record (recid_a).
    """
    fields = []
    rec = create_record(format_record(recid_b, "xm"))[0]
    for field_instance in record_get_field_instances(rec,
                                                     tag=tag,
                                                     ind1=ind1,
                                                     ind2=ind2):
        if not ('w', str(recid_a)) in field_instance[0]:
            fields.append(field_instance)
    return fields
Exemple #24
0
def record_find_matching_fields(key, rec, tag="", ind1=" ", ind2=" ",
                                exact_match=False):
    """
    This utility function will look for any fieldvalues containing or equal
    to, if exact match is wanted, given keyword string. The found fields will be
    returned as a list of field instances per tag. The fields to search can be
    narrowed down to tag/indicator level.

    @param key: keyword to search for
    @type key: string

    @param rec: a record structure as returned by bibrecord.create_record()
    @type rec: dict

    @param tag: a 3 characters long string
    @type tag: string

    @param ind1: a 1 character long string
    @type ind1: string

    @param ind2: a 1 character long string
    @type ind2: string

    @return: a list of found fields in a tuple per tag: (tag, field_instances) where
        field_instances is a list of (Subfields, ind1, ind2, value, field_position_global)
        and subfields is list of (code, value)
    @rtype: list
    """
    if not tag:
        all_field_instances = rec.items()
    else:
        all_field_instances = [(tag, record_get_field_instances(rec, tag, ind1, ind2))]
    matching_field_instances = []
    for current_tag, field_instances in all_field_instances:
        found_fields = []
        for field_instance in field_instances:
            # Get values to match: controlfield_value + subfield values
            values_to_match = [field_instance[3]] + \
                              [val for dummy_code, val in field_instance[0]]
            if exact_match and key in values_to_match:
                found_fields.append(field_instance)
            else:
                for value in values_to_match:
                    if value.find(key) > -1:
                        found_fields.append(field_instance)
                        break
        if len(found_fields) > 0:
            matching_field_instances.append((current_tag, found_fields))
    return matching_field_instances
Exemple #25
0
def crossref_translate_title(record):
    """
    Convert the record's title to the Inspire specific abbreviation
    of the title (using JOURNALS knowledge base)
    @return: changed record
    """
    # probably there is only one 773 field
    # but just in case let's treat it as a list
    for field in record_get_field_instances(record, '773'):
        title = field[0][0][1]
        new_title = get_kbr_values("JOURNALS", title, searchtype='e')
        if new_title:
            # returned value is a list, and we need only the first value
            new_title = new_title[0][0]
            position = field[4]
            record_modify_subfield(rec=record, tag='773', subfield_code='p', \
            value=new_title, subfield_position=0, field_position_global=position)
Exemple #26
0
def crossref_translate_title(record):
    """
    Convert the record's title to the Inspire specific abbreviation
    of the title (using JOURNALS knowledge base)
    @return: changed record
    """
    # probably there is only one 773 field
    # but just in case let's treat it as a list
    for field in record_get_field_instances(record, '773'):
        title = field[0][0][1]
        new_title = get_kbr_values("JOURNALS", title, searchtype='e')
        if new_title:
            # returned value is a list, and we need only the first value
            new_title = new_title[0][0]
            position = field[4]
            record_modify_subfield(rec=record, tag='773', subfield_code='p',
                                   value=new_title, subfield_position=0,
                                   field_position_global=position)
Exemple #27
0
def references_nb_counts():
    """Get number of references for the record `recid`."""
    recid = request.view_args.get("recid")
    if recid is None:
        return

    from invenio.legacy.bibrecord import record_get_field_instances
    from invenio.modules.search.models import Field
    from invenio.modules.records.api import get_record

    if not CFG_CERN_SITE:
        reftag = ""
        reftags = list(Field.get_field_tags("reference"))
        if reftags:
            reftag = reftags[0]
        tmprec = get_record(recid)
        if reftag and len(reftag) > 4:
            return len(record_get_field_instances(tmprec, reftag[0:3], reftag[3], reftag[4]))
    return 0
Exemple #28
0
def get_record_provenance(recid):
    """
    Return the provenance XML representation of a record, suitable to be put
    in the about tag.
    """
    record = get_record(recid)
    provenances = record_get_field_instances(
        record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3],
        CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3],
        CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4])
    out = ""
    for provenance in provenances:
        base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = ""
        for (code, value) in provenance[0]:
            if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD:
                base_url = value
            elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]:
                identifier = value
            elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD:
                datestamp = value
            elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD:
                metadata_namespace = value
            elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD:
                origin_description = value
            elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD:
                harvest_date = value
            elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD:
                altered = value
        if base_url:
            out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">"""
            out += X.originDescription(
                harvestDate=harvest_date, altered=altered)(
                    X.baseURL()(base_url),
                    X.identifier()(identifier),
                    X.datestamp()(datestamp),
                    X.metadataNamespace()(metadata_namespace),
                    origin_description
                    and X.originDescription(origin_description)
                    or ''  ## This is already XML
                )
            out += """</provenance>"""
    return out
Exemple #29
0
def get_record_provenance(recid):
    """
    Return the provenance XML representation of a record, suitable to be put
    in the about tag.
    """
    record = get_record(recid)
    provenances = record_get_field_instances(
        record,
        CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3],
        CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3],
        CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4],
    )
    out = ""
    for provenance in provenances:
        base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = ""
        for (code, value) in provenance[0]:
            if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD:
                base_url = value
            elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]:
                identifier = value
            elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD:
                datestamp = value
            elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD:
                metadata_namespace = value
            elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD:
                origin_description = value
            elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD:
                harvest_date = value
            elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD:
                altered = value
        if base_url:
            out += """<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">"""
            out += X.originDescription(harvestDate=harvest_date, altered=altered)(
                X.baseURL()(base_url),
                X.identifier()(identifier),
                X.datestamp()(datestamp),
                X.metadataNamespace()(metadata_namespace),
                origin_description and X.originDescription(origin_description) or "",  ## This is already XML
            )
            out += """</provenance>"""
    return out
Exemple #30
0
def references_nb_counts():
    """Get number of references for the record `recid`."""
    recid = request.view_args.get('recid')
    if recid is None:
        return

    from invenio.legacy.bibrecord import record_get_field_instances
    from invenio.modules.search.models import Field
    from invenio.modules.records.api import get_record

    if not CFG_CERN_SITE:
        reftag = ""
        reftags = list(Field.get_field_tags("reference"))
        if reftags:
            reftag = reftags[0]
        tmprec = get_record(recid)
        if reftag and len(reftag) > 4:
            return len(
                record_get_field_instances(tmprec, reftag[0:3], reftag[3],
                                           reftag[4]))
    return 0
Exemple #31
0
def tarballs_by_recids(recids, sdir):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids.

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live

    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recid.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = int(recid)

    arXiv_ids = []

    for recid in list_of_ids:
        rec = get_record(recid)
        for afieldinstance in record_get_field_instances(rec, tag='037'):
            if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                arXiv_ids.append(arXiv_id)

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Exemple #32
0
def record_get_value_with_provenence(record,
                                     provenence_value,
                                     provenence_code,
                                     tag,
                                     ind1=" ",
                                     ind2=" ",
                                     code=""):
    """
    Retrieves the value of the given field(s) with given provenence code/value
    combo.

    For example:

    If one would like to extract all subject categories (65017 $a) with a given
    provenence, in this case "arXiv" in $9:

    65017 $ahep-ph$9arXiv
    65017 $ahep-th$9arXiv
    65017 $aMath$9INSPIRE

    this function would return ["hep-ph", "hep-th"]

    Returns a list of subfield values.
    """
    fields = record_get_field_instances(record, tag, ind1, ind2)
    final_values = []
    for subfields, dummy1, dummy2, dummy3, dummy4 in fields:
        for subfield_code, value in subfields:
            if subfield_code == provenence_code and value == provenence_value:
                # We have a hit. Stop to look for right value
                break
        else:
            # No hits.. continue to next field
            continue
        for subfield_code, value in subfields:
            if subfield_code == code:
                # This is the value we are looking for with the correct provenence
                final_values.append(value)
    return final_values
    def tokenize(self, recID):
        phrases = []
        try:
            rec = get_record(recID)

            for rule in self.rules:
                tag_to_index, necessary_tag, necessary_value = rule
                core_tag = tag_to_index[0:3]
                ind = tag_to_index[3:5]
                sub_tag = tag_to_index[5]

                fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])]
                for field in fields:
                    tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == ''
                    value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \
                                      necessary_value == ''
                    if tag_condition and field.has_key(sub_tag) and value_condition:
                        phrases.append(field[sub_tag])
            return phrases
        except KeyError:
            return []
        return phrases
Exemple #34
0
    def find_modified_tags(self, common_tags, record1, record2):
        """
        For each tag common to Record1 and Record2, checks for modifictions
        at field-level, indicator-level and subfield-level.

        Returns a dictionary of tags and corresponding fields from Record1
        that have been found to have modified.
        """

        result = {}
        for tag in common_tags:
            # retrieve tag instances of record1 and record2
            rec1_tag_val = record_get_field_instances(record1, tag, '%', '%')
            rec2_tag_val = record_get_field_instances(record2, tag, '%', '%')
            if rec1_tag_val:
                rec1_ind = self.group_tag_values_by_indicator(rec1_tag_val)
            if rec2_tag_val:
                rec2_ind = self.group_tag_values_by_indicator(rec2_tag_val)

            # NOTE: At this point rec1_ind and rec2_ind will be dictionary
            # Key ==> (ind1, ind2) tuple
            # Val ==> list of data_tuple => [dt1,dt2]
            # dt(n) => ([sfl],ind1,ind2,ctrlfield,fn)

            # Generating 3 different dictionaries
            # common/added/deleted ind pairs in record1 based on record2
            (com_ind, add_ind, del_ind) = self.compare_tags_by_ind(rec1_ind, rec2_ind)

            if add_ind:
                for ind_pair in add_ind:
                    for data_tuple in add_ind[ind_pair]:
                        subfield_list = data_tuple[0]
                        record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list)

            # Indicators that are deleted from record1 w.r.t record2 will be added with special code
            if del_ind:
                for ind_pair in del_ind:
                    record_add_field(result, tag, ind_pair[0], ind_pair[1], '', [(CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE)])

            # Common modified fields. Identifying changes at subfield level
            if com_ind:
                for ind_pair in com_ind:
                    # NOTE: sf_rec1 and sf_rec2 are list of list of subfields
                    # A simple list comparison is sufficient in this scneario
                    # Any change in the order of fields or changes in subfields
                    # will cause the entire list of data_tuple for that ind_pair
                    # to be copied from record1(upload) to result.
                    if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
                        cf_rec1 = [data_tuple[3] for data_tuple in rec1_ind[ind_pair]]
                        cf_rec2 = [data_tuple[3] for data_tuple in rec2_ind[ind_pair]]
                        if cf_rec1 != cf_rec2:
                            for data_tuple in com_ind[ind_pair]:
                                record_add_field(result, tag, controlfield_value=data_tuple[3])
                    else:
                        sf_rec1 = [data_tuple[0] for data_tuple in rec1_ind[ind_pair]]
                        sf_rec2 = [data_tuple[0] for data_tuple in rec2_ind[ind_pair]]
                        if sf_rec1 != sf_rec2:
                            # change at subfield level/ re-oredered fields
                            for data_tuple in com_ind[ind_pair]:
                                # com_ind will have data_tuples of record1(upload) and not record2
                                subfield_list = data_tuple[0]
                                record_add_field(result, tag, ind_pair[0], ind_pair[1], '', subfields=subfield_list)

        return result
Exemple #35
0
def merge_field_group(rec1, rec2, fnum, ind1="", ind2="", merge_conflicting_fields=False):
    """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag.
    the second record.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param merge_conflicting_fields: whether to merge conflicting fields or not
    """
    ### Check if merging goes for all indicators and set a boolean
    merging_all_indicators = not ind1 and not ind2

    ### check if there is no field in rec2 to be merged in rec1
    if not record_has_field(rec2, fnum):
        return

    ### get fields of rec2
    if merging_all_indicators:
        fields2 = record_get_field_instances(rec2, fnum, "%", "%")
    else:
        fields2 = record_get_field_instances(rec2, fnum, ind1, ind2)
    if len(fields2) == 0:
        return

    ### check if field in rec1 doesn't even exist
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, fields2)
        return

    ### compare the fields, get diffs for given indicators
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields, ind1, ind2)

    ### check if fields are the same
    if alldiffs is None:
        return  # nothing to merge

    ### find the diffing for the fields of the given indicators

    alldiffs = alldiffs[1]  # keep only the list of diffs by indicators (without the 'c')

    if merging_all_indicators:
        # combine the diffs for each indicator to one list
        diff = _combine_diffs(alldiffs)
    else:  # diffing for one indicator
        for diff in alldiffs:  # look for indicator pair in diff result
            if diff[0] == (ind1, ind2):
                break
        else:
            raise Exception, "Indicators not in diff result."
        diff = diff[1]  # keep only the list of diffs (without the indicator tuple)

    ### proceed to merging fields in a new field list
    fields1, fields2 = rec1[fnum], rec2[fnum]
    new_fields = []
    if merge_conflicting_fields == False:  # merge non-conflicting fields
        for m in diff:  # for every match of fields in the diff
            if m[0] is not None:  # if rec1 has a field in the diff, keep it
                new_fields.append(deepcopy(fields1[m[0]]))
            else:  # else take the field from rec2
                new_fields.append(deepcopy(fields2[m[1]]))
    else:  # merge all fields
        for m in diff:  # for every match of fields in the diff
            if m[1] is not None:  # if rec2 has a field, add it
                new_fields.append(deepcopy(fields2[m[1]]))
                if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]:
                    # if the fields are not the same then add the field of rec1
                    new_fields.append(deepcopy(fields1[m[0]]))
            else:
                new_fields.append(deepcopy(fields1[m[0]]))

    ### delete existing fields
    record_delete_field(rec1, fnum, ind1, ind2)
    ## find where the new_fields should be inserted in rec1 (insert_index)
    if merging_all_indicators:
        insert_index = 0
    else:
        insert_index = None
        ind_pair = (ind1, ind2)
        first_last_dict = _first_and_last_index_for_each_indicator(rec1.get(fnum, []))
        # find the indicator pair which is just before the one which will be inserted
        indicators = first_last_dict.keys()
        indicators.sort()
        ind_pair_before = None
        for pair in indicators:
            if pair > ind_pair:
                break
            else:
                ind_pair_before = pair
        if ind_pair_before is None:  # if no smaller indicator pair exists
            insert_index = 0  # insertion will take place at the beginning
        else:  # else insert after the last field index of the previous indicator pair
            insert_index = first_last_dict[ind_pair_before][1] + 1

    ### add the new (merged) fields in correct 'in_field_index' position
    record_add_fields(rec1, fnum, new_fields, insert_index)
    return
Exemple #36
0
def merge_field_group(rec1,
                      rec2,
                      fnum,
                      ind1='',
                      ind2='',
                      merge_conflicting_fields=False):
    """Merges non-conflicting fields from 'rec2' to 'rec1' for a specific tag.
    the second record.
    @param rec1: First record (a record dictionary structure)
    @param rec2: Second record (a record dictionary structure)
    @param fnum: a 3 characters long string indicating field tag number
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param merge_conflicting_fields: whether to merge conflicting fields or not
    """
    ### Check if merging goes for all indicators and set a boolean
    merging_all_indicators = not ind1 and not ind2

    ### check if there is no field in rec2 to be merged in rec1
    if not record_has_field(rec2, fnum):
        return

    ### get fields of rec2
    if merging_all_indicators:
        fields2 = record_get_field_instances(rec2, fnum, '%', '%')
    else:
        fields2 = record_get_field_instances(rec2, fnum, ind1, ind2)
    if len(fields2) == 0:
        return

    ### check if field in rec1 doesn't even exist
    if not record_has_field(rec1, fnum):
        record_add_fields(rec1, fnum, fields2)
        return

    ### compare the fields, get diffs for given indicators
    alldiffs = record_field_diff(rec1[fnum], rec2[fnum], fnum, match_subfields,
                                 ind1, ind2)

    ### check if fields are the same
    if alldiffs is None:
        return  #nothing to merge

    ### find the diffing for the fields of the given indicators

    alldiffs = alldiffs[
        1]  #keep only the list of diffs by indicators (without the 'c')

    if merging_all_indicators:
        #combine the diffs for each indicator to one list
        diff = _combine_diffs(alldiffs)
    else:  #diffing for one indicator
        for diff in alldiffs:  #look for indicator pair in diff result
            if diff[0] == (ind1, ind2):
                break
        else:
            raise Exception, "Indicators not in diff result."
        diff = diff[
            1]  #keep only the list of diffs (without the indicator tuple)

    ### proceed to merging fields in a new field list
    fields1, fields2 = rec1[fnum], rec2[fnum]
    new_fields = []
    if merge_conflicting_fields == False:  #merge non-conflicting fields
        for m in diff:  #for every match of fields in the diff
            if m[0] is not None:  #if rec1 has a field in the diff, keep it
                new_fields.append(deepcopy(fields1[m[0]]))
            else:  #else take the field from rec2
                new_fields.append(deepcopy(fields2[m[1]]))
    else:  #merge all fields
        for m in diff:  #for every match of fields in the diff
            if m[1] is not None:  #if rec2 has a field, add it
                new_fields.append(deepcopy(fields2[m[1]]))
                if m[0] is not None and fields1[m[0]][0] != fields2[m[1]][0]:
                    #if the fields are not the same then add the field of rec1
                    new_fields.append(deepcopy(fields1[m[0]]))
            else:
                new_fields.append(deepcopy(fields1[m[0]]))

    ### delete existing fields
    record_delete_field(rec1, fnum, ind1, ind2)
    ## find where the new_fields should be inserted in rec1 (insert_index)
    if merging_all_indicators:
        insert_index = 0
    else:
        insert_index = None
        ind_pair = (ind1, ind2)
        first_last_dict = _first_and_last_index_for_each_indicator(
            rec1.get(fnum, []))
        #find the indicator pair which is just before the one which will be inserted
        indicators = first_last_dict.keys()
        indicators.sort()
        ind_pair_before = None
        for pair in indicators:
            if pair > ind_pair:
                break
            else:
                ind_pair_before = pair
        if ind_pair_before is None:  #if no smaller indicator pair exists
            insert_index = 0  #insertion will take place at the beginning
        else:  #else insert after the last field index of the previous indicator pair
            insert_index = first_last_dict[ind_pair_before][1] + 1

    ### add the new (merged) fields in correct 'in_field_index' position
    record_add_fields(rec1, fnum, new_fields, insert_index)
    return
Exemple #37
0
def tarballs_by_recids(recids,
                       sdir,
                       docname=None,
                       doctype=None,
                       docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(
                            afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(
                            afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_type() == doctype
                ]
            if docname:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_name() == docname
                ]
            if docformat:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_format() == docformat
                ]
            local_files.extend([(docfile.get_path(), recid)
                                for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Exemple #38
0
def task_run_core():
    """Perform a search to find records without a texkey.

    generates a new one and uploads the changes in chunks
    """
    recids = perform_request_search(
        p='-035:spirestex -035:inspiretex', cc='HEP')

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct, tag="035",
                                                   ind1="", ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                try:
                    value = field_get_subfield_values(instance, "a")[0]
                except IndexError:
                    value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message(
                    "INFO: Record %s has already texkey %s" % (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message((
                    "WARNING: Record %s has no first author or "
                    "collaboration") % recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    # FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    return True
    def find_modified_tags(self, common_tags, record1, record2):
        """
        For each tag common to Record1 and Record2, checks for modifictions
        at field-level, indicator-level and subfield-level.

        Returns a dictionary of tags and corresponding fields from Record1
        that have been found to have modified.
        """

        result = {}
        for tag in common_tags:
            # retrieve tag instances of record1 and record2
            rec1_tag_val = record_get_field_instances(record1, tag, '%', '%')
            rec2_tag_val = record_get_field_instances(record2, tag, '%', '%')
            if rec1_tag_val:
                rec1_ind = self.group_tag_values_by_indicator(rec1_tag_val)
            if rec2_tag_val:
                rec2_ind = self.group_tag_values_by_indicator(rec2_tag_val)

            # NOTE: At this point rec1_ind and rec2_ind will be dictionary
            # Key ==> (ind1, ind2) tuple
            # Val ==> list of data_tuple => [dt1,dt2]
            # dt(n) => ([sfl],ind1,ind2,ctrlfield,fn)

            # Generating 3 different dictionaries
            # common/added/deleted ind pairs in record1 based on record2
            (com_ind, add_ind,
             del_ind) = self.compare_tags_by_ind(rec1_ind, rec2_ind)

            if add_ind:
                for ind_pair in add_ind:
                    for data_tuple in add_ind[ind_pair]:
                        subfield_list = data_tuple[0]
                        record_add_field(result,
                                         tag,
                                         ind_pair[0],
                                         ind_pair[1],
                                         '',
                                         subfields=subfield_list)

            # Indicators that are deleted from record1 w.r.t record2 will be added with special code
            if del_ind:
                for ind_pair in del_ind:
                    record_add_field(result, tag, ind_pair[0], ind_pair[1], '',
                                     [(CFG_BIBUPLOAD_DELETE_CODE,
                                       CFG_BIBUPLOAD_DELETE_VALUE)])

            # Common modified fields. Identifying changes at subfield level
            if com_ind:
                for ind_pair in com_ind:
                    # NOTE: sf_rec1 and sf_rec2 are list of list of subfields
                    # A simple list comparison is sufficient in this scneario
                    # Any change in the order of fields or changes in subfields
                    # will cause the entire list of data_tuple for that ind_pair
                    # to be copied from record1(upload) to result.
                    if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
                        cf_rec1 = [
                            data_tuple[3] for data_tuple in rec1_ind[ind_pair]
                        ]
                        cf_rec2 = [
                            data_tuple[3] for data_tuple in rec2_ind[ind_pair]
                        ]
                        if cf_rec1 != cf_rec2:
                            for data_tuple in com_ind[ind_pair]:
                                record_add_field(
                                    result,
                                    tag,
                                    controlfield_value=data_tuple[3])
                    else:
                        sf_rec1 = [
                            data_tuple[0] for data_tuple in rec1_ind[ind_pair]
                        ]
                        sf_rec2 = [
                            data_tuple[0] for data_tuple in rec2_ind[ind_pair]
                        ]
                        if sf_rec1 != sf_rec2:
                            # change at subfield level/ re-oredered fields
                            for data_tuple in com_ind[ind_pair]:
                                # com_ind will have data_tuples of record1(upload) and not record2
                                subfield_list = data_tuple[0]
                                record_add_field(result,
                                                 tag,
                                                 ind_pair[0],
                                                 ind_pair[1],
                                                 '',
                                                 subfields=subfield_list)

        return result
Exemple #40
0
def task_run_core():
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = perform_request_search(p='-035:spirestex -035:inspiretex',
                                    cc='HEP')

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct,
                                                   tag="035",
                                                   ind1="",
                                                   ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                try:
                    value = field_get_subfield_values(instance, "a")[0]
                except IndexError:
                    value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" %
                              (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message(
                    "WARNING: Record %s has no first author or collaboration" %
                    recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    #FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    return True
Exemple #41
0
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [docfile for docfile in all_files if
                             docfile.get_type() == doctype]
            if docname:
                all_files = [docfile for docfile in all_files if
                             docfile.get_name() == docname]
            if docformat:
                all_files = [docfile for docfile in all_files if
                             docfile.get_format() == docformat]
            local_files.extend([(docfile.get_path(), recid) for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Exemple #42
0
    def fields(self, tag, escape=0, repeatable_subfields_p=False):
        """
        Returns the list of values corresonding to "tag".

        If tag has an undefined subcode (such as 999C5),
        the function returns a list of dictionaries, whoose keys
        are the subcodes and the values are the values of tag.subcode.
        If the tag has a subcode, simply returns list of values
        corresponding to tag.
        Eg. for given MARC::
            999C5 $a value_1a $b value_1b
            999C5 $b value_2b
            999C5 $b value_3b $b value_3b_bis

            >>> bfo.fields('999C5b')
            >>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis']
            >>> bfo.fields('999C5')
            >>> [{'a':'value_1a', 'b':'value_1b'},
                {'b':'value_2b'},
                {'b':'value_3b'}]

        By default the function returns only one value for each
        subfield (that is it considers that repeatable subfields are
        not allowed). It is why in the above example 'value3b_bis' is
        not shown for bfo.fields('999C5').  (Note that it is not
        defined which of value_3b or value_3b_bis is returned).  This
        is to simplify the use of the function, as most of the time
        subfields are not repeatable (in that way we get a string
        instead of a list).  You can allow repeatable subfields by
        setting 'repeatable_subfields_p' parameter to True. In
        this mode, the above example would return:
            >>> bfo.fields('999C5b', repeatable_subfields_p=True)
            >>> ['value_1b', 'value_2b', 'value_3b']
            >>> bfo.fields('999C5', repeatable_subfields_p=True)
            >>> [{'a':['value_1a'], 'b':['value_1b']},
                {'b':['value_2b']},
                {'b':['value_3b', 'value3b_bis']}]

        NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT.  Also note
        that whatever the value of 'repeatable_subfields_p' is,
        bfo.fields('999C5b') always show all fields, even repeatable
        ones. This is because the parameter has no impact on the
        returned structure (it is always a list).

        'escape' parameter allows to escape special characters
        of the fields. The value of escape can be:
                      0. No escaping
                      1. Escape all HTML characters
                      2. Remove unsafe HTML tags (Eg. keep <br />)
                      3. Mix of mode 1 and 2. If value of field starts with
                      <!-- HTML -->, then use mode 2. Else use mode 1.
                      4. Remove all HTML tags
                      5. Same as 2, with more tags allowed (like <img>)
                      6. Same as 3, with more tags allowed (like <img>)
                      7. Mix of mode 0 and mode 1. If field_value
                      starts with <!--HTML-->, then use mode 0.
                      Else use mode 1.
                      8. Same as mode 1, but also escape double-quotes
                      9. Same as mode 4, but also escape double-quotes

        :param tag: the marc code of a field
        :param escape: 1 if returned values should be escaped. Else 0.
        @repeatable_subfields_p if True, returns the list of subfields in the dictionary
        @return: values of field tag in record
        """

        if self.get_record() is None:
            # Case where BibRecord could not parse object
            return []

        p_tag = parse_tag(tag)
        if p_tag[3] != "":
            # Subcode has been defined. Simply returns list of values
            values = record_get_field_values(self.get_record(),
                                             p_tag[0],
                                             p_tag[1],
                                             p_tag[2],
                                             p_tag[3])
            if escape == 0:
                return values
            else:
                return [escape_field(value, escape) for value in values]

        else:
            # Subcode is undefined. Returns list of dicts.
            # However it might be the case of a control field.

            instances = record_get_field_instances(self.get_record(),
                                                   p_tag[0],
                                                   p_tag[1],
                                                   p_tag[2])
            if repeatable_subfields_p:
                list_of_instances = []
                for instance in instances:
                    instance_dict = {}
                    for subfield in instance[0]:
                        if subfield[0] not in instance_dict:
                            instance_dict[subfield[0]] = []
                        if escape == 0:
                            instance_dict[subfield[0]].append(subfield[1])
                        else:
                            instance_dict[subfield[0]].append(escape_field(subfield[1], escape))
                    list_of_instances.append(instance_dict)
                return list_of_instances
            else:
                if escape == 0:
                    return [dict(instance[0]) for instance in instances]
                else:
                    return [dict([(subfield[0], escape_field(subfield[1], escape))
                                   for subfield in instance[0]])
                            for instance in instances]