Python record_xml_outputの例、invenio.legacy.bibrecord.record_xml_output Pythonの例

コード例 #1

0

ファイルを表示

ファイル: tasks.py プロジェクト: egabancho/zenodo

def bibupload(record=None, collection=None, file_prefix="", mode="-c"):
    """
    General purpose function that will write a MARCXML file and call bibupload
    on it.
    """
    if collection is None and record is None:
        return

    (file_out, filename) = open_temp_file(file_prefix)

    if collection is not None:
        file_out.write("<collection>")
        tot = 0
        for rec in collection:
            file_out.write(record_xml_output(rec))
            tot += 1
            if tot == MAX_RECORDS:
                file_out.write("</collection>")
                file_out.close()
                logger.debug("Submitting bibupload %s -n %s" % (mode, filename))
                task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')

                (file_out, filename) = open_temp_file(file_prefix)
                file_out.write("<collection>")
                tot = 0
        file_out.write("</collection>")
    elif record is not None:
        tot = 1
        file_out.write(record_xml_output(record))

    file_out.close()
    if tot > 0:
        logger.debug("Submitting bibupload %s -n %s" % (mode, filename))
        task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')

コード例 #2

0

ファイルを表示

ファイル: Link_Records.py プロジェクト: mhellmic/b2share

def _prepare_marcxml(recid_a, rn_a, recid_b, rn_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True):
    record_a = {}
    record_b = {}
    record_add_field(record_a, "001", controlfield_value=str(recid_a))
    record_add_field(record_a, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_a and "0" or "1", subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))])
    record_add_field(record_b, "001", controlfield_value=str(recid_b))
    record_add_field(record_b, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_b and "0" or "1", subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))])
    return "<collection>\n%s\n%s</collection>" % (record_xml_output(record_a), record_xml_output(record_b))

コード例 #3

0

ファイルを表示

def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False,
                    task_name="bibedit", sequence_id=None):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
            xml_record = record_xml_output(record)
            delete_cache(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        fd, file_path = tempfile.mkstemp(dir=cfg['CFG_BIBEDIT_CACHEDIR'],
                                         prefix="%s_" % cfg['CFG_BIBEDIT_FILENAME'],
                                         suffix="_%s_%s.xml" % (recid, uid))
        f = os.fdopen(fd, 'w')
        f.write(xml_to_write)
        f.close()
    else:
        file_path = '%s_%s.xml' % (_get_file_path(recid, uid),
                                   cfg['CFG_BIBEDIT_TO_MERGE_SUFFIX'])
        xml_file = open(file_path, 'w')
        xml_file.write(xml_to_write)
        xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        args = ['bibupload', user_name, '-P', '5', '-r',
                file_path, '-u', user_name]
        if task_name == "bibedit":
            args.extend(['--name', 'bibedit'])
        if sequence_id:
            args.extend(["-I", sequence_id])
        args.extend(['--email-logs-on-error'])
        task_low_level_submission(*args)
    return True

コード例 #4

0

ファイルを表示

ファイル: utils.py プロジェクト: Theer108/invenio

def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False,
                    task_name="bibedit", sequence_id=None):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
            xml_record = record_xml_output(record)
            delete_cache(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        fd, file_path = tempfile.mkstemp(dir=cfg['CFG_BIBEDIT_CACHEDIR'],
                                         prefix="%s_" % cfg['CFG_BIBEDIT_FILENAME'],
                                         suffix="_%s_%s.xml" % (recid, uid))
        f = os.fdopen(fd, 'w')
        f.write(xml_to_write)
        f.close()
    else:
        file_path = '%s_%s.xml' % (_get_file_path(recid, uid),
                                   cfg['CFG_BIBEDIT_TO_MERGE_SUFFIX'])
        xml_file = open(file_path, 'w')
        xml_file.write(xml_to_write)
        xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        args = ['bibupload', user_name, '-P', '5', '-r',
                file_path, '-u', user_name]
        if task_name == "bibedit":
            args.extend(['--name', 'bibedit'])
        if sequence_id:
            args.extend(["-I", sequence_id])
        args.extend(['--email-logs-on-error'])
        task_low_level_submission(*args)
    return True

コード例 #5

0

ファイルを表示

def compare_references(test, a, b):
    from invenio.legacy.bibrecord import create_record, record_xml_output, \
        record_delete_field
    ## Let's normalize records to remove the Invenio refextract signature
    a = create_record(a)[0]
    b = create_record(b)[0]
    record_delete_field(a, '999', 'C', '6')
    a = record_xml_output(a)
    b = record_xml_output(b)
    test.assertEqual(a, b)

コード例 #6

0

ファイルを表示

ファイル: test_textminer_documents.py プロジェクト: mhellmic/b2share

def compare_references(test, a, b):
    from invenio.legacy.bibrecord import create_record, record_xml_output, \
        record_delete_field
    ## Let's normalize records to remove the Invenio refextract signature
    a = create_record(a)[0]
    b = create_record(b)[0]
    record_delete_field(a, '999', 'C', '6')
    a = record_xml_output(a)
    b = record_xml_output(b)
    test.assertEqual(a, b)

コード例 #7

0

ファイルを表示

ファイル: engine.py プロジェクト: SCOAP3/invenio

def _get_formated_record(record_id, output_format, update_commands, language, outputTags="",
                         checked=True, displayed_records=None):
    """Returns a record in a given format

    @param record_id: the ID of record to format
    @param output_format: an output format code (or short identifier for the output format)
    @param update_commands: list of commands used to update record contents
    @param language: the language to use to format the record
    @param outputTags: the tags to be shown to the user
    @param checked: is the record checked by the user?
    @param displayed_records: records to be displayed on a given page

    @returns: record formated to be displayed or None
    """
    if update_commands and checked:
        # Modify the bibrecord object with the appropriate actions
        updated_record = _get_updated_record(record_id, update_commands)

    textmarc_options = {"aleph-marc":0, "correct-mode":1, "append-mode":0,
                        "delete-mode":0, "insert-mode":0, "replace-mode":0,
                        "text-marc":1}

    if record_id not in displayed_records:
        return

    old_record = search_engine.get_record(recid=record_id)
    old_record_textmarc = xmlmarc2textmarc.create_marc_record(old_record, sysno="", options=textmarc_options)
    if "hm" == output_format:
        if update_commands and checked:
            updated_record_textmarc = xmlmarc2textmarc.create_marc_record(updated_record, sysno="", options=textmarc_options)
            result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id)
        else:
            filter_tags = "All tags" not in outputTags and outputTags
            result = ['<pre>']
            for line in old_record_textmarc.splitlines():
                if not filter_tags or line.split()[0].replace('_', '') in outputTags:
                    result.append("%09d " % record_id + line.strip())
            result.append('</pre>')
            result = '\n'.join(result)
    else:
        if update_commands and checked:
            # No coloring of modifications in this case
            xml_record = bibrecord.record_xml_output(updated_record)
        else:
            xml_record = bibrecord.record_xml_output(old_record)
        result = bibformat.format_record(recID=None,
                                        of=output_format,
                                        xml_record=xml_record,
                                        ln=language)
    return result

コード例 #8

0

ファイルを表示

ファイル: task.py プロジェクト: pombredanne/invenio-3

def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)

コード例 #9

0

ファイルを表示

ファイル: test_jsonext.py プロジェクト: SDSG-Invenio/zenodo

    def test_marc_export(self):
        from invenio.modules.records.api import Record
        from invenio.legacy.bibrecord import create_record, record_xml_output

        rec = Record(json=test_record, master_format='marc')

        # Needed to properly set authors when generating MARC
        first = rec['authors'][0]
        additional = rec['authors'][1:]
        rec['_first_author'] = first
        rec['_additional_authors'] = additional

        output_marc = record_xml_output(
            create_record(rec.legacy_export_as_marc())[0]
        )
        try:
            self.assertEqual(test_marc, output_marc)
        except AssertionError:
            # Print diff in case of errors.
            import difflib
            diff = "".join(difflib.unified_diff(
                test_marc.splitlines(1),
                output_marc.splitlines(1)
            ))
            raise AssertionError(diff)

        form_json = rec.produce('json_for_form')
        for k, v in test_form_json.items():
            self.assertEqual(form_json[k], test_form_json[k])

コード例 #10

0

ファイルを表示

ファイル: utils.py プロジェクト: jiangmin9/invenio

def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0], tag="999", ind1="C", ind2="5")
    refextract_status = record_get_field_instances(references[0], tag="999", ind1="C", ind2="6")

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, "999")
        record_add_fields(record, "999", references_to_add)
        record_add_fields(record, "999", refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml

コード例 #11

0

ファイルを表示

    def test_marc_export(self):
        from invenio.modules.records.api import Record
        from invenio.legacy.bibrecord import create_record, record_xml_output

        rec = Record(json=test_record, master_format='marc')

        # Needed to properly set authors when generating MARC
        first = rec['authors'][0]
        additional = rec['authors'][1:]
        rec['_first_author'] = first
        rec['_additional_authors'] = additional

        output_marc = record_xml_output(
            create_record(rec.legacy_export_as_marc())[0]
        )
        try:
            self.assertEqual(test_marc, output_marc)
        except AssertionError:
            # Print diff in case of errors.
            import difflib
            diff = "".join(difflib.unified_diff(
                test_marc.splitlines(1),
                output_marc.splitlines(1)
            ))
            raise AssertionError(diff)

        form_json = rec.produce('json_for_form')
        for k, v in test_form_json.items():
            self.assertEqual(form_json[k], test_form_json[k])

コード例 #12

0

ファイルを表示

ファイル: api.py プロジェクト: chokribr/invenio-1

def replace_references(recid):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    """
    # Parse references
    references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)
    # Record marc xml
    record = get_record(recid)

    if references[0]:
        fields_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='%',
                                                   ind2='%')
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', fields_to_add)
        # Update record references
        out_xml = record_xml_output(record)
    else:
        out_xml = None

    return out_xml

コード例 #13

0

ファイルを表示

ファイル: utils.py プロジェクト: mhellmic/b2share

def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_file_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
            xml_record = record_xml_output(record)
            delete_cache_file(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        file_path = '%s.xml' % _get_file_path(recid, uid)
    else:
        file_path = '%s_%s.xml' % (_get_file_path(recid, uid),
                                   CFG_BIBEDIT_TO_MERGE_SUFFIX)
    xml_file = open(file_path, 'w')
    xml_file.write(xml_to_write)
    xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        # Pass XML file to BibUpload.
        task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r',
                                  file_path, '-u', user_name)
    return True

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: osub3/invenio

def create_marcxml(record):
    """Create MARCXML based on type of input variable."""
    from invenio_records.api import Record
    if isinstance(record, six.string_types):
        return record
    elif isinstance(record, Record):
        return record.legacy_export_as_marc()
    else:
        return record_xml_output(record)

コード例 #15

0

ファイルを表示

ファイル: utils.py プロジェクト: SCOAP3/invenio

def create_marcxml(record):
    """Create MARCXML based on type of input variable."""
    from invenio.modules.records.api import Record
    if isinstance(record, six.string_types):
        return record
    elif isinstance(record, Record):
        return record.legacy_export_as_marc()
    else:
        return record_xml_output(record)

コード例 #16

0

ファイルを表示

ファイル: Link_Records.py プロジェクト: SCOAP3/invenio

def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False):
    output = '<collection>'
    record_a = {}
    record_b = {}
    if what_is_b_for_a is not None:
        marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \
          _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1")
        record_add_field(record_a, "001", controlfield_value=str(recid_a))
        if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p:
            # Add empty field in order to account for cases where all
            # linkings are removed by the submitter
            record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a)
        for recid_b, rn_b in recids_and_rns_b:
            record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a,
                             subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))])
        output += record_xml_output(record_a)

    if what_is_a_for_b is not None:
        marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \
          _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1")
        for recid_b, rn_b in recids_and_rns_b:
            record_b = {}
            record_add_field(record_b, "001", controlfield_value=str(recid_b))
            if upload_mode == 'correct':
                original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b)
                record_add_fields(record_b, marc_tag_for_b, original_linking_fields)
            record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b,
                             subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))])
            output += record_xml_output(record_b)
        # Remove linking in remote records where adequate
        if consider_empty_p:
            unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b)
            for recid_b in unlinked_recids:
                record_b = {}
                record_add_field(record_b, "001", controlfield_value=str(recid_b))
                original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b)
                if not original_linking_fields:
                    # Add empty field in order to account for cases where all
                    # linkings are removed by the submitter
                    record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b)
                record_add_fields(record_b, marc_tag_for_b, original_linking_fields)
                output += record_xml_output(record_b)
    output += '</collection>'
    return output

コード例 #17

0

ファイルを表示

ファイル: utils.py プロジェクト: mhellmic/b2share

def modify_record_timestamp(revision_xml, last_revision_ts):
    """ Modify tag 005 to add the revision passed as parameter.
    @param revision_xml: marcxml representation of the record to modify
    @type revision_xml: string
    @param last_revision_ts: timestamp to add to 005 tag
    @type last_revision_ts: string

    @return: marcxml with 005 tag modified
    """
    recstruct = create_record(revision_xml)[0]
    record_modify_controlfield(recstruct, "005", last_revision_ts,
                                field_position_local=0)
    return record_xml_output(recstruct)

コード例 #18

0

ファイルを表示

ファイル: record_massage.py プロジェクト: GRArmstrong/invenio-inspire-tools

def output_records(container):
    try:
        strio = codecs.open(sys.argv[2], mode='w', encoding='utf-8')
        _print(TF.YELLOW + "Writing to %s" % (sys.argv[2],) + TF.END)
    except Exception:
        strio = sys.stdout
        _print(TF.YELLOW + "Writing to StdOut" + TF.END)

    stream = Streamer(strio, '\n')
    stream.write(MARCXML_COLLECTION_HEADER)
    for record in container:
        marcxml = record_xml_output(record)
        stream.write(marcxml)
    stream.write(MARCXML_COLLECTION_FOOTER)

コード例 #19

0

ファイルを表示

ファイル: tasks.py プロジェクト: LibrarPotter/zenodo

def bibupload(record=None, collection=None, file_prefix="", mode="-c"):
    """
    General purpose function that will write a MARCXML file and call bibupload
    on it.
    """
    if collection is None and record is None:
        return

    (file_out, filename) = open_temp_file(file_prefix)

    if collection is not None:
        file_out.write("<collection>")
        tot = 0
        for rec in collection:
            file_out.write(record_xml_output(rec))
            tot += 1
            if tot == MAX_RECORDS:
                file_out.write("</collection>")
                file_out.close()
                logger.debug(
                    "Submitting bibupload %s -n %s" % (mode, filename))
                task_low_level_submission(
                    'bibupload', 'openaire', mode, filename, '-n')

                (file_out, filename) = open_temp_file(file_prefix)
                file_out.write("<collection>")
                tot = 0
        file_out.write("</collection>")
    elif record is not None:
        tot = 1
        file_out.write(record_xml_output(record))

    file_out.close()
    if tot > 0:
        logger.debug("Submitting bibupload %s -n %s" % (mode, filename))
        task_low_level_submission(
            'bibupload', 'openaire', mode, filename, '-n')

コード例 #20

0

ファイルを表示

ファイル: b2share_marc_handler.py プロジェクト: mhellmic/b2share

def create_marc(form, sub_id, email):
    """
    Generates MARC data used by Invenio from the filled out form, then
    submits it to the Invenio system.
    """
    rec = {}
    recid = create_recid()
    record_add_field(rec, '001', controlfield_value=str(recid))
    add_basic_fields(rec, form, email)
    add_domain_fields(rec, form)
    add_file_info(rec, form, email, sub_id, recid)
    checksum = create_checksum(rec, sub_id)
    add_epic_pid(rec, recid, checksum)
    marc = record_xml_output(rec)

    return recid, marc

コード例 #21

0

ファイルを表示

def modify_record_timestamp(revision_xml, last_revision_ts):
    """ Modify tag 005 to add the revision passed as parameter.
    @param revision_xml: marcxml representation of the record to modify
    @type revision_xml: string
    @param last_revision_ts: timestamp to add to 005 tag
    @type last_revision_ts: string

    @return: marcxml with 005 tag modified
    """
    recstruct = create_record(revision_xml)[0]
    if "005" in recstruct:
        record_modify_controlfield(recstruct, "005", last_revision_ts,
                                   field_position_local=0)
    else:
        record_add_field(recstruct, '005', controlfield_value=last_revision_ts)
    return record_xml_output(recstruct)

コード例 #22

0

ファイルを表示

ファイル: b2share_marc_handler.py プロジェクト: cjhak/b2share

def create_marc(form, sub_id, email, meta):
    """
    Generates MARC data used by Invenio from the filled out form, then
    submits it to the Invenio system.
    """
    rec = {}
    recid = create_recid()

    record_add_field(rec, '001', controlfield_value=str(recid))
    add_basic_fields(rec, form, meta)
    record_add_field(rec, '856', ind1='0', subfields=[('f', email)])

    add_domain_fields(rec, form, meta)
    add_file_info(rec, form, email, sub_id, recid)
    checksum = create_checksum(rec, sub_id)
    add_epic_pid(rec, recid, checksum)
    marc = record_xml_output(rec)

    return recid, marc

コード例 #23

0

ファイルを表示

def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(
            txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(
        recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml

コード例 #24

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jalavik/invenio-oaiharvester

    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball,
                                      cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' % (
                obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error("Error parsing authorlist record for id: %s" % (
                            identifiers,))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }]
                    )
                    obj.update_task_results(
                        "number_of_authors",
                        [{
                            "name": "number_of_authors",
                            "results": new_dict_representation["number_of_authors"]
                        }]
                    )
                    break

コード例 #25

0

ファイルを表示

def author_list(obj, eng):
    """Perform the special authorlist extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.errors import WorkflowError
    from invenio.utils.plotextractor.converter import untar
    from invenio.utils.shell import Timeout

    identifiers = obj.data["system_control_number"]["value"]
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path,
            ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise WorkflowError(str(
                "Error harvesting tarball from id: %s %s" % (
                    identifiers, extract_path)), eng.uuid, id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"],
                                  cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (
            obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if match:
            obj.log.info("Found a match for author extraction")

            a_stylesheet = obj.extra_data["repository"]["arguments"].get(
                "a_stylesheet"
            ) or "authorlist2marcxml.xsl"
            authors = convert(xml_content, a_stylesheet)
            authorlist_record = create_records(authors)
            if len(authorlist_record) == 1:
                if authorlist_record[0][0] is None:
                    eng.log.error("Error parsing authorlist record for id: %s" % (
                        identifiers,))
                authorlist_record = authorlist_record[0][0]
                # Convert any LaTeX symbols in authornames
            translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
            translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                          + record_xml_output(authorlist_record) + '</collection>'
            if not None == updated_xml:
                # We store the path to the directory  the tarball contents live
                # Read and grab MARCXML from plotextractor run
                new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                obj.data['authors'] = new_dict_representation["authors"]
                obj.data['number_of_authors'] = new_dict_representation[
                    "number_of_authors"]
                obj.add_task_result("authors", new_dict_representation["authors"])
                obj.add_task_result("number_of_authors",
                                    new_dict_representation["number_of_authors"])
                break

コード例 #26

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jmartinm/invenio-oaiharvester

    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(
            cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' %
                          (obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error(
                            "Error parsing authorlist record for id: %s" %
                            (identifiers, ))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(
                        updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }])
                    obj.update_task_results("number_of_authors", [{
                        "name":
                        "number_of_authors",
                        "results":
                        new_dict_representation["number_of_authors"]
                    }])
                    break

コード例 #27

0

ファイルを表示

ファイル: updater.py プロジェクト: mhellmic/b2share

def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" % len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid, verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3)

        current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set)
             if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3)

        updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) |
             (current_oai_sets - updated_oai_sets))
        write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3)
            continue # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" % recid, verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
                else:
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename)
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c', filename)
    else:
        os.remove(filename)

    return True

コード例 #28

0

ファイルを表示

ファイル: engine.py プロジェクト: mhellmic/b2share

def perform_request_record(requestType, uid, data):
    """Handle 'major' record related requests.
    Handle retrieving, submitting or cancelling the merging session.
    """
    #TODO add checks before submission and cancel, replace get_bibrecord call
    result = {
        'resultCode': 0,
        'resultText': ''
        }
    recid1 = data["recID1"]
    record1 = _get_record(recid1, uid, result)
    if result['resultCode'] != 0: #if record not accessible return error information
        return result

    if requestType == 'submit':
        if 'duplicate' in data:
            recid2 = data['duplicate']
            record2 = _get_record_slave(recid2, result, 'recid', uid)
            if result['resultCode'] != 0: #return in case of error
                return result
            # mark record2 as deleted
            record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')])
            # mark record2 as duplicate of record1
            record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))])

            # submit record2 to be deleted
            xml_record2 = record_xml_output(record2)
            save_xml_record(recid2, uid, xml_record2)

            #submit record1
            xml_record1 = record_xml_output(record1)
            save_xml_record(recid1, uid, xml_record1)

            result['resultText'] = 'Records submitted'
            return result

        #submit record1 from cache
        save_xml_record(recid1, uid)

        # Delete cache file if it exists
        if cache_exists(recid1, uid):
            delete_cache_file(recid1, uid)

        result['resultText'] = 'Record submitted'
        return result

    elif requestType == 'cancel':
        delete_cache_file(recid1, uid)
        result['resultText'] = 'Cancelled'
        return result

    recid2 = data["recID2"]
    mode = data['record2Mode']
    record2 = _get_record_slave(recid2, result, mode, uid)
    if result['resultCode'] != 0: #if record not accessible return error information
        return result

    if requestType == 'getRecordCompare':
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records compared'

    elif requestType == 'recCopy':
        copy_R2_to_R1(record1, record2)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Record copied'

    elif requestType == 'recMerge':
        merge_record(record1, record2, merge_conflicting_fields=True)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records merged'

    elif requestType == 'recMergeNC':
        merge_record(record1, record2, merge_conflicting_fields=False)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records merged'

    else:
        result['resultCode'], result['resultText'] = 1, 'Wrong request type'

    return result

コード例 #29

0

ファイルを表示

ファイル: engine.py プロジェクト: derekstrom/invenio

def format_with_format_template(format_template_filename, bfo,
                                verbose=0, format_template_code=None, qid="",
                                extra_context=None):
    """ Format a record given a
    format template.

    Returns a formatted version of the record represented by bfo,
    in the language specified in bfo, and with the specified format template.

    If format_template_code is provided, the template will not be loaded from
    format_template_filename (but format_template_filename will still be used to
    determine if bft or xsl transformation applies). This allows to preview format
    code without having to save file on disk.

    :param format_template_filename: the dilename of a format template
    :param bfo: the object containing parameters for the current formatting
    :param format_template_code: if not empty, use code as template instead of reading format_template_filename (used for previews)
    :param verbose: the level of verbosity from 0 to 9 (O: silent,
                                                       5: errors,
                                                       7: errors and warnings,
                                                       9: errors and warnings, stop if error (debug mode ))
    @return: formatted text
    """
    if format_template_code is not None:
        format_content = str(format_template_code)
    elif not format_template_filename.endswith("." + CFG_BIBFORMAT_FORMAT_JINJA_TEMPLATE_EXTENSION):
        format_content = get_format_template(format_template_filename)['code']

    if format_template_filename.endswith("." + CFG_BIBFORMAT_FORMAT_JINJA_TEMPLATE_EXTENSION):
        evaluated_format = '<!-- empty -->'
        #try:
        from functools import wraps
        from invenio.modules.records.api import \
            create_record as new_create_record, \
            get_record as new_get_record
        from flask_login import current_user
        from invenio.base.helpers import unicodifier

        def _format_record(recid, of='hb', user_info=current_user, *args, **kwargs):
            from invenio.modules.formatter import format_record
            return format_record(recid, of, user_info=user_info, *args, **kwargs)

        # Fixes unicode problems in Jinja2 templates.
        def encode_utf8(f):
            @wraps(f)
            def wrapper(*args, **kwds):
                return unicodifier(f(*args, **kwds))
            return wrapper

        if bfo.xml_record is None:
            record = new_get_record(bfo.recID)
        else:
            record = new_create_record(bfo.xml_record, master_format='marc')
            bfo.recID = bfo.recID if bfo.recID else 0
        record.__getitem__ = encode_utf8(record.__getitem__)
        record.get = encode_utf8(record.get)

        evaluated_format = render_template_to_string(
            'format/record/'+format_template_filename,
            recid=bfo.recID,
            record=record,
            format_record=_format_record,
            qid=qid,
            bfo=bfo, **(extra_context or {})).encode('utf-8')
        needs_2nd_pass = False
    else:
        from invenio.modules.records.api import get_record as new_get_record

        #.xsl
        if bfo.xml_record:
            # bfo was initialized with a custom MARCXML
            xml_record = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
                         record_xml_output(bfo.record)
        else:
            # Fetch MARCXML. On-the-fly xm if we are now formatting in xm
            xml_record = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
                         new_get_record(bfo.recID).legacy_export_as_marc()

        # Transform MARCXML using stylesheet
        evaluated_format = xslt.format(xml_record, template_source=format_content).decode('utf-8')
        needs_2nd_pass = False

    return evaluated_format, needs_2nd_pass

コード例 #30

0

ファイルを表示

ファイル: Link_Records.py プロジェクト: chokribr/invenio-1

def _prepare_marcxml(recid_a,
                     rn_a,
                     recids_and_rns_b,
                     what_is_a_for_b,
                     what_is_b_for_a,
                     display_in_a=True,
                     display_in_b=True,
                     marc_for_a=None,
                     marc_for_b=None,
                     upload_mode='append',
                     consider_empty_p=False):
    output = '<collection>'
    record_a = {}
    record_b = {}
    if what_is_b_for_a is not None:
        marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \
          _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1")
        record_add_field(record_a, "001", controlfield_value=str(recid_a))
        if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p:
            # Add empty field in order to account for cases where all
            # linkings are removed by the submitter
            record_add_field(record_a,
                             marc_tag_for_a,
                             ind1=marc_ind1_for_a,
                             ind2=marc_ind2_for_a)
        for recid_b, rn_b in recids_and_rns_b:
            record_add_field(record_a,
                             marc_tag_for_a,
                             ind1=marc_ind1_for_a,
                             ind2=marc_ind2_for_a,
                             subfields=[('i', what_is_b_for_a), ('r', rn_b),
                                        ('w', str(recid_b))])
        output += record_xml_output(record_a)

    if what_is_a_for_b is not None:
        marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \
          _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1")
        for recid_b, rn_b in recids_and_rns_b:
            record_b = {}
            record_add_field(record_b, "001", controlfield_value=str(recid_b))
            if upload_mode == 'correct':
                original_linking_fields = _get_record_linking_fields(
                    recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b,
                    marc_ind2_for_b)
                record_add_fields(record_b, marc_tag_for_b,
                                  original_linking_fields)
            record_add_field(record_b,
                             marc_tag_for_b,
                             ind1=marc_ind1_for_b,
                             ind2=marc_ind2_for_b,
                             subfields=[('i', what_is_a_for_b), ('r', rn_a),
                                        ('w', str(recid_a))])
            output += record_xml_output(record_b)
        # Remove linking in remote records where adequate
        if consider_empty_p:
            unlinked_recids = get_unlinked_records(recid_a, marc_for_b,
                                                   display_in_b, upload_mode,
                                                   recids_and_rns_b)
            for recid_b in unlinked_recids:
                record_b = {}
                record_add_field(record_b,
                                 "001",
                                 controlfield_value=str(recid_b))
                original_linking_fields = _get_record_linking_fields(
                    recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b,
                    marc_ind2_for_b)
                if not original_linking_fields:
                    # Add empty field in order to account for cases where all
                    # linkings are removed by the submitter
                    record_add_field(record_b,
                                     marc_tag_for_b,
                                     ind1=marc_ind1_for_b,
                                     ind2=marc_ind2_for_b)
                record_add_fields(record_b, marc_tag_for_b,
                                  original_linking_fields)
                output += record_xml_output(record_b)
    output += '</collection>'
    return output

コード例 #31

0

ファイルを表示

ファイル: engine.py プロジェクト: SCOAP3/invenio

def perform_request_record(requestType, uid, data):
    """Handle 'major' record related requests.
    Handle retrieving, submitting or cancelling the merging session.
    """
    #TODO add checks before submission and cancel, replace get_bibrecord call
    result = {
        'resultCode': 0,
        'resultText': ''
        }
    recid1 = data["recID1"]
    record1 = _get_record(recid1, uid, result)
    if result['resultCode'] != 0: #if record not accessible return error information
        return result

    if requestType == 'submit':
        if 'duplicate' in data:
            recid2 = data['duplicate']
            record2 = _get_record_slave(recid2, result, 'recid', uid)
            if result['resultCode'] != 0: #return in case of error
                return result
            (errcode, message) = check_doi_status_after_merge(data["recID1"], data['duplicate'],
                                                              record1, record2,
                                                              record2_marked_as_duplicate_p=data.has_key('duplicate'),
                                                              submit_confirmed_p=data.get('additional_data', {'confirmed_submit': False}).get('confirmed_submit', False))
            if errcode:
                result['resultCode'] = errcode
                result['resultText'] = message
                return result

            # mark record2 as deleted
            record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')])
            # mark record2 as duplicate of record1
            record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))])
            # add recid of deleted record to master record
            record_add_field(record1, '981', ' ', ' ', '', [('a', str(recid2))])

            # To ensure updates happen in order, use a seq id
            sequence_id = str(random.randrange(1, 4294967296))

            # submit record2 to be deleted
            xml_record2 = record_xml_output(record2)
            save_xml_record(recid2, uid, xml_record2, task_name="bibmerge",
                            sequence_id=sequence_id)

            # submit record1
            xml_record1 = record_xml_output(record1)
            save_xml_record(recid1, uid, xml_record1, task_name="bibmerge",
                            sequence_id=sequence_id)

            # Delete cache file if it exists
            if cache_exists(recid1, uid):
                delete_cache(recid1, uid)

            result['resultText'] = 'Records submitted'
            return result

        (errcode, message) = check_doi_status_after_merge(data["recID1"], data["recID2"],
                                                          record1, None,
                                                          submit_confirmed_p=data.get('additional_data', {'confirmed_submit': False}).get('confirmed_submit', False))
        if errcode:
            result['resultCode'] = errcode
            result['resultText'] = message
            return result

        #submit record1 from cache
        save_xml_record(recid1, uid, task_name="bibmerge")

        # Delete cache file if it exists
        if cache_exists(recid1, uid):
            delete_cache(recid1, uid)

        result['resultText'] = 'Record submitted'
        return result

    elif requestType == 'cancel':
        delete_cache(recid1, uid)
        result['resultText'] = 'Cancelled'
        return result

    recid2 = data["recID2"]
    mode = data['record2Mode']
    record2 = _get_record_slave(recid2, result, mode, uid)
    if result['resultCode'] != 0: #if record not accessible return error information
        return result

    if requestType == 'getRecordCompare':
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records compared'

    elif requestType == 'recCopy':
        copy_R2_to_R1(record1, record2)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Record copied'

    elif requestType == 'recMerge':
        merge_record(record1, record2, merge_conflicting_fields=True)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records merged'

    elif requestType == 'recMergeNC':
        merge_record(record1, record2, merge_conflicting_fields=False)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(record1, record2)
        result['resultText'] = 'Records merged'

    else:
        result['resultCode'], result['resultText'] = 1, 'Wrong request type'

    return result

コード例 #32

0

ファイルを表示

def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql(
            "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"
    ):
        write_message(
            "Previous requests of oairepository still being elaborated. Let's skip this execution."
        )
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in iteritems(recids_for_set)
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename, '-n',
                                              '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename,
                                              '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
    else:
        os.remove(filename)

    return True

コード例 #33

0

ファイルを表示

ファイル: engine.py プロジェクト: chokribr/invenio-1

def perform_request_record(requestType, uid, data):
    """Handle 'major' record related requests.
    Handle retrieving, submitting or cancelling the merging session.
    """
    #TODO add checks before submission and cancel, replace get_bibrecord call
    result = {'resultCode': 0, 'resultText': ''}
    recid1 = data["recID1"]
    record1 = _get_record(recid1, uid, result)
    if result[
            'resultCode'] != 0:  #if record not accessible return error information
        return result

    if requestType == 'submit':
        if 'duplicate' in data:
            recid2 = data['duplicate']
            record2 = _get_record_slave(recid2, result, 'recid', uid)
            if result['resultCode'] != 0:  #return in case of error
                return result
            (errcode, message) = check_doi_status_after_merge(
                data["recID1"],
                data['duplicate'],
                record1,
                record2,
                record2_marked_as_duplicate_p=data.has_key('duplicate'),
                submit_confirmed_p=data.get('additional_data', {
                    'confirmed_submit': False
                }).get('confirmed_submit', False))
            if errcode:
                result['resultCode'] = errcode
                result['resultText'] = message
                return result

            # mark record2 as deleted
            record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')])
            # mark record2 as duplicate of record1
            record_add_field(record2, '970', ' ', ' ', '',
                             [('d', str(recid1))])
            # add recid of deleted record to master record
            record_add_field(record1, '981', ' ', ' ', '',
                             [('a', str(recid2))])

            # To ensure updates happen in order, use a seq id
            sequence_id = str(random.randrange(1, 4294967296))

            # submit record2 to be deleted
            xml_record2 = record_xml_output(record2)
            save_xml_record(recid2,
                            uid,
                            xml_record2,
                            task_name="bibmerge",
                            sequence_id=sequence_id)

            # submit record1
            xml_record1 = record_xml_output(record1)
            save_xml_record(recid1,
                            uid,
                            xml_record1,
                            task_name="bibmerge",
                            sequence_id=sequence_id)

            # Delete cache file if it exists
            if cache_exists(recid1, uid):
                delete_cache(recid1, uid)

            result['resultText'] = 'Records submitted'
            return result

        (errcode, message) = check_doi_status_after_merge(
            data["recID1"],
            data["recID2"],
            record1,
            None,
            submit_confirmed_p=data.get('additional_data', {
                'confirmed_submit': False
            }).get('confirmed_submit', False))
        if errcode:
            result['resultCode'] = errcode
            result['resultText'] = message
            return result

        #submit record1 from cache
        save_xml_record(recid1, uid, task_name="bibmerge")

        # Delete cache file if it exists
        if cache_exists(recid1, uid):
            delete_cache(recid1, uid)

        result['resultText'] = 'Record submitted'
        return result

    elif requestType == 'cancel':
        delete_cache(recid1, uid)
        result['resultText'] = 'Cancelled'
        return result

    recid2 = data["recID2"]
    mode = data['record2Mode']
    record2 = _get_record_slave(recid2, result, mode, uid)
    if result[
            'resultCode'] != 0:  #if record not accessible return error information
        return result

    if requestType == 'getRecordCompare':
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(
            record1, record2)
        result['resultText'] = 'Records compared'

    elif requestType == 'recCopy':
        copy_R2_to_R1(record1, record2)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(
            record1, record2)
        result['resultText'] = 'Record copied'

    elif requestType == 'recMerge':
        merge_record(record1, record2, merge_conflicting_fields=True)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(
            record1, record2)
        result['resultText'] = 'Records merged'

    elif requestType == 'recMergeNC':
        merge_record(record1, record2, merge_conflicting_fields=False)
        result['resultHtml'] = bibmerge_templates.BM_html_all_diff(
            record1, record2)
        result['resultText'] = 'Records merged'

    else:
        result['resultCode'], result['resultText'] = 1, 'Wrong request type'

    return result

コード例 #34

0

ファイルを表示

def tweet_to_record(tweet, query):
    """
    Transform a tweet into a record.
    @note: you may want to highly customize this.
    """
    rec = {}
    ## Let's normalize the body of the tweet.
    text = tweet.text.encode('UTF-8')
    text = text.replace('&gt;', '>')
    text = text.replace('&lt;', '<')
    text = text.replace('&quot;', "'")
    text = text.replace('&amp;', '&')

    ## Let's add the creation date
    try:
        creation_date = time.strptime(tweet.created_at,
                                      '%a, %d %b %Y %H:%M:%S +0000')
    except ValueError:
        creation_date = time.strptime(tweet.created_at,
                                      '%a %b %d %H:%M:%S +0000 %Y')
    record_add_field(rec, '260__c',
                     time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date))

    ## Let's add the Tweet ID
    record_add_field(rec, '970', subfields=[('a', str(tweet.id))])

    ## Let's add the body of the tweet as an abstract
    record_add_field(rec, '520', subfields=[('a', text)])

    ## Let's re-add the body of the tweet as a title.
    record_add_field(rec, '245', subfields=[('a', text)])

    ## Let's fetch information about the user
    try:
        user = _TWITTER_API.GetUser(tweet.from_user)

        ## Let's add the user name as author of the tweet
        record_add_field(rec,
                         '100',
                         subfields=[('a', str(user.name.encode('UTF-8')))])

        ## Let's fetch the icon of the user profile, and let's upload it as
        ## an image (and an icon of itself)
        record_add_field(rec,
                         'FFT',
                         subfields=[
                             ('a', user.profile.image_url.encode('UTF-8')),
                             ('x', user.profile.image_url.encode('UTF-8'))
                         ])
    except Exception as err:
        write_message("WARNING: issue when fetching the user: %s" % err,
                      stream=sys.stderr)
    if hasattr(tweet, 'iso_language_code'):
        ## Let's add the language of the Tweet if available (also this depends)
        ## on the kind of Twitter API call we used
        record_add_field(rec,
                         '045',
                         subfields=[('a',
                                     tweet.iso_language_code.encode('UTF-8'))])

    ## Let's tag this record as a TWEET so that later we can build a collection
    ## out of these records.
    record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)])

    ## Some smart manipulations: let's parse out URLs and tags from the body
    ## of the Tweet.
    for url in _RE_GET_HTTP.findall(text):
        url = url[0]
        record_add_field(rec, '856', '4', subfields=[('u', url)])

    for tag in _RE_TAGS.findall(text):
        ## And here we add the keywords.
        record_add_field(rec,
                         '653',
                         '1',
                         subfields=[('a', tag), ('9', 'TWITTER')])

    ## Finally we shall serialize everything to MARCXML
    return record_xml_output(rec)

コード例 #35

0

ファイルを表示

ファイル: marcxml_tasks.py プロジェクト: mhellmic/b2share

def author_list(obj, eng):
    """
    Performs the special authorlist extraction step (Mostly INSPIRE/CERN related).

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults

    identifiers = obj.data["system_number_external"]["value"]
    bibtask.task_sleep_now_if_required()
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
        tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)),
                                                eng.uuid,
                                                id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if not match == []:
            authors += match[0]
            # Generate file to store conversion results
    if authors is not '':
        authors = convert(authors, "authorlist2marcxml.xsl")
        authorlist_record = create_records(authors)
        if len(authorlist_record) == 1:
            if authorlist_record[0][0] is None:
                eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,))
            authorlist_record = authorlist_record[0][0]
            # Convert any LaTeX symbols in authornames
        translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
        translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

        updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \
                      + '</collection>'
        if not None == updated_xml:
            # We store the path to the directory  the tarball contents live
            # Read and grab MARCXML from plotextractor run
            new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps()
            obj.data['authors'] = new_dict_representation["authors"]
            obj.data['number_of_authors'] = new_dict_representation["number_of_authors"]
            obj.add_task_result("authors", new_dict_representation["authors"])
            obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])

コード例 #36

0

ファイルを表示

ファイル: bst_twitter_fetcher.py プロジェクト: mhellmic/b2share

def tweet_to_record(tweet, query):
    """
    Transform a tweet into a record.
    @note: you may want to highly customize this.
    """
    rec = {}
    ## Let's normalize the body of the tweet.
    text = tweet.text.encode('UTF-8')
    text = text.replace('&gt;', '>')
    text = text.replace('&lt;', '<')
    text = text.replace('&quot;', "'")
    text = text.replace('&amp;', '&')

    ## Let's add the creation date
    try:
        creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000')
    except ValueError:
        creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y')
    record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date))

    ## Let's add the Tweet ID
    record_add_field(rec, '970', subfields=[('a', str(tweet.id))])

    ## Let's add the body of the tweet as an abstract
    record_add_field(rec, '520', subfields=[('a', text)])

    ## Let's re-add the body of the tweet as a title.
    record_add_field(rec, '245', subfields=[('a', text)])

    ## Let's fetch information about the user
    try:
        user = _TWITTER_API.GetUser(tweet.from_user)

        ## Let's add the user name as author of the tweet
        record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))])

        ## Let's fetch the icon of the user profile, and let's upload it as
        ## an image (and an icon of itself)
        record_add_field(rec, 'FFT', subfields=[('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8'))])
    except Exception as err:
        write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr)
    if hasattr(tweet, 'iso_language_code'):
            ## Let's add the language of the Tweet if available (also this depends)
        ## on the kind of Twitter API call we used
        record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))])

    ## Let's tag this record as a TWEET so that later we can build a collection
    ## out of these records.
    record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)])

    ## Some smart manipulations: let's parse out URLs and tags from the body
    ## of the Tweet.
    for url in _RE_GET_HTTP.findall(text):
        url = url[0]
        record_add_field(rec, '856', '4', subfields=[('u', url)])

    for tag in _RE_TAGS.findall(text):
        ## And here we add the keywords.
        record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')])

    ## Finally we shall serialize everything to MARCXML
    return record_xml_output(rec)