def _record_in_files_p(recid, filenames):
    """Search XML files for given record."""
    # Get id tags of record in question
    rec_oaiid = rec_sysno = -1
    rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG)
    if rec_oaiid_tag:
        rec_oaiid = rec_oaiid_tag[0]
    rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG)
    if rec_sysno_tag:
        rec_sysno = rec_sysno_tag[0]

    # For each record in each file, compare ids and abort if match is found
    for filename in filenames:
        try:
            file_ = open(filename)
            records = create_records(file_.read(), 0, 0)
            for i in range(0, len(records)):
                record, all_good = records[i][:2]
                if record and all_good:
                    if _record_has_id_p(record, recid, rec_oaiid, rec_sysno):
                        return True
            file_.close()
        except IOError:
            continue
    return False
def _create_marc(records_xml):
    """Creates MARC from MARCXML.

    @param records_xml: MARCXML containing information about the records

    @return: string containing information about the records
    in MARC format
    """
    aleph_marc_output = "<pre>"

    records = bibrecord.create_records(records_xml)
    for (record, status_code, list_of_errors) in records:
        # The system number is in field 970a
        # By this reason it should exist in the MARC XML
        # otherwise it will be None in the output ALEPH marc
        sysno_options = {"text-marc":0}
        sysno = xmlmarc2textmarclib.get_sysno_from_record(record,
                                                              sysno_options)

        if sysno == None:
            sysno = ""

        options = {"aleph-marc":1, "correct-mode":1, "append-mode":0,
                   "delete-mode":0, "insert-mode":0, "replace-mode":0,
                   "text-marc":0}
        aleph_record = xmlmarc2textmarclib.create_marc_record(record,
                                                              sysno,
                                                              options)
        aleph_marc_output += aleph_record

    aleph_marc_output += "</pre>"
    return aleph_marc_output
 def test_restricted_collections_remote(self):
     """bibmatch - check restricted collections remote search"""
     records = create_records(self.recxml5)
     # Jekyll should have access
     [dummy1, matchedrecs, dummy2, dummy3] = match_records(
         records,
         qrystrs=[("", "[088__a]")],
         collections=["Theses"],
         server_url="https://invenio-demo.cern.ch",
         user="******",
         password="******",
         verbose=0,
     )
     self.assertEqual(1, len(matchedrecs))
     # Hyde should not have access
     [nomatchrecs, dummy1, dummy2, dummy3] = match_records(
         records,
         qrystrs=[("", "[088__a]")],
         collections=["Theses"],
         server_url="https://invenio-demo.cern.ch",
         user="******",
         password="******",
         verbose=0,
     )
     self.assertEqual(1, len(nomatchrecs))
 def test_check_textmarc(self):
     """bibmatch - check textmarc as input"""
     marcxml = transform_input_to_marcxml("", self.textmarc)
     records = create_records(marcxml)
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           verbose=0)
     self.assertEqual(2, len(matchedrecs))
 def test_check_remote(self):
     """bibmatch - check remote match (Invenio demo site)"""
     records = create_records(self.recxml6)
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           server_url="http://invenio-demo.cern.ch", \
                                                           verbose=0)
     self.assertEqual(1, len(matchedrecs))
def main():
    usage = """ Usage: $ %s [tags_csv] [marcxml_in] [marcxml_out]
  tags_csv      Tags to preserve as CSVs
  marcxml_in    MARCXML file to read from
  marcxml_out   MARCXML file to write""" % (PROGRAM_NAME,)
    if len(argv) == 4:
        tags = argv[1].split(',')
        fin = argv[2]
        fout = argv[3]
    else:
        print(usage)
        return

    with open(fin) as handle:
        records = create_records(handle.read())


    xmlout = ('<?xml version="1.0"?>\n' +
              '<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    for record, err, reason in records:
        if err == '0':
            print('Error: Could not create record\n' + reason)
        else:
            xmlout += record_xml_output(record, tags=tags) + '\n'

    with open(fout, 'w') as handle:
        handle.write(xmlout + '</collection>\n')
    def _filter_records_fields(self, records_xml, output_fields):
        """Leaves in the records only fields that are necessary.
        All the other fields are removed from the records.

        @param records_xml: MARC XML containing all the information about the records
        @param output_fields: list of fields that should remain in the records

        @return: MARC XML with records containing only fields that are
        in output_fields list.
        """
        # Add 001/970 to the output fields. 970 is necessary for system number
        # extraction when exporting in aleph marc. When we add more formats,
        # we can add it optionally only when exporting aleph marc.
        output_fields.append("001")
        output_fields.append("970")

        records = bibrecord.create_records(records_xml)
        output_records = []

        for (record, status_code, list_of_errors) in records:
            record = self._filter_fields(record, output_fields)
            # do not return empty records
            if not self._is_record_empty(record):
                output_records.append(record)

        output_xml = bibrecord.print_recs(output_records)

        return output_xml
def _create_marc(records_xml):
    """Creates MARC from MARCXML.

    @param records_xml: MARCXML containing information about the records

    @return: string containing information about the records
    in MARC format
    """
    aleph_marc_output = ""

    records = bibrecord.create_records(records_xml)
    for (record, status_code, list_of_errors) in records:

        sysno = ""

        options = {"aleph-marc":0, "correct-mode":1, "append-mode":0,
                   "delete-mode":0, "insert-mode":0, "replace-mode":0,
                   "text-marc":1}

        aleph_record = xmlmarc2textmarc.create_marc_record(record,
                                                           sysno,
                                                           options)
        aleph_marc_output += aleph_record

    return aleph_marc_output
 def test_check_collection(self):
     """bibmatch - check collection"""
     records = create_records(self.recxml4)
     [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, collections=["Books"], verbose=0)
     self.assertEqual(1, len(nomatchrecs))
     [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, collections=["Articles"], verbose=0)
     self.assertEqual(1, len(matchedrecs))
 def test_check_textmarc(self):
     """bibmatch - check textmarc as input"""
     marcxml = transform_input_to_marcxml("", self.textmarc)
     records = create_records(marcxml)
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           server_url="http://invenio-demo.cern.ch")
     self.assertEqual(2, len(matchedrecs))
    def test_check_existing(self):
        """bibmatch - check existing record"""
        # Non-fuzzy searching will not find it
        records = create_records(self.recxml4)
        [nonmatchedrecs, dummy1, dummy2, dummy3] = match_records(records, verbose=0, fuzzy=False)
        self.assertEqual(1, len(nonmatchedrecs))

        # Fuzzy searching should find it
        records = create_records(self.recxml4)
        [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, verbose=0, fuzzy=True)
        self.assertEqual(1, len(matchedrecs))

        # Check that searches returning more results are properly validated
        # This search should return 4 hits, but only real 1 match.
        records = create_records(self.recxml6)
        [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, verbose=0)
        self.assertEqual(1, len(matchedrecs))
 def test_check_altered(self):
     """bibmatch - check altered match"""
     records = create_records(self.recxml4)
     self.assertTrue(not record_has_field(records[0][0], '001'))
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           modify=1, \
                                                           verbose=0)
     self.assertTrue(record_has_field(matchedrecs[0][0], '001'))
def process_bibcodes_to_delete(extraction_directory, upload_mode):
    """method that creates the MarcXML for the bibcodes to delete"""
    logger.info("In function %s" % (inspect.stack()[0][3],))

    #I create an unique file for all the bibcodes to delete:
    #I don't think it's necessary to split the content in groups, since the XML is really simple

    #I create the base object for the tree
    doc = libxml2.newDoc("1.0")
    root = doc.newChild(None, "collection", None)

    #then for each bibcode to delete I create the proper record
    for bibcode in BIBCODES_TO_DELETE_LIST:
        record = root.newChild(None, 'record', None)
        #I add to the record the 2 necessary datafields
        d970 = record.newChild(None, 'datafield', None)
        d970.setProp('tag', '970')
        d970.setProp('ind1', '')
        d970.setProp('ind2', '')
        #I create the subfield tag
        sub = d970.newChild(None, 'subfield', bibcode.replace('&', '&amp;'))
        sub.setProp("code", "a")
        d980 = record.newChild(None, 'datafield', None)
        d980.setProp('tag', '980')
        d980.setProp('ind1', '')
        d980.setProp('ind2', '')
        #I create the subfield tag
        sub = d980.newChild(None, 'subfield', "DELETED")
        sub.setProp("code", "c")

    #I extract the node
    marcxml_string = doc.serialize('UTF-8', 1)
    #I remove the data
    doc.freeDoc()
    del doc
    #I write the bibcodes in the done bibcodes file
    w2f = write_files.WriteFile(extraction_directory, logger)
    w2f.write_done_bibcodes_to_file(BIBCODES_TO_DELETE_LIST)
    del w2f
    
    if upload_mode == 'concurrent':
        #I transform the xml in bibrecords
        bibrecord_object = [elem[0] for elem in bibrecord.create_records(marcxml_string)]
        #I upload the result with option append
        logger.warning('Upload of records to delete started.')
        bibupload_merger(bibrecord_object, logger, 'append')
        logger.warning('Upload of records to delete ended.')
    elif upload_mode == 'bibupload':
        filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, settings.BASE_BIBRECORD_FILES_DIR, settings.BIBCODE_TO_DELETE_OUT_NAME)
        with open(filepath, 'w') as marcxml_to_del_file:
            marcxml_to_del_file.write(marcxml_string)
        task_low_level_submission('bibupload', 'admin', '-a', filepath)
        logger.warning('File "%s" submitted to bibupload.' % filepath)
    else:
        logger.error('Upload mode "%s" not supported! File not uploaded' % upload_mode)
    return True
 def test_check_qrystr(self):
     """bibmatch - check querystrings"""
     operator = "and"
     qrystr_old = "title||author"
     qrystr_new = "[title] %s [author]" % (operator,)
     querystring = Querystring(operator)
     records = create_records(self.recxml3)
     old_query = querystring.create_query(records[0], qrystr_old)
     new_query = querystring.create_query(records[0], qrystr_new)
     self.assertEqual(old_query, new_query)
def inject_recid(data):
    """ """
    updated_records = []
    for match in data:
        original_record_bibrec = create_records(match)[0][0]
        if not record_has_field(original_record_bibrec, '001'):
            rec_id = re_matched_recid.findall(match)[0][1]
            record_add_field(original_record_bibrec, tag='001', controlfield_value=rec_id)
        updated_records.append(original_record_bibrec)
    return updated_records
def parse_resultfile(data):
    pairs = []
    server_url = ""
    for match in data:
        orig_record = create_records(match)[0]
        recids = re_matched_recid.findall(match)
        print match
        print '\n=====\n', recids
        sys.exit(0)
        queries = re_matched_query.findall(match)
        pairs.append(((recids, queries), orig_record))
    return pairs
def _check_client_can_submit_file(client_ip="", metafile="", req=None, webupload=0, ln=CFG_SITE_LANG):
    """
    Is this client able to upload such a FILENAME?
    check 980 $a values and collection tags in the file to see if they are among the
    permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION.
    Useful to make sure that the client does not override other records by
    mistake.
    """
    _ = gettext_set_language(ln)
    recs = create_records(metafile, 0, 0)
    user_info = collect_user_info(req)

    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' in permitted_dbcollids:
        if not webupload:
            return True
        else:
            return (0, " ")

    filename_tag980_values = _detect_980_values_from_marcxml_file(recs)
    for filename_tag980_value in filename_tag980_values:
        if not filename_tag980_value:
            if not webupload:
                return False
            else:
                return(1, "Invalid collection in tag 980")
        if not webupload:
            if not filename_tag980_value in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_tag980_value)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value}
                return (auth_code, error_msg)

    filename_rec_id_collections = _detect_collections_from_marcxml_file(recs)

    for filename_rec_id_collection in filename_rec_id_collections:
        if not webupload:
            if not filename_rec_id_collection in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_rec_id_collection)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection}
                return (auth_code, error_msg)
    if not webupload:
        return True
    else:
        return (0, " ")
Example #18
0
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content):
    """
    Function which creates the harvesting logs
    @param task_id bibupload task id
    """
    try:
        records = create_records(xml_content)
        for record in records:
            oai_id = record_extract_oai_id(record[0])
            query = "INSERT INTO oaiHARVESTLOG (id_oaiHARVEST, oai_id, date_harvested, bibupload_task_id) VALUES (%s, %s, NOW(), %s)"
            run_sql(query, (str(oai_src_id), str(oai_id), str(task_id)))
    except Exception, msg:
        print "Logging exception : %s   " % (str(msg), )
def get_result_invenio_xmltransformer(xmlstring):
    xmlobj = libxml2.parseDoc(xmlstring)
    xslt = '../misc/AdsXML2MarcXML_v2.xsl'
    stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(xslt))
    xml_transformed_object = stylesheet.applyStylesheet(xmlobj, None)
    marcxml = xml_transformed_object.serialize(encoding='utf-8')
    #result with internal function
    result_xml_transformer = x.create_record_from_libxml_obj(xml_transformed_object, logger)
    #result with function from invenio
    regex = re.compile('<collection>.*?</collection>', re.DOTALL)
    record_xmls = regex.findall(marcxml)
    result_invenio = [[res[0] for res in bibrecord.create_records(xml)] for xml in record_xmls]
    
    return (result_xml_transformer, result_invenio)
def retrieve_records(results):
    last_url = ""
    records = []
    search_params = dict(p="", of="xm")
    for url, recid in results:
        if url != last_url:
            server = InvenioConnector(url)
        search_params["p"] = "001:%s" % (recid,)
        res = server.search_with_retry(**search_params)
	time.sleep(1.0)
        if res != []:
            records.append(create_records(res)[0])
        else:
            print "Problem with record: %s" % (recid,)
    return records
def perform_basic_upload_checks(xml_record):
    """ Performs tests that would provoke the bibupload task to fail with
    an exit status 1, to prevent batchupload from crashing while alarming
    the user wabout the issue
    """
    from bibupload import writing_rights_p

    errors = []
    if not writing_rights_p():
        errors.append("Error: BibUpload does not have rights to write fulltext files.")
    recs = create_records(xml_record, 1, 1)
    if recs == []:
        errors.append("Error: Cannot parse MARCXML file.")
    elif recs[0][0] is None:
        errors.append("Error: MARCXML file has wrong format: %s" % recs)
    return errors
Example #22
0
def _record_in_files_p(recid, filenames):
    """Search XML files for given record."""
    # Get id tags of record in question
    rec_oaiid = rec_sysno = -1
    rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG)
    if rec_oaiid_tag:
        rec_oaiid = rec_oaiid_tag[0]
    rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG)
    if rec_sysno_tag:
        rec_sysno = rec_sysno_tag[0]

    # For each record in each file, compare ids and abort if match is found
    for filename in filenames:
        try:
            if CFG_BIBEDIT_QUEUE_CHECK_METHOD == "regexp":
                # check via regexp: this is fast, but may not be precise
                re_match_001 = re.compile('<controlfield tag="001">%s</controlfield>' % (recid))
                re_match_oaiid = re.compile(
                    '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                    % (OAIID_TAG[0:3], rec_oaiid)
                )
                re_match_sysno = re.compile(
                    '<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                    % (SYSNO_TAG[0:3], rec_sysno)
                )
                file_content = open(filename).read()
                if re_match_001.search(file_content):
                    return True
                if rec_oaiid_tag:
                    if re_match_oaiid.search(file_content):
                        return True
                if rec_sysno_tag:
                    if re_match_sysno.search(file_content):
                        return True
            else:
                # by default, check via bibrecord: this is accurate, but may be slow
                file_ = open(filename)
                records = create_records(file_.read(), 0, 0)
                for i in range(0, len(records)):
                    record, all_good = records[i][:2]
                    if record and all_good:
                        if _record_has_id_p(record, recid, rec_oaiid, rec_sysno):
                            return True
                file_.close()
        except IOError:
            continue
    return False
def parse_noresultfile(data, recid_patterns=(re_original_id,), sysno_patterns=None):
    """
    This function will look for the original recid in 001 and any matching recids
    from given regular expression patterns in the textmarc format of given record.

    Returns a list of BibRec structure with found recids for original and matching records.
    """
    record_pairs = []
    sysno_gen = get_sysno_generator()
    options = {'text-marc':1, 'aleph-marc':0}
    for match in data:
        original_record_bibrec = create_records(match)[0][0]
        rec_id = record_get_field_value(original_record_bibrec, '001')
        sysno = sysno_gen.next()
        original_record_marc = create_marc_record(original_record_bibrec, sysno, options)
        matching_result_recids = []
        for pattern in recid_patterns:
            matches = pattern.findall(original_record_marc)
            for match in matches:
                if type(match) is tuple:
                    for res in match:
                        if res != "":
                            matching_result_recids = [res]
                            break
                elif type(match) is str:
                    matching_result_recids = [match]
                    break
            if len(matching_result_recids) > 0:
                break
        matching_result_sysnos = []
        for pattern in sysno_patterns:
            matches = pattern.findall(original_record_marc)
            for match in matches:
                if type(match) is tuple:
                    for res in match:
                        if res != "":
                            matching_result_sysnos = [res]
                            break
                elif type(match) is str:
                    matching_result_sysnos = [match]
                    break
            if len(matching_result_sysnos) > 0:
                break

        record_pairs.append((rec_id, matching_result_recids, matching_result_sysnos))
    return record_pairs
Example #24
0
def read_record(filename):
    """
    Read template as record

    @param filename: base_name of template file
    @type  filename: string
    @yield: record
    """

    path_templates = CFG_BIBEDIT_RECORD_TEMPLATES_PATH
    template = codecs.EncodedFile(codecs.open('%s%s%s' %
               (path_templates, os.sep, filename), mode='r'), 'utf8')
    xmlrecords = template.read()
    recs = create_records(xmlrecords, verbose=1)
    template.close()
    for recordtuple in recs:
        if recordtuple[1] != 0:
            yield recordtuple[0]
    def test_restricted_collections_local(self):
        """bibmatch - check restricted collections local search"""
        records = create_records(self.recxml5)
        # Should not have access
        [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
                                                              qrystrs=[("", "[088__a]")], \
                                                              collections=["Theses"], \
                                                              verbose=0)
        self.assertEqual(1, len(nomatchrecs))

        # Jekyll should have access
        [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
                                                              qrystrs=[("", "[088__a]")], \
                                                              collections=["Theses"], \
                                                              user="******",
                                                              password="******", \
                                                              verbose=0)
        self.assertEqual(1, len(matchedrecs))
 def test_restricted_collections_local(self):
     """bibmatch - check restricted collections local search"""
     records = create_records(self.recxml5)
     # Jekyll should have access
     [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
                                                           qrystrs=[("", "[088__a]")], \
                                                           collections=["Theses"], \
                                                           user="******",
                                                           password="******")
     self.assertEqual(1, len(matchedrecs))
     # Hyde should not have access
     [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
                                                           qrystrs=[("", "[088__a]")], \
                                                           collections=["Theses"], \
                                                           user="******", \
                                                           password="******",
                                                           verbose=0)
     self.assertEqual(1, len(matchedrecs))
Example #27
0
    def test_restricted_collections_local(self):
        """bibmatch - check restricted collections local search"""
        records = create_records(self.recxml5)
        # Should not have access
        [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
                                                              qrystrs=[("", "[088__a]")], \
                                                              collections=["Theses"], \
                                                              verbose=0)
        self.assertEqual(1, len(nomatchrecs))

        if MECHANIZE_AVAILABLE:
            # Jekyll should have access
            [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
                                                                  qrystrs=[("", "[088__a]")], \
                                                                  collections=["Theses"], \
                                                                  user="******",
                                                                  password="******", \
                                                                  verbose=0,
                                                                  insecure_login=True)
            self.assertEqual(1, len(matchedrecs))
    def test_restricted_collections_local(self):
        """bibmatch - check restricted collections local search"""
        records = create_records(self.recxml5)
        # Should not have access
        [nomatchrecs, dummy1, dummy2, dummy3] = match_records(
            records, qrystrs=[("", "[088__a]")], collections=["Theses"], verbose=0
        )
        self.assertEqual(1, len(nomatchrecs))

        if MECHANIZE_AVAILABLE:
            # Jekyll should have access
            [dummy1, matchedrecs, dummy2, dummy3] = match_records(
                records,
                qrystrs=[("", "[088__a]")],
                collections=["Theses"],
                user="******",
                password="******",
                verbose=0,
                insecure_login=True,
            )
            self.assertEqual(1, len(matchedrecs))
def _check_client_can_submit_file(client_ip="", metafile="", req=None, webupload=0):
    """
    Is this client able to upload such a FILENAME?
    check 980 $a values and collection tags in the file to see if they are among the
    permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION.
    Useful to make sure that the client does not override other records by
    mistake.
    """
    from invenio.bibrecord import create_records

    recs = create_records(metafile, 0, 0)

    filename_tag980_values = _detect_980_values_from_marcxml_file(recs)
    for filename_tag980_value in filename_tag980_values:
        if not filename_tag980_value:
            if not webupload:
                return False
            else:
                return(1, "Invalid tag 980 value")
        if not webupload:
            if not filename_tag980_value in CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS[client_ip]:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_tag980_value)
            if auth_code != 0:
                return (auth_code, auth_message)

    filename_rec_id_collections = _detect_collections_from_marcxml_file(recs)
    for filename_rec_id_collection in filename_rec_id_collections:
        if not webupload:
            if not filename_rec_id_collection in CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS[client_ip]:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_rec_id_collection)
            if auth_code != 0:
                return (auth_code, auth_message)
    if not webupload:
        return True
    else:
        return (0, " ")
def parse_resultfile(data, recid_patterns=(re_original_id,), recids=[],
                     sysno_patterns=None, preserved_tags=[]):
    """
    This function will look for the original recid and any matching recids in a
    BibMatch result file containing references to matching records in comments before
    every record in MARCXML format.

    Returns a list of BibRec structure with found recids for original and matching records.
    """
    record_pairs = []
    sysno_gen = get_sysno_generator()
    options = {'text-marc':1, 'aleph-marc':0}
    for index, match in enumerate(data):
        original_record_bibrec = create_records(match)[0][0]
        if record_has_field(original_record_bibrec, '001'):
            rec_id = record_get_field_value(original_record_bibrec, '001')
        else:
            sysno = sysno_gen.next()
            original_record_marc = create_marc_record(original_record_bibrec, sysno, options)
            rec_id = ""
            for pattern in recid_patterns:
                matches = pattern.findall(original_record_marc)
                if len(matches) > 0:
                    rec_id = matches[0]
                    break
        if recids:
            matching_result_recids = [recids[index]]
        else:
            matching_result_recids = re_matched_recid.findall(match)
        matching_result_sysnos = []
        preserved_fields = {}
        print preserved_tags
        for tag in preserved_tags:
            try:
                print 'doing it' + tag
                preserved_fields[tag] = original_record_bibrec[tag]
            except KeyError:
                pass
        record_pairs.append((rec_id, matching_result_recids, matching_result_sysnos, preserved_fields))
    return record_pairs
    def _create_marc(self, records_xml):
        """Creates MARC from MARCXML.

        @param records_xml: MARCXML containing information about the records

        @return: string containing information about the records
        in MARC format
        """
        aleph_marc_output = ""

        records = bibrecord.create_records(records_xml)
        for (record, status_code, list_of_errors) in records:
            sysno_options = {"text-marc":1}
            sysno = xmlmarc2textmarc.get_sysno_from_record(record,
                                                              sysno_options)
            options = {"aleph-marc":0, "correct-mode":1, "append-mode":0,
                       "delete-mode":0, "insert-mode":0, "replace-mode":0,
                       "text-marc":1}
            aleph_record = xmlmarc2textmarc.create_marc_record(record,
                                                                  sysno,
                                                                  options)
            aleph_marc_output += aleph_record

        return aleph_marc_output
 def test_check_remote(self):
     """bibmatch - check remote match (Invenio demo site)"""
     records = create_records(self.recxml3)
     [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
                                                           server_url="http://invenio-demo.cern.ch")
     self.assertEqual(1, len(matchedrecs))
Example #33
0
 def test_check_ambiguous(self):
     """bibmatch - check an ambiguous record"""
     records = create_records(self.recxml3)
     [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, \
                                                         verbose=0)
     self.assertEqual(1, len(ambigrecs))
Example #34
0
def validate_matches(bibmatch_recid, record, server, result_recids, \
                     collections="", verbose=0, ascii_mode=False):
    """
    Perform record validation on a set of matches. This function will
    try to find any search-result that "really" is a correct match, based on
    various methods defined in a given rule-set. See more about rule-sets in
    validate_match() function documentation.

    This function will return a tuple containing a list of all record IDs
    satisfying the count of field matching needed for exact matches and a
    similar list for fuzzy matches that has less fields matching then the
    threshold. Records that are not matching at all are simply left out of
    the lists.

    @param bibmatch_recid: Current record number. Used for logging.
    @type bibmatch_recid: int

    @param record: bibrec structure of original record
    @type record: dict

    @param server: InvenioConnector object to matched record source repository
    @type server: InvenioConnector object

    @param result_recids: the list of record ids from search result.
    @type result_recids: list

    @param collections: list of collections to search, if specified
    @type collections: list

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: list of record IDs matched
    @rtype: list
    """
    matches_found = []
    fuzzy_matches_found = []

    # Generate final rule-set by analyzing the record
    final_ruleset = get_validation_ruleset(record)
    if not final_ruleset:
        raise BibMatchValidationError("Bad configuration rule-set." \
                                      "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \
                                      " is formed correctly.")

    if verbose > 8:
        sys.stderr.write(
            "\nStart record validation:\n\nFinal validation ruleset used:\n")
        pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2)
        pp.pprint(final_ruleset)
    CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" %
                             (final_ruleset, ))

    # Fetch all records in MARCXML and convert to BibRec
    found_record_list = []
    query = " OR ".join(["001:%d" % (recid, ) for recid in result_recids])

    if collections:
        search_params = dict(p=query, of="xm", c=collections)
    else:
        search_params = dict(p=query, of="xm")
    CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" %
                             (str(search_params), ))
    result_marcxml = server.search_with_retry(**search_params)
    # Check if record was found
    if result_marcxml:
        found_record_list = [r[0] for r in create_records(result_marcxml)]
        # Check if BibRecord generation was successful
        if not found_record_list:
            # Error fetching records. Unable to validate. Abort.
            raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \
                                          % (server.server_url,))
        if len(found_record_list) < len(result_recids):
            # Error fetching all records. Will still continue.
            sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \
                              % (server.server_url,))

    # Validate records one-by-one, adding any matches to the list of matching record IDs
    current_index = 1
    for matched_record in found_record_list:
        recid = record_get_field_values(matched_record, tag="001")[0]
        if verbose > 8:
            sys.stderr.write("\n Validating matched record #%d (%s):\n" % \
                             (current_index, recid))
        CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \
                                 (bibmatch_recid, recid))
        match_ratio = validate_match(record, matched_record, final_ruleset, \
                                     verbose, ascii_mode)
        if match_ratio == 1.0:
            # All matches were a success, this is an exact match
            CFG_BIBMATCH_LOGGER.info(
                "Matching of record %d: Exact match found -> %s" %
                (bibmatch_recid, recid))
            matches_found.append(recid)
        elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT:
            # This means that some matches failed, but some succeeded as well. That's fuzzy...
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \
                                     (bibmatch_recid, recid))
            fuzzy_matches_found.append(recid)
        else:
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" %
                                     (bibmatch_recid, ))
        current_index += 1

    # Return list of matching record IDs
    return matches_found, fuzzy_matches_found
 def test_check_fuzzy(self):
     """bibmatch - check fuzzily matched record"""
     records = create_records(self.recxml1)
     [dummy1, dummy2, dummy3, fuzzyrecs] = match_records(records, \
                                                         verbose=0)
     self.assertEqual(1, len(fuzzyrecs))
Example #36
0
    def get_number_of_records_found(self):
        """Returns the number of records in the result"""
        records = bibrecord.create_records(self._result)
        records_count = len(records)

        return records_count
def get_institution_records(path):
    """
    Returns all institution records in a BibRecord structure.
    """
    return [res[0] for res in bibrecord.create_records(open(path).read())]
Example #38
0
    if not f_input:
        for line_in in sys.stdin:
            read_list.append(line_in)
    else:
        f = open(f_input)
        for line_in in f:
            read_list.append(line_in)
        f.close()
    file_read = "".join(read_list)

    # Detect input type
    if not file_read.startswith('<'):
        # Not xml, assume type textmarc
        file_read = transform_input_to_marcxml(f_input, file_read)

    records = create_records(file_read)

    if len(records) == 0:
        if verbose:
            sys.stderr.write("\nBibMatch: Input file contains no records.\n")
        sys.exit(0)

    if verbose:
        sys.stderr.write("read %d records" % (len(records), ))
        sys.stderr.write("\nBibMatch: Matching ...")

    match_results = match_records(records, qrystrs, search_mode, operator,
                                  verbose, server_url, modify, sleeptime,
                                  clean, collections, user, password)

    # set the output according to print..
Example #39
0
def recxml2recmarc(xmltext, options, sysno_generator=get_sysno_generator()):
    """The function that processes creating the records from
       an XML string, and prints these records to the
       standard output stream.
       @param xmltext: An XML MARC record in string form.
       @param options: Various options about the record to be
        created, as passed from the command line.
       @param sysno_generator: A static parameter to act as an Aleph
        system number generator. Do not provide a value for this - it
        will be assigned upon first call to this function.
    """
    rec_count = 0  ## Counter used to record the number of the rec
    ## that is being processed. Used in error messages
    ## for the user, when a record cannot be processed

    ## create internal records structure from xmltext:
    records = create_records(xmltext, 1, 1)

    ## now loop through each record, get its sysno, and convert it:
    for rec_tuple in records:
        rec_count += 1
        ## Get the record-dictionary itself from the record-tuple:
        record = rec_tuple[0]

        if record is None:
            ## if the record is None, there was probably a problem
            ## with the MARC XML. Display a warning message on stderr and
            ## move past this record:
            sys.stderr.write("E: Unable to process record number %s; The XML " \
                             " may be broken for this record.\n" \
                             % str(rec_count))
            continue

        ## From the record, get the SYS if running in ALEPH-MARC mode, or
        ## the recid (001) if running in TEXT-MARC mode:
        sysno = get_sysno_from_record(record, options)

        if sysno is None:
            ## No 'sysno' was found in the record:
            if options["text-marc"] == 1:
                ## 'sysno' (001) (which is actually the recid) is mandatory
                ## for the creation of TEXT-MARC. Report the error and skip
                ## past the record:
                sys.stderr.write("E: Record number %s has no 'recid' (001). " \
                                 "This field is mandatory for the " \
                                 "creation of TEXT MARC. The record has been " \
                                 "skipped.\n" % str(rec_count))
                continue
            elif options["aleph-marc"] ==  1 and \
                     1 in (options["append-mode"], options["delete-mode"], \
                           options["correct-mode"], options["replace-mode"]):
                ## When creating ALEPH-MARC that will be used to manipulate
                ## a record in some way (i.e. correct, append, delete, replace),
                ## the ALEPH SYS (970__a in MARC XML) is mandatory. Report the
                ## error and skip past the record:
                sys.stderr.write("E: Record number %s has no ALEPH 'SYS' " \
                                 "(970__a). This field is mandatory for the " \
                                 "creation of ALEPH MARC that is used for the" \
                                 " manipulation of records (i.e. replace, " \
                                 "correct, append, delete). The record has " \
                                 "been skipped.\n" % str(rec_count))
                continue
        elif options["aleph-marc"] == 1 and type(sysno) in (list, tuple):
            ## multiple values for SYS (970__a) in ALEPH-MARC mode are not
            ## permitted. Report the error and skip past the record:
            sys.stderr.write("E: Multiple values have been found for the " \
                             "ALEPH SYS (970__a) in record number %s. This " \
                             "is not permitted when running in ALEPH-MARC " \
                             "mode. The record has been skipped." \
                             % str(rec_count))
            continue

        if options["aleph-marc"] == 1 and options["insert-mode"] == 1:
            ## Creating an ALEPH "insert" record. Since the resulting record
            ## should be treated as a new insert into ALEPH, any 'sysno' that
            ## may have been found in the MARC XML record cannot be used -
            ## that would be dangerous. Therefore, set 'sysno' to None and
            ## create a random sysno:
            sysno = None
            try:
                sysno = sysno_generator.next()
            except StopIteration:
                ## generator counter has overstepped the MAX ALEPH SYS!
                ## Without a SYS, we cannot create ALEPH MARC
                sys.stderr.write("""E: Maximum ALEPH SYS has been """ \
                                 """reached - unable to continue.\n""")
                sys.exit(1)

        ## No problems were encountered with SYS or recid. Display the
        ## translated record:
        rec_out = create_marc_record(record, sysno, options)
        sys.stdout.write(rec_out)
        sys.stdout.flush()
Example #40
0
 def test_check_existing(self):
     """bibmatch - check existing record"""
     records = create_records(self.recxml3)
     [dummy1, matchedrecs, dummy2, dummy3] = match_records(records)
     self.assertEqual(1, len(matchedrecs))
Example #41
0
 def test_check_ambiguous(self):
     """bibmatch - check an ambiguous record"""
     records = create_records(self.recxml1)
     [dummy1, dummy2, ambigrecs,
      dummy3] = match_records(records, qrystrs=[("", "[100__a]")])
     self.assertEqual(1, len(ambigrecs))
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Transform MARCXML to record structure
    try:
        records = create_records(open_marc_file(input_filename))
    except:
        sys.stderr.write("bibupload.xml_marc_to_records failed on file: %s" %
                         (input_filename, ))
        sys.exit(3)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        # Perform various checks to determine an suitable action to be taken for
        # that particular record. Whether it will be inserted, discarded or replacing
        # existing records
Example #43
0
 def test_check_new(self):
     """bibmatch - check a new record"""
     records = create_records(self.recxml2)
     [newrecs, dummy1, dummy2, dummy3] = match_records(records, \
                                                       verbose=0)
     self.assertEqual(1, len(newrecs))
Example #44
0
def _check_client_can_submit_file(client_ip="",
                                  metafile="",
                                  req=None,
                                  webupload=0,
                                  ln=CFG_SITE_LANG):
    """
    Is this client able to upload such a FILENAME?
    check 980 $a values and collection tags in the file to see if they are among the
    permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION.
    Useful to make sure that the client does not override other records by
    mistake.
    """
    _ = gettext_set_language(ln)
    recs = create_records(metafile, 0, 0)
    user_info = collect_user_info(req)

    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if not permitted_dbcollids:
        if _check_authorization_bearer(req):
            permitted_dbcollids = ['*']

    if '*' in permitted_dbcollids:
        if not webupload:
            return True
        else:
            return (0, " ")

    filename_tag980_values = _detect_980_values_from_marcxml_file(recs)
    for filename_tag980_value in filename_tag980_values:
        if not filename_tag980_value:
            if not webupload:
                return False
            else:
                return (1, "Invalid collection in tag 980")
        if not webupload:
            if not filename_tag980_value in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(
                req, 'runbatchuploader', collection=filename_tag980_value)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value}
                return (auth_code, error_msg)

    filename_rec_id_collections = _detect_collections_from_marcxml_file(recs)

    for filename_rec_id_collection in filename_rec_id_collections:
        if not webupload:
            if not filename_rec_id_collection in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(
                req, 'runbatchuploader', collection=filename_rec_id_collection)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection}
                return (auth_code, error_msg)
    if not webupload:
        return True
    else:
        return (0, " ")
def get_institution_records(path):
    """
    Returns all institution records in a BibRecord structure.
    """
    return [res[0] for res in bibrecord.create_records(open(path).read())]
Example #46
0
 def test_check_fuzzy(self):
     """bibmatch - check fuzzily matched record"""
     records = create_records(self.recxml1)
     [dummy1, dummy2, dummy3, fuzzyrecs] = match_records(records, \
                                                         verbose=0)
     self.assertEqual(1, len(fuzzyrecs))
Example #47
0
    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Read and wash incoming data
    file_data = open_marc_file(input_filename)
    washed_data = wash_for_xml(wash_for_utf8(file_data))

    # Transform MARCXML to record structure
    records = create_records(washed_data)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        if record is None:
            sys.stderr.write("Record is None: %s" % (rec[2], ))
            sys.exit(1)
        # Perform various checks to determine an suitable action to be taken for
        # that particular record. Whether it will be inserted, discarded or replacing
        # existing records
        #
Example #48
0
 def test_check_completeness(self):
     """bibmatch - check query completeness"""
     records = create_records(self.recxml4)
     [dummy1, dummy2, ambigrecs,
      dummy3] = match_records(records, qrystrs=[("", "[088__a] [035__a]")])
     self.assertEqual(1, len(ambigrecs))