def test_rules(app, db, field, expected):
    # array = [create_record(data) for data in
    #          split_stream(
    #              open(
    #                  '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
    #                  '/xml_files/vskp_test2.xml',
    #                  'rb'))]
    # for idx, field in enumerate(array):
    rec = fix_grantor(field)
    # rec = fix_keywords(rec)
    rec = fix_language(rec)
    transformed = old_nusl.do(rec)
    schema = ThesisMetadataSchemaV1()
    try:
        marshmallowed = schema.load(transformed).data
    except ValidationError:
        traceback.print_exc()
        # skip()
        marshmallowed = transformed
    marshmallowed = schema.dump(marshmallowed).data
    print("\n\n")
    print("MARSHMALLOWED")
    print(marshmallowed)
    print("\n")
    pprint(marshmallowed)
    print("\n\n")
    assert marshmallowed == expected
def test_rules_oai(app, db):
    array = [create_record(data) for data in
             split_stream_oai_nusl(
                 open(
                     '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
                     '/xml_files/oai_nusl_listrecords.xml',
                     'rb'))]
    for field in array:
        rec = fix_grantor(field)
        transformed = old_nusl.do(rec)
        schema = ThesisMetadataSchemaV1()
        marshmallowed = schema.load(transformed).data
        marshmallowed = schema.dump(marshmallowed).data
        print(transformed)
        print("------------MARSHMALLOWED---------------------", marshmallowed)
def test_rules_3(app, db):
    array = [create_record(data) for data in
             split_stream(
                 open(
                     '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
                     '/xml_files/keywords_pipe.xml',
                     'rb'))]
    for idx, field in enumerate(array):
        rec = fix_grantor(field)
        rec = fix_keywords(rec)
        rec = fix_language(rec)
        transformed = old_nusl.do(rec)
        schema = ThesisMetadataSchemaV1()
        marshmallowed = schema.load(transformed).data
        marshmallowed = schema.dump(marshmallowed).data
        print(marshmallowed)
Example #4
0
def data_loop_collector(break_on_error, error_counts, error_documents, gen,
                        processed_ids, stop):
    i = 0
    for data in gen:
        i += 1
        print("Record number: ", i)
        if i >= int(stop):
            break

        for cf in data.iter('{http://www.loc.gov/MARC21/slim}controlfield'):
            if cf.attrib['tag'] == '001':
                recid = cf.text
                break
        else:
            recid = str(uuid.uuid4())

        ch.setRecord(data, recid)

        if recid in processed_ids:
            logging.warning(
                'Record with id %s already parsed, probably end of stream',
                recid)
            return

        processed_ids.add(recid)
        marshmallowed = None
        try:
            # PŘEVOD XML NA GroupableOrderedDict
            rec = create_record(data)

            if rec.get('980__') and rec['980__'].get('a') not in (
                    # test jestli doctype je vysokoškolská práce, ostatní nezpracováváme
                    'bakalarske_prace',
                    'diplomove_prace',
                    'disertacni_prace',
                    'habilitacni_prace',
                    'rigorozni_prace'):
                continue

            # Fix data before transformation into JSON
            rec = fix_language(rec)
            rec = fix_grantor(rec)  # Sjednocení grantora pod pole 7102
            # rec = fix_keywords(rec)

            transformed = old_nusl.do(
                rec)  # PŘEVOD GroupableOrderedDict na Dict
            ch.setTransformedRecord(transformed)
            try:
                # Validace dat podle Marshmallow a JSON schematu
                marshmallowed = nusl_theses.validate(transformed)
            except ValidationError as e:
                error_counts[e.field_name] += 1
                error_documents[e.field_name].append(recid)
                if e.field_name not in IGNORED_ERROR_FIELDS:
                    raise
                continue

            # uložení do databáze/invenia
            nusl_theses.import_old_nusl_record(marshmallowed)
        except Exception as e:
            logging.exception('Error in transformation')
            logging.error('data %s', marshmallowed)
            if break_on_error:
                raise