def test_rules(app, db, field, expected): # array = [create_record(data) for data in # split_stream( # open( # '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests' # '/xml_files/vskp_test2.xml', # 'rb'))] # for idx, field in enumerate(array): rec = fix_grantor(field) # rec = fix_keywords(rec) rec = fix_language(rec) transformed = old_nusl.do(rec) schema = ThesisMetadataSchemaV1() try: marshmallowed = schema.load(transformed).data except ValidationError: traceback.print_exc() # skip() marshmallowed = transformed marshmallowed = schema.dump(marshmallowed).data print("\n\n") print("MARSHMALLOWED") print(marshmallowed) print("\n") pprint(marshmallowed) print("\n\n") assert marshmallowed == expected
def test_rules_oai(app, db): array = [create_record(data) for data in split_stream_oai_nusl( open( '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests' '/xml_files/oai_nusl_listrecords.xml', 'rb'))] for field in array: rec = fix_grantor(field) transformed = old_nusl.do(rec) schema = ThesisMetadataSchemaV1() marshmallowed = schema.load(transformed).data marshmallowed = schema.dump(marshmallowed).data print(transformed) print("------------MARSHMALLOWED---------------------", marshmallowed)
def test_rules_3(app, db): array = [create_record(data) for data in split_stream( open( '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests' '/xml_files/keywords_pipe.xml', 'rb'))] for idx, field in enumerate(array): rec = fix_grantor(field) rec = fix_keywords(rec) rec = fix_language(rec) transformed = old_nusl.do(rec) schema = ThesisMetadataSchemaV1() marshmallowed = schema.load(transformed).data marshmallowed = schema.dump(marshmallowed).data print(marshmallowed)
def data_loop_collector(break_on_error, error_counts, error_documents, gen, processed_ids, stop): i = 0 for data in gen: i += 1 print("Record number: ", i) if i >= int(stop): break for cf in data.iter('{http://www.loc.gov/MARC21/slim}controlfield'): if cf.attrib['tag'] == '001': recid = cf.text break else: recid = str(uuid.uuid4()) ch.setRecord(data, recid) if recid in processed_ids: logging.warning( 'Record with id %s already parsed, probably end of stream', recid) return processed_ids.add(recid) marshmallowed = None try: # PŘEVOD XML NA GroupableOrderedDict rec = create_record(data) if rec.get('980__') and rec['980__'].get('a') not in ( # test jestli doctype je vysokoškolská práce, ostatní nezpracováváme 'bakalarske_prace', 'diplomove_prace', 'disertacni_prace', 'habilitacni_prace', 'rigorozni_prace'): continue # Fix data before transformation into JSON rec = fix_language(rec) rec = fix_grantor(rec) # Sjednocení grantora pod pole 7102 # rec = fix_keywords(rec) transformed = old_nusl.do( rec) # PŘEVOD GroupableOrderedDict na Dict ch.setTransformedRecord(transformed) try: # Validace dat podle Marshmallow a JSON schematu marshmallowed = nusl_theses.validate(transformed) except ValidationError as e: error_counts[e.field_name] += 1 error_documents[e.field_name].append(recid) if e.field_name not in IGNORED_ERROR_FIELDS: raise continue # uložení do databáze/invenia nusl_theses.import_old_nusl_record(marshmallowed) except Exception as e: logging.exception('Error in transformation') logging.error('data %s', marshmallowed) if break_on_error: raise