def test_rules(app, db, field, expected):
    # array = [create_record(data) for data in
    #          split_stream(
    #              open(
    #                  '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
    #                  '/xml_files/vskp_test2.xml',
    #                  'rb'))]
    # for idx, field in enumerate(array):
    rec = fix_grantor(field)
    # rec = fix_keywords(rec)
    rec = fix_language(rec)
    transformed = old_nusl.do(rec)
    schema = ThesisMetadataSchemaV1()
    try:
        marshmallowed = schema.load(transformed).data
    except ValidationError:
        traceback.print_exc()
        # skip()
        marshmallowed = transformed
    marshmallowed = schema.dump(marshmallowed).data
    print("\n\n")
    print("MARSHMALLOWED")
    print(marshmallowed)
    print("\n")
    pprint(marshmallowed)
    print("\n\n")
    assert marshmallowed == expected
def run(url, break_on_error, cache_dir, clean_output_dir, start):
    org_units = set()
    processed_ids = set()
    if clean_output_dir and os.path.exists(ERROR_DIR):
        shutil.rmtree(ERROR_DIR)

    if url.startswith('http'):
        gen = url_nusl_data_generator(start, url, cache_dir)
    else:
        gen = file_nusl_data_generator(start, url, cache_dir)
    try:
        for data in gen:

            for cf in data.iter('{http://www.loc.gov/MARC21/slim}controlfield'):
                if cf.attrib['tag'] == '001':
                    recid = cf.text
                    break
            else:
                recid = str(uuid.uuid4())

            if recid in processed_ids:
                logging.warning('Record with id %s already parsed, probably end of stream', recid)
                return

            processed_ids.add(recid)
            try:
                rec = create_record(data)
                rec = fix_grantor(rec)
                if "7102_" not in rec:
                    continue
                for org_unit in aslist(rec["7102_"]):
                    if not isinstance(org_unit, dict):
                        print("Org unit is not an object: ", org_unit, recid)
                        continue
                    university = org_unit.get("a")
                    faculty = org_unit.get("g")
                    department = org_unit.get("b")
                    language = org_unit.get("9")

                    if faculty and not university:
                        logging.error("No university for faculty: %s", rec)
                        continue
                    if department and not faculty:
                        logging.error("No faculty for department: %s", rec)
                        continue

                    org_units.add((university, faculty, department, language))



            except Exception as e:
                logging.exception('Error in transformation')
                if break_on_error:
                    raise
    finally:
        with open("departments.json", "w") as f:
            json.dump(list(org_units), f)
def test_rules_oai(app, db):
    array = [create_record(data) for data in
             split_stream_oai_nusl(
                 open(
                     '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
                     '/xml_files/oai_nusl_listrecords.xml',
                     'rb'))]
    for field in array:
        rec = fix_grantor(field)
        transformed = old_nusl.do(rec)
        schema = ThesisMetadataSchemaV1()
        marshmallowed = schema.load(transformed).data
        marshmallowed = schema.dump(marshmallowed).data
        print(transformed)
        print("------------MARSHMALLOWED---------------------", marshmallowed)
def test_rules_3(app, db):
    array = [create_record(data) for data in
             split_stream(
                 open(
                     '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
                     '/xml_files/keywords_pipe.xml',
                     'rb'))]
    for idx, field in enumerate(array):
        rec = fix_grantor(field)
        rec = fix_keywords(rec)
        rec = fix_language(rec)
        transformed = old_nusl.do(rec)
        schema = ThesisMetadataSchemaV1()
        marshmallowed = schema.load(transformed).data
        marshmallowed = schema.dump(marshmallowed).data
        print(marshmallowed)
Esempio n. 5
0
def data_loop_collector(break_on_error, error_counts, error_documents, gen,
                        processed_ids, stop):
    i = 0
    for data in gen:
        i += 1
        print("Record number: ", i)
        if i >= int(stop):
            break

        for cf in data.iter('{http://www.loc.gov/MARC21/slim}controlfield'):
            if cf.attrib['tag'] == '001':
                recid = cf.text
                break
        else:
            recid = str(uuid.uuid4())

        ch.setRecord(data, recid)

        if recid in processed_ids:
            logging.warning(
                'Record with id %s already parsed, probably end of stream',
                recid)
            return

        processed_ids.add(recid)
        marshmallowed = None
        try:
            # PŘEVOD XML NA GroupableOrderedDict
            rec = create_record(data)

            if rec.get('980__') and rec['980__'].get('a') not in (
                    # test jestli doctype je vysokoškolská práce, ostatní nezpracováváme
                    'bakalarske_prace',
                    'diplomove_prace',
                    'disertacni_prace',
                    'habilitacni_prace',
                    'rigorozni_prace'):
                continue

            # Fix data before transformation into JSON
            rec = fix_language(rec)
            rec = fix_grantor(rec)  # Sjednocení grantora pod pole 7102
            # rec = fix_keywords(rec)

            transformed = old_nusl.do(
                rec)  # PŘEVOD GroupableOrderedDict na Dict
            ch.setTransformedRecord(transformed)
            try:
                # Validace dat podle Marshmallow a JSON schematu
                marshmallowed = nusl_theses.validate(transformed)
            except ValidationError as e:
                error_counts[e.field_name] += 1
                error_documents[e.field_name].append(recid)
                if e.field_name not in IGNORED_ERROR_FIELDS:
                    raise
                continue

            # uložení do databáze/invenia
            nusl_theses.import_old_nusl_record(marshmallowed)
        except Exception as e:
            logging.exception('Error in transformation')
            logging.error('data %s', marshmallowed)
            if break_on_error:
                raise