Ejemplo n.º 1
0
 def convert_marc_xml(self, db_update_obj):
     # check if it is a marc file
     if self.source.endswith('.mrc') or self.source.endswith('.marc'):
         # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files
         subfolder_name = '%s_%s' %(self.source.replace("Webapp/source/MARC/", "").replace("Webapp/source/BIBFRAME/", "").split('.')[0], datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
         db_update_obj.master_file = subfolder_name
         db_update_obj.save()
         sufolder = os.path.join(self.folder, subfolder_name)
         if not os.path.exists(sufolder):
             os.makedirs(sufolder)
         #create the BIBFRAME converter object
         BIBFRAME = XML_BIBFRAME(subfolder_name)
         output = os.path.join(sufolder, '')
         with open(self.source, "rb") as marc_file:
             reader = MARCReader(marc_file, to_unicode=True, force_utf8=False, utf8_handling='ignore')
             for i, record in enumerate(reader):
                 #print ("converting record number %s to XML" %(str(i)))
                 if record.title():
                     ti = record.title()
                     ti = ti.replace("/", "")
                     ti = ti.replace(" ", "_")
                     ti = ti[0:50]
                     writer = XMLWriter(open(output + ti + '.xml','wb'))
                     writer.write(record)
                     writer.close()
                 else:
                     writer = XMLWriter(open(output + 'unknownTitle' + str(i) + '.xml','wb'))
             marc_file.close()
         #convert MARC/XML to BIBFRAME
         db_update_obj.stage = "MARC-XML_to_BIBFRAME"
         db_update_obj.save()
         BIBFRAME.convert_to_BIBFRAME(i, db_update_obj)
         # merge the BIBFRAME files into one (per the master MARC file) for ease of processing
         merged = BIBFRAME.merger()
     return merged
Ejemplo n.º 2
0
def writeRecordToFile(record, filename):
    try:
        writer = XMLWriter(open(filename, 'wb'))
        writer.write(record)
    except:
        print "Error: Cannot write metadata to "+filename
    writer.close()  # Important!
Ejemplo n.º 3
0
def main():
    files = parser.parse_args().files
    for filename in files:
        print filename
        with open(filename) as fp:
            reader = MARCReader(fp)
            writer = XMLWriter(open(filename.split('.')[0]+'.xml', 'wb'))
            for record in reader:
                writer.write(record)
            writer.close()  # Important!
Ejemplo n.º 4
0
def upload_success(filename):
    if request.method == 'POST':
        file = "uploads/" + filename
        if not os.path.exists("processed"):
            os.makedirs("processed")
        with open(file, "rb") as infile:
            ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
            reader = MARCReader(infile,
                                to_unicode=True,
                                force_utf8=False,
                                utf8_handling='ignore')
            files = []
            for i, record in enumerate(reader):
                print("converting record number " + str(i) + " to XML")
                if record.title():
                    ti = record.title()
                    ti = ti.replace("/", "")
                    ti = ti.replace(" ", "_")
                    ti = ti[0:50]
                    writer = XMLWriter(open('processed/' + ti + '.xml', 'wb'))
                    writer.write(record)
                    writer.close()
                    files.append(ti)
                else:
                    writer = XMLWriter(
                        open('processed/' + 'unknownTitle' + str(i) + '.xml',
                             'wb'))
                    files.append('unknownTitle' + str(i))
            infile.close()
            #print (file + " was successfully converted to XML")
            xslt = ET.parse("marc2bibframe2-master/xsl/marc2bibframe2.xsl")
            tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
            #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S'))
            for i, file in enumerate(files):
                #print ("start processing record number " + str(i))
                #print ("starting BIBFRAME transformation")
                f = "processed/" + file + ".xml"
                dom = ET.parse(f)
                transform = ET.XSLT(xslt)
                newdom = transform(dom)
                if not os.path.exists("BIB"):
                    os.makedirs("BIB")
                with open("BIB/" + file + ".xml", "w+") as oo:
                    oo.write(str(newdom).replace('<?xml version="1.0"?>', ''))
                #print ("starting enrichment process")
                main("BIB/" + file + ".xml")
                tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
                #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S'))
        return redirect(url_for('process_success', files=file))
    return ''' 
Ejemplo n.º 5
0
def upload_success(filename):
    if request.method == 'POST':
        file = "uploads/" + filename
        if not os.path.exists("processed"):
            os.makedirs("processed")
        with open(file, "rb") as infile:
            ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
            reader = MARCReader(infile, to_unicode=True, force_utf8=False, utf8_handling='ignore')
            files = []
            for i, record in enumerate(reader):
                print ("converting record number " + str(i) + " to XML")
                if record.title():
                    ti = record.title()
                    ti = ti.replace("/", "")
                    ti = ti.replace(" ", "_")
                    ti = ti[0:50]
                    writer = XMLWriter(open('processed/' + ti + '.xml','wb'))
                    writer.write(record)
                    writer.close()
                    files.append(ti)
                else:
                    writer = XMLWriter(open('processed/' + 'unknownTitle' + str(i) + '.xml','wb'))
                    files.append('unknownTitle' + str(i))
            infile.close()
            #print (file + " was successfully converted to XML")
            xslt = ET.parse("marc2bibframe2-master/xsl/marc2bibframe2.xsl")
            tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
            #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S'))
            for i, file in enumerate(files):
                #print ("start processing record number " + str(i))
                #print ("starting BIBFRAME transformation")
                f = "processed/" + file + ".xml"
                dom = ET.parse(f)
                transform = ET.XSLT(xslt)
                newdom = transform(dom)
                if not os.path.exists("BIB"):
                    os.makedirs("BIB")
                with open ("BIB/" + file + ".xml", "w+") as oo:
                    oo.write(str(newdom).replace('<?xml version="1.0"?>', ''))
                #print ("starting enrichment process")
                main("BIB/" + file + ".xml")
                tf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
                #print("walltime:", datetime.strptime(tf, '%H:%M:%S') - datetime.strptime(ts, '%H:%M:%S'))
        return redirect(url_for('process_success', 
                                    files=file))
    return''' 
Ejemplo n.º 6
0
 def convert_marc_xml(self, db_update_obj):
     # check if it is a marc file
     if self.source.endswith('.mrc') or self.source.endswith('.marc'):
         # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files
         subfolder_name = '%s_%s' % (self.source.replace(
             "Webapp/source/MARC/", "").replace(
                 "Webapp/source/BIBFRAME/",
                 "").split('.')[0], datetime.fromtimestamp(
                     time.time()).strftime('%Y-%m-%d %H:%M:%S'))
         db_update_obj.master_file = subfolder_name
         db_update_obj.save()
         sufolder = os.path.join(self.folder, subfolder_name)
         if not os.path.exists(sufolder):
             os.makedirs(sufolder)
         #create the BIBFRAME converter object
         BIBFRAME = XML_BIBFRAME(subfolder_name)
         output = os.path.join(sufolder, '')
         with open(self.source, "rb") as marc_file:
             reader = MARCReader(marc_file,
                                 to_unicode=True,
                                 force_utf8=False,
                                 utf8_handling='ignore')
             for i, record in enumerate(reader):
                 #print ("converting record number %s to XML" %(str(i)))
                 if record.title():
                     ti = record.title()
                     ti = ti.replace("/", "")
                     ti = ti.replace(" ", "_")
                     ti = ti[0:50]
                     writer = XMLWriter(open(output + ti + '.xml', 'wb'))
                     writer.write(record)
                     writer.close()
                 else:
                     writer = XMLWriter(
                         open(output + 'unknownTitle' + str(i) + '.xml',
                              'wb'))
             marc_file.close()
         #convert MARC/XML to BIBFRAME
         db_update_obj.stage = "MARC-XML_to_BIBFRAME"
         db_update_obj.save()
         BIBFRAME.convert_to_BIBFRAME(i, db_update_obj)
         # merge the BIBFRAME files into one (per the master MARC file) for ease of processing
         merged = BIBFRAME.merger()
     return merged
Ejemplo n.º 7
0
 def convert_marc_xml(self):
     # for each .marc file in the "marc" folder, convert marc to MARC/XML and then to BIBFRAME
     for index, files in enumerate(os.listdir(self.source)):
         # check if it is a marc file
         if files.endswith('.mrc') or files.endswith('.marc'):
             # for each .mrc file create a sub-folder based on timestamp to store converted MARC/XML files
             subfolder_name = '%s_%s' % (
                 files.split('.')[0], datetime.fromtimestamp(
                     time.time()).strftime('%Y-%m-%d %H:%M:%S'))
             sufolder = os.path.join(self.folder, subfolder_name)
             if not os.path.exists(sufolder):
                 os.makedirs(sufolder)
             #create the BIBFRAME converter object
             BIBFRAME = XML_BIBFRAME(subfolder_name)
             file = os.path.join(self.source, files)
             output = os.path.join(sufolder, '')
             with open(file, "rb") as marc_file:
                 reader = MARCReader(marc_file,
                                     to_unicode=True,
                                     force_utf8=False,
                                     utf8_handling='ignore')
                 for i, record in enumerate(reader):
                     print(
                         "converting record number %s of file number %s to XML"
                         % (str(i), str(index + 1)))
                     if record.title():
                         ti = record.title()
                         ti = ti.replace("/", "")
                         ti = ti.replace(" ", "_")
                         ti = ti[0:50]
                         writer = XMLWriter(open(output + ti + '.xml',
                                                 'wb'))
                         writer.write(record)
                         writer.close()
                     else:
                         writer = XMLWriter(
                             open(output + 'unknownTitle' + str(i) + '.xml',
                                  'wb'))
                 marc_file.close()
             #convert MARC/XML to BIBFRAME
             BIBFRAME.convert_to_BIBFRAME(i)
             # merge the BIBFRAME files into one (per the master MARC file) for ease of processing
             BIBFRAME.merger()
Ejemplo n.º 8
0
    def convert(self, filename):
        """Convert file into zero or more MARC records."""
        g = Graph()
        with open(filename, 'r') as fh:
            g.parse(file=fh, format='n3')

        # Get JSON-LD object in PyLD form
        jld = pyld_jsonld_from_rdflib_graph(g)
        # print(json.dumps(jld, indent=2, sort_keys=True))

        # Manipulate the JSON in some way (say switch @context
        # to a reference) and output
        # -- no easy way to do this is rdflib
        with open('cache/biblioteko_context.json', 'r') as cfh:
            context = json.load(cfh)
        with open('cache/biblioteko_frame.json', 'r') as ffh:
            frame = json.load(ffh)
        comp = jsonld.compact(jld, ctx=context)
        comp = jsonld.compact(jsonld.frame(comp, frame),
                              ctx=context,
                              options={'graph': True})
        if (self.dump_json):
            sys.stderr.write("Framed and compacted JSON-LD:\n" +
                             json.dumps(comp, indent=2, sort_keys=True) + '\n')
        self.jsonld = comp

        memory = BytesIO()
        writer = XMLWriter(memory)
        for obj in self.jsonld['@graph']:
            if (obj.get('type') == 'bf:Instance'):
                writer.write(self.make_marc(obj))
            else:
                logging.info(
                    "Ignoring object %s type %s" %
                    (obj.get('id', 'NO-URI'), obj.get('type', 'NO-TYPE')))
        writer.close(close_fh=False)  # Important!
        xml = memory.getvalue().decode(encoding='UTF-8')
        # Dumb semi pretty print
        xml = re.sub(r'(</\w+>)', r'\1\n', xml)
        xml = re.sub(r'(\?>)', r'\1\n', xml)
        xml = re.sub(r'(<record>)', r'\n\1\n', xml)
        return (xml)
Ejemplo n.º 9
0
def marc_record_to_xml_string(record):
    b = BytesIO()
    writer = XMLWriter(b)
    writer.write(record)
    writer.close(close_fh=False)

    # Transform record from bytes to string.
    b.seek(0,0)
    bytestr = b.read()
    marc = bytestr.decode('UTF-8')

    # Remove the XML declaration and collection stuff.
    marc = re.sub(r'^.*<collection[^>]*>','',marc)
    marc = re.sub(r'</collection>$','',marc)
    # Remove tab characters.
    marc = re.sub(r'\t','',marc)

    # Verify cleaning worked:
    if not marc.startswith('<record'):
        print("Error: record failed to create clean XML.")
        return False
    else:
        return marc
    def create_MARC_XML(self):
        """
        Creates a MARC XML format file from the given dataframe
        :return:
        """
        df = self.marc_data
        #  MARCXML file
        output_file = self.data_path_processed / (
            self.collection_id + "_final_" + self.dt_now + ".xml")
        writer = XMLWriter(open(output_file, "wb"))

        # MarcEdit MRK file
        output_file_mrk = self.data_path_processed / (
            self.collection_id + "_finalMRK_" + self.dt_now + ".txt")
        mrk_file = open(output_file_mrk, "w", encoding="utf8")

        start_time = time.time()
        counter = 1

        for index, row in df.iterrows():

            record = Record()

            # add control field
            record.add_field(Field(tag="001", data=str(index)))

            for col in df:
                # if field is empty, skip
                if str(row[col]) == "":
                    continue
                # leader
                elif col == "LDR":
                    l = record.leader
                    l.record_status = "c"  # c - Corrected or revised
                    l.type_of_record = "p"  # p - Mixed materials

                    # Bibliographic level
                    if row["351"] == "File Record" or "Item Record":
                        l.bibliographic_level = "c"
                    else:
                        l.bibliographic_level = "d"

                    l.coding_scheme = "a"  # flag saying this record is utf8
                    l.cataloging_form = "a"
                    continue

                # 008
                elif col == "008":
                    field = Field(tag="008", data=row[col])
                    record.add_field(field)
                    continue

                # extract field name
                field = col[:3]

                # extract indicators
                if col.find("_") == -1:
                    col_name = "{:<5}".format(col)
                    ind = [col_name[3], col_name[4]]
                else:
                    col_name = "{:<5}".format(col[:col.find("_")])
                    ind = [col_name[3], col_name[4]]

                # extract sub-fields
                subfields_data = list()
                subfields_prep = list(filter(None, str(row[col]).split("$$")))
                for subfield in subfields_prep:
                    if subfield == "":
                        continue
                    subfields_data.append(subfield[0])
                    subfields_data.append(subfield[1:])

                # print('field:', field)
                # if not ind:
                #     print('no indicators')
                # else:
                #     print('indicators:', ind)
                # # print('subfields:', subfields_data)

                record.add_field(
                    Field(tag=field, indicators=ind, subfields=subfields_data))

            counter += 1
            # mrk_file.write(record.as_marc())
            writer.write(record)
        writer.close()
        mrk_file.close()
        run_time = time.time() - start_time

        return counter, run_time
Ejemplo n.º 11
0
                subfields = [ 'w', wval,
                              'a', unicode(label) ]
            else:
                subfields = [ 'a', unicode(label) ]
            
            rec.add_field(
                Field(
                    tag='550',
                    indicators = [' ', ' '],
                    subfields = subfields
                )
            )    
    
    # skos:note -> 680
    for note in sorted(g.objects(conc, SKOS.note), key=lambda v:v.language):
        rec.add_field(
            Field(
                tag='680',
                indicators = [' ', ' '],
                subfields=[ 'i', unicode(note),
                            '9', format_language(note.language) ]
            )
        )
    
    # TODO 750 links to ysa/allars/lcsh - need to look up labels for the URIs

    writer.write(rec)


writer.close()
Ejemplo n.º 12
0
def convert(cs, language, g):

    vocId = cs.get("vocabulary_code")

    # variable for a bit complicated constants and casting/converting them to appropiate types
    helper_variables = {
        "vocCode" : (cs.get("vocabulary_code") + "/" + LANGUAGES[language] \
            if cs.getboolean("multilanguage", fallback=False) \
            else vocId),
        "groupingClasses" : [URIRef(x) for x in cs.get("groupingClasses", fallback=",".join(GROUPINGCLASSES)).split(",")],
        "groupingClassesDefault" : [URIRef(x) for x in cs.parser.get("DEFAULT", "groupingClasses", fallback=",".join(GROUPINGCLASSES)).split(",")],
        'modificationDates': cs.get("modificationDates", fallback=None),
        'keepModified' : cs.get("keepModifiedAfter", fallback=None),
        'keepGroupingClasses' : cs.getboolean("keepGroupingClasses", fallback=False),
        'defaultOutputFileName' : "yso2marc-" + cs.name.lower() + "-" + language + ".mrcx"
    }

    if helper_variables['keepModified']:
        helper_variables['keepModifiedLimit'] = False \
        if cs.get("keepModifiedAfter", fallback=KEEPMODIFIEDAFTER).lower() == "all" \
        else datetime.date(datetime.strptime(cs.get("keepModifiedAfter"), "%Y-%m-%d"))

    if cs.get("output", fallback=None):
        parts = cs.get("languages").split(",")
        if len(parts) > 1:
            output = cs.get("output")
            if len(output.split(".")) > 1:
                helper_variables["outputFileName"] = ".".join(
                    output.split(".")
                    [:-1]) + "-" + language + "." + output.split(".")[-1]
            else:
                helper_variables["outputFileName"] = output + "-" + language
    if not "outputFileName" in helper_variables:
        helper_variables["outputFileName"] = cs.get(
            "output", fallback=helper_variables["defaultOutputFileName"])

    #modified_dates on dict-objekti, joka sisältää tietueen id:n avaimena ja
    #arvona tuplen, jossa on tietueen viimeinen muokkauspäivämäärä ja tietueen sisältö MD5-tiivisteenä
    if helper_variables['modificationDates']:
        if os.path.isfile(helper_variables['modificationDates']):
            with open(helper_variables['modificationDates'],
                      'rb') as pickle_file:
                try:
                    modified_dates = pickle.load(pickle_file)
                except EOFError:
                    logging.error(
                        "The file %s for modification dates is empty " %
                        helper_variables['modificationDates'])
                    sys.exit(2)
        else:
            modified_dates = {}

    logging.info(
        "Processing vocabulary with vocabulary code '%s' in language '%s'" %
        (vocId, language))
    incrementor = 0
    writer_records_counter = 0
    ET_namespaces = {"marcxml": "http://www.loc.gov/MARC21/slim"}

    handle = open(
        cs.get("output", fallback=helper_variables["defaultOutputFileName"]),
        "wb")
    writer = XMLWriter(handle)

    # listataan preflabelit, jotta voidaan karsia alt_labelit, jotka toisessa käsitteessä pref_labelina
    pref_labels = set()
    for conc in g.subjects(RDF.type, SKOS.Concept):
        pref_label = g.preferredLabel(conc, lang=language)
        if pref_label:
            pref_labels.add(str(pref_label[0][1]))

    # vain nämä mts-käsiteryhmät otetaan mukaan, ryhmän nimestä ei tehdä MARC21-tietuetta
    ids = {"occupations": ['m2332'], "titles": ['m121', 'm3764']}

    uris = {}
    for key in ids:
        uris[key] = set()
        for id in ids[key]:
            uris[key].add(MTS + id)

    for group in g.subjects(RDF.type, ISOTHES.ConceptGroup):
        for key in uris:
            if any(str(group).endswith(uri) for uri in uris[key]):
                get_member_groups(g, group, uris[key])

    concs = []
    if helper_variables['keepModified']:
        concs = []
        for uri in modified_dates:
            if modified_dates[uri][0] >= helper_variables['keepModifiedLimit']:
                concs.append(URIRef(uri))
    else:
        for conc in g.subjects(RDF.type, SKOS.Concept):
            concs.append(conc)

    #luotujen käsitteiden tunnukset, joilla voidaan selvittää modification_dates-listan avulla poistetut käsitteet
    created_concepts = set()

    for concept in concs:
        #vain ammateista ja arvonimistä luodaan MARC21-tietueet
        if not (concept in uris['occupations'] or concept in uris['titles']):
            continue
        created_concepts.add(str(concept))
        incrementor += 1
        if incrementor % 1000 == 0:
            logging.info("Processing %sth concept" % (incrementor))

        #skipataan ryhmittelevät käsitteet
        if not helper_variables['keepGroupingClasses']:
            if any(conceptType in helper_variables["groupingClasses"]
                   for conceptType in g.objects(concept, RDF.type)):
                continue

        rec = Record()

        rec.leader = cs.get("leaderNew", fallback=LEADERNEW)

        # 024 muut standarditunnukset - käsitteen URI tallennetaan tähän
        rec.add_field(
            Field(tag='024',
                  indicators=['7', ' '],
                  subfields=['a', concept, '2', "uri"]))

        # 040 luetteloiva organisaatio
        rec.add_field(
            Field(tag='040',
                  indicators=[' ', ' '],
                  subfields=[
                      'a',
                      cs.get("creatorAgency", fallback=CREATOR_AGENCY), 'b',
                      LANGUAGES[language], 'f', helper_variables["vocCode"]
                  ]))

        valueProps = sorted(getValues(g,
                                      concept,
                                      SKOS.prefLabel,
                                      language=language),
                            key=lambda o: o.value)
        if len(valueProps) == 0:
            logging.warning(
                "Could not find preflabel for concept %s in language %s. Skipping the whole concept."
                % (concept, language))
            continue
        elif len(valueProps) != 1:
            logging.warning(
                "Multiple prefLabels detected for concept %s in language %s. Choosing the first."
                % (concept, language))

        if concept in uris['occupations']:
            tag = "174"
            subfield_code = "a"
        elif concept in uris['titles']:
            tag = "168"
            subfield_code = "d"

        rec.add_field(
            Field(tag=tag,
                  indicators=[' ', ' '],
                  subfields=[
                      subfield_code,
                      decomposedÅÄÖtoUnicodeCharacters(
                          unicodedata.normalize(NORMALIZATION_FORM,
                                                str(valueProps[0].value)))
                  ]))

        # skos:altLabel -> 467, 474
        # 450 katso-viittaus
        # jätetään tuottamatta 45X-kentät, jotka ovat toisessa käsitteessä 15X-kenttinä, paitsi altLabelein kohdalla
        seen_values = set()

        for valueProp in sorted(getValues(g,
                                          concept, [SKOS.altLabel],
                                          language=language),
                                key=lambda o: str(o.value)):
            if valueProp.prop != SKOS.altLabel and str(
                    valueProp.value) in pref_labels:
                continue
            if valueProp.prop == SKOS.hiddenLabel:
                if str(valueProp.value) in seen_values:
                    continue
            seen_values.add(str(valueProp.value))
            if concept in uris['occupations']:
                tag = "474"
                subfield_code = "a"
            elif concept in uris['titles']:
                tag = "468"
                subfield_code = "d"

            rec.add_field(
                Field(tag=tag,
                      indicators=[' ', ' '],
                      subfields=[
                          subfield_code,
                          decomposedÅÄÖtoUnicodeCharacters(
                              unicodedata.normalize(NORMALIZATION_FORM,
                                                    str(valueProp.value)))
                      ]))

        valueProps = getValues(g, concept, [
            SKOS.prefLabel, SKOS.exactMatch, SKOS.closeMatch, SKOS.broadMatch,
            SKOS.narrowMatch, SKOS.relatedMatch
        ])

        fields = list(
        )  # kerätään kentät tähän muuttujaan, joka sitten lopuksi järjestetään

        for valueProp in valueProps:
            if valueProp.prop == SKOS.prefLabel:
                # suodatetaan samankieliset, jotka menivät jo 1xx-kenttiin
                # valueProp.value sisältää tässä poikkeuksellisesti jo halutun literaalin
                # (vrt. kun muissa on solmu)
                if valueProp.value.language == language:
                    continue

            else:
                # otetaan vain viittaukset samaan sanastoon
                continue

            if concept in uris['occupations']:
                tag = "774"
                subfield_code = "a"
            elif concept in uris['titles']:
                tag = "768"
                subfield_code = "d"

            sub2 = "mts" + "/" + LANGUAGES[valueProp.value.language]
            fields.append(
                Field(tag=tag,
                      indicators=[' ', ' '],
                      subfields=[
                          subfield_code,
                          decomposedÅÄÖtoUnicodeCharacters(
                              unicodedata.normalize(NORMALIZATION_FORM,
                                                    str(valueProp.value))),
                          '4', 'EQ', '2', sub2, '0', concept
                      ]))

        # sort fields and add them
        for sorted_field in sorted(fields,
                                   key=lambda o: (o.tag, o.value().lower())):
            rec.add_field(sorted_field)

        writer_records_counter += 1
        writer.write(rec)

        if helper_variables['modificationDates']:
            md5 = hashlib.md5()
            md5.update(str.encode(str(rec)))
            hash = md5.hexdigest()
            if str(concept) in modified_dates:
                if not hash == modified_dates[str(concept)][1]:
                    modified_dates[str(concept)] = (date.today(), hash)
            else:
                modified_dates[str(concept)] = (date.today(), hash)

    #tuotetaan poistetut käsitteet, kun haetaan muuttuneet käsitteet
    #jos tietue on modified_dates-parametrillä määritettyssä tiedostossa, mutta ei graafissa, tulkitana poistetuksi tietueeksi
    #mts:ssä ei ole deprekointipäiviä
    #

    if helper_variables['keepModified']:
        concs = []
        for conc in g.subjects(RDF.type, SKOS.Concept):
            if conc in uris['occupations'] or conc in uris['titles']:
                concs.append(str(conc))
        for conc in modified_dates:
            if conc not in concs:
                #jos ei ole hajautussummaa (tuplen 2. arvo), luodaan deprekoitu käsite
                if modified_dates[conc][1]:
                    rec = Record()
                    rec.leader = cs.get("leaderDeleted0",
                                        fallback=LEADERDELETED0)
                    rec.add_field(
                        Field(tag='024',
                              indicators=['7', ' '],
                              subfields=['a', conc, '2', "uri"]))
                    modified_dates[conc] = (date.today(), "")
                    writer_records_counter += 1
                    writer.write(rec)

    if handle is not sys.stdout:
        writer.close()

    if helper_variables['modificationDates']:
        with open(helper_variables['modificationDates'], 'wb') as output:
            pickle.dump(modified_dates, output, pickle.HIGHEST_PROTOCOL)

    #jos luodaan kaikki käsitteet, tuotetaan tuotetaan lopuksi käsitteet laveassa XML-muodossa
    #if not helper_variables['keepModified']:
    parser = ET.XMLParser(remove_blank_text=True, strip_cdata=False)
    file_path = helper_variables["outputFileName"]
    tree = ET.parse(file_path, parser)
    e = tree.getroot()
    handle = open(
        cs.get("output", fallback=helper_variables["defaultOutputFileName"]),
        "wb")
    handle.write(
        ET.tostring(e,
                    encoding='UTF-8',
                    pretty_print=True,
                    xml_declaration=True))

    if handle is not sys.stdout:
        handle.close()

    # lokitetaan vähän tietoa konversiosta
    logging.info("Processed %s concepts. Wrote %s MARCXML records." %
                 (incrementor, writer_records_counter))

    if cs.get("outputSpecified", fallback=None) == None:
        outputChannel = sys.stdout.buffer
        with open(
                cs.get("output",
                       fallback=helper_variables['defaultOutputFileName']),
                "rb") as f:
            shutil.copyfileobj(f, outputChannel)
    if cs.get("outputSpecified", fallback=None) == None:
        os.remove(
            cs.get("output",
                   fallback=helper_variables['defaultOutputFileName']))

    logging.info("Conversion completed: %s" %
                 datetime.now().replace(microsecond=0).isoformat())
Ejemplo n.º 13
0
def compare_records(args):
    """
    input_file_1: Vertailtavien MARC-tietueiden tiedostonimi
    input_file_2: Tiedostonimi MARC-tietueille, joista tallennetaan muokatut ja uudet
    output_file: muokatuista ja uusista tietueista muodostetun MARCXML-tiedoston nimi
    pickle_file: tallettaa muutospäivämäärät pickle-tiedostoon date_1 ja date_2 parametrien mukaan
    date_1: alkuperäinen päivämäärä
    date_2: muutospäivämäärä
    """
    #git rev-list -1 --before="2019-08-23 23:59" master
    #git log
    
    input_file_1 = args.first_input_file
    input_file_2 = args.second_input_file 
    mrcx_file = args.output_mrcx
    pickle_file = args.output_pkl
    date_1 = args.original_date
    date_2 = args.modified_date
    
    modified_records = 0    
    new_records = 0
    all_records = {}
    
    loglevel = logging.INFO
    logger = logging.getLogger()
    logger.setLevel(loglevel)
    
    if date_1:
        old_date = datetime.date(datetime.strptime(date_1, "%Y-%m-%d"))
    else:
        old_date = date.fromtimestamp(os.path.getmtime(input_file_1))
    
    if date_2:
        new_date = datetime.date(datetime.strptime(date_2, "%Y-%m-%d"))
    else:
        new_date = date.fromtimestamp(os.path.getmtime(input_file_2))    
    
    writer = XMLWriter(open(mrcx_file, "wb"))
    records = parse_xml_to_array(input_file_1)
    
    old_records_dict = {}
    for record in records:
        md5 = hashlib.md5()        
        md5.update(str.encode(str(record)))
        for field in record.get_fields('024'):
            old_records_dict.update({field['a']: md5.hexdigest()})
    records = parse_xml_to_array(input_file_2)

    for record in records:
        record_id = None
        modified = False
        modified_date = old_date
        for field in record.get_fields('024'):
            record_id = field['a']
        if record_id:
            if record_id in old_records_dict:
                md5 = hashlib.md5()        
                md5.update(str.encode(str(record)))
                hash = md5.hexdigest()
                old_hash = old_records_dict[record_id]
                if not old_hash == hash:
                    modified = True
                    modified_records += 1
            else:
                modified = True
                new_records += 1
        else:
            logging.warning("Record id missing")
        if modified:
            writer.write(record)
            modified_date = new_date
        all_records[record_id] = (modified_date, hash)
    
    logging.info("Number of modified records: %s"%modified_records)
    logging.info("Number of new records: %s"%new_records)
    
    if pickle_file:
        with open(pickle_file, 'wb') as output:
            pickle.dump(all_records, output, pickle.HIGHEST_PROTOCOL)
            output.close()    
    
    writer.close()
   
    parser = ET.XMLParser(remove_blank_text=True,strip_cdata=False)
    tree = ET.parse(mrcx_file, parser)
    e = tree.getroot()
    handle = open(mrcx_file, "wb")
    handle.write(ET.tostring(e, encoding='UTF-8', pretty_print=True, xml_declaration=True))
Ejemplo n.º 14
0
def writeMARCXML(record, filename):
    # Write out record in MARCXML format
    writer = XMLWriter(open(filename,'wb'))
    writer.write(record)
    writer.close()  # Important!
Ejemplo n.º 15
0
def main():
    global inputfile, target, mapping
    parser = argparse.ArgumentParser(
        description='Process and map classification authority records for BCUR.'
    )
    parser.add_argument('-i',
                        '--inputfiles',
                        type=str,
                        nargs='+',
                        help='one or more file(s) to be processed',
                        required=True)
    parser.add_argument('-o',
                        '--outputfile',
                        type=str,
                        nargs=1,
                        help='name of the output file',
                        required=True)
    parser.add_argument('-m',
                        '--map',
                        type=str,
                        nargs=1,
                        help='map target code',
                        required=True,
                        choices=valid_targets)

    args = parser.parse_args()

    targetcode = args.map[0]

    # For musi and musg records, found records are copied as-is, no field mapping.
    if targetcode in ('musi', 'musg'):
        mapping = False

    outputfile = args.outputfile[0]

    # Open a new XML document in which target records will be stored
    global writer
    writer = XMLWriter(open(outputfile, 'wb'))

    # Record start processing time
    tstart = datetime.datetime.now()

    # Print header row in case log is opened as CSV
    print("Notice,Champ,Contenu du champ,Message")

    # Loop through the list of input files and call the mapping function
    for infile in args.inputfiles:
        inputfile = infile
        target = targetcode
        if mapping:
            print(
                f"----- Traitement du fichier {inputfile} avec mapping {target} -----"
            )
        else:
            print(
                f"----- Traitement du fichier {inputfile} sans mapping -----")

        # This applies the mapping function to each record in inputfile
        map_xml(record_map, inputfile)

        if targetcode == 'vddoc':
            # For vddoc, also look for vddoc-la
            target = 'vddoc-la'
            map_xml(record_map, inputfile)

    # Calculate the total time elapsed
    tdiff = datetime.datetime.now() - tstart

    # Close the output document
    writer.close()

    print(
        f'Routine terminée en {tdiff.total_seconds()} secondes. Résultat enregistré dans {outputfile}'
    )