def formatToAnnotation(folia_file: str): doc = folia.Document(file=folia_file) doc.declare( folia.EntitiesLayer, "https://raw.githubusercontent.com/BaptisteBlouin/ttl/master/namedentities.ontonotesset.ttl", ) doc.save(folia_file)
def process(inputfile, outputfile, metadatadir, oztfile): docid = os.path.basename(inputfile).split('.')[0] metadatafile = os.path.join(metadatadir, docid + '.json') hasmetadata = os.path.exists(metadatafile) doc = folia.Document(file=inputfile) doc.provenance.append( folia.Processor.create("PICCL/nederlab.nf/addmetadata.py")) if hasmetadata: with open(metadatafile, 'r') as f: data = json.load(f) for key, value in sorted(data.items()): doc.metadata[key] = value print("Added metadata from JSON file for " + docid, file=sys.stderr) if oztfile: addsubmetadata_ozt(doc, oztfile, metadatadir) assignids(doc) doc.provenance.processors[-1].append( folia.Processor.create(name="foliaid", version=TOOLVERSION, src="https://github.com/proycon/foliatools")) if outputfile == '-': print(doc.xmlstring()) else: doc.save(outputfile)
def validator(self): try: import folia.main as folia #soft-dependency, not too big a deal if it is not present, but no metadata extraction then #this loads a whole FoLiA document into memory! which is a bit of a waste of memory and a performance hog! try: doc = folia.Document(file=str(self.file)) except Exception as e: self['validation_error'] = str(e) self.valid = False return False self['version'] = doc.version for annotationtype, annotationset in doc.annotations: key = folia.annotationtype2str( annotationtype).lower() + "-annotation" if annotationset is None: annotationset = "no-set" if key in self and annotationset not in [ x.strip() for x in self[key].split(',') ]: self[key] += "," + annotationset else: self[key] = annotationset except ImportError as e: self['validation_error'] = str(e) return False return True
def compare(fs_i, f_o='data/comparison.csv'): """ fs_i: list of input (FoLiA) files (full path, with extension) (str) f_o: output (CSV) file (full path, with extension) (str) ... """ # https://pynlpl.readthedocs.io/en/latest/folia.html#reading-folia docs_i = list() speeches_i = list() for f in fs_i: print(f) docs_i.append(folia.Document(file=f)) for d in docs_i: # list of lists of utterances speeches_i.append([u for u in d[0]]) with open(f_o, 'w', encoding='utf-8',newline='') as f: print(f_o) writer = csv.writer(f) empty_lines = [ [], [], ] for u in zip(*speeches_i): if len(set(len(uu) for uu in u))==1: # all utterances in u have the same number of tokens flag_diff = [True]*len(u[0]) # annotation contents are different for every tokens across the utterances in u morphos_list = list() for i in range(len(u)): morphos = [fs_i[i]] for w in u[i]: morphos.append('\n'.join(get_annotation(w))) morphos_list.append(morphos) for i in range(len(u[0])): if len(set(m[i+1] for m in morphos_list))==1: flag_diff[i] = False for j in range(len(morphos_list)): morphos_list[j][i+1] = str() if any(flag_diff): writer.writerow([u[0].id, u[0].begintime, u[0].endtime]) # words = [''] + [w.text() for w in u[0]] words = [''] + [w.text(hidden=True) for w in u[0]] writer.writerow(words) writer.writerows(morphos_list) writer.writerows(empty_lines) else: for i in range(len(u)): words = [''] morphos = [fs_i[i]] for w in u[i]: # words.append(w.text()) words.append(w.text(hidden=True)) morphos.append('\n'.join(get_annotation(w))) writer.writerows([words,morphos]) writer.writerows(empty_lines)
def read(self, collected_file: CollectedFile) -> Iterable[Document]: try: doc = folia.Document(string=collected_file.content, autodeclare=True, loadsetdefinitions=False) self.tokenize(doc) doc_metadata = self.get_metadata_dict(doc.metadata.items()) yield Document(collected_file, list(self.get_utterances(doc, doc_metadata)), doc_metadata) except Exception as e: raise Exception(collected_file.relpath + "/" + collected_file.filename) from e
def convert(f_i, f_o=None): """ f_i: input (ELAN) files (full path, with extension) (str) f_o: output (FoLiA) file (full path, with extension) (str) ... """ doc_i = Eaf(f_i) if not f_o: f_o = '.'.join([f_i.rpartition('.')[0], 'folia.xml']) # https://foliapy.readthedocs.io/en/latest/folia.html#editing-folia # https://foliapy.readthedocs.io/en/latest/folia.html#adding-structure # https://foliapy.readthedocs.io/en/latest/folia.html#structure-annotation-types # print(os.path.basename(f_o)) id_doc_o = os.path.basename(f_o).partition('.')[0] print(id_doc_o) # doc_o = folia.Document(id=os.path.basename(f_o)) doc_o = folia.Document(id=id_doc_o) # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py # future: # https://foliapy.readthedocs.io/en/latest/folia.html#declarations # https://foliapy.readthedocs.io/en/latest/folia.html#provenance-information # doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA) # processor_mystem as a single processor for all annotation performed by this script processor_mystem = doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA, processor=folia.Processor(name="Mystem+")) # doc_o.declare(folia.PosAnnotation, set=SET_POS) doc_o.declare(folia.PosAnnotation, set=SET_POS, processor=processor_mystem) # doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group") doc_o.declare(folia.Description, processor=processor_mystem) doc_o.declare(folia.Comment, processor=processor_mystem) doc_o.declare(folia.Utterance, processor=processor_mystem) doc_o.declare(folia.Word, processor=processor_mystem) doc_o.declare(folia.Hiddenword) # folia.Speech cannot be declared as an annotation type speech = doc_o.append(folia.Speech) for aa in create_conversation(get_aas(doc_i)): utterance = speech.append(folia.Utterance, id=aa[0], speaker=aa[1], begintime=aa[2], endtime=aa[3], processor=processor_mystem) # https://docs.python.org/3/library/string.html#formatspec utterance.append(folia.Word, '{}:'.format(aa[1].upper()), processor=processor_mystem) # aa[4]: utterance text tokens = get_tokens(aa[4]) len_tokens = len(tokens) for i in range(len_tokens): t = tokens[i] # consider the previous token in morphological analysis # pre_t = None # if i: # pre_t = tokens[i-1] pre_t = [None, None] if i > 1: pre_t = [tokens[i - 2], tokens[i - 1]] elif i == 1: pre_t[1] = tokens[i - 1] token = utterance.append(folia.Word, t, processor=processor_mystem) if i < (len_tokens - 1): t = ' '.join([t, tokens[i + 1]]) # lemma, pos, features = analyze_morphology(t) lemma, pos, features = analyze_morphology(pre_t, t) if lemma: token.append(folia.LemmaAnnotation, cls=lemma, set=SET_LEMMA, processor=processor_mystem # annotator='Mystem+' ) if pos: an_pos = token.append(folia.PosAnnotation, cls=pos, set=SET_POS, processor=processor_mystem # annotator='Mystem+' ) if features: # https://foliapy.readthedocs.io/en/latest/folia.html#features an_pos.append(folia.Description, value=re.sub(r'=', r',', features), processor=processor_mystem # annotator='Mystem+' ) an_pos.append(folia.Comment, value=' '.join(['Mystem+ features:', features]), processor=processor_mystem # annotator='Mystem+' ) doc_o.save(f_o)
clamdata = clam.common.data.getclamdata(datafile) #You now have access to all data. A few properties at your disposition now are: # clamdata.system_id , clamdata.project, clamdata.user, clamdata.status , clamdata.parameters, clamdata.inputformats, clamdata.outputformats , clamdata.input , clamdata.output clam.common.status.write(statusfile, "Starting...") count = defaultdict(int) for i, inputfile in enumerate(clamdata.input): #Update our status message to let CLAM know what we're doing clam.common.status.write( statusfile, "Processing " + os.path.basename(str(inputfile)) + "...", round((i / float(len(clamdata.input))) * 100)) doc = folia.Document(file=str(inputfile)) for pos in doc.select(folia.PosAnnotation): count[pos.cls] += 1 #Write overall statistics output for this file f = io.open(outputdir + 'posfreqlist.tsv', 'w', encoding='utf-8') f.write(dicttotext(count)) f.close() #A nice status message to indicate we're done clam.common.status.write(statusfile, "Done", 100) # status update sys.exit( 0 ) #non-zero exit codes indicate an error and will be picked up by CLAM as such!
def compare(fs_i, f_o='data/comparison.csv'): """ fs_i: list of input (FoLiA) files (full path, with extension) (str) f_o: output (CSV) file (full path, with extension) (str) ... """ # https://pynlpl.readthedocs.io/en/latest/folia.html#reading-folia docs_i = list() speeches_i = list() for f in fs_i: print(f) docs_i.append(folia.Document(file=f)) for d in docs_i: # list of lists of utterances speeches_i.append([u for u in d[0] if isinstance(u, folia.Utterance)]) ids_utterance = get_ids(*speeches_i) with open(f_o, 'w', encoding='utf-8', newline='') as f: print(f_o) writer = csv.writer(f) empty_lines = [ [], [], ] for i in ids_utterance: if i in docs_i[0] and i in docs_i[1]: u1 = docs_i[0][i] u2 = docs_i[1][i] morphos_list = list() ids_token = get_ids(u1, u2) flag_diff = [True] * len(ids_token) for j in range(2): # 1st element of morphos is the file name morphos = [fs_i[j]] for k in ids_token: if k in docs_i[j]: morphos.append('\n'.join( get_annotation(docs_i[j][k], True)).strip()) else: morphos.append('') morphos_list.append(morphos) for j in range(len(ids_token)): # j+1 because 1st element of morphos is the file name if len(set(m[j + 1] for m in morphos_list)) == 1: flag_diff[j] = False # len(morphos_list) is 2 here for k in range(len(morphos_list)): morphos_list[k][j + 1] = str() if any(flag_diff): writer.writerow( [i, docs_i[0][i].begintime, docs_i[0][i].endtime]) writer.writerow([''] + ids_token) writer.writerows(morphos_list) writer.writerows(empty_lines) else: if i in docs_i[0]: u = docs_i[0][i] else: u = docs_i[1][i] # w: either normal word or hidden word ids_token_plus = [''] + [w.id for w in u] # 1st element of morphos is the file name morphos = [fs_i[i]] # w: either normal word or hidden word for w in u: morphos.append('\n'.join(get_annotation(w, True)).strip()) writer.writerows([ids_token_plus, morphos]) writer.writerows(empty_lines)
<text xml:id="untitleddoc.text"> <p xml:id="untitleddoc.p.1"> <s xml:id="untitleddoc.p.1.s.1"> <t>Hello world.</t> </s> <s xml:id="untitleddoc.p.1.s.2"> <t>This is a test.</t> </s> </p> <p xml:id="untitleddoc.p.2"> <s xml:id="untitleddoc.p.2.s.1"> <t>The capital of The Netherlands is Amsterdam.</t> </s> <s xml:id="untitleddoc.p.2.s.2"> <t>The government, however, always convenes in The Hague.</t> </s> </p> </text> </FoLiA> """ import spacy import folia.main as folia from spacy2folia import spacy2folia nlp = spacy.load("en_core_web_sm") foliadoc = folia.Document(string=text) foliadoc = spacy2folia.convert_folia(foliadoc, nlp) print(foliadoc.xmlstring())
def read_folia_xml(filename): doc = folia.Document(file=filename, encoding='utf-8') return doc
def test(self): if debug: print("Testing " + filename,file=sys.stderr) if erroneous: self.assertRaises(Exception, folia.Document, file=filename,autodeclare=False,deepvalidation=deepvalidation) else: folia.Document(file=filename, deepvalidation=deepvalidation)
def main(): parser = argparse.ArgumentParser( description= "BabelEnte: Entity extractioN, Translation and Evaluation using BabelFy", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--apikey', '--key', type=str, help="Babelnet API key", action='store', default="", required=False) parser.add_argument('-s', '--sourcelang', type=str, help="Source language code", action='store', default="EN", required=False) parser.add_argument('-t', '--targetlang', type=str, help="Target language code", action='store', default="", required=False) parser.add_argument( '-S', '--source', type=str, help="Source sentences (plain text, one per line, utf-8)", action='store', default="", required=False) parser.add_argument( '-T', '--target', type=str, help="Target sentences (plain text, one per line, utf-8)", action='store', default="", required=False) parser.add_argument( '-r', '--recall', help= "Compute recall as well using Babel.net (results in many extra queries!)", action='store_true', required=False) parser.add_argument( '-o', '--outputdir', type=str, help= "Output directory when processing FoLiA documents (set to /dev/null to skip output alltogether)", action='store', default="./", required=False) parser.add_argument('-d', '--debug', help="Debug", action='store_true', required=False) parser.add_argument('--nodup', help="Filter out duplicate entities in evaluation", action='store_true', required=False) parser.add_argument( '--evalfile', type=str, help="(Re)evaluate the supplied json file (output of babelente)", action='store', default="", required=False) parser.add_argument( '--anntype', type=str, help= "Annotation Type: Allows to restrict the disambiguated entries to only named entities (NAMED_ENTITIES), word senses (CONCEPTS) or both (ALL).", action='store', required=False) parser.add_argument( '--annres', type=str, help= "Annotation Resource: Allows to restrict the disambiguated entries to only WordNet (WN), Wikipedia (WIKI) or BabelNet (BN)", action='store', required=False) parser.add_argument('--th', type=float, help="Cutting Threshold (BabelFy)", action='store', required=False) parser.add_argument( '--match', type=str, help= "select the candidate extraction strategy, i.e., either only exact matching (EXACT_MATCHING) or both exact and partial matching (PARTIAL_MATCHING)", action='store', required=False) parser.add_argument( '--mcs', type=str, help= "Use this to enable or disable the most common sense backoff strategy for BabelFy (values: ON, OFF, ON_WITH_STOPWORDS)", action='store', required=False) parser.add_argument( '--dens', help= "Enable the densest subgraph heuristic during the disambiguation pipeline.", action='store_true', required=False) parser.add_argument( '--cands', type=str, help= "Use this parameter to obtain as a result of the disambiguation procedure a scored list of candidates (ALL) or only the top ranked one (TOP); if ALL is selected then --mcs and --th parameters will not be taken into account).", action='store', required=False) parser.add_argument( '--postag', type=str, help= "Use this parameter to change the tokenization and pos-tagging pipeline for your input text. Values: STANDARD, NOMINALIZE_ADJECTIVES, INPUT_FRAGMENTS_AS_NOUNS, CHAR_BASED_TOKENIZATION_ALL_NOUN", action='store', required=False) parser.add_argument( '--extaida', help= "Extend the candidates sets with the aida_means relations from YAGO.", action='store_true', required=False) parser.add_argument( '--overlap', type=str, help= "Resolve overlapping entities, can be set to allow (default), longest, score, globalscore, coherencescore", action='store', default='allow', required=False) parser.add_argument( '--cache', type=str, help= "Cache file, stores queries to prevent excessive querying of BabelFy (warning: not suitable for parallel usage!)", action='store', required=False) parser.add_argument('--dryrun', help="Do not query", action='store_true', required=False) parser.add_argument( 'inputfiles', nargs='*', help= 'FoLiA input documents, use with -s to choose source language. For tramooc style usage: use -S/-T or --evalfile instead of this.' ) #hidden power options: parser.add_argument( '--foliaset', type=str, help=argparse.SUPPRESS, action='store', default= "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.babelnet.ttl", required=False) parser.add_argument( '--foliarelationset', type=str, help=argparse.SUPPRESS, action='store', default= "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.relations.ttl", required=False) parser.add_argument( '--foliametricset', type=str, help=argparse.SUPPRESS, action='store', default= "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.metrics.ttl", required=False) args = parser.parse_args() if not args.source and not args.target and not args.evalfile and not args.inputfiles: print( "ERROR: For Tramooc style usage, specify either --source/-S (with or without --target/-T, or --evalfile.", file=sys.stderr) print( " For entity extraction & linking on FoLiA documents, just specify one or more FoLiA documents", file=sys.stderr) print(" along with --sourcelang to choose source language.", file=sys.stderr) print(" See babelente -h for full usage instructions.", file=sys.stderr) sys.exit(2) if args.target and not args.source: print( "ERROR: Specify --source/-S as well when --target/-T is used . See babelente -h for usage instructions.", file=sys.stderr) sys.exit(2) if (args.target or args.source or args.inputfiles) and not args.apikey: print( "ERROR: Specify an API key (--apikey). Get one on http://babelnet.org/", file=sys.stderr) sys.exit(2) if args.target and not args.targetlang: print("ERROR: Specify a target language (-t).", file=sys.stderr) sys.exit(2) if args.inputfiles: if not args.sourcelang: print("ERROR: Specify a source language (-s)", file=sys.stderr) sys.exit(2) first = True textdoc = False for filename in args.inputfiles: if not os.path.exists(filename): print("ERROR: No such file: " + filename) sys.exit(2) if filename[-3:].lower() == ".xml": #FoLiA based, extraction only print("Loading FoLiA document " + filename + " ...", file=sys.stderr) doc = folia.Document(file=filename) if first: print("[") first = False processfolia(doc, args, None) if args.outputdir != '/dev/null': outputname = os.path.basename(filename) if outputname.endswith('.folia.xml'): outputname = outputname.replace( '.folia.xml', '.babelente.folia.xml') elif outputname.endswith('.xml'): outputname = outputname.replace( '.xml', '.babelente.folia.xml') else: outputname = outputname + '.babelente.folia.xml' doc.save(os.path.join(args.outputdir, outputname)) else: #text-based textdoc = True print("Loading text document " + filename + " ...", file=sys.stderr) args.source = filename break if not textdoc: print("{}") print("]") return True #Tramooc-style extraction, translation and evaluation with open(args.source, 'r', encoding='utf-8') as f: sourcelines = [stripmultispace(l) for l in f.readlines()] if args.target: with open(args.target, 'r', encoding='utf-8') as f: targetlines = [stripmultispace(l) for l in f.readlines()] if len(sourcelines) != len(targetlines): print( "ERROR: Expected the same number of line in source and target files, but got " + str(len(sourcelines)) + " vs " + str(len(targetlines)), file=sys.stderr) sys.exit(2) if args.cache: if os.path.exists(args.cache): print("Loading cache from " + args.cache, file=sys.stderr) with open(args.cache, 'rb') as f: cache = pickle.load(f) else: print("Creating new cache " + args.cache, file=sys.stderr) cache = { 'source': {}, 'target': {}, 'synsets_source': {}, 'synsets_target': {} } else: cache = None evaluation = None if args.evalfile: with open(args.evalfile, 'rb') as f: data = json.load(f) sourceentities = data['sourceentities'] targetentities = data['targetentities'] print("Evaluating...", file=sys.stderr) evaluation = evaluate( sourceentities, targetentities, sourcelines, targetlines, args.recall, args.targetlang, args.apikey, args.nodup, None if cache is None else cache['synsets_source'], args.debug) else: print("Extracting source entities...", file=sys.stderr) sourceentities = [ entity for entity in findentities( sourcelines, args.sourcelang, args, None if cache is None else cache['source']) if entity['isEntity'] and 'babelSynsetID' in entity ] #with sanity check if args.target: print("Extracting target entities...", file=sys.stderr) targetentities = [ entity for entity in findentities( targetlines, args.targetlang, args, None if cache is None else cache['target']) if entity['isEntity'] and 'babelSynsetID' in entity ] #with sanity check print("Evaluating...", file=sys.stderr) evaluation = evaluate( sourceentities, targetentities, sourcelines, targetlines, args.recall, args.targetlang, args.apikey, args.nodup, None if cache is None else cache['synsets_target'], args.debug) else: print( json.dumps({'entities': sourceentities}, indent=4, ensure_ascii=False)) #MAYBE TODO: add coverage? if evaluation is not None: print( json.dumps( { 'sourceentities': sourceentities, 'targetentities': targetentities, 'evaluation': evaluation }, indent=4, ensure_ascii=False)) #output summary to stderr (info is all in JSON stdout output as well) print("PRECISION(macro)=" + str(round(evaluation['precision'], 3)), "RECALL(macro)=" + str(round(evaluation['recall'], 3)), file=sys.stderr) print("PRECISION(micro)=" + str(round(evaluation['microprecision'], 3)), "RECALL(micro)=" + str(round(evaluation['microrecall'], 3)), file=sys.stderr) print("SOURCECOVERAGE=" + str(round(evaluation['sourcecoverage'], 3)), "TARGETCOVERAGE=" + str(round(evaluation['targetcoverage'], 3)), file=sys.stderr) print("SOURCEENTITIES=" + str(len(sourceentities)), "TARGETENTITIES=" + str(len(targetentities))) print("MATCHES=" + str(evaluation['matches']), file=sys.stderr) print("TRANSLATABLEENTITIES=" + str(evaluation['translatableentities']), file=sys.stderr) if cache is not None: with open(args.cache, 'wb') as f: pickle.dump(cache, f)
def process(file, **kwargs): selectlang = kwargs.get('language', None) doc = folia.Document(file=file, processor=folia.Processor.create("wikiente", version=VERSION)) if not doc.declared(folia.Sentence): print( "ERROR: Document contains no sentence annotation, but this is required for wikiente", file=sys.stderr) sys.exit(2) for sentence in doc.sentences(): if kwargs.get('debug') and sentence.id: print("Processing sentence ", sentence.id, file=sys.stderr) if selectlang: foundlang = getlanguage(sentence) if foundlang is None: print("(no language information, skipping sentence ", repr(sentence), ")", file=sys.stderr) continue elif foundlang.cls != selectlang: print("(skipping, language doesn't match, expected ", selectlang, " found ", foundlang.cls, file=sys.stderr) continue text = sentence.text(cls=kwargs.get('textclass', 'current'), retaintokenisation=True) if kwargs.get('debug'): print("Processing: ", text, file=sys.stderr) try: entities = spotlight.annotate(os.path.join(kwargs.get('server'), "annotate"), text, confidence=kwargs.get( 'confidence', 0.5)) except ConnectionError as e: print("WARNING: Connection Error", str(e), file=sys.stderr) if kwargs.get('ignore'): continue else: sys.exit(2) except spotlight.SpotlightException as e: print("WARNING: Spot exception", str(e), file=sys.stderr) continue except HTTPError as e: print("ERROR: HTTP exception", str(e), file=sys.stderr) if kwargs.get('ignore'): continue else: sys.exit(2) for rawentity in entities: if kwargs.get('debug'): print(rawentity, file=sys.stderr) wordspan = None try: wordspan = sentence.resolveoffsets( rawentity['offset'], rawentity['offset'] + len(rawentity['surfaceForm']), cls=kwargs.get('textclass', 'current')) except folia.InconsistentText as e: print("WARNING: ", str(e), file=sys.stderr) if not wordspan: print("WARNING: Unable to resolve entity", rawentity['surfaceForm'], file=sys.stderr) else: mode = kwargs.get('mode', 1) if mode == 1: cls = rawentity['URI'] entityset = ENTITYSET_MODE_1 elif mode == 2: cls = getclass(rawentity['types'].split(',')) if cls is None: print( "WARNING: Resolved entity does not specify any known types, skipping: ", rawentity['surfaceForm'], file=sys.stderr) continue entityset = ENTITYSET_MODE_2 else: raise ValueError("Invalid mode") entity = wordspan[0].add(folia.Entity, *wordspan, cls=cls, set=entityset) if kwargs.get('metrics'): for key, value in rawentity: if key not in ('URI', 'offset', 'surfaceForm'): entity.append(folia.Metric, set=METRIC_SET, cls=key, value=str(value)) if mode == 2: entity.append(folia.Relation, cls="dbpedia", href=rawentity['URI'], set=RELATIONSET, format="application/rdf+xml") if kwargs.get('output', None) == '-': print(doc.xmlstring()) else: doc.save(kwargs.get('output', None))
def process(filename, outputdir, metadata, oztmetadata, oztcount, ignore): assert os.path.exists(filename) doc = folia.Document(file=filename) doc.provenance.append(folia.Processor.create("dbnl_ozt_fix.py")) found = 0 if doc.id not in metadata: if doc.id + '_01' in metadata: doc.id = doc.id + '_01' print( "WARNING: Document ID did not have _01 suffix and did not match with metadata, added it manually so it matches again", file=sys.stderr) elif ignore: print( "WARNING: Document not found in Nederlab metadata! Ignoring this and passing the document as-is!!!", file=sys.stderr) doc.save( os.path.join( outputdir, os.path.basename(doc.filename.replace(".xml.gz", ".xml")))) return else: raise Exception("Document not found in metadata") for key, value in metadata[doc.id].items(): if key not in ('title', 'ingestTime', 'updateTime', 'processingMethod') and value: doc.metadata[key] = value if doc.id in oztcount: unmatched = 0 assigned_ozt_divs = set() for div in doc.select(folia.Division, False): if div.cls in ("chapter", "act"): found += 1 seq_id = str(found).zfill(4) ozt_id = doc.id + "_" + seq_id if ozt_id not in oztmetadata: unmatched += 1 #false positive if div.id in assigned_ozt_divs: print( f"ERROR: No metadata was found for {ozt_id}, we expected an independent title but this is not one! We can't skip this element ({div.id}) because it was already assigned to an earlier division and would cause a conflict! This may be indicative of a problem where the metadata is out of sync with the actual files! Please investigate.", file=sys.stderr) sys.exit(6) else: print( f"WARNING: No metadata was found for {ozt_id}, we expected an independent title but this is not one! Skipping...", file=sys.stderr) div.metadata = None #unassign any metadata continue print(f"Found {ozt_id}, reassigning identifiers...", file=sys.stderr) div.id = ozt_id + ".text" div.metadata = ozt_id + ".metadata" doc.submetadata[ozt_id + ".metadata"] = folia.NativeMetaData() doc.submetadatatype[ozt_id + ".metadata"] = "native" assigned_ozt_divs.add(div.id) for key, value in oztmetadata[ozt_id].items(): if key not in ('ingestTime', 'updateTime', 'processingMethod') and value: doc.submetadata[ozt_id + ".metadata"][key] = value reassignids(div) elif div.metadata: div.metadata = None expected = oztcount[doc.id] if found != expected + unmatched: raise Exception( f"Found {found} OZT chapters for {doc.id}, expected {expected} ( + {unmatched} unmatched)" ) else: print(f"Document {doc.id} has no independent titles, skipping...", file=sys.stderr) obsolete_submetadata = [ key for key, value in doc.submetadata.items() if not value ] for key in obsolete_submetadata: del doc.submetadata[key] del doc.submetadatatype[key] print("Saving document", file=sys.stderr) doc.save( os.path.join(outputdir, os.path.basename(doc.filename.replace(".xml.gz", ".xml"))))