def makeTf(): tfPathV = f'{TF_PATH}/{VERSION}' if os.path.exists(tfPathV): rmtree(tfPathV) TF = Fabric(locations=[tfPathV]) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
def genTrees(version): C = setVersion(version) bhsa = C.bhsa sp = C.sp rela = C.rela ptyp = C.ptyp ctyp = C.ctyp g_word_utf8 = C.g_word_utf8 tfDir = C.tfDir TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa) api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother") E = api.E F = api.F Fs = api.Fs def getTag(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) def getTagN(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" if not isWord: tag += "{" + str(node) + "}" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) treeTypes = ("sentence", "clause", "phrase", "subphrase", "word") (rootType, leafType, clauseType, phraseType) = ( treeTypes[0], treeTypes[-1], treeTypes[1], treeTypes[2], ) ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items()) ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items()) tree = Tree( TF, otypes=treeTypes, phraseType=phraseType, clauseType=clauseType, ccrFeature=rela, ptFeature=ptyp, posFeature=sp, motherFeature="mother", ) tree.restructureClauses(ccrClass) results = tree.relations() TF.info("Ready for processing") skip = set() TF.info("Verifying whether all slots are preserved under restructuring") TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}") errors = [] # i = 10 for snode in F.otype.s(rootType): declaredSlots = set(E.oslots.s(snode)) results = {} thisgood = {} for kind in ("e", "r"): results[kind] = set(lt for lt in tree.getLeaves(snode, kind) if F.otype.v(lt) == leafType) thisgood[kind] = declaredSlots == results[kind] # if not thisgood[kind]: # print(f"{kind} D={declaredSlots}\n L={results[kind]}") # i -= 1 # if i == 0: break if False in thisgood.values(): errors.append((snode, thisgood["e"], thisgood["r"])) nErrors = len(errors) if nErrors: TF.error(f"{len(errors)} mismatches:") mine = min(20, len(errors)) skip |= {e[0] for e in errors} for (s, e, r) in errors[0:mine]: TF.error( (f"{s} embedding: {'OK' if e else 'XX'};" f" restructd: {'OK' if r else 'XX'}"), tm=False, ) else: TF.info(f"{len(errors)} mismatches") TF.info(f"Exporting {rootType} trees to TF") s = 0 chunk = 10000 sc = 0 treeData = {} treeDataN = {} for node in F.otype.s(rootType): if node in skip: continue (treeRep, wordsRep, bSlot) = tree.writeTree(node, "r", getTag, rev=False, leafNumbers=True) (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node, "r", getTagN, rev=False, leafNumbers=True) treeData[node] = treeRep treeDataN[node] = treeNRep s += 1 sc += 1 if sc == chunk: TF.info(f"{s} trees composed") sc = 0 TF.info(f"{s} trees composed") nodeFeatures = dict(tree=treeData, treen=treeDataN) metaData = dict( tree=dict( valueType="str", description="penn treebank represententation for sentences", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), treen=dict( valueType="str", description= "penn treebank represententation for sentences with node numbers included", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), ) TF.info("Writing tree feature to TF") TFw = Fabric(locations=tfDir, silent=True) TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
def generateTf(bookAcro, content): if os.path.exists(TF_PATH): rmtree(TF_PATH) os.makedirs(TF_PATH) print("Slicing content into features") cur = collections.Counter() curSlot = 0 context = [] nodeFeatures = collections.defaultdict(dict) edgeFeatures = collections.defaultdict(lambda: collections.defaultdict(set)) oSlots = collections.defaultdict(set) for acro in allAcrosSeq: thisBookInfo = bookAcro[acro] bookName = thisBookInfo["bookName"] witness = thisBookInfo.get("witness", None) chapters = content[bookName] cur["book"] += 1 bookNode = ("book", cur["book"]) nodeFeatures["book"][bookNode] = acro nodeFeatures["book@en"][bookNode] = bookName if witness is not None: nodeFeatures["witness"][bookNode] = witness context.append(("book", cur["book"])) for chapterNum in chapters: verses = chapters[chapterNum] cur["chapter"] += 1 nodeFeatures["chapter"][("chapter", cur["chapter"])] = chapterNum nodeFeatures["book"][("chapter", cur["chapter"])] = acro if witness is not None: nodeFeatures["witness"][("chapter", cur["chapter"])] = witness context.append(("chapter", cur["chapter"])) for verseNum in verses: words = verses[verseNum].strip().split() cur["verse"] += 1 nodeFeatures["verse"][("verse", cur["verse"])] = verseNum nodeFeatures["chapter"][("verse", cur["verse"])] = chapterNum nodeFeatures["book"][("verse", cur["verse"])] = acro if witness is not None: nodeFeatures["witness"][("verse", cur["verse"])] = witness context.append(("verse", cur["verse"])) for elem in splitWords(words): if len(elem) != 2: print(bookName, chapterNum, verseNum, words) continue (word, punc) = elem wSyc = TR.to_syriac(word) pSyc = TR.to_syriac(punc) curSlot += 1 wordNode = ("word", curSlot) nodeFeatures["word_etcbc"][wordNode] = word nodeFeatures["word"][wordNode] = wSyc nodeFeatures["trailer_etcbc"][wordNode] = punc nodeFeatures["trailer"][wordNode] = pSyc for (nt, curNode) in context: oSlots[(nt, curNode)].add(curSlot) context.pop() context.pop() context.pop() if len(context): print("Context:", context) print(f"\n{curSlot:>7} x slot") for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): print(f"{amount:>7} x {nodeType}") nValues = reduce(operator.add, (len(values) for values in nodeFeatures.values()), 0) print(f"{len(nodeFeatures)} node features with {nValues} values") print(f"{len(oSlots)} nodes linked to slots") print("Compiling TF data") print("Building warp feature otype") nodeOffset = {"word": 0} oType = {} n = 1 for k in range(n, curSlot + 1): oType[k] = "word" n = curSlot + 1 for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): nodeOffset[nodeType] = n - 1 for k in range(n, n + amount): oType[k] = nodeType n = n + amount print(f"{len(oType)} nodes") print("Filling in the nodes for features") newNodeFeatures = collections.defaultdict(dict) for (ft, featureData) in nodeFeatures.items(): newFeatureData = {} for ((nodeType, node), value) in featureData.items(): newFeatureData[nodeOffset[nodeType] + node] = value newNodeFeatures[ft] = newFeatureData newOslots = {} for ((nodeType, node), slots) in oSlots.items(): newOslots[nodeOffset[nodeType] + node] = slots nodeFeatures = newNodeFeatures nodeFeatures["otype"] = oType edgeFeatures["oslots"] = newOslots print(f'Node features: {" ".join(nodeFeatures)}') print(f'Edge features: {" ".join(edgeFeatures)}') metaData = { "": commonMetaData, "otext": oText, "oslots": dict(valueType="str"), "book@en": langMetaData["en"], } for ft in set(nodeFeatures) | set(edgeFeatures): metaData.setdefault(ft, {})["valueType"] = "int" if ft in numFeatures else "str" metaData[ft]["description"] = ( specificMetaData[ft] if ft in specificMetaData else "?" ) TF = Fabric(locations=TF_PATH, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
nodeFeatures = { 'otype': otype, 'name': name, } edgeFeatures = { 'oslots': oslots, } metaData = { '': { 'name': 'testset', }, 'otype': { 'valueType': 'str', }, 'oslots': { 'valueType': 'str', }, 'sign': { 'valueType': 'str', }, 'name': { 'valueType': 'str', }, } # SAVE THE CORPUS AS TF TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
metaData = { '': dict(createdBy='Ernst Boogert and Dirk Roorda', ), 'otext': { 'sectionFeatures': 'book,chapter,verse', 'sectionTypes': 'book,chapter,verse', 'fmt:text-orig-full': '{orig} ', 'fmt:text-orig-main': '{main} ', 'fmt:text-orig-plain': '{plain} ', }, 'book@en': { 'valueType': 'str', 'language': 'English', 'languageCode': 'en', 'languageEnglish': 'english', }, } sorted(data.nodeFeatures.keys()) for nf in data.nodeFeatures: metaData.setdefault( nf, {})['valueType'] = 'int' if nf in numberFeatures else 'str' for ef in data.edgeFeatures: metaData.setdefault( ef, {})['valueType'] = 'int' if ef in numberFeatures else 'str' TF.save(nodeFeatures=data.nodeFeatures, edgeFeatures=data.edgeFeatures, metaData=metaData)
"description": "similarity between words, as a percentage of the common material wrt the combined material", }, } simData = {} for ((f, t), d) in similarity.items(): simData.setdefault(f, {})[t] = d FOLDER_SIM = "sim/tf" path = f"{ORG}/{REPO}/{FOLDER_SIM}" location = f"{GH_BASE}/{path}" module = VERSION TF.save(edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module) # --- # All chapters: # # * *use* # * [share](share.ipynb) # * [app](app.ipynb) # * [repo](repo.ipynb) # * [compose](compose.ipynb) # # --- # # CC-BY Dirk Roorda
def parseCorpus(): annotSpecs = SyrNT.ANNOTATIONS cur = collections.Counter() curSlot = 0 context = [] nodeFeatures = collections.defaultdict(dict) edgeFeatures = collections.defaultdict( lambda: collections.defaultdict(set)) oSlots = collections.defaultdict(set) (bookEn, verseLabels) = getVerseLabels() (prevBook, prevChapter) = (None, None) lexemes = set() for p in readCorpus(): (book, chapter, verse) = verseLabels[cur['verse']] if book != prevBook: print(f'\t{bookEn[book]:<15} current:' f' b={cur["book"]:>2}' f' c={cur["chapter"]:>3}' f' v={cur["verse"]:>4}' f' w={curSlot:>6}') if prevChapter is not None: context.pop() prevChapter = None if prevBook is not None: context.pop() cur['book'] += 1 prevBook = book bookNode = ('book', cur['book']) nodeFeatures['book'][bookNode] = book nodeFeatures['book@en'][bookNode] = bookEn[book] context.append(('book', cur['book'])) if chapter != prevChapter: if prevChapter is not None: context.pop() cur['chapter'] += 1 prevChapter = chapter nodeFeatures['chapter'][('chapter', cur['chapter'])] = chapter nodeFeatures['book'][('chapter', cur['chapter'])] = book context.append(('chapter', cur['chapter'])) cur['verse'] += 1 nodeFeatures['verse'][('verse', cur['verse'])] = verse nodeFeatures['chapter'][('verse', cur['verse'])] = chapter nodeFeatures['book'][('verse', cur['verse'])] = book (ln, line) = p words = line.split() context.append(('verse', cur['verse'])) for word in words: curSlot += 1 (wordTrans, annotationStr) = word.split('|', 1) wordSyr = wordTrans.translate(tosyr) wordEtcbc = TR.from_syriac(wordSyr) annotations = annotationStr.split('#') wordNode = ('word', curSlot) nodeFeatures['word_sedra'][wordNode] = wordTrans nodeFeatures['word_etcbc'][wordNode] = wordEtcbc nodeFeatures['word'][wordNode] = wordTrans.translate(tosyr) for ((feature, values), data) in zip(annotSpecs, annotations): value = data if values is None else values[int(data)] if values is None: nodeFeatures[f'{feature}_sedra'][wordNode] = value value = value.translate(tosyr) valueEtcbc = TR.from_syriac(value) nodeFeatures[f'{feature}_etcbc'][wordNode] = valueEtcbc nodeFeatures[feature][wordNode] = ( value if feature in numFeatures else NA_VALUE if value in NA_VALUES else value) lexeme = nodeFeatures['lexeme'][wordNode] if lexeme not in lexemes: lexemes.add(lexeme) cur['lexeme'] += 1 lexNode = ('lexeme', cur['lexeme']) nodeFeatures['lexeme'][lexNode] = lexeme nodeFeatures['lexeme_sedra'][lexNode] = ( nodeFeatures['lexeme_sedra'][wordNode]) nodeFeatures['lexeme_etcbc'][lexNode] = ( nodeFeatures['lexeme_etcbc'][wordNode]) context.append(('lexeme', cur['lexeme'])) for (nt, curNode) in context: oSlots[(nt, curNode)].add(curSlot) context.pop() context.pop() context.pop() context.pop() print('') if SHOW: if LIMIT == -1: for ft in sorted(nodeFeatures): print(ft) for n in range(1, 5): for ntp in ('book', 'chapter', 'verse', 'word'): if (ntp, n) in nodeFeatures[ft]: print(f'\t"{nodeFeatures[ft][(ntp, n)]}"') else: print(nodeFeatures) print(oSlots) if len(context): print('Context:', context) print(f'\n{curSlot:>7} x slot') for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): print(f'{amount:>7} x {nodeType}') nValues = reduce(operator.add, (len(values) for values in nodeFeatures.values()), 0) print(f'{len(nodeFeatures)} node features with {nValues} values') print(f'{len(oSlots)} nodes linked to slots') print('Compiling TF data') print(f'Building warp feature otype') nodeOffset = {'word': 0} oType = {} n = 1 for k in range(n, curSlot + 1): oType[k] = 'word' n = curSlot + 1 for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): nodeOffset[nodeType] = n - 1 for k in range(n, n + amount): oType[k] = nodeType n = n + amount print(f'{len(oType)} nodes') print('Filling in the nodes for features') newNodeFeatures = collections.defaultdict(dict) for (ft, featureData) in nodeFeatures.items(): newFeatureData = {} for ((nodeType, node), value) in featureData.items(): newFeatureData[nodeOffset[nodeType] + node] = value newNodeFeatures[ft] = newFeatureData newOslots = {} for ((nodeType, node), slots) in oSlots.items(): newOslots[nodeOffset[nodeType] + node] = slots nodeFeatures = newNodeFeatures nodeFeatures['otype'] = oType edgeFeatures['oslots'] = newOslots print(f'Node features: {" ".join(nodeFeatures)}') print(f'Edge features: {" ".join(edgeFeatures)}') metaData = { '': commonMetaData, 'otext': oText, 'oslots': dict(valueType='str'), 'book@en': langMetaData['en'], } for ft in set(nodeFeatures) | set(edgeFeatures): metaData.setdefault( ft, {})['valueType'] = 'int' if ft in numFeatures else 'str' metaData[ft]['description'] = (specificMetaData[ft] if ft in specificMetaData else '?') print(f'Remove existing TF directory') rmtree(TF_PATH) print(f'Save TF dataset') TF = Fabric(locations=TF_PATH, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
# In[26]: nodeFeatures = dict(tree=treeData, treen=treeDataN) for f in nodeFeatures: metaData[f]["valueType"] = "str" # In[27]: utils.caption(4, "Writing tree feature to TF") TFw = Fabric(locations=thisTempTf, silent=True) TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData) # # Diffs # # Check differences with previous versions. # In[ ]: utils.checkDiffs(thisTempTf, thisTf, only=set(nodeFeatures)) # # Deliver # # Copy the new TF features from the temporary location where they have been created to their final destination.
def generateTf(bookAcro, content): if os.path.exists(TF_PATH): rmtree(TF_PATH) os.makedirs(TF_PATH) print('Slicing content into features') cur = collections.Counter() curSlot = 0 context = [] nodeFeatures = collections.defaultdict(dict) edgeFeatures = collections.defaultdict( lambda: collections.defaultdict(set) ) oSlots = collections.defaultdict(set) for acro in allAcrosSeq: thisBookInfo = bookAcro[acro] bookName = thisBookInfo['bookName'] witness = thisBookInfo.get('witness', None) chapters = content[bookName] cur['book'] += 1 bookNode = ('book', cur['book']) nodeFeatures['book'][bookNode] = acro nodeFeatures['book@en'][bookNode] = bookName if witness is not None: nodeFeatures['witness'][bookNode] = witness context.append(('book', cur['book'])) for chapterNum in chapters: verses = chapters[chapterNum] cur['chapter'] += 1 nodeFeatures['chapter'][('chapter', cur['chapter'])] = chapterNum nodeFeatures['book'][('chapter', cur['chapter'])] = acro if witness is not None: nodeFeatures['witness'][('chapter', cur['chapter'])] = witness context.append(('chapter', cur['chapter'])) for verseNum in verses: words = verses[verseNum].strip().split() cur['verse'] += 1 nodeFeatures['verse'][('verse', cur['verse'])] = verseNum nodeFeatures['chapter'][('verse', cur['verse'])] = chapterNum nodeFeatures['book'][('verse', cur['verse'])] = acro if witness is not None: nodeFeatures['witness'][('verse', cur['verse'])] = witness context.append(('verse', cur['verse'])) for elem in splitWords(words): if len(elem) != 2: print(bookName, chapterNum, verseNum, words) continue (word, punc) = elem wSyc = TR.to_syriac(word) pSyc = TR.to_syriac(punc) curSlot += 1 wordNode = ('word', curSlot) nodeFeatures['word_etcbc'][wordNode] = word nodeFeatures['word'][wordNode] = wSyc nodeFeatures['trailer_etcbc'][wordNode] = punc nodeFeatures['trailer'][wordNode] = pSyc for (nt, curNode) in context: oSlots[(nt, curNode)].add(curSlot) context.pop() context.pop() context.pop() if len(context): print('Context:', context) print(f'\n{curSlot:>7} x slot') for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): print(f'{amount:>7} x {nodeType}') nValues = reduce( operator.add, (len(values) for values in nodeFeatures.values()), 0 ) print(f'{len(nodeFeatures)} node features with {nValues} values') print(f'{len(oSlots)} nodes linked to slots') print('Compiling TF data') print(f'Building warp feature otype') nodeOffset = {'word': 0} oType = {} n = 1 for k in range(n, curSlot + 1): oType[k] = 'word' n = curSlot + 1 for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): nodeOffset[nodeType] = n - 1 for k in range(n, n + amount): oType[k] = nodeType n = n + amount print(f'{len(oType)} nodes') print('Filling in the nodes for features') newNodeFeatures = collections.defaultdict(dict) for (ft, featureData) in nodeFeatures.items(): newFeatureData = {} for ((nodeType, node), value) in featureData.items(): newFeatureData[nodeOffset[nodeType] + node] = value newNodeFeatures[ft] = newFeatureData newOslots = {} for ((nodeType, node), slots) in oSlots.items(): newOslots[nodeOffset[nodeType] + node] = slots nodeFeatures = newNodeFeatures nodeFeatures['otype'] = oType edgeFeatures['oslots'] = newOslots print(f'Node features: {" ".join(nodeFeatures)}') print(f'Edge features: {" ".join(edgeFeatures)}') metaData = { '': commonMetaData, 'otext': oText, 'oslots': dict(valueType='str'), 'book@en': langMetaData['en'], } for ft in set(nodeFeatures) | set(edgeFeatures): metaData.setdefault( ft, {} )['valueType'] = 'int' if ft in numFeatures else 'str' metaData[ft]['description'] = ( specificMetaData[ft] if ft in specificMetaData else '?' ) TF = Fabric(locations=TF_PATH, silent=True) TF.save( nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData )