def loadTf(): print(f'Load TF dataset for the first time') TF = Fabric(locations=TF_PATH, modules=['']) api = TF.load('') allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] TF.load(loadableFeatures, add=True) return api print('All done')
def loadTf(): TF = Fabric(locations=[OUT_DIR]) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures, silent=False) if api: print(f'max node = {api.F.otype.maxNode}') print(api.F.root.freqList()[0:20])
def loadTf(): TF = Fabric(locations=[f'{TF_PATH}/{VERSION}']) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures) if api: print(f'max node = {api.F.otype.maxNode}') print(api.F.root.freqList()[0:20])
def loadTf(outDir): TF = Fabric(locations=[outDir]) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures["nodes"] + allFeatures["edges"] api = TF.load(loadableFeatures, silent=False) if api: print(f"max node = {api.F.otype.maxNode}") print("Frequencies of words") for (word, n) in api.F.letters.freqList()[0:20]: print(f"{n:>6} x {word}")
def gather(locations, modules): TF = Fabric(locations=locations, modules=modules, silent=True) api = TF.load(FEATURES, silent=True) for node in api.F.otype.s('book'): book = api.T.sectionFromNode(node)[0] print(book) dump_book(api, book) with open(os.path.join(DATADIR, 'verse_nodes.pkl'), 'wb') as f: pickle.dump(VERSE_NODES, f)
def load_tf(self): ''' Loads an instance of TF if necessary. ''' # load BHSA Hebrew data TF = Fabric(bhsa_data_paths, silent=True) tf_api = TF.load(''' function lex vs language pdp freq_lex gloss domain ls heads prep_obj mother rela typ sp sem_domain sem_domain_code ''', silent=True) self.tf_api = tf_api
def load_tf_bhsa(self): ''' Loads a TF instance of the BHSA dataset. ''' TF = Fabric( locations='~/github', modules=['etcbc/bhsa/tf/c', 'semantics/phase1/tf/c' ], # modify paths here for your system silent=True) api = TF.load(''' book chapter verse function lex vs language pdp freq_lex gloss domain ls heads ''', silent=True) B = Bhsa(api, '4. Semantic Space Construction', version='c') return api, B
def main(): TF = Fabric(modules=['hebrew/etcbc4c'], locations='~/VersionControl/etcbc-data', silent=True) api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) api.makeAvailableIn(globals()) data = Databank() for n in N(): try: handle(n, data) except (KeyError, ValueError): pass print(len(data.verbs), len(data.roots)) with open('etcbc-verbs.csv', 'w') as csvverbs: verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) i = VERB_STARTID for verb in data.verbs: verbwr.writerow([ i, verb.verb, verb.root, verb.stem, verb.tense, verb.person if verb.person is not None else 'NULL', verb.gender if verb.gender is not None else 'NULL', verb.number if verb.number is not None else 'NULL', 1 ]) i += 1 with open('etcbc-roots.csv', 'w') as csvroots: rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) #rootwr.writerow(['id', 'root', 'root_kind_id']) i = ROOT_STARTID for root in data.roots: rootwr.writerow([i, root.lex, 1]) i += 1
from tf.fabric import Fabric import collections import sys # https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html TF = Fabric(locations='/home/chaim/github/text-fabric-data', modules=['hebrew/etcbc4c']) #TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data', modules=['hebrew/etcbc4c']) api = TF.load( 'sp lex g_word g_word_utf8 trailer_utf8 ls typ rela function qere_utf8 qere' ) api.makeAvailableIn(globals()) F = api.F T = api.T C = api.C L = api.L #print(sorted(T.formats)) def print_original_words(): for i in range(1, 12): print(api.T.text([i], 'text-orig-full')) # for w in F.otype.s('word'): # word, part_of_speech = F.g_word.v(w), F.sp.v(w) # print(word, part_of_speech) # if w == 14:
# # Just to see whether everything loads and the precomputing of extra information works out. # Moreover, if you want to work with these features, then the precomputing has already been done, and everything is quicker in subsequent runs. # # We issue load statement to trigger the precomputing of extra data. # Note that all features specified text formats in the `otext` config feature, # will be loaded, as well as the features for sections. # # At that point we have access to the full list of features. # We grab them and are going to load them all! # In[5]: utils.caption(4, 'Load and compile standard TF features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('') utils.caption(4, 'Load and compile all other TF features') allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures) api.makeAvailableIn(globals()) # # Examples # In[12]: utils.caption(4, 'Basic test') utils.caption(4, 'First verse in all formats') for fmt in T.formats: utils.caption(0, '{}'.format(fmt), continuation=True)
# In[14]: utils.caption(4, "Load the existing TF dataset") TF = Fabric(locations=[coreTf, thisTf], modules=[""]) # We instruct the API to load data. # In[8]: # In[15]: api = TF.load(""" function rela typ g_word_utf8 trailer_utf8 lex prs uvf sp pdp ls vs vt nametype gloss book chapter verse label number s_manual f_correction valence predication grammatical original lexical semantic mother """) api.makeAvailableIn(globals()) # # Indicators # # Here we specify by what features we recognize key constituents. # We use predominantly features that come from the correction/enrichment workflow. # In[9]: # pf ... : predication feature # gf_... : grammatical feature
otextInfo = dict(line[1:].split('=', 1) for line in LEX_FORMATS.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # # Lexicon preparation # We add lexical data. # The lexical data will not be added as features of words, but as features of lexemes. # The lexemes will be added as fresh nodes, of a new type `lex`. # In[8]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) vocLex = ' g_voc_lex g_voc_lex_utf8 ' if DO_VOCALIZED_LEXEME else '' api = TF.load('lex lex_utf8 language sp ls gn ps nu st oslots {} {}'.format( vocLex, EXTRA_OVERLAP)) api.makeAvailableIn(globals()) # # Text pass # We map the values in the language feature to standardized ISO values: `arc` and `hbo`. # We run over all word occurrences, grab the language and lexeme identifier, and create for each # unique pair a new lexeme node. # # We remember the mapping between nodes and lexemes. # # This stage does not yet involve the lexical files. # In[9]: utils.caption(4, 'Collect lexemes from the text')
if thisOtext is '': utils.caption(0, 'No additional text formats provided') otextInfo = {} else: utils.caption(0, 'New text formats') otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # In[7]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('label g_word g_cons trailer_utf8') api.makeAvailableIn(globals()) # # Verse labels # The ketiv-qere files deal with different verse labels. # We make a mapping between verse labels and nodes. # In[8]: utils.caption(0, 'Mapping between verse labels and verse nodes') nodeFromLabel = {} for vs in F.otype.s('verse'): lab = F.label.v(vs) nodeFromLabel[lab] = vs
import json import codecs import csv # from flask import request from sblgnt_back.controller import translate as tr from sblgnt_back.lib import vcodeparser as vp SBLGNT = 'sblgnt' TG = Fabric(modules=SBLGNT, silent=False) gnt = TG.load(''' book chapter verse g_word trailer otext otype psp Case Gender Mood Number Person Tense Voice UnicodeLemma gloss strong transliteration ClType function ''') # 번역본 텍스트 불러오기 #자체 json 파일로 번역본 인용 def json_to_verse(ver, book, chp, bib): path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) if bib == "old": book_code = { "Genesis": 0, "Exodus": 1,
sys.path.append('scripts') from build_tables import build_sample_tables # fire up Text-Fabric with BHSA data TF = Fabric(snakemake.input['tf_mods'], silent='deep') features = """ sp pdp vs vt ps gn nu lex language gloss voc_lex voc_lex_utf8 function number label typ code rela mother domain txt genre sense nhead funct_assoc """ bhsa = TF.load(features, silent='deep') F, E, T, L, Fs, = bhsa.F, bhsa.E, bhsa.T, bhsa.L, bhsa.Fs # load GBI Hebrew data with open(snakemake.input.bhsa2gbi, 'rb') as infile: bhsa2gbi = pickle.load(infile) # preprocess data bookmap = get_book_maps(bhsa) loca_lexs = get_loca_assocs(bhsa) def join_on(nodes, jchar='_', default=''): """Join words on a char and ensure they are pre/appended with that char. The pre/appending provides easy-to-match word boundaries.
if SCRIPT: (good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]), force=FORCE) if not good: stop(good=False) if not work: stop(good=True) # # Load existing data # In[17]: utils.caption(4, 'Loading relevant features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('book') api.makeAvailableIn(globals()) nodeFeatures = {} nodeFeatures['book@la'] = {} bookNodes = [] for b in F.otype.s('book'): bookNodes.append(b) nodeFeatures['book@la'][b] = F.book.v(b) for (langCode, langBookNames) in bookNames.items(): nodeFeatures['book@{}'.format(langCode)] = dict( zip(bookNodes, langBookNames)) utils.caption(0, '{} book name features created'.format(len(nodeFeatures)))
def ingest_french(paths): """Match the French data to our dataset.""" # load the French dataset with open(paths['source'], 'r') as infile: reader = csv.reader(infile, delimiter='\t') french_data = list(reader) # load the BHSA Hebrew data for matching the Hebrew text TF = Fabric(locations=paths['bhsa']) API = TF.load('g_word_utf8') F, T, L = API.F, API.T, API.L # match the Hebrew verbs in the French data with the # Hebrew verbs in BHSA # we treat the ref strings as unique ID's # we use 2 dicts; one to hold ID 2 BHSA node mappings # another to hold the IDs 2 french data french2bhsa = {} french2data = {} frenchverses = {} for row in french_data: # parse French data wid = row[0] hb_txt, hb_lex, hb_tag, hb_prev = row[1:5] fr_words, fr_verse = row[5:7] bk, ch, vs, sg, wnum = parse_refstring(wid) french2data[wid] = { 'wid': wid, 'hebrew': hb_txt, 'hebrew_parse': hb_tag, 'french': fr_words, } # look up BHSA data and get the verse node tf_book = int2book[bk] vrs_node = T.nodeFromSection((tf_book, ch, vs)) if vrs_node is None: raise Exception((tf_book, ch, vs), wid, hb_txt) # save the French verse text ref_string = str((tf_book, ch, vs)) frenchverses[ref_string] = fr_verse french2data[wid]['ref'] = ref_string # get the closest matching word from the verse; # NB we iterate over the verse words in reversed order # so that if there are 2+ words with equivalent distances, # we always end on the one that is first in the verse; # the match is then added to a set so that it is not # available for subsequent matches french2bhsa[wid] = BhsaWord(0, float('inf')) # initialize with dummy matched = set() for word_node in reversed(L.d(vrs_node, 'word')): if word_node in matched: continue bhsa_txt = T.text(word_node) dist = levdist(bhsa_txt, hb_txt) if french2bhsa[wid].dist > dist: french2bhsa[wid] = BhsaWord(word_node, dist) matched.add(french2bhsa[wid].node) # iterate over both french dicts and assemble # into one BHSA dict bhsa2french = {} for wid, bhsa_word in french2bhsa.items(): bhsa_node = bhsa_word.node if bhsa_node != 0: bhsa2french[bhsa_node] = french2data[wid] # the linking is complete with open(paths['out'], 'w') as outfile: json.dump(bhsa2french, outfile, indent=2, ensure_ascii=False) with open(paths['out_verses'], 'w') as outfile: json.dump(frenchverses, outfile, indent=2, ensure_ascii=False)
provenanceMetadata = dict( dataset='BHSA', datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis', version=VERSION, author='Eep Talstra Centre for Bible and Computer', encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)', website='https://shebanq.ancient-data.org', email='*****@*****.**', ) # In[7]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('label number') api.makeAvailableIn(globals()) # # Clause atom identifiers in .px # We must map the way the clause_atoms are identified in the `.px` files # to nodes in TF. # In[8]: utils.caption(0, '\tLabeling clause_atoms') labelNumberFromNode = {} nodeFromLabelNumber = {} for n in N(): otype = F.otype.v(n) if otype == 'book':
from tf.fabric import Fabric import collections import sys # https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html #TF = Fabric(locations='/home/chaim/github/text-fabric-data', modules=['hebrew/etcbc4c']) TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data', modules=['hebrew/etcbc4c']) api = TF.load('qere_utf8 qere lex0 g_word_utf8 g_word') api.makeAvailableIn(globals()) F = api.F T = api.T C = api.C L = api.L def print_original_words(): for i in range(1, 12): print(api.T.text([i], 'text-orig-full')) # for w in F.otype.s('word'): # word, part_of_speech = F.g_word.v(w), F.sp.v(w) # print(word, part_of_speech) # if w == 14: # break import sys
work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]), force=FORCE) if not good: stop(good=False) if not work: stop(good=True) # # Collect # # We collect the statistics. # In[6]: utils.caption(4, 'Loading relevant features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('{} {} {}'.format(LANG_FEATURE, LEX_FEATURE, OCC_FEATURE)) api.makeAvailableIn(globals()) hasLex = 'lex' in set(F.otype.all) # In[7]: utils.caption(0, 'Counting occurrences') wstats = { 'freqs': { 'lex': collections.defaultdict(lambda: collections.Counter()), 'occ': collections.defaultdict(lambda: collections.Counter()), }, 'ranks': { 'lex': collections.defaultdict(lambda: {}), 'occ': collections.defaultdict(lambda: {}),
from tf.fabric import Fabric ### set up app - we're going to use it for gzip middleware ### app = Bottle() ### load up TF ### TF = Fabric(locations='../text-fabric-data', modules='hebrew/etcbc4c') api = TF.load(''' book chapter verse sp nu gn ps vt vs st otype det g_word_utf8 trailer_utf8 lex_utf8 lex voc_utf8 g_prs_utf8 g_uvf_utf8 prs_gn prs_nu prs_ps g_cons_utf8 gloss sdbh lxxlexeme accent accent_quality tab typ ''') api.makeAvailableIn(globals()) ### WORD API ### def remove_na_and_empty_and_unknown(list_to_reduce): templist = list_to_reduce keys_to_remove = set() for key, value in templist.items():
import json import pickle import collections from tf.fabric import Fabric from pathlib import Path from verb_form import get_verbform # load basic BHSA data with Text-Fabric TF = Fabric(snakemake.input, silent='deep') bhsa = TF.load('pdp lex vt language', silent='deep') F, L = bhsa.F, bhsa.L # load GBI data for verb_form creation with open(snakemake.input.bhsa2gbi, 'rb') as infile: bhsa2gbi = pickle.load(infile) # loop through all verbs stored in the BHSA # and select those forms specified by the wildcard samples = [] for node in F.pdp.s('verb'): # skip non-hebrew words if F.language.v(node) != 'Hebrew': continue verb_form = get_verbform(node, bhsa, bhsa2gbi) get_form = snakemake.wildcards.verb # handle cohortatives / jussives if get_form == 'yqtl' and verb_form in {'jussM', 'cohoM'}: samples.append(node)
# @app.after_request # def set_response_headers(r): # r.headers['Cache-Control'] = 'public, max-age=3600' # return r ### Load up TF ### ETCBC = 'hebrew/etcbc4c' TF = Fabric(locations='text-fabric-data', modules=ETCBC) #api = TF.load('book') api = TF.load(''' book chapter verse nu gn ps vt vs st otype typ function det pdp qere_utf8 qere_trailer_utf8 g_word_utf8 trailer_utf8 lex_utf8 lex voc_utf8 g_prs_utf8 g_uvf_utf8 prs_gn prs_nu prs_ps g_cons_utf8 gloss phono ''') api.makeAvailableIn(globals()) # kml 파일 관련 book_abb = { "Genesis": "gen", "Exodus": "exod", "Leviticus": "lev", "Numbers": "num", "Deuteronomy": "deut",
import sys import unittest from tf.fabric import Fabric # LOAD THE TEST CORPUS TF = Fabric('tf') api = TF.load('sign name') F = api.F S = api.S # MAKE CUSTOM SETS OF NODES Sign = set(range(1, F.otype.maxSlot + 1)) Node = set(range(1, F.otype.maxNode + 1)) sets = dict(Sign=Sign, Node=Node) # RUN A QUERY, OPTIONALLY WITH CUSTOM SETS def query(template, sets=None): return (tuple(S.search(template)) if sets is None else tuple( S.search(template, sets=sets))) # DEFINE THE TESTS relationKey = {
else: locations = {} if not locations: raise Exception('Please add your data paths in bhsa.py line 30.') for path in locations: if not os.path.exists(path): raise Exception( f'You need an extra datamodule in {os.path.dirname(path)}. Do "git pull {locations[path]}" to this location.' ) # load TF and BHSA data TF = Fabric(locations=locations.keys(), modules='2017', silent=True) api = TF.load(''' otype language book chapter verse function domain typ pdp kind tree crossref ''', silent=True) api.makeAvailableIn(globals()) # globalize TF methods # define book groups & names lbh_books = ('Song_of_songs', 'Ecclesiastes', 'Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles') sbh_books = ('Genesis', 'Exodus', 'Leviticus', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings') test_books = ('Jonah', 'Ruth') all_books = tuple(T.sectionFromNode(b)[0]
ENTRY_HEB = "voc_lex_utf8" PHONO_TRAILER = "phono_trailer" LANGUAGE = "languageISO" # In[ ]: TF = Fabric(locations=[thisRepo, phonoRepo], modules=[tfDir]) api = TF.load(f""" g_cons g_cons_utf8 g_word g_word_utf8 trailer_utf8 {QERE} {QERE_TRAILER} {LANGUAGE} lex g_lex lex_utf8 sp pdp ls {ENTRY} {ENTRY_HEB} vt vs gn nu ps st nme pfm prs uvf vbe vbs gloss nametype root ls pargr phono {PHONO_TRAILER} function typ rela txt det code tab number freq_lex freq_occ rank_lex rank_occ book chapter verse """) api.makeAvailableIn(globals()) # In[6]: hasLex = "lex" in set(F.otype.all) # # Data model
import os, sys, collections from tf.fabric import Fabric # locations = '~/github/etcbc' locations = '/home/oem/text-fabric-data/etcbc' coreModule = 'bhsa' sources = [coreModule, 'phono'] # version = '2017' version = 'c' tempDir = os.path.expanduser(f'{locations}/{coreModule}/_temp/{version}/r') tableFile = f'{tempDir}/{coreModule}{version}.txt' modules = [f'{s}/tf/{version}' for s in sources] TF = Fabric(locations=locations, modules=modules) api = TF.load('') api = TF.load( ('suffix_person', 'tab', 'trailer', 'trailer_utf8', 'txt', 'typ', 'uvf', 'vbe', 'vbs', 'verse', 'voc_lex', 'voc_lex_utf8', 'vs', 'vt', 'distributional_parent', 'functional_parent', 'mother', 'oslots')) allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] del (api) api = TF.load(loadableFeatures) api.makeAvailableIn(globals()) print('done')
VERSION = "0.2" TF_PATH = f"{TF_DIR}/{VERSION}" TF = Fabric(locations=TF_PATH) # - # We ask for a list of all features: allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures["nodes"] + allFeatures["edges"] loadableFeatures # We load all features: api = TF.load(loadableFeatures, silent=False) # You see that all files are marked with a `T`. # # That means that Text-Fabric loads the features by reading the plain text `.tf` files. # But after reading, it makes a binary equivalent and stores it as a `.tfx` # file in the hidden `.tf` directory next to it. # # Furthermore, you see some lines marked with `C`. Here Text-Fabric is computing derived data, # mostly about sections, the order of nodes, and the relative positions of nodes with respect to the slots they # are linked to. # # The results of this pre-computation are also stored in that hidden `.tf` directory. # # The next time, Text-Fabric loads the data from their binary `.tfx` files, which is much faster. # And the pre-computation step will be skipped.
def genTrees(version): C = setVersion(version) bhsa = C.bhsa sp = C.sp rela = C.rela ptyp = C.ptyp ctyp = C.ctyp g_word_utf8 = C.g_word_utf8 tfDir = C.tfDir TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa) api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother") E = api.E F = api.F Fs = api.Fs def getTag(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) def getTagN(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" if not isWord: tag += "{" + str(node) + "}" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) treeTypes = ("sentence", "clause", "phrase", "subphrase", "word") (rootType, leafType, clauseType, phraseType) = ( treeTypes[0], treeTypes[-1], treeTypes[1], treeTypes[2], ) ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items()) ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items()) tree = Tree( TF, otypes=treeTypes, phraseType=phraseType, clauseType=clauseType, ccrFeature=rela, ptFeature=ptyp, posFeature=sp, motherFeature="mother", ) tree.restructureClauses(ccrClass) results = tree.relations() TF.info("Ready for processing") skip = set() TF.info("Verifying whether all slots are preserved under restructuring") TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}") errors = [] # i = 10 for snode in F.otype.s(rootType): declaredSlots = set(E.oslots.s(snode)) results = {} thisgood = {} for kind in ("e", "r"): results[kind] = set(lt for lt in tree.getLeaves(snode, kind) if F.otype.v(lt) == leafType) thisgood[kind] = declaredSlots == results[kind] # if not thisgood[kind]: # print(f"{kind} D={declaredSlots}\n L={results[kind]}") # i -= 1 # if i == 0: break if False in thisgood.values(): errors.append((snode, thisgood["e"], thisgood["r"])) nErrors = len(errors) if nErrors: TF.error(f"{len(errors)} mismatches:") mine = min(20, len(errors)) skip |= {e[0] for e in errors} for (s, e, r) in errors[0:mine]: TF.error( (f"{s} embedding: {'OK' if e else 'XX'};" f" restructd: {'OK' if r else 'XX'}"), tm=False, ) else: TF.info(f"{len(errors)} mismatches") TF.info(f"Exporting {rootType} trees to TF") s = 0 chunk = 10000 sc = 0 treeData = {} treeDataN = {} for node in F.otype.s(rootType): if node in skip: continue (treeRep, wordsRep, bSlot) = tree.writeTree(node, "r", getTag, rev=False, leafNumbers=True) (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node, "r", getTagN, rev=False, leafNumbers=True) treeData[node] = treeRep treeDataN[node] = treeNRep s += 1 sc += 1 if sc == chunk: TF.info(f"{s} trees composed") sc = 0 TF.info(f"{s} trees composed") nodeFeatures = dict(tree=treeData, treen=treeDataN) metaData = dict( tree=dict( valueType="str", description="penn treebank represententation for sentences", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), treen=dict( valueType="str", description= "penn treebank represententation for sentences with node numbers included", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), ) TF.info("Writing tree feature to TF") TFw = Fabric(locations=tfDir, silent=True) TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
sp = "part_of_speech" if VERSION == "3" else "sp" rela = "clause_constituent_relation" if VERSION == "3" else "rela" ptyp = "phrase_type" if VERSION == "3" else "typ" ctyp = "clause_atom_type" if VERSION == "3" else "typ" g_word_utf8 = "text" if VERSION == "3" else "g_word_utf8" # - # In[7]: api = TF.load( f""" {sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother """ ) api.makeAvailableIn(globals()) # We are going to make convenient labels for constituents, words and clauses, based on the # the types of textual objects and the features # `sp` and `rela`. # ## Node types # In[8]: