def loadTf(): TF = Fabric(locations=[f'{TF_PATH}/{VERSION}']) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures) if api: print(f'max node = {api.F.otype.maxNode}') print(api.F.root.freqList()[0:20])
def makeTf(): tfPathV = f'{TF_PATH}/{VERSION}' if os.path.exists(tfPathV): rmtree(tfPathV) TF = Fabric(locations=[tfPathV]) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
def loadTf(): TF = Fabric(locations=[OUT_DIR]) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures, silent=False) if api: print(f'max node = {api.F.otype.maxNode}') print(api.F.root.freqList()[0:20])
def loadTf(outDir): TF = Fabric(locations=[outDir]) allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures["nodes"] + allFeatures["edges"] api = TF.load(loadableFeatures, silent=False) if api: print(f"max node = {api.F.otype.maxNode}") print("Frequencies of words") for (word, n) in api.F.letters.freqList()[0:20]: print(f"{n:>6} x {word}")
def gather(locations, modules): TF = Fabric(locations=locations, modules=modules, silent=True) api = TF.load(FEATURES, silent=True) for node in api.F.otype.s('book'): book = api.T.sectionFromNode(node)[0] print(book) dump_book(api, book) with open(os.path.join(DATADIR, 'verse_nodes.pkl'), 'wb') as f: pickle.dump(VERSE_NODES, f)
def convert(source, ocred, pages, versionTf): global pageNums global SRC_FILE global TYPE_MAP global HAS_TOC global OCRED global U global VERSION_TF U = UChar() pageNums = parseNums(pages) workInfo = WORKS[source] dest = getTfDest(source, versionTf) (SRC_FILE, OCRED) = getFile(source, ocred) HAS_TOC = workInfo.get("toc", False) TYPE_MAP = TYPE_MAPS[OCRED] VERSION_TF = versionTf cv = CV(Fabric(locations=dest)) return cv.walk( director, slotType, otext=otext[None] | otext[OCRED], generic=generic(source), intFeatures=intFeatures[None] | intFeatures[OCRED], featureMeta=featureMeta[None] | featureMeta[OCRED], generateTf=True, )
def load_tf(self): ''' Loads an instance of TF if necessary. ''' # load BHSA Hebrew data TF = Fabric(bhsa_data_paths, silent=True) tf_api = TF.load(''' function lex vs language pdp freq_lex gloss domain ls heads prep_obj mother rela typ sp sem_domain sem_domain_code ''', silent=True) self.tf_api = tf_api
def load_tf_bhsa(self): ''' Loads a TF instance of the BHSA dataset. ''' TF = Fabric( locations='~/github', modules=['etcbc/bhsa/tf/c', 'semantics/phase1/tf/c' ], # modify paths here for your system silent=True) api = TF.load(''' book chapter verse function lex vs language pdp freq_lex gloss domain ls heads ''', silent=True) B = Bhsa(api, '4. Semantic Space Construction', version='c') return api, B
def main(): TF = Fabric(modules=['hebrew/etcbc4c'], locations='~/VersionControl/etcbc-data', silent=True) api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) api.makeAvailableIn(globals()) data = Databank() for n in N(): try: handle(n, data) except (KeyError, ValueError): pass print(len(data.verbs), len(data.roots)) with open('etcbc-verbs.csv', 'w') as csvverbs: verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) i = VERB_STARTID for verb in data.verbs: verbwr.writerow([ i, verb.verb, verb.root, verb.stem, verb.tense, verb.person if verb.person is not None else 'NULL', verb.gender if verb.gender is not None else 'NULL', verb.number if verb.number is not None else 'NULL', 1 ]) i += 1 with open('etcbc-roots.csv', 'w') as csvroots: rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) #rootwr.writerow(['id', 'root', 'root_kind_id']) i = ROOT_STARTID for root in data.roots: rootwr.writerow([i, root.lex, 1]) i += 1
def loadTf(): print(f'Load TF dataset for the first time') TF = Fabric(locations=TF_PATH, modules=['']) api = TF.load('') allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] TF.load(loadableFeatures, add=True) return api print('All done')
from tf.fabric import Fabric from book_formats import get_book_maps, etcbc2sbl, etcbc2abbr from verb_form import get_verbform, get_cl_verbform from modify_domain import permissive_q from synvar_carc import in_dep_calc as clause_relator from modify_cltype import simplify_cl_type from tag_args import clause_objects, get_loca_assocs, clause_locas, clause_time, clause_args # NB that working directory when script is executed is # /workflow; because we have some utilities that we want # to run from above directory, we need to append it to path sys.path.append('scripts') from build_tables import build_sample_tables # fire up Text-Fabric with BHSA data TF = Fabric(snakemake.input['tf_mods'], silent='deep') features = """ sp pdp vs vt ps gn nu lex language gloss voc_lex voc_lex_utf8 function number label typ code rela mother domain txt genre sense nhead funct_assoc """ bhsa = TF.load(features, silent='deep') F, E, T, L, Fs, = bhsa.F, bhsa.E, bhsa.T, bhsa.L, bhsa.Fs # load GBI Hebrew data with open(snakemake.input.bhsa2gbi, 'rb') as infile:
afterWord += ' ' rest = splitPunc(w[pA:]) if pA < len(w) else () return ((preWord, word, afterWord), ) + rest def plainCaps(w): return ''.join(x.upper() for x in normalize(NFD, w) if category(x)[0] not in dia) for word in (word1, word2): print(splitPunc(word)) tm = Timestamp() TF = Fabric(locations=TF_DIR) class Data: def __init__(self, bookEn): self.bookEn = bookEn self.tfFromXml = {} self.xmlFromTf = {} self.nodeNum = 1 self.maxSlot = 0 self.maxNode = 0 self.paths = {} self.nodeFeatures = collections.defaultdict(dict) self.edgeFeatures = collections.defaultdict(dict)
from kimsbible import app from kimsbible.lib import lib as kb from kimsbible.lib import vcodeparser as vp from kimsbible.lib import db from kimsbible.lib.config import google_map_api, kml_url # @app.after_request # def set_response_headers(r): # r.headers['Cache-Control'] = 'public, max-age=3600' # return r ### Load up TF ### ETCBC = 'hebrew/etcbc4c' TF = Fabric(locations='text-fabric-data', modules=ETCBC) #api = TF.load('book') api = TF.load(''' book chapter verse nu gn ps vt vs st otype typ function det pdp qere_utf8 qere_trailer_utf8 g_word_utf8 trailer_utf8 lex_utf8 lex voc_utf8 g_prs_utf8 g_uvf_utf8 prs_gn prs_nu prs_ps g_cons_utf8 gloss phono ''') api.makeAvailableIn(globals())
from morphological_lists import book_index, generous_name, book_abbreviation from bottle import Bottle, hook, route, get, post, request, response, redirect, run, template, static_file import paste.gzipper from itertools import chain from loadParallelText import getPTextFromRefPairArray from tf.fabric import Fabric ### set up app - we're going to use it for gzip middleware ### app = Bottle() ### load up TF ### TF = Fabric(locations='../text-fabric-data', modules='hebrew/etcbc4c') api = TF.load(''' book chapter verse sp nu gn ps vt vs st otype det g_word_utf8 trailer_utf8 lex_utf8 lex voc_utf8 g_prs_utf8 g_uvf_utf8 prs_gn prs_nu prs_ps g_cons_utf8 gloss sdbh lxxlexeme accent accent_quality tab typ ''') api.makeAvailableIn(globals())
import os, sys, re, collections from tf.fabric import Fabric sourceDir = os.path.expanduser('~/github/etcbc-data') mqlFile = '{}/{}'.format(sourceDir, 'synvar.mql') targetDir = os.path.expanduser('~/github/text-fabric-data') tfDir = '{}/hebrew/extrabiblical'.format(targetDir) if not os.path.exists(tfDir): os.makedirs(tfDir) TF = Fabric(tfDir) slotType = 'word' enums = dict() objectTypes = dict() tables = dict() curMonads = None curId = None edgeF = dict() nodeF = dict() def setFromSpec(spec): covered = set() for r_str in spec.split(','): bounds = r_str.split('-') if len(bounds) == 1: covered.add(int(r_str)) else: b = int(bounds[0]) e = int(bounds[1])
def generateTf(bookAcro, content): if os.path.exists(TF_PATH): rmtree(TF_PATH) os.makedirs(TF_PATH) print("Slicing content into features") cur = collections.Counter() curSlot = 0 context = [] nodeFeatures = collections.defaultdict(dict) edgeFeatures = collections.defaultdict(lambda: collections.defaultdict(set)) oSlots = collections.defaultdict(set) for acro in allAcrosSeq: thisBookInfo = bookAcro[acro] bookName = thisBookInfo["bookName"] witness = thisBookInfo.get("witness", None) chapters = content[bookName] cur["book"] += 1 bookNode = ("book", cur["book"]) nodeFeatures["book"][bookNode] = acro nodeFeatures["book@en"][bookNode] = bookName if witness is not None: nodeFeatures["witness"][bookNode] = witness context.append(("book", cur["book"])) for chapterNum in chapters: verses = chapters[chapterNum] cur["chapter"] += 1 nodeFeatures["chapter"][("chapter", cur["chapter"])] = chapterNum nodeFeatures["book"][("chapter", cur["chapter"])] = acro if witness is not None: nodeFeatures["witness"][("chapter", cur["chapter"])] = witness context.append(("chapter", cur["chapter"])) for verseNum in verses: words = verses[verseNum].strip().split() cur["verse"] += 1 nodeFeatures["verse"][("verse", cur["verse"])] = verseNum nodeFeatures["chapter"][("verse", cur["verse"])] = chapterNum nodeFeatures["book"][("verse", cur["verse"])] = acro if witness is not None: nodeFeatures["witness"][("verse", cur["verse"])] = witness context.append(("verse", cur["verse"])) for elem in splitWords(words): if len(elem) != 2: print(bookName, chapterNum, verseNum, words) continue (word, punc) = elem wSyc = TR.to_syriac(word) pSyc = TR.to_syriac(punc) curSlot += 1 wordNode = ("word", curSlot) nodeFeatures["word_etcbc"][wordNode] = word nodeFeatures["word"][wordNode] = wSyc nodeFeatures["trailer_etcbc"][wordNode] = punc nodeFeatures["trailer"][wordNode] = pSyc for (nt, curNode) in context: oSlots[(nt, curNode)].add(curSlot) context.pop() context.pop() context.pop() if len(context): print("Context:", context) print(f"\n{curSlot:>7} x slot") for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): print(f"{amount:>7} x {nodeType}") nValues = reduce(operator.add, (len(values) for values in nodeFeatures.values()), 0) print(f"{len(nodeFeatures)} node features with {nValues} values") print(f"{len(oSlots)} nodes linked to slots") print("Compiling TF data") print("Building warp feature otype") nodeOffset = {"word": 0} oType = {} n = 1 for k in range(n, curSlot + 1): oType[k] = "word" n = curSlot + 1 for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])): nodeOffset[nodeType] = n - 1 for k in range(n, n + amount): oType[k] = nodeType n = n + amount print(f"{len(oType)} nodes") print("Filling in the nodes for features") newNodeFeatures = collections.defaultdict(dict) for (ft, featureData) in nodeFeatures.items(): newFeatureData = {} for ((nodeType, node), value) in featureData.items(): newFeatureData[nodeOffset[nodeType] + node] = value newNodeFeatures[ft] = newFeatureData newOslots = {} for ((nodeType, node), slots) in oSlots.items(): newOslots[nodeOffset[nodeType] + node] = slots nodeFeatures = newNodeFeatures nodeFeatures["otype"] = oType edgeFeatures["oslots"] = newOslots print(f'Node features: {" ".join(nodeFeatures)}') print(f'Edge features: {" ".join(edgeFeatures)}') metaData = { "": commonMetaData, "otext": oText, "oslots": dict(valueType="str"), "book@en": langMetaData["en"], } for ft in set(nodeFeatures) | set(edgeFeatures): metaData.setdefault(ft, {})["valueType"] = "int" if ft in numFeatures else "str" metaData[ft]["description"] = ( specificMetaData[ft] if ft in specificMetaData else "?" ) TF = Fabric(locations=TF_PATH, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
import os, sys, collections from tf.fabric import Fabric # locations = '~/github/etcbc' locations = '/home/oem/text-fabric-data/etcbc' coreModule = 'bhsa' sources = [coreModule, 'phono'] # version = '2017' version = 'c' tempDir = os.path.expanduser(f'{locations}/{coreModule}/_temp/{version}/r') tableFile = f'{tempDir}/{coreModule}{version}.txt' modules = [f'{s}/tf/{version}' for s in sources] TF = Fabric(locations=locations, modules=modules) api = TF.load('') api = TF.load( ('suffix_person', 'tab', 'trailer', 'trailer_utf8', 'txt', 'typ', 'uvf', 'vbe', 'vbs', 'verse', 'voc_lex', 'voc_lex_utf8', 'vs', 'vt', 'distributional_parent', 'functional_parent', 'mother', 'oslots')) allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] del (api) api = TF.load(loadableFeatures) api.makeAvailableIn(globals()) print('done')
def webPipelineSingle(pipeline, version, force=False, kinds={"mql", "mysql"}): good = True if "mql" in kinds: caption(1, "Aggregate MQL for version {}".format(version)) for key in ["repoOrder"]: if key not in pipeline: caption(0, "\tERROR: no {} declared in the pipeline".format(key)) good = False if not good: return False repoOrder = pipeline["repoOrder"].strip().split() resultRepo = repoOrder[0] # addedRepos = repoOrder[1:] resultRepoDir = "{}/{}".format(githubBase, resultRepo) thisTempDir = "{}/_temp/{}".format(resultRepoDir, version) tempShebanqDir = "{}/shebanq".format(thisTempDir) shebanqDir = "{}/shebanq/{}".format(resultRepoDir, version) if not os.path.exists(shebanqDir): os.makedirs(shebanqDir) dbName = "shebanq_etcbc{}".format(version) mqlUFile = "{}/{}.mql".format(tempShebanqDir, dbName) mqlZFile = "{}/{}.mql.bz2".format(shebanqDir, dbName) xmU = os.path.exists(mqlUFile) # xmZ = os.path.exists(mqlZFile) uptodate = True referenceFile = mqlUFile if xmU else mqlZFile if not os.path.exists(referenceFile): uptodate = False caption(0, "\tWork to do because {} does not exist".format(referenceFile)) else: tmR = os.path.getmtime(referenceFile) for (i, repo) in enumerate(repoOrder): tfxDir = "{}/{}/tf/{}/.tf".format(githubBase, repo, version) if not os.path.exists(tfxDir): uptodate = False caption( 0, "\tWork to do because the tf in {} is fresh".format(repo) ) caption(0, "\t\t{}".format(tfxDir)) break if os.path.getmtime(tfxDir) > tmR: uptodate = False caption( 0, "\tWork to do because the tf in {} is recently compiled".format( repo ), ) caption(0, "\t\t{}".format(tfxDir)) break if uptodate and force: caption(0, "\tWork to do because you forced me to!") uptodate = False if not uptodate: caption(1, "Using TF to make an MQL export") locations = [] for (i, repo) in enumerate(repoOrder): locations.append("{}/{}/tf/{}".format(githubBase, repo, version)) TF = Fabric(locations=locations, modules=[""]) TF.exportMQL(dbName, tempShebanqDir) else: caption(0, "\tAlready up to date") caption(0, "\tbzipping {}".format(mqlUFile)) caption(0, "\tand delivering as {} ...".format(mqlZFile)) bzip(mqlUFile, mqlZFile) caption(0, "\tDone") if "mysql" in kinds: caption(1, "Create Mysql passage db for version {}".format(version)) runNb(pipelineRepo, programDir, "passageFromTf", force=force, VERSION=version) caption(0, "\tDone") return True
import sys import unittest from tf.fabric import Fabric # LOAD THE TEST CORPUS TF = Fabric('tf') api = TF.load('sign name') F = api.F S = api.S # MAKE CUSTOM SETS OF NODES Sign = set(range(1, F.otype.maxSlot + 1)) Node = set(range(1, F.otype.maxNode + 1)) sets = dict(Sign=Sign, Node=Node) # RUN A QUERY, OPTIONALLY WITH CUSTOM SETS def query(template, sets=None): return (tuple(S.search(template)) if sets is None else tuple( S.search(template, sets=sets))) # DEFINE THE TESTS relationKey = {
# + import os from tf.fabric import Fabric # + GH_BASE = os.path.expanduser("~/github") ORG = "annotation" REPO = "banks" FOLDER = "tf" TF_DIR = f"{GH_BASE}/{ORG}/{REPO}/{FOLDER}" VERSION = "0.2" TF_PATH = f"{TF_DIR}/{VERSION}" TF = Fabric(locations=TF_PATH) # - # We ask for a list of all features: allFeatures = TF.explore(silent=True, show=True) loadableFeatures = allFeatures["nodes"] + allFeatures["edges"] loadableFeatures # We load all features: api = TF.load(loadableFeatures, silent=False) # You see that all files are marked with a `T`. # # That means that Text-Fabric loads the features by reading the plain text `.tf` files.
if SCRIPT: (good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]), force=FORCE) if not good: stop(good=False) if not work: stop(good=True) # # Load existing data # In[17]: utils.caption(4, 'Loading relevant features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('book') api.makeAvailableIn(globals()) nodeFeatures = {} nodeFeatures['book@la'] = {} bookNodes = [] for b in F.otype.s('book'): bookNodes.append(b) nodeFeatures['book@la'][b] = F.book.v(b) for (langCode, langBookNames) in bookNames.items(): nodeFeatures['book@{}'.format(langCode)] = dict( zip(bookNodes, langBookNames)) utils.caption(0, '{} book name features created'.format(len(nodeFeatures)))
if getpass.getuser() == 'etien': locations = etien_path elif getpass.getuser() == 'cody': locations = cody_path else: locations = {} if not locations: raise Exception('Please add your data paths in bhsa.py line 30.') for path in locations: if not os.path.exists(path): raise Exception( f'You need an extra datamodule in {os.path.dirname(path)}. Do "git pull {locations[path]}" to this location.' ) # load TF and BHSA data TF = Fabric(locations=locations.keys(), modules='2017', silent=True) api = TF.load(''' otype language book chapter verse function domain typ pdp kind tree crossref ''', silent=True) api.makeAvailableIn(globals()) # globalize TF methods # define book groups & names lbh_books = ('Song_of_songs', 'Ecclesiastes', 'Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles')
def ingest_french(paths): """Match the French data to our dataset.""" # load the French dataset with open(paths['source'], 'r') as infile: reader = csv.reader(infile, delimiter='\t') french_data = list(reader) # load the BHSA Hebrew data for matching the Hebrew text TF = Fabric(locations=paths['bhsa']) API = TF.load('g_word_utf8') F, T, L = API.F, API.T, API.L # match the Hebrew verbs in the French data with the # Hebrew verbs in BHSA # we treat the ref strings as unique ID's # we use 2 dicts; one to hold ID 2 BHSA node mappings # another to hold the IDs 2 french data french2bhsa = {} french2data = {} frenchverses = {} for row in french_data: # parse French data wid = row[0] hb_txt, hb_lex, hb_tag, hb_prev = row[1:5] fr_words, fr_verse = row[5:7] bk, ch, vs, sg, wnum = parse_refstring(wid) french2data[wid] = { 'wid': wid, 'hebrew': hb_txt, 'hebrew_parse': hb_tag, 'french': fr_words, } # look up BHSA data and get the verse node tf_book = int2book[bk] vrs_node = T.nodeFromSection((tf_book, ch, vs)) if vrs_node is None: raise Exception((tf_book, ch, vs), wid, hb_txt) # save the French verse text ref_string = str((tf_book, ch, vs)) frenchverses[ref_string] = fr_verse french2data[wid]['ref'] = ref_string # get the closest matching word from the verse; # NB we iterate over the verse words in reversed order # so that if there are 2+ words with equivalent distances, # we always end on the one that is first in the verse; # the match is then added to a set so that it is not # available for subsequent matches french2bhsa[wid] = BhsaWord(0, float('inf')) # initialize with dummy matched = set() for word_node in reversed(L.d(vrs_node, 'word')): if word_node in matched: continue bhsa_txt = T.text(word_node) dist = levdist(bhsa_txt, hb_txt) if french2bhsa[wid].dist > dist: french2bhsa[wid] = BhsaWord(word_node, dist) matched.add(french2bhsa[wid].node) # iterate over both french dicts and assemble # into one BHSA dict bhsa2french = {} for wid, bhsa_word in french2bhsa.items(): bhsa_node = bhsa_word.node if bhsa_node != 0: bhsa2french[bhsa_node] = french2data[wid] # the linking is complete with open(paths['out'], 'w') as outfile: json.dump(bhsa2french, outfile, indent=2, ensure_ascii=False) with open(paths['out_verses'], 'w') as outfile: json.dump(frenchverses, outfile, indent=2, ensure_ascii=False)
os.makedirs(thisTempSource) utils.caption(0, 'bunzipping {} ...'.format(mqlzFile)) utils.bunzip(mqlzFile, mqlFile) utils.caption(0, 'Done') if os.path.exists(thisTempTf): rmtree(thisTempTf) os.makedirs(thisTempTf) # # MQL to Text-Fabric # Transform the collected information in feature-like datastructures, and write it all # out to `.tf` files. # In[8]: TF = Fabric(locations=thisTempTf, silent=True) TF.importMQL(mqlFile, slotType=slotType, otext=otextInfo, meta=featureMetaData) # # Rename features # We rename the features mentioned in the RENAME dictionary. # In[8]: if RENAME == None: utils.caption(4, 'Rename features: nothing to do') else: utils.caption(4, 'Renaming {} features in {}'.format(len(RENAME), thisTempTf)) for (srcFeature, dstFeature) in RENAME: srcPath = '{}/{}.tf'.format(thisTempTf, srcFeature) dstPath = '{}/{}.tf'.format(thisTempTf, dstFeature)
from tf.fabric import Fabric TF = Fabric('tf', '') maxSlot = 10 halfSlot = int(round(maxSlot / 2)) otype = {i: 'sign' for i in range(1, maxSlot + 1)} oslots = {} name = {i: chr(i + ord('a') - 1) for i in range(1, maxSlot + 1)} p = 0 # GENERATE NODES OF TYPE PART # create a part with that name and linked to those slots def addPart(nm, signs): mySlots = set(s for s in signs if 1 <= s <= maxSlot) if not mySlots: return global p p += 1 node = maxSlot + p otype[node] = 'part' oslots[node] = mySlots name[node] = nm
from tf.fabric import Fabric import collections import sys # https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html TF = Fabric(locations='/home/chaim/github/text-fabric-data', modules=['hebrew/etcbc4c']) #TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data', modules=['hebrew/etcbc4c']) api = TF.load( 'sp lex g_word g_word_utf8 trailer_utf8 ls typ rela function qere_utf8 qere' ) api.makeAvailableIn(globals()) F = api.F T = api.T C = api.C L = api.L #print(sorted(T.formats)) def print_original_words(): for i in range(1, 12): print(api.T.text([i], 'text-orig-full')) # for w in F.otype.s('word'): # word, part_of_speech = F.g_word.v(w), F.sp.v(w) # print(word, part_of_speech) # if w == 14:
else: utils.caption(0, 'New text formats') otextInfo = dict(line[1:].split('=', 1) for line in LEX_FORMATS.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # # Lexicon preparation # We add lexical data. # The lexical data will not be added as features of words, but as features of lexemes. # The lexemes will be added as fresh nodes, of a new type `lex`. # In[8]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) vocLex = ' g_voc_lex g_voc_lex_utf8 ' if DO_VOCALIZED_LEXEME else '' api = TF.load('lex lex_utf8 language sp ls gn ps nu st oslots {} {}'.format( vocLex, EXTRA_OVERLAP)) api.makeAvailableIn(globals()) # # Text pass # We map the values in the language feature to standardized ISO values: `arc` and `hbo`. # We run over all word occurrences, grab the language and lexeme identifier, and create for each # unique pair a new lexeme node. # # We remember the mapping between nodes and lexemes. # # This stage does not yet involve the lexical files. # In[9]:
# We call up TF and let it look into the directory where the output has to land, # in this case a subdirectory of the tutorials repo on annotation. # + # TF_DIR = os.path.expanduser('~/Downloads/banks/tf') # if you want it in your Downloads directory instead BASE = os.path.expanduser('~/github') ORG = 'annotation' REPO = 'banks' RELATIVE = 'tf' TF_DIR = os.path.expanduser(f'{BASE}/{ORG}/{REPO}/{RELATIVE}') VERSION = '0.2' TF_PATH = f'{TF_DIR}/{VERSION}' TF = Fabric(locations=TF_PATH, silent=True) # - # ## TF configuration # # A Text-Fabric dataset is a bunch of individual `.tf` files that start with a little bit of metadata and then contain # a stream of data, typically the values of a single feature for each node or edge in the graph. # # We specify the metadata bit by bit. # # ### slot type # # A crucial design aspect of each TF dataset is its granularity. What are the slots? # # Words, morphemes, characters? #
def genTrees(version): C = setVersion(version) bhsa = C.bhsa sp = C.sp rela = C.rela ptyp = C.ptyp ctyp = C.ctyp g_word_utf8 = C.g_word_utf8 tfDir = C.tfDir TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa) api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother") E = api.E F = api.F Fs = api.Fs def getTag(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) def getTagN(node): otype = F.otype.v(node) tag = TYPE_TABLE[otype] if tag == "P": tag = Fs(ptyp).v(node) elif tag == "C": tag = ccrTable[Fs(rela).v(node)] isWord = tag == "" if not isWord: tag += "{" + str(node) + "}" pos = POS_TABLE[Fs(sp).v(node)] if isWord else None slot = node if isWord else None text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None return (tag, pos, slot, text, isWord) treeTypes = ("sentence", "clause", "phrase", "subphrase", "word") (rootType, leafType, clauseType, phraseType) = ( treeTypes[0], treeTypes[-1], treeTypes[1], treeTypes[2], ) ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items()) ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items()) tree = Tree( TF, otypes=treeTypes, phraseType=phraseType, clauseType=clauseType, ccrFeature=rela, ptFeature=ptyp, posFeature=sp, motherFeature="mother", ) tree.restructureClauses(ccrClass) results = tree.relations() TF.info("Ready for processing") skip = set() TF.info("Verifying whether all slots are preserved under restructuring") TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}") errors = [] # i = 10 for snode in F.otype.s(rootType): declaredSlots = set(E.oslots.s(snode)) results = {} thisgood = {} for kind in ("e", "r"): results[kind] = set(lt for lt in tree.getLeaves(snode, kind) if F.otype.v(lt) == leafType) thisgood[kind] = declaredSlots == results[kind] # if not thisgood[kind]: # print(f"{kind} D={declaredSlots}\n L={results[kind]}") # i -= 1 # if i == 0: break if False in thisgood.values(): errors.append((snode, thisgood["e"], thisgood["r"])) nErrors = len(errors) if nErrors: TF.error(f"{len(errors)} mismatches:") mine = min(20, len(errors)) skip |= {e[0] for e in errors} for (s, e, r) in errors[0:mine]: TF.error( (f"{s} embedding: {'OK' if e else 'XX'};" f" restructd: {'OK' if r else 'XX'}"), tm=False, ) else: TF.info(f"{len(errors)} mismatches") TF.info(f"Exporting {rootType} trees to TF") s = 0 chunk = 10000 sc = 0 treeData = {} treeDataN = {} for node in F.otype.s(rootType): if node in skip: continue (treeRep, wordsRep, bSlot) = tree.writeTree(node, "r", getTag, rev=False, leafNumbers=True) (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node, "r", getTagN, rev=False, leafNumbers=True) treeData[node] = treeRep treeDataN[node] = treeNRep s += 1 sc += 1 if sc == chunk: TF.info(f"{s} trees composed") sc = 0 TF.info(f"{s} trees composed") nodeFeatures = dict(tree=treeData, treen=treeDataN) metaData = dict( tree=dict( valueType="str", description="penn treebank represententation for sentences", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), treen=dict( valueType="str", description= "penn treebank represententation for sentences with node numbers included", converter="Dirk Roorda", convertor="trees.ipynb", url="https://github.com/etcbc/trees/trees.ipynb", coreData="BHSA", coreVersion=version, ), ) TF.info("Writing tree feature to TF") TFw = Fabric(locations=tfDir, silent=True) TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
stop(good=False) if not work: stop(good=True) # # Loading the feature data # # We load the features we need from the BHSA core database and from the valence module, # as far as generated by the # [enrich](https://github.com/ETCBC/valence/blob/master/programs/enrich.ipynb) notebook. # In[7]: # In[14]: utils.caption(4, "Load the existing TF dataset") TF = Fabric(locations=[coreTf, thisTf], modules=[""]) # We instruct the API to load data. # In[8]: # In[15]: api = TF.load(""" function rela typ g_word_utf8 trailer_utf8 lex prs uvf sp pdp ls vs vt nametype gloss book chapter verse label number s_manual f_correction valence predication grammatical original lexical semantic mother