def copyServerSingle( pipeline, user, server, remoteDir, version, kinds={"mql", "mysql"} ): repoOrder = pipeline["repoOrder"].strip().split() resultRepo = repoOrder[0] dbDir = "{}/{}/shebanq/{}".format(githubBase, resultRepo, version) dbFile = "shebanq_etcbc{}.mql.bz2".format(version) pdbFile = "shebanq_passage{}.sql.gz".format(version) address = "{}@{}:{}".format(user, server, remoteDir) good = True for theFile in (dbFile, pdbFile): if theFile == dbFile and "mql" not in kinds: continue if theFile == pdbFile and "mysql" not in kinds: continue if theFile == dbFile: caption(1, "Sending MQL database for version {} to server".format(version)) else: caption( 1, "Sending passage database for version {} to server".format(version) ) caption(0, "\t{}".format(theFile)) caption(0, "\tscp {}/{} {}/{}".format(dbDir, theFile, address, theFile)) if not run("scp {}/{} {}/{}".format(dbDir, theFile, address, theFile)): good = False caption(0, "\tdone") return good
def __repr__(self): r = "" if len(self.importations) > 0: r += utils.caption("Imports") for t in self.importations: r += str(t) + "\n" r += "\n" r += utils.caption("Statechart") r += self.stateChart() + "\n" if len(self.actions) > 0: r += utils.caption("Actions") for a in self.actions: r += str(a) + "\n" r += "\n" if len(self.transitions) > 0: r += utils.caption("Transitions") for t in self.transitions: r += str(t) + "\n" r += "\n" return r
def showParagraphs(verseNode): clause_atoms = L.d(verseNode, otype='clause_atom') for ca in clause_atoms: utils.caption(0, '\t\t{:<3} {:>12} {}'.format( F.instruction.v(ca), F.pargr.v(ca), T.text(L.d(ca, otype='word'))), continuation=True)
def showLex(w): info = dict((f, Fs(f).v(w)) for f in features) utils.caption( 0, '\t{} - {} - {}x'.format( F.language.v(w), F.lex.v(w), len(L.d(w, otype='word')), )) for f in sorted(info): utils.caption(0, '\t\t{:<15} = {}'.format(f, info[f]))
def runNb(repo, dirName, nb, force=False, **parameters): caption(3, "Run notebook [{}/{}] with parameters:".format(repo, nb)) for (param, value) in sorted(parameters.items()): caption(0, "\t{:<20} = {}".format(param, value)) location = "{}/{}/{}".format(githubBase, repo, dirName) nbFile = "{}/{}.ipynb".format(location, nb) pyFile = "{}/{}.py".format(location, nb) nbObj = nbformat.read(nbFile, 4) pyScript = py.from_notebook_node(nbObj)[0] with open(pyFile, "w") as s: s.write(pyScript) os.chdir(location) good = True with open(pyFile) as s: locals()["SCRIPT"] = True locals()["FORCE"] = force locals()["NAME"] = repo for (param, value) in parameters.items(): locals()[param] = value try: exec(s.read(), locals()) except SystemExit as inst: good = inst.args[0] == 0 caption(0, "{} {}".format("SUCCESS" if good else "FAILURE", nb)) caption(3, "[{}/{}]".format(repo, nb), good=good) return good
def run(cmd): p = Popen( [cmd], stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True, bufsize=1, universal_newlines=True, ) for line in p.stdout: caption(0, line) for line in p.stderr: caption(0, line) p.wait() return p.returncode == 0
def checkRepo(repo, repoConfig, force=False, **parameters): good = True for item in repoConfig: task = item.get("task", None) if task is None: caption(0, "ERROR: missing task name in item {}".format(item)) good = False for param in standardParams: if param not in parameters: caption( 0, "ERROR: {} needs parameter {} which is not supplied".format( task, param, ), ) good = False return good
def runRepo(repo, repoConfig, force=False, **parameters): caption(2, "Make repo [{}]".format(repo)) # copy the utils.py from the pipeline repo to the target repo copy( "{}/{}/{}".format(githubBase, pipelineRepo, utilsScript), "{}/{}/{}".format(githubBase, repo, utilsScript), ) good = True for item in repoConfig: task = item["task"] omit = item.get("omit", set()) paramValues = dict() for (param, values) in parameters.items(): paramValues[param] = parameters[param] if "params" in item: paramValues.update(item["params"]) version = paramValues.get("VERSION", "UNKNOWN") if version in omit: caption(3, "[{}/{}] skipped in version [{}]".format(repo, task, version)) continue good = runNb(repo, programDir, task, force=force, **paramValues) if not good: break caption(2, "[{}]".format(repo), good=good) return good
def runRepos(repoOrder, repoConfig, repos=None, force=False, **parameters): good = True doRepos = [] for repo in repoOrder.strip().split(): if repos is not None and repo not in repos: caption(1, f"Skipping {repo} because it is not in the repos parameter") continue if repo not in repoConfig: caption(0, "ERROR: missing configuration for repo {}".format(repo)) good = False if not checkRepo(repo, repoConfig[repo], **parameters): good = False if good: doRepos.append(repo) if not good: return False for repo in doRepos: good = runRepo(repo, repoConfig[repo], force=force, **parameters) if not good: break return good
def showExceptions(cases): nCases = len(cases) if nCases == 0: utils.caption(0, '\tFully consistent') else: utils.caption(0, '\t{} inconsistent cases'.format(nCases)) limit = 10 for (i, (lan, lex)) in enumerate(cases): if i == limit: utils.caption(0, '\t\t...and {} more.'.format(nCases - limit)) break utils.caption( 0, '\t\t{}-{}: {}'.format( lan, lex, ', '.join(sorted(cases[(lan, lex)]))))
def showKq(w): hw = F.g_word_utf8.v(w) tw = F.g_word.v(w) ht = F.trailer_utf8.v(w) tt = F.trailer.v(w) qhw = F.qere_utf8.v(w) qtw = F.qere.v(w) qht = F.qere_trailer_utf8.v(w) qtt = F.qere_trailer.v(w) utils.caption(0, '{:<20} {}'.format('hebrew', hw + ht)) utils.caption(0, '{:<20} {}'.format('hebrew qere', qhw + qht)) utils.caption(0, '{:<20} {}'.format('transcription', tw + tt)) utils.caption(0, '{:<20} {}'.format('transcription qere', qtw + qtt))
def readLex(lan): lexInfile = open(lexFile[lan], encoding='utf-8') errors = [] lexItems = {} ln = 0 for line in lexInfile: ln += 1 line = line.rstrip() line = line.split('#')[0] if line == '': continue (entry, featurestr) = line.split(sep=None, maxsplit=1) entry = entry.strip('"') if entry in lexItems: errors.append('duplicate lexical entry {} in line {}.\n'.format( entry, ln)) continue featurestr = featurestr.strip(':') featurestr = featurestr.replace('\\:', chr(254)) featurelst = featurestr.split(':') features = {} for feature in featurelst: comps = feature.split('=', maxsplit=1) if len(comps) == 1: if feature.strip().isnumeric(): comps = ('_n', feature.strip()) else: errors.append( 'feature without value for lexical entry {} in line {}: {}\n' .format( entry, ln, feature, )) continue (key, value) = comps value = value.replace(chr(254), ':') if key in features: errors.append( 'duplicate feature for lexical entry {} in line {}: {}={}\n' .format( entry, ln, key, value, )) continue features[key] = value.replace('\\', '/') if 'sp' in features and features['sp'] == 'verb': if 'gl' in features: gloss = features['gl'] if gloss.startswith('to '): features['gl'] = gloss[3:] lexItems[entry] = features lexInfile.close() nErrors = len(errors) if len(errors): utils.caption( 0, 'Lexicon [{}]: {} error{}'.format(nErrors, '' if nErrors == 1 else 's')) return lexItems
# In[7]: provenanceMetadata = dict( dataset='BHSA', version=VERSION, datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis', author='Eep Talstra Centre for Bible and Computer', encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)', website='https://shebanq.ancient-data.org', email='*****@*****.**', ) lexType = 'lex' if LEX_FORMATS is '': utils.caption(0, 'No additional text formats provided') otextInfo = {} else: utils.caption(0, 'New text formats') otextInfo = dict(line[1:].split('=', 1) for line in LEX_FORMATS.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # # Lexicon preparation # We add lexical data. # The lexical data will not be added as features of words, but as features of lexemes. # The lexemes will be added as fresh nodes, of a new type `lex`. # In[8]:
# In[6]: provenanceMetadata = dict( dataset='BHSA', datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis', version=VERSION, author='Eep Talstra Centre for Bible and Computer', encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)', website='https://shebanq.ancient-data.org', email='*****@*****.**', ) # In[7]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('label number') api.makeAvailableIn(globals()) # # Clause atom identifiers in .px # We must map the way the clause_atoms are identified in the `.px` files # to nodes in TF. # In[8]: utils.caption(0, '\tLabeling clause_atoms') labelNumberFromNode = {} nodeFromLabelNumber = {} for n in N():
def importLocalSingle(pipeline, version, kinds={"mql", "mysql"}): good = True if "mql" in kinds: repoOrder = pipeline["repoOrder"].strip().split() resultRepo = repoOrder[0] dbName = "shebanq_etcbc{}".format(version) dbDir = "{}/{}/shebanq/{}".format(githubBase, resultRepo, version) caption(1, "Import MQL db for version {} locally".format(version)) dbDir = "{}/{}/_temp/{}/shebanq".format(githubBase, resultRepo, version) dbName = "shebanq_etcbc{}".format(version) caption(1, "Drop database {}".format(dbName)) if not run('mysql -u root -e "drop database if exists {};"'.format(dbName)): caption(1, "Drop database failed for {}".format(dbName), good=False) return False caption(1, "Importing MQL {} ...".format(dbName)) if not run("mql -n -b m -u root -e UTF8 < {}/{}.mql".format(dbDir, dbName)): caption(1, "Import mql failed for {}".format(dbName), good=False) return False caption(1, "Imported MQL {}".format(dbName)) if "mysql" in kinds: caption(1, "Importing passage db for version {} ...".format(version)) pdbName = "shebanq_passage{}".format(version) if not run("mysql -u root < {}/{}.sql".format(dbDir, pdbName)): caption(1, "Import mysql failed for {}".format(pdbName), good=False) return False caption(1, "Imported passage db for version {}".format(version)) return good
if not good: stop(good=False) if not work: stop(good=True) # In[5]: for path in (thisMysql, thisTempMysql): if not os.path.exists(path): os.makedirs(path) # # Collect # # We collect the data from the TF repos. # In[6]: utils.caption(4, 'Loading relevant features') if VERSION in {'4', '4b'}: QERE = 'g_qere_utf8' QERE_TRAILER = 'qtrailer_utf8' ENTRY = 'g_entry' ENTRY_HEB = 'g_entry_heb' PHONO_TRAILER = 'phono_sep' else: QERE = 'qere_utf8' QERE_TRAILER = 'qere_trailer_utf8' ENTRY = 'voc_lex' ENTRY_HEB = 'voc_lex_utf8' PHONO_TRAILER = 'phono_trailer' TF = Fabric(locations=[thisRepo, phonoRepo], modules=[tfDir])
'2017': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', '2016': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', } thisOtext = oText.get(VERSION, '') if thisOtext is '': utils.caption(0, 'No additional text formats provided') otextInfo = {} else: utils.caption(0, 'New text formats') otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # In[7]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('label g_word g_cons trailer_utf8') api.makeAvailableIn(globals())
if SCRIPT: (good, work) = utils.mustRun( None, "{}/.tf/{}.tfx".format(thisTf, "tree"), force=FORCE ) if not good: stop(good=False) if not work: stop(good=True) # # Load the TF data # In[5]: utils.caption(4, "Load the existing TF dataset") TF = Fabric(locations=coreTf, modules=[""]) # # Load data # We load the some features of the # [BHSA](https://github.com/etcbc/bhsa) data. # See the [feature documentation](https://etcbc.github.io/bhsa/features/hebrew/2017/0_home.html) for more info. # In[6]: sp = "part_of_speech" if VERSION == "3" else "sp" rela = "clause_constituent_relation" if VERSION == "3" else "rela" ptyp = "phrase_type" if VERSION == "3" else "typ" ctyp = "clause_atom_type" if VERSION == "3" else "typ"
if not good: stop(good=False) if not work: stop(good=True) # # Loading the feature data # # We load the features we need from the BHSA core database and from the valence module, # as far as generated by the # [enrich](https://github.com/ETCBC/valence/blob/master/programs/enrich.ipynb) notebook. # In[7]: # In[14]: utils.caption(4, "Load the existing TF dataset") TF = Fabric(locations=[coreTf, thisTf], modules=[""]) # We instruct the API to load data. # In[8]: # In[15]: api = TF.load(""" function rela typ g_word_utf8 trailer_utf8 lex prs uvf sp pdp ls vs vt nametype gloss book chapter verse label number s_manual f_correction valence predication grammatical original lexical semantic
def runVersion(pipeline, repos=None, version=None, force=False): caption(1, "Make version [{}]".format(version)) good = True for key in ("defaults", "versions", "repoOrder", "repoConfig"): if key not in pipeline: if key == "defaults": if version is None: caption( 0, "ERROR: no {} version given and no known default section in pipeline", ) good = False else: caption(0, "ERROR: no {} declared in the pipeline".format(key)) good = False elif key == "defaults": if version is None: if "VERSION" not in pipeline["defaults"]: caption( 0, "ERROR: no version given and no default version specified in pipeline", ) good = False else: version = pipeline["defaults"]["VERSION"] elif key == "versions": if version not in pipeline["versions"]: if version is not None: caption( 0, "ERROR: version {} not declared in pipeline".format(version) ) good = False else: versionInfo = pipeline["versions"][version] if not good: return False defaults = pipeline.get("defaults", {}) # versions = pipeline["versions"] repoOrder = pipeline["repoOrder"] repoConfig = pipeline["repoConfig"] versionInfo = pipeline["versions"][version] paramValues = dict() for param in standardParams: if param == "VERSION": value = version else: value = versionInfo.get(param, defaults.get(param, None)) if value is None: caption(0, "ERROR: no value or default value for {}".format(param)) good = False else: paramValues[param] = value for (param, value) in defaults.items(): if param in standardParams: continue paramValues[param] = value for (param, value) in versionInfo.items(): if param in standardParams: continue paramValues[param] = value if not good: return False good = runRepos(repoOrder, repoConfig, repos=repos, force=force, **paramValues) caption(1, "[{}]".format(version), good=good) return good
repoBase = os.path.expanduser('~/github/etcbc') thisRepo = '{}/{}'.format(repoBase, CORE_NAME) thisTemp = '{}/_temp/{}'.format(thisRepo, VERSION) thisTempTf = '{}/tf'.format(thisTemp) thisTf = '{}/tf/{}'.format(thisRepo, VERSION) # # Collect # # We collect the book names. # In[15]: utils.caption(4, 'Book names') metaData = { '': dict( dataset='BHSA', version=VERSION, datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis', author='Eep Talstra Centre for Bible and Computer', provenance='book names from wikipedia and other sources', encoders='Dirk Roorda (TF)', website='https://shebanq.ancient-data.org', email='*****@*****.**', ), }
# In[5]: for path in (thisMysql, thisTempMysql): if not os.path.exists(path): os.makedirs(path) # # Collect # # We collect the data from the TF repos. # In[6]: # In[ ]: utils.caption(4, "Loading relevant features") # In[ ]: if VERSION in {"4", "4b"}: QERE = "g_qere_utf8" NO_QERE = "" QERE_TRAILER = "qtrailer_utf8" ENTRY = "g_entry" ENTRY_HEB = "g_entry_heb" PHONO_TRAILER = "phono_sep" LANGUAGE = "language" else: QERE = "qere_utf8" NO_QERE = None QERE_TRAILER = "qere_trailer_utf8"
if SCRIPT: (good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]), force=FORCE) if not good: stop(good=False) if not work: stop(good=True) # # Collect # # We collect the statistics. # In[6]: utils.caption(4, 'Loading relevant features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('{} {} {}'.format(LANG_FEATURE, LEX_FEATURE, OCC_FEATURE)) api.makeAvailableIn(globals()) hasLex = 'lex' in set(F.otype.all) # In[7]: utils.caption(0, 'Counting occurrences') wstats = { 'freqs': { 'lex': collections.defaultdict(lambda: collections.Counter()), 'occ': collections.defaultdict(lambda: collections.Counter()), },
def webPipelineSingle(pipeline, version, force=False, kinds={"mql", "mysql"}): good = True if "mql" in kinds: caption(1, "Aggregate MQL for version {}".format(version)) for key in ["repoOrder"]: if key not in pipeline: caption(0, "\tERROR: no {} declared in the pipeline".format(key)) good = False if not good: return False repoOrder = pipeline["repoOrder"].strip().split() resultRepo = repoOrder[0] # addedRepos = repoOrder[1:] resultRepoDir = "{}/{}".format(githubBase, resultRepo) thisTempDir = "{}/_temp/{}".format(resultRepoDir, version) tempShebanqDir = "{}/shebanq".format(thisTempDir) shebanqDir = "{}/shebanq/{}".format(resultRepoDir, version) if not os.path.exists(shebanqDir): os.makedirs(shebanqDir) dbName = "shebanq_etcbc{}".format(version) mqlUFile = "{}/{}.mql".format(tempShebanqDir, dbName) mqlZFile = "{}/{}.mql.bz2".format(shebanqDir, dbName) xmU = os.path.exists(mqlUFile) # xmZ = os.path.exists(mqlZFile) uptodate = True referenceFile = mqlUFile if xmU else mqlZFile if not os.path.exists(referenceFile): uptodate = False caption(0, "\tWork to do because {} does not exist".format(referenceFile)) else: tmR = os.path.getmtime(referenceFile) for (i, repo) in enumerate(repoOrder): tfxDir = "{}/{}/tf/{}/.tf".format(githubBase, repo, version) if not os.path.exists(tfxDir): uptodate = False caption( 0, "\tWork to do because the tf in {} is fresh".format(repo) ) caption(0, "\t\t{}".format(tfxDir)) break if os.path.getmtime(tfxDir) > tmR: uptodate = False caption( 0, "\tWork to do because the tf in {} is recently compiled".format( repo ), ) caption(0, "\t\t{}".format(tfxDir)) break if uptodate and force: caption(0, "\tWork to do because you forced me to!") uptodate = False if not uptodate: caption(1, "Using TF to make an MQL export") locations = [] for (i, repo) in enumerate(repoOrder): locations.append("{}/{}/tf/{}".format(githubBase, repo, version)) TF = Fabric(locations=locations, modules=[""]) TF.exportMQL(dbName, tempShebanqDir) else: caption(0, "\tAlready up to date") caption(0, "\tbzipping {}".format(mqlUFile)) caption(0, "\tand delivering as {} ...".format(mqlZFile)) bzip(mqlUFile, mqlZFile) caption(0, "\tDone") if "mysql" in kinds: caption(1, "Create Mysql passage db for version {}".format(version)) runNb(pipelineRepo, programDir, "passageFromTf", force=force, VERSION=version) caption(0, "\tDone") return True
@fmt:text-trans-plain={surface_consonants} @sectionFeatures=book,chapter,verse @sectionTypes=book,chapter,verse ''', } # The next function selects the proper otext material, falling back on a default if nothing # appropriate has been specified in `oText`. # In[6]: thisOtext = oText.get(VERSION, oText['']) if thisOtext is oText['']: utils.caption( 0, 'WARNING: no otext feature info provided, using a meager default value' ) otextInfo = {} else: utils.caption(0, 'INFO: otext feature information found') otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '\t{:<20} = "{}"'.format(*x)) # # Overview # # The program has several stages: # # 1. **prepare** the source (utils.bunzip if needed) # 1. **convert** convert the MQL file into a text-fabric dataset
def copyVersion(pipeline, fromVersion, toVersion): caption(1, "Copy version {} ==> {}".format(fromVersion, toVersion)) good = True for key in ("repoOrder", "repoDataDirs"): if key not in pipeline: caption(0, "ERROR: no {} declared in the pipeline".format(key)) good = False if not good: return False for repo in pipeline["repoOrder"].strip().split(): caption(2, "Repo {}".format(repo)) if repo not in pipeline["repoDataDirs"]: caption(0, "Not specified which data directories I should copy over") continue dataDirs = pipeline["repoDataDirs"][repo].strip().split() for dataDir in dataDirs: fromDir = "{}/{}/{}/{}".format(githubBase, repo, dataDir, fromVersion) toDir = "{}/{}/{}/{}".format(githubBase, repo, dataDir, toVersion) caption( 0, "\tCopy {}/{} ==> {}/{}".format( dataDir, fromVersion, dataDir, toVersion ), ) if os.path.exists(toDir): caption(0, "\t\tremoving existing {}/{}".format(dataDir, toVersion)) rmtree(toDir) else: caption(0, "\t\tno existing {}/{}".format(dataDir, toVersion)) if os.path.exists(fromDir): caption( 0, "\t\tputting data in place from {}/{}".format(dataDir, fromVersion), ) copytree(fromDir, toDir) if dataDir == "tf": caption( 0, "\t\tadapting version in metadata of tf features to {}".format( toVersion ), ) updateFeatures(toDir, toVersion) else: caption(0, "\t\tNo data found in {}/{}".format(dataDir, fromVersion)) caption(2, "Repo {} done".format(repo)) caption(1, "Version {} ==> {} copied".format(fromVersion, toVersion))