Beispiel #1
0
def loadTf():
    TF = Fabric(locations=[f'{TF_PATH}/{VERSION}'])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    api = TF.load(loadableFeatures)
    if api:
        print(f'max node = {api.F.otype.maxNode}')
        print(api.F.root.freqList()[0:20])
Beispiel #2
0
def makeTf():
    tfPathV = f'{TF_PATH}/{VERSION}'
    if os.path.exists(tfPathV):
        rmtree(tfPathV)
    TF = Fabric(locations=[tfPathV])
    TF.save(nodeFeatures=nodeFeatures,
            edgeFeatures=edgeFeatures,
            metaData=metaData)
Beispiel #3
0
def loadTf():
    TF = Fabric(locations=[OUT_DIR])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f'max node = {api.F.otype.maxNode}')
        print(api.F.root.freqList()[0:20])
Beispiel #4
0
def loadTf(outDir):
    TF = Fabric(locations=[outDir])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f"max node = {api.F.otype.maxNode}")
        print("Frequencies of words")
        for (word, n) in api.F.letters.freqList()[0:20]:
            print(f"{n:>6} x {word}")
Beispiel #5
0
def gather(locations, modules):
    TF = Fabric(locations=locations, modules=modules, silent=True)
    api = TF.load(FEATURES, silent=True)

    for node in api.F.otype.s('book'):
        book = api.T.sectionFromNode(node)[0]
        print(book)
        dump_book(api, book)

    with open(os.path.join(DATADIR, 'verse_nodes.pkl'), 'wb') as f:
        pickle.dump(VERSE_NODES, f)
Beispiel #6
0
def convert(source, ocred, pages, versionTf):
    global pageNums
    global SRC_FILE
    global TYPE_MAP
    global HAS_TOC
    global OCRED
    global U
    global VERSION_TF

    U = UChar()

    pageNums = parseNums(pages)

    workInfo = WORKS[source]
    dest = getTfDest(source, versionTf)
    (SRC_FILE, OCRED) = getFile(source, ocred)
    HAS_TOC = workInfo.get("toc", False)
    TYPE_MAP = TYPE_MAPS[OCRED]
    VERSION_TF = versionTf

    cv = CV(Fabric(locations=dest))

    return cv.walk(
        director,
        slotType,
        otext=otext[None] | otext[OCRED],
        generic=generic(source),
        intFeatures=intFeatures[None] | intFeatures[OCRED],
        featureMeta=featureMeta[None] | featureMeta[OCRED],
        generateTf=True,
    )
    def load_tf(self):
        '''
        Loads an instance of TF if necessary.
        '''

        # load BHSA Hebrew data
        TF = Fabric(bhsa_data_paths, silent=True)
        tf_api = TF.load('''
                        function lex vs language
                        pdp freq_lex gloss domain ls
                        heads prep_obj mother rela
                        typ sp sem_domain sem_domain_code
                      ''',
                         silent=True)

        self.tf_api = tf_api
Beispiel #8
0
    def load_tf_bhsa(self):
        '''
        Loads a TF instance of the BHSA dataset.
        '''
        TF = Fabric(
            locations='~/github',
            modules=['etcbc/bhsa/tf/c', 'semantics/phase1/tf/c'
                     ],  # modify paths here for your system
            silent=True)
        api = TF.load('''
                        book chapter verse
                        function lex vs language
                        pdp freq_lex gloss domain ls
                        heads
                      ''',
                      silent=True)

        B = Bhsa(api, '4. Semantic Space Construction', version='c')

        return api, B
Beispiel #9
0
def main():
    TF = Fabric(modules=['hebrew/etcbc4c'],
                locations='~/VersionControl/etcbc-data',
                silent=True)
    api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True)
    api.makeAvailableIn(globals())

    data = Databank()

    for n in N():
        try:
            handle(n, data)
        except (KeyError, ValueError):
            pass

    print(len(data.verbs), len(data.roots))

    with open('etcbc-verbs.csv', 'w') as csvverbs:
        verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL)
        #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active'])
        i = VERB_STARTID
        for verb in data.verbs:
            verbwr.writerow([
                i, verb.verb, verb.root, verb.stem, verb.tense,
                verb.person if verb.person is not None else 'NULL',
                verb.gender if verb.gender is not None else 'NULL',
                verb.number if verb.number is not None else 'NULL', 1
            ])
            i += 1

    with open('etcbc-roots.csv', 'w') as csvroots:
        rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL)
        #rootwr.writerow(['id', 'root', 'root_kind_id'])
        i = ROOT_STARTID
        for root in data.roots:
            rootwr.writerow([i, root.lex, 1])
            i += 1
Beispiel #10
0
def loadTf():
    print(f'Load TF dataset for the first time')
    TF = Fabric(locations=TF_PATH, modules=[''])
    api = TF.load('')
    allFeatures = TF.explore(silent=False, show=True)
    loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
    TF.load(loadableFeatures, add=True)
    return api

    print('All done')
Beispiel #11
0
from tf.fabric import Fabric
from book_formats import get_book_maps, etcbc2sbl, etcbc2abbr
from verb_form import get_verbform, get_cl_verbform
from modify_domain import permissive_q
from synvar_carc import in_dep_calc as clause_relator
from modify_cltype import simplify_cl_type
from tag_args import clause_objects, get_loca_assocs, clause_locas, clause_time, clause_args

# NB that working directory when script is executed is
# /workflow; because we have some utilities that we want
# to run from above directory, we need to append it to path
sys.path.append('scripts')
from build_tables import build_sample_tables

# fire up Text-Fabric with BHSA data
TF = Fabric(snakemake.input['tf_mods'], silent='deep')
features = """
sp pdp vs vt ps gn nu
lex language gloss voc_lex voc_lex_utf8
function number label 
typ code rela mother domain txt 
genre
sense
nhead
funct_assoc
"""
bhsa = TF.load(features, silent='deep')
F, E, T, L, Fs, = bhsa.F, bhsa.E, bhsa.T, bhsa.L, bhsa.Fs

# load GBI Hebrew data
with open(snakemake.input.bhsa2gbi, 'rb') as infile:
Beispiel #12
0
        afterWord += ' '

    rest = splitPunc(w[pA:]) if pA < len(w) else ()
    return ((preWord, word, afterWord), ) + rest


def plainCaps(w):
    return ''.join(x.upper() for x in normalize(NFD, w)
                   if category(x)[0] not in dia)


for word in (word1, word2):
    print(splitPunc(word))

tm = Timestamp()
TF = Fabric(locations=TF_DIR)


class Data:
    def __init__(self, bookEn):
        self.bookEn = bookEn
        self.tfFromXml = {}
        self.xmlFromTf = {}
        self.nodeNum = 1
        self.maxSlot = 0
        self.maxNode = 0
        self.paths = {}
        self.nodeFeatures = collections.defaultdict(dict)
        self.edgeFeatures = collections.defaultdict(dict)

Beispiel #13
0
from kimsbible import app
from kimsbible.lib import lib as kb
from kimsbible.lib import vcodeparser as vp
from kimsbible.lib import db

from kimsbible.lib.config import google_map_api, kml_url

# @app.after_request
# def set_response_headers(r):
#     r.headers['Cache-Control'] = 'public, max-age=3600'
#     return r

### Load up TF ###
ETCBC = 'hebrew/etcbc4c'
TF = Fabric(locations='text-fabric-data', modules=ETCBC)
#api = TF.load('book')

api = TF.load('''
    book chapter verse
    nu gn ps vt vs st
    otype typ function
    det pdp qere_utf8 qere_trailer_utf8
    g_word_utf8 trailer_utf8
    lex_utf8 lex voc_utf8
    g_prs_utf8 g_uvf_utf8
    prs_gn prs_nu prs_ps g_cons_utf8
    gloss phono 
''')

api.makeAvailableIn(globals())
Beispiel #14
0
from morphological_lists import book_index, generous_name, book_abbreviation
from bottle import Bottle, hook, route, get, post, request, response, redirect, run, template, static_file
import paste.gzipper
from itertools import chain

from loadParallelText import getPTextFromRefPairArray

from tf.fabric import Fabric

### set up app - we're going to use it for gzip middleware ###

app = Bottle()

### load up TF ###

TF = Fabric(locations='../text-fabric-data', modules='hebrew/etcbc4c')
api = TF.load('''
	book chapter verse
	sp nu gn ps vt vs st
	otype
	det
	g_word_utf8 trailer_utf8
	lex_utf8 lex voc_utf8
	g_prs_utf8 g_uvf_utf8
	prs_gn prs_nu prs_ps g_cons_utf8
	gloss sdbh lxxlexeme
	accent accent_quality
	tab typ
''')
api.makeAvailableIn(globals())
Beispiel #15
0
import os, sys, re, collections
from tf.fabric import Fabric

sourceDir = os.path.expanduser('~/github/etcbc-data')
mqlFile = '{}/{}'.format(sourceDir, 'synvar.mql')
targetDir = os.path.expanduser('~/github/text-fabric-data')
tfDir = '{}/hebrew/extrabiblical'.format(targetDir)
if not os.path.exists(tfDir): os.makedirs(tfDir)

TF = Fabric(tfDir)

slotType = 'word'

enums = dict()
objectTypes = dict()
tables = dict()
curMonads = None
curId = None
edgeF = dict()
nodeF = dict()


def setFromSpec(spec):
    covered = set()
    for r_str in spec.split(','):
        bounds = r_str.split('-')
        if len(bounds) == 1:
            covered.add(int(r_str))
        else:
            b = int(bounds[0])
            e = int(bounds[1])
Beispiel #16
0
def generateTf(bookAcro, content):
    if os.path.exists(TF_PATH):
        rmtree(TF_PATH)
    os.makedirs(TF_PATH)

    print("Slicing content into features")

    cur = collections.Counter()
    curSlot = 0
    context = []
    nodeFeatures = collections.defaultdict(dict)
    edgeFeatures = collections.defaultdict(lambda: collections.defaultdict(set))
    oSlots = collections.defaultdict(set)
    for acro in allAcrosSeq:
        thisBookInfo = bookAcro[acro]
        bookName = thisBookInfo["bookName"]
        witness = thisBookInfo.get("witness", None)
        chapters = content[bookName]

        cur["book"] += 1
        bookNode = ("book", cur["book"])
        nodeFeatures["book"][bookNode] = acro
        nodeFeatures["book@en"][bookNode] = bookName
        if witness is not None:
            nodeFeatures["witness"][bookNode] = witness
        context.append(("book", cur["book"]))

        for chapterNum in chapters:
            verses = chapters[chapterNum]

            cur["chapter"] += 1
            nodeFeatures["chapter"][("chapter", cur["chapter"])] = chapterNum
            nodeFeatures["book"][("chapter", cur["chapter"])] = acro
            if witness is not None:
                nodeFeatures["witness"][("chapter", cur["chapter"])] = witness
            context.append(("chapter", cur["chapter"]))

            for verseNum in verses:
                words = verses[verseNum].strip().split()

                cur["verse"] += 1
                nodeFeatures["verse"][("verse", cur["verse"])] = verseNum
                nodeFeatures["chapter"][("verse", cur["verse"])] = chapterNum
                nodeFeatures["book"][("verse", cur["verse"])] = acro
                if witness is not None:
                    nodeFeatures["witness"][("verse", cur["verse"])] = witness
                context.append(("verse", cur["verse"]))
                for elem in splitWords(words):
                    if len(elem) != 2:
                        print(bookName, chapterNum, verseNum, words)
                        continue
                    (word, punc) = elem
                    wSyc = TR.to_syriac(word)
                    pSyc = TR.to_syriac(punc)

                    curSlot += 1
                    wordNode = ("word", curSlot)
                    nodeFeatures["word_etcbc"][wordNode] = word
                    nodeFeatures["word"][wordNode] = wSyc
                    nodeFeatures["trailer_etcbc"][wordNode] = punc
                    nodeFeatures["trailer"][wordNode] = pSyc
                    for (nt, curNode) in context:
                        oSlots[(nt, curNode)].add(curSlot)
                context.pop()
            context.pop()
        context.pop()

    if len(context):
        print("Context:", context)

    print(f"\n{curSlot:>7} x slot")
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        print(f"{amount:>7} x {nodeType}")

    nValues = reduce(operator.add, (len(values) for values in nodeFeatures.values()), 0)
    print(f"{len(nodeFeatures)} node features with {nValues} values")
    print(f"{len(oSlots)} nodes linked to slots")

    print("Compiling TF data")
    print("Building warp feature otype")

    nodeOffset = {"word": 0}
    oType = {}
    n = 1
    for k in range(n, curSlot + 1):
        oType[k] = "word"
    n = curSlot + 1
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        nodeOffset[nodeType] = n - 1
        for k in range(n, n + amount):
            oType[k] = nodeType
        n = n + amount
    print(f"{len(oType)} nodes")

    print("Filling in the nodes for features")

    newNodeFeatures = collections.defaultdict(dict)
    for (ft, featureData) in nodeFeatures.items():
        newFeatureData = {}
        for ((nodeType, node), value) in featureData.items():
            newFeatureData[nodeOffset[nodeType] + node] = value
        newNodeFeatures[ft] = newFeatureData
    newOslots = {}
    for ((nodeType, node), slots) in oSlots.items():
        newOslots[nodeOffset[nodeType] + node] = slots

    nodeFeatures = newNodeFeatures
    nodeFeatures["otype"] = oType
    edgeFeatures["oslots"] = newOslots

    print(f'Node features: {" ".join(nodeFeatures)}')
    print(f'Edge features: {" ".join(edgeFeatures)}')

    metaData = {
        "": commonMetaData,
        "otext": oText,
        "oslots": dict(valueType="str"),
        "book@en": langMetaData["en"],
    }
    for ft in set(nodeFeatures) | set(edgeFeatures):
        metaData.setdefault(ft, {})["valueType"] = "int" if ft in numFeatures else "str"
        metaData[ft]["description"] = (
            specificMetaData[ft] if ft in specificMetaData else "?"
        )

    TF = Fabric(locations=TF_PATH, silent=True)
    TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
Beispiel #17
0
import os, sys, collections
from tf.fabric import Fabric

# locations = '~/github/etcbc'
locations = '/home/oem/text-fabric-data/etcbc'
coreModule = 'bhsa'
sources = [coreModule, 'phono']
# version = '2017'
version = 'c'
tempDir = os.path.expanduser(f'{locations}/{coreModule}/_temp/{version}/r')
tableFile = f'{tempDir}/{coreModule}{version}.txt'

modules = [f'{s}/tf/{version}' for s in sources]
TF = Fabric(locations=locations, modules=modules)

api = TF.load('')
api = TF.load(
    ('suffix_person', 'tab', 'trailer', 'trailer_utf8', 'txt', 'typ', 'uvf',
     'vbe', 'vbs', 'verse', 'voc_lex', 'voc_lex_utf8', 'vs', 'vt',
     'distributional_parent', 'functional_parent', 'mother', 'oslots'))
allFeatures = TF.explore(silent=False, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
del (api)
api = TF.load(loadableFeatures)
api.makeAvailableIn(globals())

print('done')
Beispiel #18
0
def webPipelineSingle(pipeline, version, force=False, kinds={"mql", "mysql"}):
    good = True

    if "mql" in kinds:
        caption(1, "Aggregate MQL for version {}".format(version))
        for key in ["repoOrder"]:
            if key not in pipeline:
                caption(0, "\tERROR: no {} declared in the pipeline".format(key))
                good = False
        if not good:
            return False

        repoOrder = pipeline["repoOrder"].strip().split()

        resultRepo = repoOrder[0]
        # addedRepos = repoOrder[1:]

        resultRepoDir = "{}/{}".format(githubBase, resultRepo)

        thisTempDir = "{}/_temp/{}".format(resultRepoDir, version)
        tempShebanqDir = "{}/shebanq".format(thisTempDir)
        shebanqDir = "{}/shebanq/{}".format(resultRepoDir, version)
        if not os.path.exists(shebanqDir):
            os.makedirs(shebanqDir)

        dbName = "shebanq_etcbc{}".format(version)

        mqlUFile = "{}/{}.mql".format(tempShebanqDir, dbName)
        mqlZFile = "{}/{}.mql.bz2".format(shebanqDir, dbName)

        xmU = os.path.exists(mqlUFile)
        # xmZ = os.path.exists(mqlZFile)

        uptodate = True

        referenceFile = mqlUFile if xmU else mqlZFile

        if not os.path.exists(referenceFile):
            uptodate = False
            caption(0, "\tWork to do because {} does not exist".format(referenceFile))
        else:
            tmR = os.path.getmtime(referenceFile)
            for (i, repo) in enumerate(repoOrder):
                tfxDir = "{}/{}/tf/{}/.tf".format(githubBase, repo, version)
                if not os.path.exists(tfxDir):
                    uptodate = False
                    caption(
                        0, "\tWork to do because the tf in {} is fresh".format(repo)
                    )
                    caption(0, "\t\t{}".format(tfxDir))
                    break
                if os.path.getmtime(tfxDir) > tmR:
                    uptodate = False
                    caption(
                        0,
                        "\tWork to do because the tf in {} is recently compiled".format(
                            repo
                        ),
                    )
                    caption(0, "\t\t{}".format(tfxDir))
                    break

        if uptodate and force:
            caption(0, "\tWork to do because you forced me to!")
            uptodate = False
        if not uptodate:
            caption(1, "Using TF to make an MQL export")
            locations = []
            for (i, repo) in enumerate(repoOrder):
                locations.append("{}/{}/tf/{}".format(githubBase, repo, version))

            TF = Fabric(locations=locations, modules=[""])
            TF.exportMQL(dbName, tempShebanqDir)
        else:
            caption(0, "\tAlready up to date")

        caption(0, "\tbzipping {}".format(mqlUFile))
        caption(0, "\tand delivering as {} ...".format(mqlZFile))
        bzip(mqlUFile, mqlZFile)
        caption(0, "\tDone")

    if "mysql" in kinds:
        caption(1, "Create Mysql passage db for version {}".format(version))
        runNb(pipelineRepo, programDir, "passageFromTf", force=force, VERSION=version)
        caption(0, "\tDone")

    return True
Beispiel #19
0
import sys
import unittest

from tf.fabric import Fabric

# LOAD THE TEST CORPUS

TF = Fabric('tf')
api = TF.load('sign name')
F = api.F
S = api.S

# MAKE CUSTOM SETS OF NODES

Sign = set(range(1, F.otype.maxSlot + 1))
Node = set(range(1, F.otype.maxNode + 1))

sets = dict(Sign=Sign, Node=Node)

# RUN A QUERY, OPTIONALLY WITH CUSTOM SETS


def query(template, sets=None):

    return (tuple(S.search(template)) if sets is None else tuple(
        S.search(template, sets=sets)))


# DEFINE THE TESTS

relationKey = {
Beispiel #20
0
# +
import os

from tf.fabric import Fabric

# +
GH_BASE = os.path.expanduser("~/github")
ORG = "annotation"
REPO = "banks"
FOLDER = "tf"
TF_DIR = f"{GH_BASE}/{ORG}/{REPO}/{FOLDER}"

VERSION = "0.2"

TF_PATH = f"{TF_DIR}/{VERSION}"
TF = Fabric(locations=TF_PATH)
# -

# We ask for a list of all features:

allFeatures = TF.explore(silent=True, show=True)
loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
loadableFeatures

# We load all features:

api = TF.load(loadableFeatures, silent=False)

# You see that all files are marked with a `T`.
#
# That means that Text-Fabric loads the features by reading the plain text `.tf` files.
Beispiel #21
0
if SCRIPT:
    (good,
     work) = utils.mustRun(None,
                           '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]),
                           force=FORCE)
    if not good: stop(good=False)
    if not work: stop(good=True)

# # Load existing data

# In[17]:

utils.caption(4, 'Loading relevant features')

TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('book')
api.makeAvailableIn(globals())

nodeFeatures = {}
nodeFeatures['book@la'] = {}

bookNodes = []
for b in F.otype.s('book'):
    bookNodes.append(b)
    nodeFeatures['book@la'][b] = F.book.v(b)

for (langCode, langBookNames) in bookNames.items():
    nodeFeatures['book@{}'.format(langCode)] = dict(
        zip(bookNodes, langBookNames))
utils.caption(0, '{} book name features created'.format(len(nodeFeatures)))
Beispiel #22
0
if getpass.getuser() == 'etien':
    locations = etien_path
elif getpass.getuser() == 'cody':
    locations = cody_path
else:
    locations = {}
if not locations:
    raise Exception('Please add your data paths in bhsa.py line 30.')
for path in locations:
    if not os.path.exists(path):
        raise Exception(
            f'You need an extra datamodule in {os.path.dirname(path)}. Do "git pull {locations[path]}" to this location.'
        )

# load TF and BHSA data
TF = Fabric(locations=locations.keys(), modules='2017', silent=True)
api = TF.load('''
              otype language
              book chapter verse
              function domain
              typ pdp kind tree
              crossref
              ''',
              silent=True)

api.makeAvailableIn(globals())  # globalize TF methods

# define book groups & names

lbh_books = ('Song_of_songs', 'Ecclesiastes', 'Esther', 'Daniel', 'Ezra',
             'Nehemiah', '1_Chronicles', '2_Chronicles')
Beispiel #23
0
def ingest_french(paths):
    """Match the French data to our dataset."""

    # load the French dataset
    with open(paths['source'], 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        french_data = list(reader)

    # load the BHSA Hebrew data for matching the Hebrew text
    TF = Fabric(locations=paths['bhsa'])
    API = TF.load('g_word_utf8')
    F, T, L = API.F, API.T, API.L

    # match the Hebrew verbs in the French data with the
    # Hebrew verbs in BHSA
    # we treat the ref strings as unique ID's
    # we use 2 dicts; one to hold ID 2 BHSA node mappings
    # another to hold the IDs 2 french data
    french2bhsa = {}
    french2data = {}
    frenchverses = {}

    for row in french_data:

        # parse French data
        wid = row[0]
        hb_txt, hb_lex, hb_tag, hb_prev = row[1:5]
        fr_words, fr_verse = row[5:7]
        bk, ch, vs, sg, wnum = parse_refstring(wid)
        french2data[wid] = {
            'wid': wid,
            'hebrew': hb_txt,
            'hebrew_parse': hb_tag,
            'french': fr_words,
        }

        # look up BHSA data and get the verse node
        tf_book = int2book[bk]
        vrs_node = T.nodeFromSection((tf_book, ch, vs))
        if vrs_node is None:
            raise Exception((tf_book, ch, vs), wid, hb_txt)

        # save the French verse text
        ref_string = str((tf_book, ch, vs))
        frenchverses[ref_string] = fr_verse
        french2data[wid]['ref'] = ref_string

        # get the closest matching word from the verse;
        # NB we iterate over the verse words in reversed order
        # so that if there are 2+ words with equivalent distances,
        # we always end on the one that is first in the verse;
        # the match is then added to a set so that it is not
        # available for subsequent matches
        french2bhsa[wid] = BhsaWord(0, float('inf'))  # initialize with dummy
        matched = set()
        for word_node in reversed(L.d(vrs_node, 'word')):
            if word_node in matched:
                continue
            bhsa_txt = T.text(word_node)
            dist = levdist(bhsa_txt, hb_txt)
            if french2bhsa[wid].dist > dist:
                french2bhsa[wid] = BhsaWord(word_node, dist)
        matched.add(french2bhsa[wid].node)

    # iterate over both french dicts and assemble
    # into one BHSA dict
    bhsa2french = {}
    for wid, bhsa_word in french2bhsa.items():
        bhsa_node = bhsa_word.node
        if bhsa_node != 0:
            bhsa2french[bhsa_node] = french2data[wid]

    # the linking is complete
    with open(paths['out'], 'w') as outfile:
        json.dump(bhsa2french, outfile, indent=2, ensure_ascii=False)

    with open(paths['out_verses'], 'w') as outfile:
        json.dump(frenchverses, outfile, indent=2, ensure_ascii=False)
Beispiel #24
0
    os.makedirs(thisTempSource)

utils.caption(0, 'bunzipping {} ...'.format(mqlzFile))
utils.bunzip(mqlzFile, mqlFile)
utils.caption(0, 'Done')

if os.path.exists(thisTempTf): rmtree(thisTempTf)
os.makedirs(thisTempTf)

# # MQL to Text-Fabric
# Transform the collected information in feature-like datastructures, and write it all
# out to `.tf` files.

# In[8]:

TF = Fabric(locations=thisTempTf, silent=True)
TF.importMQL(mqlFile, slotType=slotType, otext=otextInfo, meta=featureMetaData)

# # Rename features
# We rename the features mentioned in the RENAME dictionary.

# In[8]:

if RENAME == None:
    utils.caption(4, 'Rename features: nothing to do')
else:
    utils.caption(4,
                  'Renaming {} features in {}'.format(len(RENAME), thisTempTf))
    for (srcFeature, dstFeature) in RENAME:
        srcPath = '{}/{}.tf'.format(thisTempTf, srcFeature)
        dstPath = '{}/{}.tf'.format(thisTempTf, dstFeature)
Beispiel #25
0
from tf.fabric import Fabric

TF = Fabric('tf', '')

maxSlot = 10
halfSlot = int(round(maxSlot / 2))

otype = {i: 'sign' for i in range(1, maxSlot + 1)}
oslots = {}

name = {i: chr(i + ord('a') - 1) for i in range(1, maxSlot + 1)}

p = 0


# GENERATE NODES OF TYPE PART

# create a part with that name and linked to those slots

def addPart(nm, signs):
  mySlots = set(s for s in signs if 1 <= s <= maxSlot)
  if not mySlots:
    return

  global p
  p += 1
  node = maxSlot + p
  otype[node] = 'part'
  oslots[node] = mySlots
  name[node] = nm
Beispiel #26
0
from tf.fabric import Fabric
import collections
import sys
# https://etcbc.github.io/bhsa/features/hebrew/4b/features/comments/g_lex_utf8.html

TF = Fabric(locations='/home/chaim/github/text-fabric-data',
            modules=['hebrew/etcbc4c'])
#TF = Fabric(locations='c:/josh/text-fabric-data/text-fabric-data', modules=['hebrew/etcbc4c'])
api = TF.load(
    'sp lex g_word g_word_utf8 trailer_utf8 ls typ rela function qere_utf8 qere'
)
api.makeAvailableIn(globals())

F = api.F
T = api.T
C = api.C
L = api.L

#print(sorted(T.formats))


def print_original_words():

    for i in range(1, 12):
        print(api.T.text([i], 'text-orig-full'))


# for w in F.otype.s('word'):
#     word, part_of_speech = F.g_word.v(w), F.sp.v(w)
#     print(word, part_of_speech)
#     if w == 14:
Beispiel #27
0
else:
    utils.caption(0, 'New text formats')
    otextInfo = dict(line[1:].split('=', 1)
                     for line in LEX_FORMATS.strip('\n').split('\n'))
    for x in sorted(otextInfo.items()):
        utils.caption(0, '{:<30} = "{}"'.format(*x))

# # Lexicon preparation
# We add lexical data.
# The lexical data will not be added as features of words, but as features of lexemes.
# The lexemes will be added as fresh nodes, of a new type `lex`.

# In[8]:

utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=thisTf, modules=[''])
vocLex = ' g_voc_lex g_voc_lex_utf8 ' if DO_VOCALIZED_LEXEME else ''
api = TF.load('lex lex_utf8 language sp ls gn ps nu st oslots {} {}'.format(
    vocLex, EXTRA_OVERLAP))
api.makeAvailableIn(globals())

# # Text pass
# We map the values in the language feature to standardized ISO values: `arc` and `hbo`.
# We run over all word occurrences, grab the language and lexeme identifier, and create for each
# unique pair a new lexeme node.
#
# We remember the mapping between nodes and lexemes.
#
# This stage does not yet involve the lexical files.

# In[9]:
Beispiel #28
0
# We call up TF and let it look into the directory where the output has to land,
# in this case a subdirectory of the tutorials repo on annotation.

# +
# TF_DIR = os.path.expanduser('~/Downloads/banks/tf')  # if you want it in your Downloads directory instead
BASE = os.path.expanduser('~/github')
ORG = 'annotation'
REPO = 'banks'
RELATIVE = 'tf'

TF_DIR = os.path.expanduser(f'{BASE}/{ORG}/{REPO}/{RELATIVE}')

VERSION = '0.2'

TF_PATH = f'{TF_DIR}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)
# -

# ## TF configuration
#
# A Text-Fabric dataset is a bunch of individual `.tf` files that start with a little bit of metadata and then contain 
# a stream of data, typically the values of a single feature for each node or edge in the graph.
#
# We specify the metadata bit by bit.
#
# ### slot type
#
# A crucial design aspect of each TF dataset is its granularity. What are the slots?
#
# Words, morphemes, characters?
#
Beispiel #29
0
def genTrees(version):
    C = setVersion(version)
    bhsa = C.bhsa
    sp = C.sp
    rela = C.rela
    ptyp = C.ptyp
    ctyp = C.ctyp
    g_word_utf8 = C.g_word_utf8
    tfDir = C.tfDir

    TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa)
    api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother")

    E = api.E
    F = api.F
    Fs = api.Fs

    def getTag(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    def getTagN(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        if not isWord:
            tag += "{" + str(node) + "}"
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    treeTypes = ("sentence", "clause", "phrase", "subphrase", "word")
    (rootType, leafType, clauseType, phraseType) = (
        treeTypes[0],
        treeTypes[-1],
        treeTypes[1],
        treeTypes[2],
    )
    ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items())
    ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items())

    tree = Tree(
        TF,
        otypes=treeTypes,
        phraseType=phraseType,
        clauseType=clauseType,
        ccrFeature=rela,
        ptFeature=ptyp,
        posFeature=sp,
        motherFeature="mother",
    )

    tree.restructureClauses(ccrClass)
    results = tree.relations()
    TF.info("Ready for processing")

    skip = set()
    TF.info("Verifying whether all slots are preserved under restructuring")
    TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}")

    errors = []
    # i = 10
    for snode in F.otype.s(rootType):
        declaredSlots = set(E.oslots.s(snode))
        results = {}
        thisgood = {}
        for kind in ("e", "r"):
            results[kind] = set(lt for lt in tree.getLeaves(snode, kind)
                                if F.otype.v(lt) == leafType)
            thisgood[kind] = declaredSlots == results[kind]
            # if not thisgood[kind]:
            #    print(f"{kind} D={declaredSlots}\n  L={results[kind]}")
            #    i -= 1
        # if i == 0: break
        if False in thisgood.values():
            errors.append((snode, thisgood["e"], thisgood["r"]))
    nErrors = len(errors)
    if nErrors:
        TF.error(f"{len(errors)} mismatches:")
        mine = min(20, len(errors))
        skip |= {e[0] for e in errors}
        for (s, e, r) in errors[0:mine]:
            TF.error(
                (f"{s} embedding: {'OK' if e else 'XX'};"
                 f" restructd: {'OK' if r else 'XX'}"),
                tm=False,
            )
    else:
        TF.info(f"{len(errors)} mismatches")

    TF.info(f"Exporting {rootType} trees to TF")
    s = 0
    chunk = 10000
    sc = 0
    treeData = {}
    treeDataN = {}
    for node in F.otype.s(rootType):
        if node in skip:
            continue
        (treeRep, wordsRep, bSlot) = tree.writeTree(node,
                                                    "r",
                                                    getTag,
                                                    rev=False,
                                                    leafNumbers=True)
        (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node,
                                                       "r",
                                                       getTagN,
                                                       rev=False,
                                                       leafNumbers=True)
        treeData[node] = treeRep
        treeDataN[node] = treeNRep
        s += 1
        sc += 1
        if sc == chunk:
            TF.info(f"{s} trees composed")
            sc = 0
    TF.info(f"{s} trees composed")

    nodeFeatures = dict(tree=treeData, treen=treeDataN)
    metaData = dict(
        tree=dict(
            valueType="str",
            description="penn treebank represententation for sentences",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
        treen=dict(
            valueType="str",
            description=
            "penn treebank represententation for sentences with node numbers included",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
    )
    TF.info("Writing tree feature to TF")
    TFw = Fabric(locations=tfDir, silent=True)
    TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
Beispiel #30
0
        stop(good=False)
    if not work:
        stop(good=True)

# # Loading the feature data
#
# We load the features we need from the BHSA core database and from the valence module,
# as far as generated by the
# [enrich](https://github.com/ETCBC/valence/blob/master/programs/enrich.ipynb) notebook.

# In[7]:

# In[14]:

utils.caption(4, "Load the existing TF dataset")
TF = Fabric(locations=[coreTf, thisTf], modules=[""])

# We instruct the API to load data.

# In[8]:

# In[15]:

api = TF.load("""
    function rela typ
    g_word_utf8 trailer_utf8
    lex prs uvf sp pdp ls vs vt nametype gloss
    book chapter verse label number
    s_manual f_correction
    valence predication grammatical original lexical semantic
    mother