Esempio n. 1
0
def makeTf():
    tfPathV = f'{TF_PATH}/{VERSION}'
    if os.path.exists(tfPathV):
        rmtree(tfPathV)
    TF = Fabric(locations=[tfPathV])
    TF.save(nodeFeatures=nodeFeatures,
            edgeFeatures=edgeFeatures,
            metaData=metaData)
Esempio n. 2
0
def genTrees(version):
    C = setVersion(version)
    bhsa = C.bhsa
    sp = C.sp
    rela = C.rela
    ptyp = C.ptyp
    ctyp = C.ctyp
    g_word_utf8 = C.g_word_utf8
    tfDir = C.tfDir

    TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa)
    api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother")

    E = api.E
    F = api.F
    Fs = api.Fs

    def getTag(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    def getTagN(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        if not isWord:
            tag += "{" + str(node) + "}"
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    treeTypes = ("sentence", "clause", "phrase", "subphrase", "word")
    (rootType, leafType, clauseType, phraseType) = (
        treeTypes[0],
        treeTypes[-1],
        treeTypes[1],
        treeTypes[2],
    )
    ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items())
    ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items())

    tree = Tree(
        TF,
        otypes=treeTypes,
        phraseType=phraseType,
        clauseType=clauseType,
        ccrFeature=rela,
        ptFeature=ptyp,
        posFeature=sp,
        motherFeature="mother",
    )

    tree.restructureClauses(ccrClass)
    results = tree.relations()
    TF.info("Ready for processing")

    skip = set()
    TF.info("Verifying whether all slots are preserved under restructuring")
    TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}")

    errors = []
    # i = 10
    for snode in F.otype.s(rootType):
        declaredSlots = set(E.oslots.s(snode))
        results = {}
        thisgood = {}
        for kind in ("e", "r"):
            results[kind] = set(lt for lt in tree.getLeaves(snode, kind)
                                if F.otype.v(lt) == leafType)
            thisgood[kind] = declaredSlots == results[kind]
            # if not thisgood[kind]:
            #    print(f"{kind} D={declaredSlots}\n  L={results[kind]}")
            #    i -= 1
        # if i == 0: break
        if False in thisgood.values():
            errors.append((snode, thisgood["e"], thisgood["r"]))
    nErrors = len(errors)
    if nErrors:
        TF.error(f"{len(errors)} mismatches:")
        mine = min(20, len(errors))
        skip |= {e[0] for e in errors}
        for (s, e, r) in errors[0:mine]:
            TF.error(
                (f"{s} embedding: {'OK' if e else 'XX'};"
                 f" restructd: {'OK' if r else 'XX'}"),
                tm=False,
            )
    else:
        TF.info(f"{len(errors)} mismatches")

    TF.info(f"Exporting {rootType} trees to TF")
    s = 0
    chunk = 10000
    sc = 0
    treeData = {}
    treeDataN = {}
    for node in F.otype.s(rootType):
        if node in skip:
            continue
        (treeRep, wordsRep, bSlot) = tree.writeTree(node,
                                                    "r",
                                                    getTag,
                                                    rev=False,
                                                    leafNumbers=True)
        (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node,
                                                       "r",
                                                       getTagN,
                                                       rev=False,
                                                       leafNumbers=True)
        treeData[node] = treeRep
        treeDataN[node] = treeNRep
        s += 1
        sc += 1
        if sc == chunk:
            TF.info(f"{s} trees composed")
            sc = 0
    TF.info(f"{s} trees composed")

    nodeFeatures = dict(tree=treeData, treen=treeDataN)
    metaData = dict(
        tree=dict(
            valueType="str",
            description="penn treebank represententation for sentences",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
        treen=dict(
            valueType="str",
            description=
            "penn treebank represententation for sentences with node numbers included",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
    )
    TF.info("Writing tree feature to TF")
    TFw = Fabric(locations=tfDir, silent=True)
    TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
Esempio n. 3
0
def generateTf(bookAcro, content):
    if os.path.exists(TF_PATH):
        rmtree(TF_PATH)
    os.makedirs(TF_PATH)

    print("Slicing content into features")

    cur = collections.Counter()
    curSlot = 0
    context = []
    nodeFeatures = collections.defaultdict(dict)
    edgeFeatures = collections.defaultdict(lambda: collections.defaultdict(set))
    oSlots = collections.defaultdict(set)
    for acro in allAcrosSeq:
        thisBookInfo = bookAcro[acro]
        bookName = thisBookInfo["bookName"]
        witness = thisBookInfo.get("witness", None)
        chapters = content[bookName]

        cur["book"] += 1
        bookNode = ("book", cur["book"])
        nodeFeatures["book"][bookNode] = acro
        nodeFeatures["book@en"][bookNode] = bookName
        if witness is not None:
            nodeFeatures["witness"][bookNode] = witness
        context.append(("book", cur["book"]))

        for chapterNum in chapters:
            verses = chapters[chapterNum]

            cur["chapter"] += 1
            nodeFeatures["chapter"][("chapter", cur["chapter"])] = chapterNum
            nodeFeatures["book"][("chapter", cur["chapter"])] = acro
            if witness is not None:
                nodeFeatures["witness"][("chapter", cur["chapter"])] = witness
            context.append(("chapter", cur["chapter"]))

            for verseNum in verses:
                words = verses[verseNum].strip().split()

                cur["verse"] += 1
                nodeFeatures["verse"][("verse", cur["verse"])] = verseNum
                nodeFeatures["chapter"][("verse", cur["verse"])] = chapterNum
                nodeFeatures["book"][("verse", cur["verse"])] = acro
                if witness is not None:
                    nodeFeatures["witness"][("verse", cur["verse"])] = witness
                context.append(("verse", cur["verse"]))
                for elem in splitWords(words):
                    if len(elem) != 2:
                        print(bookName, chapterNum, verseNum, words)
                        continue
                    (word, punc) = elem
                    wSyc = TR.to_syriac(word)
                    pSyc = TR.to_syriac(punc)

                    curSlot += 1
                    wordNode = ("word", curSlot)
                    nodeFeatures["word_etcbc"][wordNode] = word
                    nodeFeatures["word"][wordNode] = wSyc
                    nodeFeatures["trailer_etcbc"][wordNode] = punc
                    nodeFeatures["trailer"][wordNode] = pSyc
                    for (nt, curNode) in context:
                        oSlots[(nt, curNode)].add(curSlot)
                context.pop()
            context.pop()
        context.pop()

    if len(context):
        print("Context:", context)

    print(f"\n{curSlot:>7} x slot")
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        print(f"{amount:>7} x {nodeType}")

    nValues = reduce(operator.add, (len(values) for values in nodeFeatures.values()), 0)
    print(f"{len(nodeFeatures)} node features with {nValues} values")
    print(f"{len(oSlots)} nodes linked to slots")

    print("Compiling TF data")
    print("Building warp feature otype")

    nodeOffset = {"word": 0}
    oType = {}
    n = 1
    for k in range(n, curSlot + 1):
        oType[k] = "word"
    n = curSlot + 1
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        nodeOffset[nodeType] = n - 1
        for k in range(n, n + amount):
            oType[k] = nodeType
        n = n + amount
    print(f"{len(oType)} nodes")

    print("Filling in the nodes for features")

    newNodeFeatures = collections.defaultdict(dict)
    for (ft, featureData) in nodeFeatures.items():
        newFeatureData = {}
        for ((nodeType, node), value) in featureData.items():
            newFeatureData[nodeOffset[nodeType] + node] = value
        newNodeFeatures[ft] = newFeatureData
    newOslots = {}
    for ((nodeType, node), slots) in oSlots.items():
        newOslots[nodeOffset[nodeType] + node] = slots

    nodeFeatures = newNodeFeatures
    nodeFeatures["otype"] = oType
    edgeFeatures["oslots"] = newOslots

    print(f'Node features: {" ".join(nodeFeatures)}')
    print(f'Edge features: {" ".join(edgeFeatures)}')

    metaData = {
        "": commonMetaData,
        "otext": oText,
        "oslots": dict(valueType="str"),
        "book@en": langMetaData["en"],
    }
    for ft in set(nodeFeatures) | set(edgeFeatures):
        metaData.setdefault(ft, {})["valueType"] = "int" if ft in numFeatures else "str"
        metaData[ft]["description"] = (
            specificMetaData[ft] if ft in specificMetaData else "?"
        )

    TF = Fabric(locations=TF_PATH, silent=True)
    TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
Esempio n. 4
0
nodeFeatures = {
    'otype': otype,
    'name': name,
}
edgeFeatures = {
    'oslots': oslots,
}

metaData = {
    '': {
        'name': 'testset',
    },
    'otype': {
        'valueType': 'str',
    },
    'oslots': {
        'valueType': 'str',
    },
    'sign': {
        'valueType': 'str',
    },
    'name': {
        'valueType': 'str',
    },
}

# SAVE THE CORPUS AS TF

TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
Esempio n. 5
0
metaData = {
    '': dict(createdBy='Ernst Boogert and Dirk Roorda', ),
    'otext': {
        'sectionFeatures': 'book,chapter,verse',
        'sectionTypes': 'book,chapter,verse',
        'fmt:text-orig-full': '{orig} ',
        'fmt:text-orig-main': '{main} ',
        'fmt:text-orig-plain': '{plain} ',
    },
    'book@en': {
        'valueType': 'str',
        'language': 'English',
        'languageCode': 'en',
        'languageEnglish': 'english',
    },
}

sorted(data.nodeFeatures.keys())

for nf in data.nodeFeatures:
    metaData.setdefault(
        nf, {})['valueType'] = 'int' if nf in numberFeatures else 'str'
for ef in data.edgeFeatures:
    metaData.setdefault(
        ef, {})['valueType'] = 'int' if ef in numberFeatures else 'str'

TF.save(nodeFeatures=data.nodeFeatures,
        edgeFeatures=data.edgeFeatures,
        metaData=metaData)
Esempio n. 6
0
        "description":
        "similarity between words, as a percentage of the common material wrt the combined material",
    },
}

simData = {}
for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d

FOLDER_SIM = "sim/tf"
path = f"{ORG}/{REPO}/{FOLDER_SIM}"
location = f"{GH_BASE}/{path}"
module = VERSION

TF.save(edgeFeatures=dict(sim=simData),
        metaData=metaData,
        location=location,
        module=module)

# ---
# All chapters:
#
# * *use*
# * [share](share.ipynb)
# * [app](app.ipynb)
# * [repo](repo.ipynb)
# * [compose](compose.ipynb)
#
# ---
#
# CC-BY Dirk Roorda
Esempio n. 7
0
def parseCorpus():
    annotSpecs = SyrNT.ANNOTATIONS
    cur = collections.Counter()
    curSlot = 0
    context = []
    nodeFeatures = collections.defaultdict(dict)
    edgeFeatures = collections.defaultdict(
        lambda: collections.defaultdict(set))
    oSlots = collections.defaultdict(set)
    (bookEn, verseLabels) = getVerseLabels()
    (prevBook, prevChapter) = (None, None)
    lexemes = set()
    for p in readCorpus():
        (book, chapter, verse) = verseLabels[cur['verse']]
        if book != prevBook:
            print(f'\t{bookEn[book]:<15} current:'
                  f' b={cur["book"]:>2}'
                  f' c={cur["chapter"]:>3}'
                  f' v={cur["verse"]:>4}'
                  f' w={curSlot:>6}')
            if prevChapter is not None:
                context.pop()
                prevChapter = None
            if prevBook is not None:
                context.pop()
            cur['book'] += 1
            prevBook = book
            bookNode = ('book', cur['book'])
            nodeFeatures['book'][bookNode] = book
            nodeFeatures['book@en'][bookNode] = bookEn[book]
            context.append(('book', cur['book']))
        if chapter != prevChapter:
            if prevChapter is not None:
                context.pop()
            cur['chapter'] += 1
            prevChapter = chapter
            nodeFeatures['chapter'][('chapter', cur['chapter'])] = chapter
            nodeFeatures['book'][('chapter', cur['chapter'])] = book
            context.append(('chapter', cur['chapter']))

        cur['verse'] += 1
        nodeFeatures['verse'][('verse', cur['verse'])] = verse
        nodeFeatures['chapter'][('verse', cur['verse'])] = chapter
        nodeFeatures['book'][('verse', cur['verse'])] = book
        (ln, line) = p
        words = line.split()
        context.append(('verse', cur['verse']))
        for word in words:
            curSlot += 1
            (wordTrans, annotationStr) = word.split('|', 1)
            wordSyr = wordTrans.translate(tosyr)
            wordEtcbc = TR.from_syriac(wordSyr)
            annotations = annotationStr.split('#')
            wordNode = ('word', curSlot)
            nodeFeatures['word_sedra'][wordNode] = wordTrans
            nodeFeatures['word_etcbc'][wordNode] = wordEtcbc
            nodeFeatures['word'][wordNode] = wordTrans.translate(tosyr)
            for ((feature, values), data) in zip(annotSpecs, annotations):
                value = data if values is None else values[int(data)]
                if values is None:
                    nodeFeatures[f'{feature}_sedra'][wordNode] = value
                    value = value.translate(tosyr)
                    valueEtcbc = TR.from_syriac(value)
                    nodeFeatures[f'{feature}_etcbc'][wordNode] = valueEtcbc
                nodeFeatures[feature][wordNode] = (
                    value if feature in numFeatures else
                    NA_VALUE if value in NA_VALUES else value)
            lexeme = nodeFeatures['lexeme'][wordNode]
            if lexeme not in lexemes:
                lexemes.add(lexeme)
                cur['lexeme'] += 1
                lexNode = ('lexeme', cur['lexeme'])
                nodeFeatures['lexeme'][lexNode] = lexeme
                nodeFeatures['lexeme_sedra'][lexNode] = (
                    nodeFeatures['lexeme_sedra'][wordNode])
                nodeFeatures['lexeme_etcbc'][lexNode] = (
                    nodeFeatures['lexeme_etcbc'][wordNode])
            context.append(('lexeme', cur['lexeme']))
            for (nt, curNode) in context:
                oSlots[(nt, curNode)].add(curSlot)
            context.pop()
        context.pop()
    context.pop()
    context.pop()

    print('')

    if SHOW:
        if LIMIT == -1:
            for ft in sorted(nodeFeatures):
                print(ft)
                for n in range(1, 5):
                    for ntp in ('book', 'chapter', 'verse', 'word'):
                        if (ntp, n) in nodeFeatures[ft]:
                            print(f'\t"{nodeFeatures[ft][(ntp, n)]}"')
        else:
            print(nodeFeatures)
            print(oSlots)

    if len(context):
        print('Context:', context)

    print(f'\n{curSlot:>7} x slot')
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        print(f'{amount:>7} x {nodeType}')

    nValues = reduce(operator.add,
                     (len(values) for values in nodeFeatures.values()), 0)
    print(f'{len(nodeFeatures)} node features with {nValues} values')
    print(f'{len(oSlots)} nodes linked to slots')

    print('Compiling TF data')
    print(f'Building warp feature otype')
    nodeOffset = {'word': 0}
    oType = {}
    n = 1
    for k in range(n, curSlot + 1):
        oType[k] = 'word'
    n = curSlot + 1
    for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
        nodeOffset[nodeType] = n - 1
        for k in range(n, n + amount):
            oType[k] = nodeType
        n = n + amount
    print(f'{len(oType)} nodes')

    print('Filling in the nodes for features')
    newNodeFeatures = collections.defaultdict(dict)
    for (ft, featureData) in nodeFeatures.items():
        newFeatureData = {}
        for ((nodeType, node), value) in featureData.items():
            newFeatureData[nodeOffset[nodeType] + node] = value
        newNodeFeatures[ft] = newFeatureData
    newOslots = {}
    for ((nodeType, node), slots) in oSlots.items():
        newOslots[nodeOffset[nodeType] + node] = slots

    nodeFeatures = newNodeFeatures
    nodeFeatures['otype'] = oType
    edgeFeatures['oslots'] = newOslots

    print(f'Node features: {" ".join(nodeFeatures)}')
    print(f'Edge features: {" ".join(edgeFeatures)}')

    metaData = {
        '': commonMetaData,
        'otext': oText,
        'oslots': dict(valueType='str'),
        'book@en': langMetaData['en'],
    }
    for ft in set(nodeFeatures) | set(edgeFeatures):
        metaData.setdefault(
            ft, {})['valueType'] = 'int' if ft in numFeatures else 'str'
        metaData[ft]['description'] = (specificMetaData[ft]
                                       if ft in specificMetaData else '?')

    print(f'Remove existing TF directory')
    rmtree(TF_PATH)
    print(f'Save TF dataset')
    TF = Fabric(locations=TF_PATH, silent=True)
    TF.save(nodeFeatures=nodeFeatures,
            edgeFeatures=edgeFeatures,
            metaData=metaData)
Esempio n. 8
0
# In[26]:


nodeFeatures = dict(tree=treeData, treen=treeDataN)

for f in nodeFeatures:
    metaData[f]["valueType"] = "str"


# In[27]:


utils.caption(4, "Writing tree feature to TF")
TFw = Fabric(locations=thisTempTf, silent=True)
TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)


# # Diffs
# 
# Check differences with previous versions.

# In[ ]:


utils.checkDiffs(thisTempTf, thisTf, only=set(nodeFeatures))


# # Deliver
# 
# Copy the new TF features from the temporary location where they have been created to their final destination.
Esempio n. 9
0
def generateTf(bookAcro, content):
  if os.path.exists(TF_PATH):
    rmtree(TF_PATH)
  os.makedirs(TF_PATH)

  print('Slicing content into features')

  cur = collections.Counter()
  curSlot = 0
  context = []
  nodeFeatures = collections.defaultdict(dict)
  edgeFeatures = collections.defaultdict(
      lambda: collections.defaultdict(set)
  )
  oSlots = collections.defaultdict(set)
  for acro in allAcrosSeq:
    thisBookInfo = bookAcro[acro]
    bookName = thisBookInfo['bookName']
    witness = thisBookInfo.get('witness', None)
    chapters = content[bookName]

    cur['book'] += 1
    bookNode = ('book', cur['book'])
    nodeFeatures['book'][bookNode] = acro
    nodeFeatures['book@en'][bookNode] = bookName
    if witness is not None:
      nodeFeatures['witness'][bookNode] = witness
    context.append(('book', cur['book']))

    for chapterNum in chapters:
      verses = chapters[chapterNum]

      cur['chapter'] += 1
      nodeFeatures['chapter'][('chapter', cur['chapter'])] = chapterNum
      nodeFeatures['book'][('chapter', cur['chapter'])] = acro
      if witness is not None:
        nodeFeatures['witness'][('chapter', cur['chapter'])] = witness
      context.append(('chapter', cur['chapter']))

      for verseNum in verses:
        words = verses[verseNum].strip().split()

        cur['verse'] += 1
        nodeFeatures['verse'][('verse', cur['verse'])] = verseNum
        nodeFeatures['chapter'][('verse', cur['verse'])] = chapterNum
        nodeFeatures['book'][('verse', cur['verse'])] = acro
        if witness is not None:
          nodeFeatures['witness'][('verse', cur['verse'])] = witness
        context.append(('verse', cur['verse']))
        for elem in splitWords(words):
          if len(elem) != 2:
            print(bookName, chapterNum, verseNum, words)
            continue
          (word, punc) = elem
          wSyc = TR.to_syriac(word)
          pSyc = TR.to_syriac(punc)

          curSlot += 1
          wordNode = ('word', curSlot)
          nodeFeatures['word_etcbc'][wordNode] = word
          nodeFeatures['word'][wordNode] = wSyc
          nodeFeatures['trailer_etcbc'][wordNode] = punc
          nodeFeatures['trailer'][wordNode] = pSyc
          for (nt, curNode) in context:
            oSlots[(nt, curNode)].add(curSlot)
        context.pop()
      context.pop()
    context.pop()

  if len(context):
      print('Context:', context)

  print(f'\n{curSlot:>7} x slot')
  for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
      print(f'{amount:>7} x {nodeType}')

  nValues = reduce(
      operator.add, (len(values) for values in nodeFeatures.values()), 0
  )
  print(f'{len(nodeFeatures)} node features with {nValues} values')
  print(f'{len(oSlots)} nodes linked to slots')

  print('Compiling TF data')
  print(f'Building warp feature otype')

  nodeOffset = {'word': 0}
  oType = {}
  n = 1
  for k in range(n, curSlot + 1):
      oType[k] = 'word'
  n = curSlot + 1
  for (nodeType, amount) in sorted(cur.items(), key=lambda x: (x[1], x[0])):
      nodeOffset[nodeType] = n - 1
      for k in range(n, n + amount):
          oType[k] = nodeType
      n = n + amount
  print(f'{len(oType)} nodes')

  print('Filling in the nodes for features')

  newNodeFeatures = collections.defaultdict(dict)
  for (ft, featureData) in nodeFeatures.items():
      newFeatureData = {}
      for ((nodeType, node), value) in featureData.items():
          newFeatureData[nodeOffset[nodeType] + node] = value
      newNodeFeatures[ft] = newFeatureData
  newOslots = {}
  for ((nodeType, node), slots) in oSlots.items():
      newOslots[nodeOffset[nodeType] + node] = slots

  nodeFeatures = newNodeFeatures
  nodeFeatures['otype'] = oType
  edgeFeatures['oslots'] = newOslots

  print(f'Node features: {" ".join(nodeFeatures)}')
  print(f'Edge features: {" ".join(edgeFeatures)}')

  metaData = {
      '': commonMetaData,
      'otext': oText,
      'oslots': dict(valueType='str'),
      'book@en': langMetaData['en'],
  }
  for ft in set(nodeFeatures) | set(edgeFeatures):
      metaData.setdefault(
          ft, {}
      )['valueType'] = 'int' if ft in numFeatures else 'str'
      metaData[ft]['description'] = (
          specificMetaData[ft] if ft in specificMetaData else '?'
      )

  TF = Fabric(locations=TF_PATH, silent=True)
  TF.save(
      nodeFeatures=nodeFeatures,
      edgeFeatures=edgeFeatures,
      metaData=metaData
  )