Esempio n. 1
0
def genTrees(version):
    C = setVersion(version)
    bhsa = C.bhsa
    sp = C.sp
    rela = C.rela
    ptyp = C.ptyp
    ctyp = C.ctyp
    g_word_utf8 = C.g_word_utf8
    tfDir = C.tfDir

    TF = Fabric(locations=f"{GH}/{ORG}", modules=bhsa)
    api = TF.load(f"{sp} {rela} {ptyp} {ctyp} {g_word_utf8} mother")

    E = api.E
    F = api.F
    Fs = api.Fs

    def getTag(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    def getTagN(node):
        otype = F.otype.v(node)
        tag = TYPE_TABLE[otype]
        if tag == "P":
            tag = Fs(ptyp).v(node)
        elif tag == "C":
            tag = ccrTable[Fs(rela).v(node)]
        isWord = tag == ""
        if not isWord:
            tag += "{" + str(node) + "}"
        pos = POS_TABLE[Fs(sp).v(node)] if isWord else None
        slot = node if isWord else None
        text = f'"{Fs(g_word_utf8).v(node)}"' if isWord else None
        return (tag, pos, slot, text, isWord)

    treeTypes = ("sentence", "clause", "phrase", "subphrase", "word")
    (rootType, leafType, clauseType, phraseType) = (
        treeTypes[0],
        treeTypes[-1],
        treeTypes[1],
        treeTypes[2],
    )
    ccrTable = dict((c[0], c[1][1]) for c in CCR_INFO.items())
    ccrClass = dict((c[0], c[1][0]) for c in CCR_INFO.items())

    tree = Tree(
        TF,
        otypes=treeTypes,
        phraseType=phraseType,
        clauseType=clauseType,
        ccrFeature=rela,
        ptFeature=ptyp,
        posFeature=sp,
        motherFeature="mother",
    )

    tree.restructureClauses(ccrClass)
    results = tree.relations()
    TF.info("Ready for processing")

    skip = set()
    TF.info("Verifying whether all slots are preserved under restructuring")
    TF.info(f"Expected mismatches: {EXPECTED_MISMATCHES.get(version, '??')}")

    errors = []
    # i = 10
    for snode in F.otype.s(rootType):
        declaredSlots = set(E.oslots.s(snode))
        results = {}
        thisgood = {}
        for kind in ("e", "r"):
            results[kind] = set(lt for lt in tree.getLeaves(snode, kind)
                                if F.otype.v(lt) == leafType)
            thisgood[kind] = declaredSlots == results[kind]
            # if not thisgood[kind]:
            #    print(f"{kind} D={declaredSlots}\n  L={results[kind]}")
            #    i -= 1
        # if i == 0: break
        if False in thisgood.values():
            errors.append((snode, thisgood["e"], thisgood["r"]))
    nErrors = len(errors)
    if nErrors:
        TF.error(f"{len(errors)} mismatches:")
        mine = min(20, len(errors))
        skip |= {e[0] for e in errors}
        for (s, e, r) in errors[0:mine]:
            TF.error(
                (f"{s} embedding: {'OK' if e else 'XX'};"
                 f" restructd: {'OK' if r else 'XX'}"),
                tm=False,
            )
    else:
        TF.info(f"{len(errors)} mismatches")

    TF.info(f"Exporting {rootType} trees to TF")
    s = 0
    chunk = 10000
    sc = 0
    treeData = {}
    treeDataN = {}
    for node in F.otype.s(rootType):
        if node in skip:
            continue
        (treeRep, wordsRep, bSlot) = tree.writeTree(node,
                                                    "r",
                                                    getTag,
                                                    rev=False,
                                                    leafNumbers=True)
        (treeNRep, wordsNRep, bSlotN) = tree.writeTree(node,
                                                       "r",
                                                       getTagN,
                                                       rev=False,
                                                       leafNumbers=True)
        treeData[node] = treeRep
        treeDataN[node] = treeNRep
        s += 1
        sc += 1
        if sc == chunk:
            TF.info(f"{s} trees composed")
            sc = 0
    TF.info(f"{s} trees composed")

    nodeFeatures = dict(tree=treeData, treen=treeDataN)
    metaData = dict(
        tree=dict(
            valueType="str",
            description="penn treebank represententation for sentences",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
        treen=dict(
            valueType="str",
            description=
            "penn treebank represententation for sentences with node numbers included",
            converter="Dirk Roorda",
            convertor="trees.ipynb",
            url="https://github.com/etcbc/trees/trees.ipynb",
            coreData="BHSA",
            coreVersion=version,
        ),
    )
    TF.info("Writing tree feature to TF")
    TFw = Fabric(locations=tfDir, silent=True)
    TFw.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
Esempio n. 2
0
#    when we take the mother relationship into account.

# In[14]:


# 1
expectedSentences = {
    "3": 71354,
    "4": 66045,
    "4b": 63586,
    "2016": 63570,
    "2017": 63711,
}
TF.info(
    "Counting {}s ... (expecting {})".format(
        rootType, expectedSentences.get(VERSION, "??")
    )
)
TF.info("There are {} {}s".format(len(list(F.otype.s(rootType))), rootType))


# In[15]:


# 2
TF.info("Checking parents of {}s ... (expecting none)".format(rootType))
exceptions = set()
for node in F.otype.s(rootType):
    if node in parent:
        exceptions.add(node)
if len(exceptions) == 0: