Exemple #1
0
 def test_RealPred_str(self):
     """
     The 'informal' string representation of a RealPred
     should have a leading underscore
     """
     thestring = '_the_q'
     catstring = '_cat_n_1'
     self.assertEqual(str(RealPred.from_string(thestring)), thestring)
     self.assertEqual(str(RealPred.from_string(catstring)), catstring)
Exemple #2
0
 def test_RealPred_str(self):
     """
     The 'informal' string representation of a RealPred
     should have a leading underscore
     """
     thestring = '_the_q'
     catstring = '_cat_n_1'
     self.assertEqual(str(RealPred.from_string(thestring)), thestring)
     self.assertEqual(str(RealPred.from_string(catstring)), catstring)
Exemple #3
0
from scipy.special import expit
from numpy import outer, zeros_like, zeros, array
from math import log

from pydmrs.components import RealPred
from utils import make_shared, is_verb

D = 800
C = 40

half = int(D/2)

with open('/anfs/bigdisc/gete2/wikiwoods/core-5-vocab.pkl', 'rb') as f:
    preds = pickle.load(f)
ind = {p:i for i,p in enumerate(preds)}
pred_index = {RealPred.from_string(p):i for p,i in ind.items()}

pred_wei = make_shared(zeros((len(preds), D)))
for filename, offset in [('/anfs/bigdisc/gete2/wikiwoods/word2vec/matrix_nouns400', 0),
                         ('/anfs/bigdisc/gete2/wikiwoods/word2vec/matrix_verbs400', half)]:
    with open(filename, 'r') as f:
        for line in f:
            pred, vecstr = line.strip().split(maxsplit=1)
            vec = array(vecstr.split())
            pred_wei[ind[pred], offset:offset+half] = vec
# Make vectors longer (av. sum 1.138 over av. 44.9 nonzero entries)
# An average entry is then 0.2, so a predicate is expit(0.2*30 - 3) = 0.95 true
pred_wei *= 8

DATA = '/anfs/bigdisc/gete2/wikiwoods/core-5'
Exemple #4
0
 def test_RealPred_from_string(self):
     """
     RealPred.from_string should instantiate RealPreds
     """
     # Two slots
     the_rel = RealPred.from_string('_the_q_rel')
     the = RealPred.from_string('_the_q')
     self.assertEqual(RealPred('the','q'), the_rel)
     self.assertEqual(RealPred('the','q'), the)
     self.assertIsInstance(the_rel, RealPred)
     self.assertIsInstance(the, RealPred)
     # Three slots
     cat_rel = RealPred.from_string('_cat_n_1_rel')
     cat = RealPred.from_string('_cat_n_1')
     self.assertEqual(RealPred('cat','n','1'), cat_rel)
     self.assertEqual(RealPred('cat','n','1'), cat)
     self.assertIsInstance(cat_rel, RealPred)
     self.assertIsInstance(cat, RealPred)
     # Intermediate underscores in lemma
     nowhere_near_rel = RealPred.from_string('_nowhere_near_x_deg_rel')
     nowhere_near = RealPred.from_string('_nowhere_near_x_deg')
     self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near_rel)
     self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near)
     self.assertIsInstance(nowhere_near_rel, RealPred)
     self.assertIsInstance(nowhere_near, RealPred)
     # Too few slots, no leading underscore, or not a string
     with self.assertRaises(ValueError):
         RealPred.from_string("_the_rel")
     with self.assertRaises(ValueError):
         RealPred.from_string("_the")
     with self.assertRaises(ValueError):
         RealPred.from_string("udef_q_rel")
     with self.assertRaises(TypeError):
         RealPred.from_string(1)
Exemple #5
0
 def test_RealPred_from_string(self):
     """
     RealPred.from_string should instantiate RealPreds
     """
     # Two slots
     the_rel = RealPred.from_string('_the_q_rel')
     the = RealPred.from_string('_the_q')
     self.assertEqual(RealPred('the','q'), the_rel)
     self.assertEqual(RealPred('the','q'), the)
     self.assertIsInstance(the_rel, RealPred)
     self.assertIsInstance(the, RealPred)
     # Three slots
     cat_rel = RealPred.from_string('_cat_n_1_rel')
     cat = RealPred.from_string('_cat_n_1')
     self.assertEqual(RealPred('cat','n','1'), cat_rel)
     self.assertEqual(RealPred('cat','n','1'), cat)
     self.assertIsInstance(cat_rel, RealPred)
     self.assertIsInstance(cat, RealPred)
     # Intermediate underscores in lemma
     nowhere_near_rel = RealPred.from_string('_nowhere_near_x_deg_rel')
     nowhere_near = RealPred.from_string('_nowhere_near_x_deg')
     self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near_rel)
     self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near)
     self.assertIsInstance(nowhere_near_rel, RealPred)
     self.assertIsInstance(nowhere_near, RealPred)
     # Too few slots, no leading underscore, or not a string
     with self.assertRaises(ValueError):
         RealPred.from_string("_the_rel")
     with self.assertRaises(ValueError):
         RealPred.from_string("_the")
     with self.assertRaises(ValueError):
         RealPred.from_string("udef_q_rel")
     with self.assertRaises(TypeError):
         RealPred.from_string(1)
Exemple #6
0
def loads_xml(bytestring, encoding=None, cls=ListDmrs, **kwargs):
    """
    Currently processes "<dmrs>...</dmrs>"
    To be updated for "<dmrslist>...</dmrslist>"...
    Expects a bytestring; to load from a string instead, specify encoding
    Produces a ListDmrs by default; for a different type, specify cls
    """
    if encoding:
        bytestring = bytestring.encode(encoding)
    xml = ET.XML(bytestring)

    dmrs = cls(**kwargs)

    dmrs.cfrom = int(xml.get('cfrom')) if 'cfrom' in xml.attrib else None
    dmrs.cto = int(xml.get('cto')) if 'cto' in xml.attrib else None
    dmrs.surface = xml.get('surface')
    dmrs.ident = int(xml.get('ident')) if 'ident' in xml.attrib else None
    index_id = int(xml.get('index')) if 'index' in xml.attrib else None
    top_id = None

    for elem in xml:
        if elem.tag == 'node':
            nodeid = int(elem.get('nodeid')) if 'nodeid' in elem.attrib else None
            cfrom = int(elem.get('cfrom')) if 'cfrom' in elem.attrib else None
            cto = int(elem.get('cto')) if 'cto' in elem.attrib else None
            surface = elem.get('surface')
            base = elem.get('base')
            carg = elem.get('carg')

            pred = None
            sortinfo = None
            for sub in elem:
                if sub.tag == 'realpred':
                    try:
                        pred = RealPred(sub.get('lemma'), sub.get('pos'), sub.get('sense'))
                    except PydmrsValueError:
                        # If the whole pred name is under 'lemma', rather than split between 'lemma', 'pos', 'sense'
                        pred = RealPred.from_string(sub.get('lemma'))
                        warn("RealPred given as string rather than lemma, pos, sense", PydmrsWarning)
                elif sub.tag == 'gpred':
                    try:
                        pred = GPred.from_string(sub.text)
                    except PydmrsValueError:
                        # If the string is actually for a RealPred, not a GPred
                        pred = RealPred.from_string(sub.text)
                        warn("RealPred string found in a <gpred> tag", PydmrsWarning)
                elif sub.tag == 'sortinfo':
                    sortinfo = sub.attrib
                else:
                    raise PydmrsValueError(sub.tag)

            dmrs.add_node(cls.Node(nodeid=nodeid, pred=pred, carg=carg, sortinfo=sortinfo, cfrom=cfrom, cto=cto, surface=surface, base=base))

        elif elem.tag == 'link':
            start = int(elem.get('from'))
            end = int(elem.get('to'))

            if start == 0:
                top_id = end
            else:
                rargname = None
                post = None
                for sub in elem:
                    if sub.tag == 'rargname':
                        rargname = sub.text
                    elif sub.tag == 'post':
                        post = sub.text
                    else:
                        raise PydmrsValueError(sub.tag)
                dmrs.add_link(Link(start, end, rargname, post))
        else:
            raise PydmrsValueError(elem.tag)

    if top_id:
        dmrs.top = dmrs[top_id]
    if index_id:
        dmrs.index = dmrs[index_id]

    return dmrs