コード例 #1
0
ファイル: adjsAndAdverbs.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    for deps in depParse:
        if deps is None: continue
        for itm in deps:
            if completed[1][(itm['gov_idx'],itm['dep_idx'])]: continue
            i = itm['dep_idx']
            if itm['rel'] in ['amod', 'advmod', 'dep', 'num', 'number', 'det']:
                h = itm['gov_idx'] # i's head
                
                if itm['rel']=='det' and itm['dep'].lower() in ['the', 'a', 'an']:
                    # skip articles
                    completed[0][i] = True
                    completed[1][(h,i)] = True
                    continue
                
                x = alignment[:h] # index of variable associated with i's head, if any
                if not (x or x==0): # need a new variable
                    assert not completed[0][h], (depParse[h],amr)
                    x = new_concept_from_token(amr, alignment, h, depParse, wTags)
                    completed[0][h] = True
                y = alignment[:i] # modifier variable
                if not (y or y==0): # new variable
                    y = new_concept_from_token( amr, alignment, i, depParse, wTags)
                    completed[0][i] = True
                if itm['rel'] in ['num', 'number']:   # attach as :quant
                    newtriple = (str(x), 'quant', str(y))   # TODO: for plain values, don't create a variable
                elif 'AGE' in amr.get_concept(str(y)).split('-'):
                    newtriple = (str(x), 'age', str(y))
                    amr.node_to_concepts[str(y)] = amr.node_to_concepts[str(y)].replace('-AGE','')
                else:   # attach with :mod relation
                    newtriple = (str(x), 'mod', str(y))
                
                
                amr = new_amr_from_old(amr, new_triples=[newtriple])
                
                completed[1][(h,i)] = True

    # simplify adverbs to adjectives based on lexicon
    for v in amr.node_to_concepts.keys():
        amr.node_to_concepts[v] = simplify_adv(amr.node_to_concepts[v])

    return depParse, amr, alignment, completed
コード例 #2
0
ファイル: auxes.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    for deps in depParse:
        if deps is None: continue
        for itm in deps:
            if completed[1][(itm['gov_idx'],itm['dep_idx'])]: continue
            i = itm['dep_idx']
            if itm['rel'] in ['aux', 'auxpass']:
                if wTags[i]["PartOfSpeech"]!='MD':
                    # BE or HAVE auxiliary--ignore
                    completed[0][i] = True
                    completed[1][(itm['gov_idx'],i)] = True
                    continue
                
                #print(itm, file=sys.stderr)
                
                mw = itm["dep"]
                mpred = MODALS[mw]
                
                
                x = alignment[:i] # index of variable associated with i's head, if any
                if not (x or x==0): # need a new variable
                    assert not completed[0][i]
                    x = new_concept_from_token(amr, alignment, i, depParse, wTags, concept=pipeline.token2concept(mpred))
                    completed[0][i] = True
                    
                h = itm["gov_idx"] # i's head
                y = alignment[:h] # modifier variable
                if not (y or y==0): # new variable
                    y = new_concept_from_token(amr, alignment, h, depParse, wTags)
                    completed[0][h] = True
                
                newtriple = (str(x), ACTION_ARG[mpred], str(y))

                amr = new_amr_from_old(amr, new_triples=[newtriple])

                completed[1][(itm['gov_idx'],i)] = True

    return depParse, amr, alignment, completed
コード例 #3
0
ファイル: timex.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    new_triples = set()
    nNewTrip = 0

    time_expressions = pipeline.loadTimex(jsonFile)
    for tid, start, end, raw_timex in time_expressions:
        t = Timex3Entity(ElementTree.fromstring(raw_timex))
        h = choose_head(range(start,end+1), depParse)

        mc = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=pipeline.token2concept(t.main_concept))

        if t.wrapper != None:
            alignment.unlink(mc, h)
            wc = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=pipeline.token2concept(t.wrapper)+'-'+t.type)
            new_triples.add((str(wc), 'op1', str(mc)))
        else:
            amr.node_to_concepts[str(mc)] += '-'+t.type

        if 'weekday' in t.date_entity:
            wd = int(t.date_entity['weekday'])
            wd_name = weekdays[wd] # e.g. 'friday'
            x = new_concept(pipeline.token2concept(wd_name), amr)
            new_triples.add((str(mc), 'weekday', str(x)))
        if 'dayperiod' in t.date_entity:
            dp = t.date_entity['dayperiod']
            dp_name = dayperiods[dp]    # e.g. 'afternoon'
            x = new_concept(pipeline.token2concept(dp_name), amr)
            new_triples.add((str(mc), 'dayperiod', str(x)))

        #print('####', t.date_entity)
        for k, v in t.date_entity.iteritems():
            if k in ['weekday','dayperiod']: continue   # handled above
            if isinstance(v,basestring):
                v = pipeline.token2concept(str(v))
                x = new_concept(v, amr)
                x = str(x)
            else:   # leave literal numeric values alone
                #print(amr.triples(instances=False))
                x = v
            new_triples.add((str(mc), k, x))

        for i in range(start, end+1): # for now mark everything as completed
            completed[0][i] = True
        for i,j in completed[1]:
            if i >= start and i <= end and j >= start and j <= end:
                completed[1][(i,j)] = True
                
        try:
            assert t.main_concept and (t.main_concept not in ['date-entity','temporal-quantity'] or len(new_triples)>nNewTrip)
        except AssertionError:
            if config.verbose or config.warn: print('Warning: Unhandled time expression', file=sys.stderr)
        nNewTrip = len(new_triples)

    #print(list(new_triples))
    
    amr = new_amr_from_old(amr, new_triples=list(new_triples))
    
    
    # TODO: mark all internal dependencies as completed?
    return depParse, amr, alignment, completed
コード例 #4
0
ファイル: nprop.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    triples = set() # to add to the AMR
    
    props = pipeline.loadNProp(jsonFile)
    
    predheads = {}  # map head index to nominal predicate variable (not reflected in the alignment)
    
    # add all predicates first, so the roleset properly goes into the AMR
    for prop in props:
        baseform, roleset = prop["baseform"], prop["frame"]
        
        if not config.fullNombank and not verbalize.nompred2verbpred(roleset):
            continue    # TODO: maybe add just the pred stem & non-core args that map to AMR role names?
        
        preds = {tuple(arg) for arg in prop["args"] if arg[0]=='rel'}
        assert len(preds)==1
        pred = next(iter(preds))
        assert pred[2]==pred[3] # multiword predicates?
        ph = pred[2]    # predicate head
        #px = alignment[:ph]    # instead of aligning noun predicate to noun in the sentence, introduce the noun predicate separately (so the plain noun concept can be its argument)
        px = predheads.get(ph)
        predconcept = pipeline.token2concept(roleset.replace('.','-n-'))
        if not (px or px==0):
            px = new_concept(predconcept, amr)  # no alignment here - instead use 'predheads'
            #print('###','newconcept',px,'/',predconcept)
            px0 = alignment[:ph]
            if not (px0 or px0==0):
                px0 = new_concept_from_token(amr, alignment, ph, depParse, wTags)
            triples.add((str(px0), '-PRED', str(px)))
            #if len(prop["args"])==1 or (prop["args"][0][0] in ['Support','rel'] and prop["args"][1][0] in ['Support','rel']):
            #    triples.add((str(px), '-DUMMY', ''))
            predheads[ph] = px
        else:   # predicate already a concept in the AMR (e.g. inserted by the 'nouns' module)
            amr.node_to_concepts[str(px)] = predconcept # change the name of the concept
        
        completed[0][ph] = True
        
    # now handle arguments
    for prop in props:
        baseform, roleset = prop["baseform"], prop["frame"]
        
        pred = [arg for arg in prop["args"] if arg[0]=='rel'][0]
        ph = pred[2]    # predicate head
        #px = alignment[:ph]
        if ph not in predheads:
            continue
        
        px = predheads[ph]
        
        for rel,treenode,i,j,yieldS in prop["args"]:
            if i is None or j is None: continue # TODO: special PropBank cases that need further work
            if rel in ['rel', 'Support']: continue
            assert rel[:3]=='ARG'
            h = choose_head(range(i,j+1), depParse)
            if h is None: continue # TODO: improve coverage of complex spans
            
            # handle general proposition arguments
            if str(alignment[:h]) in amr.node_to_concepts:
                rel, amr.node_to_concepts[str(alignment[:h])] = common_arg(rel, amr.get_concept(str(alignment[:h])))
            else:
                drels = [dep["rel"] for dep in depParse[h]]
                rel = common_arg(rel, drels=drels)
            
            if isinstance(rel,tuple):
                rel, val = rel
                assert isinstance(val,Atom)
                triples.add((str(px), rel, val))
            else:
                x = amrget(amr, alignment, h, depParse, wTags)
                
                triples.add((str(px), rel, str(x)))
            #print('###',px,rel,x)
            
            completed[0][h] = True

            # if SRL argument link corresponds to a dependency edge, mark that edge as complete
            if (ph,h) in completed[1]:
                completed[1][(ph,h)] = True
                #print('completed ',(ph,h))
            if (h,ph) in completed[1]:  # also for reverse direction
                completed[1][(h,ph)] = True
                #print('completed ',(ph,h))
    
    #print(triples)
    amr = new_amr_from_old(amr, new_triples=list(triples))

    return depParse, amr, alignment, completed
コード例 #5
0
ファイル: vprop.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    triples = set() # to add to the AMR
    
    props = pipeline.loadVProp(jsonFile)
    
    # add all predicates first, so the roleset properly goes into the AMR
    for prop in props:
        baseform, roleset = prop["baseform"], prop["frame"]
        
        preds = {tuple(arg[:5]) for arg in prop["args"] if arg[0]=='rel'}
        assert len(preds)==1
        pred = next(iter(preds))
        assert pred[2]==pred[3] # multiword predicates?
        ph = pred[2]    # predicate head
        if ph is None: continue  # TODO: improve coverage of complex spans
        
        px = alignment[:ph]
        if not (px or px==0):
            px = new_concept_from_token(amr, alignment, ph, depParse, wTags, concept=pipeline.token2concept(roleset.replace('.','-')))
            if len(prop["args"])==1 or prop["args"][1][0].startswith('LINK'):
                triples.add((str(px), '-DUMMY', ''))
        completed[0][ph] = True
        
    # now handle arguments
    for prop in props:
        baseform, roleset = prop["baseform"], prop["frame"]
        
        pred = [arg for arg in prop["args"] if arg[0]=='rel'][0]
        ph = pred[2]    # predicate head
        if ph is None: continue # TODO: improve coverage of complex spans
        px = alignment[:ph]
        
        for rel,treenode,i,j,yieldS,_ in prop["args"]:
            if i is None or j is None: continue # TODO: special PropBank cases that need further work
            if rel in ['rel', 'LINK-PCR', 'LINK-SLC']: continue
            assert rel[:3]=='ARG'
            if i==j:
                #assert depParse[i], (tokens[i],rel,treenode,yieldS)
                if depParse[i] is None: continue    # TODO: is this appropriate? e.g. in wsj_0003.0
            #print(roleset,rel,i,j,yieldS)
            h = choose_head(range(i,j+1), depParse)
            if h is None: continue  # TODO: temporary?
            x = alignment[:h] # index of variable associated with i's head, if any
            
            # handle general proposition arguments
            if str(alignment[:h]) in amr.node_to_concepts:
                rel, amr.node_to_concepts[str(alignment[:h])] = common_arg(rel, amr.get_concept(str(alignment[:h])))
            else:
                drels = [dep["rel"] for dep in depParse[h]]
                rel = common_arg(rel, drels=drels)
            
            # verb-specific argument types
            if rel=='ARGM-MOD':
                if yieldS=='will':
                    pass    # skip this auxiliary
                else:
                    continue # handle modal in a later module
            elif isinstance(rel,tuple):
                rel, val = rel
                assert isinstance(val,Atom)
                triples.add((str(px), rel, val))
            else:
                if not (x or x==0): # need a new variable
                    x = new_concept_from_token(amr, alignment, h, depParse, wTags)
                triples.add((str(px), rel, str(x)))
            
            completed[0][h] = True

            # if SRL argument link corresponds to a dependency edge, mark that edge as complete
            if (ph,h) in completed[1]:
                completed[1][(ph,h)] = True
                #print('completed ',(ph,h))
            if (h,ph) in completed[1]:  # also for reverse direction
                completed[1][(h,ph)] = True
                #print('completed ',(ph,h))
    
    #print(triples)
    amr = new_amr_from_old(amr, new_triples=list(triples))

    return depParse, amr, alignment, completed
コード例 #6
0
ファイル: nes.py プロジェクト: christianbuck/nlu
def main(sentenceId, jsonFile, tokens, ww, wTags, depParse, inAMR, alignment, completed):
    amr = inAMR
    triples = set() # to add to the AMR
    
    entities = pipeline.loadBBN(jsonFile)
    for i,j,name,coarse,fine,raw in entities:
        
        if raw.startswith('<TIMEX'): continue  # use the timex module (sutime output) instead
        
        h = choose_head(range(i,j+1), depParse, 
                        fallback=lambda frontier: max(frontier) if len(frontier)==2 and ww[min(frontier)]=='than' else False)
                        # ^ dirty hack: in 'more than 3 times' (wsj_0003.12), [more than 3] is a value expression 
                        # but 'than' and '3' both attach to 'times' in the dependency parse.
        #print((i,j),name,h,depParse[h+1]['dep'], file=sys.stderr)
        
        x = alignment[:h] # index of variable associated with i's head, if any
        
        if raw.startswith('<NUMEX'):
            if coarse in ['MONEY','CARDINAL','PERCENT']:
                # get normalized value from Stanford tools
                v = wTags[h]["NormalizedNamedEntityTag"]
                
                wrapper = None
                if v[0] in '<>~':
                    if len(v)==1:
                        print('Warning: Unexpected NormalizedNamedEntityTag:',v,'for',raw, file=sys.stderr)
                    else:
                        if v[1]=='=':
                            reln = v[:2]
                            v = v[2:]
                        else:
                            reln = v[0]
                            v = v[1:]
                        concept = {'<': 'less-than', '>': 'more-than', '<=': 'no-more-than', '>=': 'at-least', '~': 'about'}[reln]
                        wrapper = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=concept)
                    
                if coarse=='MONEY':
                    m = re.match(r'^([\$¥£])(\d+\.\d+(E-?\d+)?)$', v)
                    if not m:
                        assert False,v
                    u = m.group(1)
                    v = m.group(2)
                elif coarse=='PERCENT':
                    m = re.match(r'^%(\d+\.\d+(E-?\d+)?)$', v)
                    if not m:
                        assert False,v
                    v = m.group(1)
                
                try:
                    v = float(v)
                    if str(v).endswith('.0'):
                        v = int(v)
                except ValueError:
                    pass
                
                if (wrapper is None or coarse=='MONEY') and not (x or x==0): # need a new variable
                    kind = {'MONEY': 'monetary-quantity', 'PERCENT': 'percentage-entity'}.get(coarse, coarse.upper())
                    if wrapper is None: # if there is a wrapper concept (e.g. 'more-than'), it is aligned, so don't provide an alignment for x
                        x = new_concept_from_token(amr, alignment, h, depParse, wTags, concept=kind)
                    else:
                        x = new_concept(kind, amr)
                
                if (x or x==0):
                    triples.add((str(x), 'value' if coarse=='PERCENT' else 'quant', v))
                    if wrapper is not None:
                        triples.add((str(wrapper), 'op1', str(x)))
                elif wrapper is not None:
                        triples.add((str(wrapper), 'op1', v))   # e.g. more-than :op1 41
                
                
                if coarse=='MONEY':
                    y = new_concept({'$': 'dollar', '¥': 'yen', '£': 'pound'}[u.encode('utf-8')], amr)
                    triples.add((str(x), 'unit', str(y)))
            elif coarse=='ORDINAL':
                pass    # skip--no special treatment in AMR guidelines, though the normalized value could be used
            else:
                assert False,(i,j,raw)
        elif coarse.endswith('_DESC'):
            # make the phrase head word the AMR head concept
            # (could be a multiword term, like Trade Representative)
            if not (x or x==0): # need a new variable
                x = new_concept_from_token(amr, alignment, h, depParse, wTags)
                triples.add((str(x), '-DUMMY', '')) # ensure the concept participates in some triple so it is printed
        else:
            if coarse.lower()=='person' and i>0 and ww[i-1] and ww[i-1].lower() in ['mr','mr.','mister','master','sir','mrs','mrs.','miss']:
                # Extend the NE to include formal titles that do not get concepts
                name = ww[i-1]+' '+name
                i -= 1

            if not (x or x==0): # need a new variable
                ne_class = fine.lower().replace('other','') or coarse.lower()
                concept, amr_name = amrify(ne_class, name)
                x = new_concept_from_token(amr, alignment, h, depParse, wTags, 
                                concept=pipeline.token2concept(concept)+'-FALLBACK')
                # -FALLBACK indicates extra information not in the sentence (NE class)
                n = new_concept('name', amr)
                triples.add((str(x), 'name', str(n)))
                for iw,w in enumerate(amr_name.split()):
                    triples.add((str(n), 'op'+str(iw+1), '"'+w+'"'))
                    
        
        for k in range(i,j+1):
            assert not completed[0][k]
            completed[0][k] = True
            #print('completed token',k)
            if k!=h:
                for link in parent_edges(depParse[k]):
                    completed[1][link] = True  # we don't need to attach non-head parts of names anywhere else
    
    amr = new_amr_from_old(amr, new_triples=list(triples))

    return depParse, amr, alignment, completed