Exemple #1
0
    def vis_parse(self, sent):  #{{{
        deps = DependenciesCollection()
        parsed = sent[:]
        parsed = [ROOT] + parsed
        sent = [ROOT] + sent
        connections = 0
        mistake = False
        for tok in parsed:
            tok['s'] = tok['form']
        fcache = {}
        scache = {}
        while len(parsed) > 1:
            # find best action
            best = -9999999
            best_pair = None
            scores = {}
            for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])):
                tid = tok1['id']
                if tid in fcache:
                    feats = fcache[tid]
                else:
                    feats = self.featExt.extract(parsed, deps, i, sent)
                    fcache[tid] = feats
                if tid in scache:
                    s1, s2 = scache[tid]
                else:
                    scr = self.scorer.get_scores(feats)
                    s1 = scr[0]
                    s2 = scr[1]
                    scache[tid] = s1, s2
                if s1 > best:
                    best = s1
                    best_pair = (tok1, tok2)
                if s2 > best:
                    best = s2
                    best_pair = (tok2, tok1)
                scores[(i, i + 1)] = s1
                scores[(i + 1, i)] = s2

            c, p = best_pair
            # remove the neighbours of parent from the cache
            i = parsed.index(p)
            frm = i - 4
            to = i + 4
            if frm < 0: frm = 0
            if to >= len(parsed): to = len(parsed) - 1
            for tok in parsed[frm:to]:
                try:
                    del fcache[tok['id']]
                    del scache[tok['id']]
                except:
                    pass
            ###
            yield (self.oracle, sent, parsed, deps, scores)
            # apply action
            deps.add(p, c)
            connections += 1
            parsed = [x for x in parsed if x != c]
        yield (self.oracle, sent, parsed, deps, scores)
Exemple #2
0
 def _build_gold(self, sent):
     # build gold deps
     deps = DependenciesCollection()
     for token in sent[1:]:
         child = token
         parent = sent[child['parent']]
         deps.add(parent, child)
     return deps
Exemple #3
0
 def vis_parse(self, sent): #{{{
    deps = DependenciesCollection()
    parsed = sent[:]
    parsed=[ROOT]+parsed
    sent = [ROOT]+sent
    connections = 0
    mistake=False
    for tok in parsed: tok['s']=tok['form']
    fcache={}
    scache={}
    while len(parsed)>1:
       # find best action
       best = -9999999
       best_pair = None 
       scores = {}
       for i,(tok1,tok2) in enumerate(zip(parsed,parsed[1:])):
          tid=tok1['id']
          if tid in fcache:
             feats = fcache[tid]
          else:
             feats = self.featExt.extract(parsed,deps,i,sent)
             fcache[tid] = feats
          if tid in scache:
             s1,s2 = scache[tid]
          else:
             scr = self.scorer.get_scores(feats)
             s1 = scr[0]
             s2 = scr[1]
             scache[tid]=s1,s2
          if s1 > best:
             best = s1
             best_pair = (tok1,tok2)
          if s2 > best:
             best = s2
             best_pair = (tok2,tok1)
          scores[(i,i+1)]=s1
          scores[(i+1,i)]=s2
          
       c,p = best_pair
       # remove the neighbours of parent from the cache
       i = parsed.index(p)
       frm=i-4
       to=i+4
       if frm<0: frm = 0
       if to>=len(parsed):to=len(parsed)-1
       for tok in parsed[frm:to]:
          try:
             del fcache[tok['id']]
             del scache[tok['id']]
          except: pass
       ###
       yield (self.oracle,sent, parsed, deps, scores)
       # apply action
       deps.add(p,c)
       connections += 1
       parsed = [x for x in parsed if x!=c]
    yield (self.oracle,sent, parsed, deps, scores)
Exemple #4
0
    def parse_labeled(self, sent):  #{{{
        id_to_action_mapper = self.id_to_action_mapper
        deps = DependenciesCollection()
        parsed = sent[:]
        parsed = [ROOT] + parsed
        sent = [ROOT] + sent
        scache = {}
        fe = self.featExt.extract
        gscore = self.scorer.get_scores
        lp = len(parsed)
        while lp > 1:
            # find best action
            _pairs = []
            for i, (tok1,
                    tok2) in enumerate(izip(parsed, islice(parsed, 1, None))):
                tid = tok1['id']
                if tid in scache:
                    (max_score_0, max_score_1, max_lbl_0,
                     max_lbl_1) = scache[tid]
                else:
                    feats = fe(parsed, deps, i, sent)
                    scr = gscore(feats)
                    scache[
                        tid] = scr  # TODO: should I copy with dict() or is it safe?
                    scored = [(score, id_to_action_mapper[aid])
                              for (aid, score) in enumerate(scr)]
                    s0 = [(s, lbl) for (s, (dr, lbl)) in scored if dr == 0]
                    s1 = [(s, lbl) for (s, (dr, lbl)) in scored if dr == 1]
                    max_score_0, max_lbl_0 = max(s0)
                    max_score_1, max_lbl_1 = max(s1)
                    scache[tid] = (max_score_0, max_score_1, max_lbl_0,
                                   max_lbl_1)
                _pairs.append((max_score_0, tok1, tok2, max_lbl_0, i + 1))
                _pairs.append((max_score_1, tok2, tok1, max_lbl_1, i))

            best, c, p, lbl, locidx = max(_pairs)
            # remove the neighbours of parent from the cache
            i = locidx
            frm = i - 4
            to = i + 4
            if frm < 0: frm = 0
            if to >= lp: to = lp - 1
            for tok in parsed[frm:to]:
                try:
                    del scache[tok['id']]
                except:
                    pass
            # apply action
            deps.add(p, c, lbl)
            parsed.remove(c)
            lp -= 1
        return deps
Exemple #5
0
    def parse(self, sent):  # {{{
        deps = DependenciesCollection()
        parsed = sent[:]
        parsed = [ROOT] + parsed
        sent = [ROOT] + sent
        scache = {}
        fcache = {}
        fe = self.featExt.extract
        gscore = self.scorer.get_scores
        lp = len(parsed)
        while lp > 1:
            # find best action
            _pairs = []
            for i, (tok1, tok2) in enumerate(izip(parsed, islice(parsed, 1, None))):
                tid = tok1['id']
                if tid in fcache:
                    feats = fcache[tid]
                else:
                    feats = self.featExt.extract(parsed,deps,i,sent)
                    fcache[tid] = feats
                if tid in scache:
                    s1, s2 = scache[tid]
                else:
                    scr = gscore(feats)
                    s1 = scr[0]
                    s2 = scr[1]
                    scache[tid] = s1, s2

                _pairs.append((s1, tok1, tok2, i + 1))
                _pairs.append((s2, tok2, tok1, i))

            best, c, p, locidx = max(_pairs)
            # remove the neighbours of parent from the cache
            i = locidx
            frm = i - 4
            to = i + 4
            if frm < 0: frm = 0
            if to >= lp: to = lp - 1
            for tok in parsed[frm:to]:
                try:
                    del fcache[tok['id']]
                    del scache[tok['id']]
                except:
                    pass
            # apply action
            deps.add(p, c)
            parsed.remove(c)
            lp -= 1
        return deps
Exemple #6
0
 def _get_state(self,
                pending,
                features=[],
                score=float('-inf'),
                clas=None,
                deps=DependenciesCollection(),
                valid=True):
     """
     state in beam
     :param pending: list of token
     :param features: global features until prv action
     :param score: score of this state
     :param clas: class of prev action
     :param deps: current deps
     :param valid: is this state valid
     :return: a dict
     """
     # copy pending
     pending = list(pending)
     # copy features
     features = copy.copy(features)
     # copy deps
     deps = copy.copy(deps)
     return {
         'pending': pending,
         'features': features,
         'score': score,
         'cls': clas,
         'deps': deps,
         'valid': valid
     }
 def __init__(self,sent):
    self.stack=[]
    self.sent=sent
    self.deps=DependenciesCollection(sent)
    self.i=0
    self.actions = []
    self._action_scores=[]
    self.cost=0
Exemple #8
0
   def parse(self, sent): #{{{
      deps = DependenciesCollection()
      parsed = sent[:]
      parsed=[ROOT]+parsed
      sent = [ROOT]+sent
      scache={}
      fe=self.featExt.extract
      gscore=self.scorer.get_scores
      lp = len(parsed) 
      while lp>1:
         # find best action
         _pairs=[]
         for i,(tok1,tok2) in enumerate(izip(parsed,islice(parsed,1,None))): 
            tid=tok1['id']
            if tid in scache:
               s1,s2 = scache[tid]
            else:
               feats = fe(parsed,deps,i,sent)
               scr = gscore(feats)
               s1 = scr[0]
               s2 = scr[1]
               scache[tid]=s1,s2

            _pairs.append((s1,tok1,tok2,i+1))
            _pairs.append((s2,tok2,tok1,i))
            
         best,c,p,locidx = max(_pairs)
         # remove the neighbours of parent from the cache
         i = locidx
         frm=i-4
         to=i+4
         if frm<0: frm = 0
         if to>=lp:to=lp-1
         for tok in parsed[frm:to]: 
            try:
               del scache[tok['id']]
            except: pass
         # apply action
         deps.add(p,c)
         parsed.remove(c)
         lp-=1
      return deps
Exemple #9
0
    def beam_parse(self, sent):  # {{{
        deps = DependenciesCollection()
        parsed = sent[:]
        parsed = [ROOT] + parsed
        sent = [ROOT] + sent
        fe = self.featExt.extract
        gscore = self.scorer.get_scores
        lp = len(parsed)
        init_state = {
            "scache": {},
            "fcache": {},
            "deps":deps,
            "parsed":parsed,
            "features":[],
            "score":0
        }
        global_beam = [init_state]
        for x in range(lp-1):
            beam = Beam(self.beam_width)
            for state in global_beam:
                lc_parsed = state['parsed']
                lc_fcache = state['fcache']
                lc_scache = state['scache']
                lc_deps = state['deps']
                for i, (tok1, tok2) in enumerate(izip(lc_parsed, islice(lc_parsed, 1, None))):
                    tid = tok1['id']
                    if tid in lc_fcache:
                        feats = lc_fcache[tid]
                    else:
                        feats = fe(lc_parsed,lc_deps,i,sent)
                        lc_fcache[tid] = feats
                    # feats += state['features']
                    if tid in lc_scache:
                        s1,s2 = lc_scache[tid]
                    else:
                        scr = gscore(feats)
                        s1 = scr[0]
                        s2 = scr[1]
                        lc_scache[tid] = s1,s2
                    beam.add_to_beam(s1,tok1,tok2,i+1,lc_deps,lc_parsed,lc_fcache,lc_scache,feats)
                    beam.add_to_beam(s2, tok2, tok1, i, lc_deps, lc_parsed, lc_fcache, lc_scache,feats)

            global_beam = beam.get_beams()

        return global_beam[-1]["deps"]
Exemple #10
0
   def train(self, sent): #{{{
      updates=0
      sent = [ROOT]+sent
      self.scorer.tick()
      deps = DependenciesCollection()
      parsed = sent[:]
      fcache = {}
      scache = {}
      while len(parsed)>1: #{{{
         # find best action
         best = -9999999
         best_pair = None 
         scored = []
         for i,(tok1,tok2) in enumerate(zip(parsed,parsed[1:])):
            tid = tok1['id']
            if tid in fcache:
               feats = fcache[tid]
            else:
               feats = self.featExt.extract(parsed,deps,i,sent)
               fcache[tid]=feats
            if tid in scache:
               s1,s2 = scache[tid]
            else:
               scores = self.scorer.get_scores(feats)
               s1 = scores[0]
               s2 = scores[1]
               scache[tid] = s1,s2
            scored.append((s1,0,feats,tok1,tok2))
            scored.append((s2,1,feats,tok2,tok1))
         scored=sorted(scored,key=lambda (s,cls,f,t1,t2):-s)
         s,cls,f,c,p = scored[0]

         if self.oracle.allow_connection(sent,deps,p,c):
            # remove the neighbours of parent from the cache
            i = parsed.index(p)
            frm=i-4
            to=i+4
            if frm<0: frm = 0
            if to>=len(parsed):to=len(parsed)-1
            for tok in parsed[frm:to]:
               try:
                  del fcache[tok['id']]
                  del scache[tok['id']]
               except: pass
            ###
            deps.add(p,c)
            parsed = [x for x in parsed if x!=c]
         else:
            scache = {} # clear the cache -- numbers changed..
            # find best allowable pair
            for s,gcls,gf,gc,gp in scored[1:]:
               if self.oracle.allow_connection(sent,deps,gp,gc):
                  break

            self.scorer.add(f,cls,-1)
            self.scorer.add(gf,gcls,1)

            updates+=1
            if updates>200:
               print "STUCK, probably because of incomplete feature set"
               print " ".join([x['form'] for x in sent])
               print " ".join([x['form'] for x in parsed])
               return
Exemple #11
0
    def train(self, sent):
        updates = 0
        sent = [ROOT] + sent
        self.scorer.tick()
        deps = DependenciesCollection()
        parsed = sent[:]
        fcache = {}
        scache = {}
        while len(parsed) > 1:
            best = -999999
            best_pair = None
            scored = []
            for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])):
                tid = tok1["id"]
                if tid in fcache:
                    feats = fcache[tid]
                else:
                    feats = self.featExt.extract(parsed, deps, i, sent)
                    fcache[tid] = feats
                if tid in scache:
                    s1, s2 = scache[tid]
                else:
                    scores = self.scorer.get_scores(feats)
                    s1 = scores[0]
                    s2 = scores[1]
                    scache[tid] = s1, s2
                scored.append((s1, 0, feats, tok1, tok2))
                scored.append((s2, 1, feats, tok2, tok1))
            # xap xep tu lon den nho, -s
            scored = sorted(scored, key=lambda (s, cls, f, t1, t2): -s)
            s, cls, f, c, p = scored[0]

            if self.oracle.allow_connection(sent, deps, p, c):
                # remove the neighbours of parent from the cache
                i = parsed.index(p)
                frm = i - 4
                to = i + 4
                if frm < 0: frm = 0
                if to >= len(parsed): to = len(parsed) - 1
                for tok in parsed[frm:to]:
                    try:
                        del fcache[tok['id']]
                        del scache[tok['id']]
                    except:
                        pass
                ###
                deps.add(p, c)
                parsed = [x for x in parsed if x != c]

            else:
                scache = {}  # clear the cache -- numbers changed..
                # find best allowable pair
                for s, gcls, gf, gc, gp in scored[1:]:
                    if self.oracle.allow_connection(sent, deps, gp, gc):
                        break

                self.scorer.add(f, cls, -1)
                self.scorer.add(gf, gcls, 1)

                updates += 1
                if updates > 200:
                    print "STUCK, probably because of incomplete feature set"
                    print " ".join([x['form'] for x in sent])
                    print " ".join([x['form'] for x in parsed])
                    return
Exemple #12
0
    def train_labeled(self, sent, iter_number, explore_policy=None):  #{{{
        id_to_action_mapper = self.id_to_action_mapper
        updates = 0
        sent = [ROOT] + sent
        self.scorer.tick()
        deps = DependenciesCollection()
        parsed = sent[:]
        fcache = {}
        scache = {}
        while len(parsed) > 1:  #{{{
            # find best action
            best = -9999999
            best_pair = None
            scored = []
            for i, (tok1, tok2) in enumerate(zip(parsed, parsed[1:])):
                tid = tok1['id']
                if tid in fcache:
                    feats = fcache[tid]
                else:
                    feats = self.featExt.extract(parsed, deps, i, sent)
                    fcache[tid] = feats
                if tid in scache:
                    scores = scache[tid]
                else:
                    scores = self.scorer.get_scores(feats)
                    scache[tid] = scores
                for aid, score in scores.iteritems():
                    dr, lbl = id_to_action_mapper[aid]
                    if dr == 0:
                        scored.append((score, (aid, lbl), feats, tok1, tok2))
                    else:
                        assert (dr == 1)
                        scored.append((score, (aid, lbl), feats, tok2, tok1))
            #print [(x[0],x[1]) for x in scored]
            scored = sorted(scored, key=lambda (s, cls, f, t1, t2): -s)
            s, cls, f, c, p = scored[0]
            #print "selected:",cls,p['id'],c['id'],s
            cost = self.oracle.action_cost(parsed, p, c, cls[1])
            if cost == 0:
                correct = True
            else:
                correct = False
                scache = {}  # clear the cache -- numbers changed..
                # find best allowable pair
                for s, gcls, gf, gc, gp in scored[1:]:
                    if self.oracle.action_cost(parsed, gp, gc, gcls[1]) == 0:
                        break

                self.scorer.add(f, cls[0], -1)
                self.scorer.add(gf, gcls[0], 1)

                updates += 1
                if updates > 200:
                    print "STUCK, probably because of incomplete feature set", id_to_action_mapper[
                        cls[0]], id_to_action_mapper[gcls[0]]
                    print " ".join([x['form'] for x in sent])
                    print " ".join([x['form'] for x in parsed])
                    return
            if correct or (explore_policy
                           and explore_policy.should_explore(iter_number)):
                # remove the neighbours of parent from the cache
                i = parsed.index(p)
                frm = i - 4
                to = i + 4
                if frm < 0: frm = 0
                if to >= len(parsed): to = len(parsed) - 1
                for tok in parsed[frm:to]:
                    try:
                        del fcache[tok['id']]
                        del scache[tok['id']]
                    except:
                        pass
                ###
                deps.add(p, c, cls[1])
                parsed = [x for x in parsed if x != c]
Exemple #13
0
    def parse_with_span_constraints(self, sent, spans):  #{{{
        """
      spans is a list of the tuples of the form (s,e)
      where s and e are integers, where s is the index of the first token in the span, and e is
      the index of the last token in the span.

      spans may not overlap or contain each other (this is not verified).

      The constraint is that all tokens in each span must share a head, and only that head may have
      children outside of the span.
      """
        deps = DependenciesCollection()
        parsed = sent[:]
        parsed = [ROOT] + parsed
        sent = [ROOT] + sent
        remaining_toks_in_span = {-1: 0}
        for sid, (s, e) in enumerate(spans):
            if e >= len(sent): continue
            remaining_toks_in_span[sid] = (e - s)
            for tok in sent[s:e + 1]:
                tok['span_id'] = sid
        scache = {}
        fe = self.featExt.extract
        gscore = self.scorer.get_scores
        lp = len(parsed)
        while lp > 1:
            # find best action
            _pairs = []
            for i, (tok1,
                    tok2) in enumerate(izip(parsed, islice(parsed, 1, None))):
                # if tok1,tok2 not allowed by the span constraints, skip.
                # in order to be allowed, we need either:
                #  tok1 and tok2 inside the same span.
                #  tok1 and tok2 not inside any span.
                #  a single token in a span is not considered to be inside a span.
                sid1 = tok1.get('span_id', -1)
                sid2 = tok2.get('span_id', -1)
                if sid1 != sid2:
                    if remaining_toks_in_span[
                            sid1] > 0 or remaining_toks_in_span[sid2] > 0:
                        continue
                tid = tok1['id']
                if tid in scache:
                    s1, s2 = scache[tid]
                else:
                    feats = fe(parsed, deps, i, sent)
                    scr = gscore(feats)
                    s1 = scr[0]
                    s2 = scr[1]
                    scache[tid] = s1, s2

                _pairs.append((s1, tok1, tok2, i + 1))
                _pairs.append((s2, tok2, tok1, i))

            best, c, p, locidx = max(_pairs)
            # remove the neighbours of parent from the cache
            i = locidx
            frm = i - 4
            to = i + 4
            if frm < 0: frm = 0
            if to >= lp: to = lp - 1
            for tok in parsed[frm:to]:
                try:
                    del scache[tok['id']]
                except:
                    pass
            # apply action
            deps.add(p, c)
            parsed.remove(c)
            remaining_toks_in_span[c.get('span_id', -1)] -= 1
            lp -= 1
        return deps