Exemple #1
0
  def oracle_trace(self, document):
    assert len(document.gold) > 0, "No gold actions"
    state = ParserState(document, self)
    for gold in document.gold:
      print("Taking gold action", gold)
      print("On state:", state)

      gold_index = self.actions.indices.get(gold, None)
      assert gold_index is not None, "Unknown gold action: %r" % gold
      assert state.is_allowed(gold_index), "Disallowed gold action: %r" % gold
      state.advance(gold)

    print("Final state after", len(document.gold), "actions:", state)
Exemple #2
0
def __main__():
    trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True)

    trainingCorpus.read( \
        open(trainingFile, 'r',
            encoding='utf-8').read())

    # make fake model params, enough for lexicon builder
    # we still need feature_maps to use ParserState
    modelParams = ModelParameters('')
    modelParams.trainingFile = trainingFile
    modelParams.cfg = {'projectivizeTrainingSet': True}

    lexicon = Lexicon(modelParams)
    lexicon.compute()

    sentence = trainingCorpus.sentences[0]

    parser_state = ParserState(sentence, lexicon.getFeatureMaps())

    # necessary for initializing and pushing root
    # (only initialize transition_state_class once!)
    # keep arc_state in sync with parser_state
    arc_state = transition_state_class(parser_state)

    dynamicOracleTrainTest(parser_state)
Exemple #3
0
def test_parse():
    """Simple tests for the PartialParse.parse function.
    Warning: these are not exhaustive.
    """
    sentence = [
        Token(i + 1, f) for i, f in enumerate(["parse", "this", "sentence"])
    ]
    state = ParserState(stack=[ROOT], buffer=sentence)
    dependencies = state.parse(["S", "S", "S", "LA", "RA", "RA"])
    dependencies = [(a[0].form, a[1].form) for a in sorted(dependencies)]
    expected = [('ROOT', 'parse'), ('parse', 'sentence'), ('sentence', 'this')]
    assert dependencies == expected, \
        f"parse test resulted in dependencies {dependencies}, expected {expected}"
    assert [t.form for t in sentence] == ["parse", "this", "sentence"], \
        f"parse test failed: the input sentence should not be modified"
    print("parse test passed!")
    def parse(self, sentences, model, conllu=False):
        """
        @param sentences: a list of (list Token).
        @param model: a trained parser model.
        @param conllu: if True prints the parsed sentences in CoNLL-U format.
        """
        vsentences = self.vectorize(sentences)

        UAS = LAS = all_tokens = 0.0
        for sent, vsent in zip(sentences, vsentences):
            if not conllu: print('.', end='')  # show progress
            state = ParserState([self.root_token], vsent, [])  # FIXME
            while state.buffer or len(state.stack) > 1:
                feats = state.extract_features(self)
                trans = model.predict([feats])[0].argmax()
                if not state.step(trans):
                    break  # if transition is not feasible
            if conllu:
                for j, t in enumerate(sent):
                    head = deprel = 0
                    for arc in state.arcs:
                        if arc[1].id == t.id:
                            head = arc[0].id
                            deprel = arc[2]
                            break
                    print('\t'.join([
                        str(j + 1), t.form, '_', t.pos, '_', '_',
                        str(head), self.id2dep[deprel], '_', '_'
                    ]))
                print()
            for arc in state.arcs:
                pred_h = arc[0].id
                gold_h = arc[1].head
                UAS += pred_h == gold_h
                pred_l = arc[2]
                gold_l = arc[1].deprel
                LAS += pred_h == gold_h and pred_l == gold_l
                all_tokens += 1
        UAS /= all_tokens
        LAS /= all_tokens
        return UAS, LAS
Exemple #5
0
 def advanceSentence(self, i):
     self.logger.debug('Slot(%d): advance sentence' % i)
     assert i >= 0 and i < self.batch_size
     if (self.sentence_batch.advanceSentence(i)):
         self.parser_states[i] = ParserState(
             self.sentence_batch.sentence(i), self.feature_maps)
         # necessary for initializing and pushing root
         # keep arc_states in sync with parser_states
         self.arc_states[i] = \
             self.transition_state_class(self.parser_states[i])
     else:
         self.parser_states[i] = None
         self.arc_states[i] = None
Exemple #6
0
 def advanceSentence(self, i):
     assert i >= 0 and i < self.batch_size
     if (self.sentence_batch.advanceSentence(i)):
         self.parser_states[i] = ParserState(
             self.sentence_batch.sentence(i), self.feature_maps)
         # necessary for initializing and pushing root
         # keep arc_states in sync with parser_states
         self.arc_states[i] = \
             self.transition_state_class(self.parser_states[i])
     else:
         self.parser_states[i] = None
         self.arc_states[i] = None
     if self.state(i) != None:
         self.docids_.insert(0, self.state(i).sentence().docid())
    def create_features(self, sentences):
        """
        Build training instances.
        @return: list(features), list(action) for each state while parsing each sentence
        """
        train_x, train_y = [], []
        with tqdm(total=len(sentences)) as prog:
            for sent in sentences:
                # arcs = [(head, dependent, deprel)]
                state = ParserState([self.root_token], sent, [])  # FIXME
                while state.buffer or len(state.stack) > 1:
                    gold_t = state.get_oracle()
                    if gold_t is None:
                        break
                    train_x.append(state.extract_features(self))
                    train_y.append(gold_t)
                    state.step(gold_t)  # perform transition
                prog.update(1)

        return train_x, train_y
Exemple #8
0
  def forward(self, document, train=False, debug=False):
    # Compute LSTM outputs for all tokens.
    lr_out, rl_out, lstm_features = self._lstm_outputs(document)

    # Run FF unit.
    state = ParserState(document, self.spec)
    actions = self.spec.actions
    cascade = self.spec.cascade
    ff_activations = []

    if train:
      losses = Losses()

      # Translate the gold actions into their cascade equivalents.
      cascade_gold = cascade.translate(document.gold)
      gold_index = 0
      while not state.done:
        # Compute the hidden layer once for all cascade delegates.
        ff_activation, _ = self._ff_activation(
            lr_out, rl_out, ff_activations, state)
        cascading = True
        delegate_index = 0   # assume we start the cascade at delegate 0
        while cascading:
          # Get the gold action for the delegate and compute loss w.r.t. it.
          gold = cascade_gold[gold_index]
          step_loss = cascade.loss(delegate_index, state, ff_activation, gold)
          losses.add(delegate_index, step_loss)

          # If the gold action was a CASCADE, move to the next delegate.
          if gold.is_cascade():
            delegate_index = gold.delegate
          else:
            state.advance(gold)
            cascading = False
          gold_index += 1

      return losses
    else:
      if document.size() == 0: return state

      shift = actions.action(actions.shift())
      stop = actions.action(actions.stop())
      disallowed_counts = [0] * cascade.size()
      total_counts = [0] * cascade.size()
      trace = Trace(self.spec, state, lstm_features) if debug else None
      while not state.done:
        # Compute the FF activation once for all cascade delegates.
        ff_activation, ff_features = self._ff_activation(
            lr_out, rl_out, ff_activations, state, debug=debug)
        if trace:
          trace.start_step(state, ff_features)

        # Store the last CASCADE action in a cascade.
        delegate_index = 0
        last = None
        while True:
          # Get the highest scoring action from the cascade delegate.
          # Note: We don't have to do any filtering or checking here, we
          # can just return the top-scoring action.
          best = cascade.predict(delegate_index, state, last, ff_activation)
          final = best
          if best.is_cascade():
            delegate_index = best.delegate
            last = best
          else:
            # If the action isn't allowed or can't be applied to the state,
            # then default to SHIFT or STOP.
            index = actions.index(best)
            total_counts[delegate_index] += 1
            if actions.disallowed[index] or not state.is_allowed(index):
              disallowed_counts[delegate_index] += 1
              final = shift
              if state.current == state.end:
                final = stop

          if trace:
            trace.action(best, final)
          if not final.is_cascade():
            # Apply the action and stop the cascade.
            state.advance(final)
            break

      return state, disallowed_counts, total_counts, trace
Exemple #9
0
    def setupParser(self, mode):
        hiddenLayerSizes = self.modelParams.cfg['hiddenLayerSizes']
        featureStrings = self.modelParams.cfg['featureStrings']
        embeddingSizes = self.modelParams.cfg['embeddingSizes']
        batchSize = self.modelParams.cfg['batchSize']
        transitionSystem = self.modelParams.cfg['transitionSystem']

        if transitionSystem == 'arc-standard':
            self.transitionSystem = ArcStandardTransitionSystem()
        elif transitionSystem == 'arc-eager':
            self.transitionSystem = ArcEagerTransitionSystem()
        else:
            assert None, 'transition system must be arc-standard or arc-eager'

        assert len(hiddenLayerSizes) > 0, 'must have at least one hidden layer'
        assert len(featureStrings) == len(set(featureStrings)), \
            'duplicate feature string detected'

        if mode == 'train':
            # determine if we have to compute or read the lexicon
            self.logger.info('Computing lexicon from training corpus...')
            self.modelParams.lexicon.compute()
            self.logger.info('Done building lexicon')
            self.modelParams.lexicon.write()
        elif mode == 'evaluate':
            self.logger.info('Reading lexicon from trained model...')
            self.modelParams.lexicon.read()
        else:
            assert None, 'invalid mode: ' + mode

        self.featureMaps = self.modelParams.lexicon.getFeatureMaps()

        self.logger.info('Feature strings: ' + str(featureStrings))

        # Get major type groups in sorted order by contructing null parser
        # state and extracting features, and then concatenating the similar
        # types
        fvec = SparseFeatureExtractor(featureStrings, self.featureMaps) \
            .extract(ParserState(ParsedConllSentence(docid=None),
                     self.featureMaps), doLogging=False)

        featureTypeInstances = fvec.types
        self.featureMajorTypeGroups, _ = fvec.concatenateSimilarTypes()

        # index: major feature type index
        # values: feature names under that type
        self.featureNames = [[] for t in self.featureMajorTypeGroups]

        self.logger.info('Detected major feature groups (in alphabetical '
                         'order): ' + str(self.featureMajorTypeGroups))

        self.featureDomainSizes = []
        #self.featureEmbeddings = []

        # For now, use all same embedding sizes
        self.featureEmbeddingSizes = \
            [embeddingSizes[t] for t in self.featureMajorTypeGroups]

        self.BAG_OF_FEATURES_LEN = 0

        for i in range(len(featureTypeInstances)):
            major_type = featureTypeInstances[i].major_type
            major_type_index = self.featureMajorTypeGroups.index(major_type)

            self.featureNames[major_type_index].append(
                featureTypeInstances[i].name)

            self.BAG_OF_FEATURES_LEN += \
                (self.featureEmbeddingSizes[major_type_index])

        for i in range(len(self.featureMajorTypeGroups)):
            major_type = self.featureMajorTypeGroups[i]

            self.logger.info('')
            self.logger.info('Feature group \'%s\'' % major_type)
            self.logger.info('... domain size: %d' % \
                (self.featureMaps[major_type].getDomainSize( \
                    includeSpecial=True)))
            self.logger.info('... embedding size: %d' % \
                (self.featureEmbeddingSizes[i]))
            #self.logger.info('... feature count: %d' % \
            #    (len(self.featureNames[i])))
            self.logger.info('... features')

            for fname in self.featureNames[i]:
                self.logger.info('....... %s' % (fname))

            self.logger.info('... total group embedding size: %d' % \
                (len(self.featureNames[i]) * self.featureEmbeddingSizes[i]))

            self.logger.info('... initializing random normal embeddings...')
            self.featureDomainSizes.append(
                self.featureMaps[major_type].getDomainSize( \
                    includeSpecial=True))

        assert len(self.featureDomainSizes) == len(self.featureEmbeddingSizes)
        #assert len(self.featureDomainSizes) == len(self.featureEmbeddings)
        assert len(self.featureDomainSizes) == len(self.featureNames)

        self.logger.info('')
        self.logger.info('Batch size (number of parser states): %d' %
                         batchSize)
        self.logger.info('Total feature count: %d' % \
            (len(featureTypeInstances)))
        self.logger.info('Total bag of features length per state: %d' % \
            (self.BAG_OF_FEATURES_LEN))
        self.logger.info('Total features input size: %d' % \
            (batchSize*self.BAG_OF_FEATURES_LEN))

        # for actions, we don't encode UNKNOWN, ROOT, or OUTSIDE
        # we only encode the number of base values
        self.ACTION_COUNT = self.transitionSystem.numActions(
            self.featureMaps['label'].getDomainSize(includeSpecial=False))

        self.logger.info('Total action count: %d' % self.ACTION_COUNT)