Example #1
0
    def generate_actions(self, state):
        """
        Determine all zero-cost action according to current state
        :param state: current State of the parser
        :return: generator of Action items to perform
        """
        if not self.edges_remaining or not state.buffer and not state.stack:
            yield Actions.Finish
            if state.stack:
                yield Actions.Reduce
            return

        self.edge_found = False
        if state.stack:
            s0 = state.stack[-1]
            incoming = self.edges_remaining.intersection(s0.orig_node.incoming)
            outgoing = self.edges_remaining.intersection(s0.orig_node.outgoing)
            if not incoming and not outgoing:
                yield Actions.Reduce
                return
            else:
                # Check for actions to create new nodes
                for edge in incoming:
                    if edge.parent.ID in self.nodes_remaining and not edge.attrib.get("remote"):
                        yield self.create_node_action(edge, edge.parent, Actions.Node)

                for edge in outgoing:
                    if edge.child.attrib.get("implicit"):
                        yield self.create_node_action(edge, edge.child, Actions.Implicit)

                if len(state.stack) > 1:
                    s1 = state.stack[-2]
                    # Check for actions to create binary edges
                    for edge in incoming:
                        if edge.parent.ID == s1.node_id:
                            yield self.create_edge_action(edge, Action.RIGHT)

                    for edge in outgoing:
                        if edge.child.ID == s1.node_id:
                            yield self.create_edge_action(edge, Action.LEFT)

                    if not self.edge_found:
                        # Check if a swap is necessary, and how far (if compound swap is enabled)
                        related = dict([(edge.child.ID,  edge) for edge in outgoing] +
                                       [(edge.parent.ID, edge) for edge in incoming])
                        distance = None  # Swap distance (how many nodes in the stack to swap)
                        for i, s in enumerate(state.stack[-3::-1]):  # Skip top two, they are not related
                            edge = related.pop(s.node_id, None)
                            if edge is not None:
                                if Config().no_swap:  # We have no chance to reach it, so stop trying
                                    self.remove(edge)
                                    continue
                                if distance is None and Config().compound_swap:  # Save the first one
                                    distance = i + 1
                                if not related:  # All related nodes are in the stack
                                    yield Actions.Swap(distance)
                                    return

        if state.buffer and not self.edge_found:
            yield Actions.Shift
Example #2
0
File: node.py Project: viksit/ucca
 def add_to_l1(self, l1, parent, tag, terminals, train):
     """
     Called when creating final Passage to add a new core.Node
     :param l1: Layer1 of the passage
     :param parent: node
     :param tag: edge tag to link to parent
     :param terminals: all terminals strings in the passage
     :param train: in training, so keep original node IDs in the "remarks" field
     """
     if Config().verify:
         assert self.node is None or self.text is not None,\
             "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent)
     edge = self.outgoing[0] if len(self.outgoing) == 1 else None
     if self.text:  # For Word terminals (Punctuation already created by add_punct for parent)
         if self.node is None and parent.node is not None:
             self.node = parent.node.add(EdgeTags.Terminal,
                                         terminals[self.index]).child
     elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]):
         if Config().verify:
             assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag)
             assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag)
         self.node = l1.add_punct(parent.node, terminals[edge.child.index])
         edge.child.node = self.node[0].child
     else:  # The usual case
         self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit)
     if train and self.node is not None and self.node_id is not None:  # In training
         self.node.extra["remarks"] = self.node_id  # Keep original node ID for reference
Example #3
0
File: tune.py Project: viksit/ucca
 def run(self):
     assert Config().args.train and Config().args.passages or Config().args.folds, \
         "insufficient parameters given to parser"
     print("Running with %s" % self)
     Config().learning_rate = self.learning_rate
     Config().decay_factor = self.decay_factor
     self.scores = parse.main()
     assert self.score is not None, "parser failed to produce score"
Example #4
0
def dense_features_wrapper(wrapper):
    from features.dense_features import DenseFeatureExtractor
    return wrapper(DenseFeatureExtractor(),
                   w=(Config().word_vectors, 10000),
                   t=(Config().tag_dim, 100),
                   e=(Config().label_dim, 15),
                   p=(Config().punct_dim, 5),
                   x=(Config().gap_dim, 3),
                   )
Example #5
0
File: parse.py Project: viksit/ucca
    def parse_passage(self, train=False):
        """
        Internal method to parse a single passage
        :param train: use oracle to train on given passages, or just parse with classifier?
        """
        if Config().verbose:
            print("  initial state: %s" % self.state)
        while True:
            if Config().check_loops:
                self.check_loop(print_oracle=train)

            true_actions = []
            if self.oracle is not None:
                try:
                    true_actions = self.oracle.get_actions(self.state)
                except (AttributeError, AssertionError) as e:
                    if train:
                        raise ParserException(
                            "Error in oracle during training") from e

            features = self.feature_extractor.extract_features(self.state)
            predicted_action = self.predict_action(
                features, true_actions)  # sets self.scores
            action = predicted_action
            if not true_actions:
                true_actions = "?"
            elif predicted_action in true_actions:
                self.correct_count += 1
            elif train:
                best_true_action = true_actions[0] if len(true_actions) == 1 else \
                    true_actions[self.scores[[a.id for a in true_actions]].argmax()]
                rate = self.learning_rate
                if best_true_action.is_swap:
                    rate *= Config().importance
                self.model.update(features, predicted_action.id,
                                  best_true_action.id, rate)
                action = Config().random.choice(true_actions)
            self.action_count += 1
            try:
                self.state.transition(action)
            except AssertionError as e:
                raise ParserException("Invalid transition (%s): %s" %
                                      (action, e)) from e
            if Config().verbose:
                if self.oracle is None:
                    print("  action: %-15s %s" % (action, self.state))
                else:
                    print("  predicted: %-15s true: %-15s taken: %-15s %s" %
                          (predicted_action, "|".join(
                              str(true_action)
                              for true_action in true_actions), action,
                           self.state))
                for line in self.state.log:
                    print("    " + line)
            if self.state.finished:
                return  # action is FINISH
Example #6
0
 def __init__(self, passage):
     l1 = passage.layer(layer1.LAYER_ID)
     self.nodes_remaining = {node.ID for node in l1.all
                             if node is not l1.heads[0] and
                             (not Config().no_linkage or node.tag != layer1.NodeTags.Linkage) and
                             (not Config().no_implicit or not node.attrib.get("implicit"))}
     self.edges_remaining = {edge for node in passage.nodes.values() for edge in node
                             if (not Config().no_linkage or edge.tag not in (
                                 layer1.EdgeTags.LinkRelation, layer1.EdgeTags.LinkArgument)) and
                             (not Config().no_implicit or not edge.child.attrib.get("implicit")) and
                             (not Config().no_remote or not edge.attrib.get("remote"))}
     self.passage = passage
     self.edge_found = False
     self.log = None
Example #7
0
 def __init__(self, feature_extractor, **kwargs):
     self.feature_extractor = feature_extractor
     self.sizes = {}
     self.embedding = {}
     for suffix, dims in kwargs.items():
         dim = dims[0]
         if isinstance(dim, int):
             self.sizes[suffix] = dim
             self.embedding[suffix] = defaultdict(
                 lambda s=dim: Config().random.normal(size=s))
         else:
             print("Loading word vectors from '%s'..." % dim)
             w2v = Word2Vec.load_word2vec_format(dim)
             unk = Config().random.normal(size=w2v.vector_size)
             self.sizes[suffix] = w2v.vector_size
             self.embedding[suffix] = Word2VecWrapper(w2v, unk)
Example #8
0
File: state.py Project: viksit/ucca
 def fix_terminal_tags(self, terminals):
     for terminal, orig_terminal in zip(terminals, self.terminals):
         if terminal.tag != orig_terminal.tag:
             if Config().verbose:
                 print("%s is the wrong tag for terminal: %s" % (terminal.tag, terminal.text),
                       file=sys.stderr)
             terminal.tag = orig_terminal.tag
Example #9
0
File: parse.py Project: viksit/ucca
 def __init__(self, model_file=None, model_type="sparse"):
     self.state = None  # State object created at each parse
     self.oracle = None  # Oracle object created at each parse
     self.scores = None  # NumPy array of action scores at each action
     self.action_count = 0
     self.correct_count = 0
     self.total_actions = 0
     self.total_correct = 0
     self.feature_extractor, self.model = models.create_model(
         model_type,
         Actions().all)
     self.model_file = model_file
     self.learning_rate = Config().learning_rate
     self.decay_factor = Config().decay_factor
     self.state_hash_history = None  # For loop checking
     # Used in verify_passage to optionally ignore a mismatch in linkage nodes:
     self.ignore_node = lambda n: n.tag == layer1.NodeTags.Linkage if Config(
     ).no_linkage else None
Example #10
0
File: parse.py Project: viksit/ucca
def evaluate_passage(guessed_passage, ref_passage):
    score = evaluation.evaluate(guessed_passage,
                                ref_passage,
                                verbose=Config().verbose
                                and guessed_passage is not None,
                                units=False,
                                errors=False)
    print("F1=%.3f" % score.average_unlabeled_f1(), flush=True)
    return score
Example #11
0
File: parse.py Project: viksit/ucca
def main():
    args = Config().args
    print("Running parser with %s" % Config())
    scores = None
    if Config().test_scores:
        with open(Config().test_scores, "w") as f:
            print(",".join(evaluation.Scores.field_titles()), file=f)
    if args.folds is not None:
        k = args.folds
        fold_scores = []
        all_passages = list(util.read_files_and_dirs(args.passages))
        assert len(all_passages) >= k,\
            "%d folds are not possible with only %d passages" % (k, len(all_passages))
        Config().random.shuffle(all_passages)
        folds = [all_passages[i::k] for i in range(k)]
        for i in range(k):
            print("Fold %d of %d:" % (i + 1, k))
            dev_passages = folds[i]
            test_passages = folds[(i + 1) % k]
            train_passages = [
                passage for fold in folds
                if fold is not dev_passages and fold is not test_passages
                for passage in fold
            ]
            s = train_test(train_passages, dev_passages, test_passages, args,
                           "_%d" % i)
            if s is not None:
                fold_scores.append(s)
        if fold_scores:
            scores = evaluation.Scores.aggregate(fold_scores)
            print("Average unlabeled test F1 score for each fold: " +
                  ", ".join("%.3f" % s.average_unlabeled_f1()
                            for s in fold_scores))
            print("Aggregated scores across folds:\n")
            scores.print()
    else:  # Simple train/dev/test by given arguments
        train_passages, dev_passages, test_passages = [
            util.read_files_and_dirs(arg)
            for arg in (args.train, args.dev, args.passages)
        ]
        scores = train_test(train_passages, dev_passages, test_passages, args)
    return scores
Example #12
0
File: parse.py Project: viksit/ucca
 def pos_tag(state):
     """
     Function to pass to State to POS tag the tokens when created
     :param state: State object to modify
     """
     tokens = [token for tokens in state.tokens for token in tokens]
     tokens, tags = zip(*pos_tag(tokens))
     if Config().verbose:
         print(" ".join("%s/%s" % (token, tag)
                        for (token, tag) in zip(tokens, tags)))
     for node, tag in zip(state.nodes, tags):
         node.pos_tag = tag
Example #13
0
File: state.py Project: viksit/ucca
 def add_node(self, *args, **kwargs):
     """
     Called during parsing to add a new Node (not core.Node) to the temporary representation
     :param args: ordinal arguments for Node()
     :param kwargs: keyword arguments for Node()
     """
     node = Node(len(self.nodes), *args, **kwargs)
     if Config().verify:
         assert node not in self.nodes, "Node already exists"
     self.nodes.append(node)
     self.log.append("node: %s" % node)
     return node
Example #14
0
File: state.py Project: viksit/ucca
 def assert_possible_edge():
     parent, child = self.get_parent_child(action)
     assert_possible_parent(parent)
     assert_possible_child(child)
     if parent is self.root and Config().constraints:
         assert child.text is None, "Root may not have terminal children, but is being added '%s'" % child
         assert action.tag in Constraints.TopLevel, "The root may not have %s edges" % action.tag
     # if Config().multiple_edges:  # Removed this option because it is not useful right now
     #     edge = Edge(parent, child, action.tag, remote=action.remote)
     #     assert edge not in parent.outgoing, "Edge must not already exist: %s" % edge
     # else:
     assert child not in parent.children, "Edge must not already exist: %s->%s" % (parent, child)
     assert parent not in child.descendants, "Detected cycle created by edge: %s->%s" % (parent, child)
Example #15
0
File: util.py Project: viksit/ucca
def read_passages(files):
    """
    :param files: iterable of files or Passage objects
    :return: generator of passages from all files given
    """
    for file in files:
        if isinstance(file, core.Passage):  # Not really a file, but a Passage
            passage = file
        elif os.path.exists(file):  # A file
            try:
                passage = ioutil.file2passage(file)  # XML or binary format
            except (IOError, ParseError):  # Failed to read as passage file
                base, ext = os.path.splitext(os.path.basename(file))
                converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text)
                with open(file) as f:
                    yield from converter(f, passage_id=base, split=Config().split)
                continue
        else:
            raise IOError("File not found: %s" % file)
        if Config().split:
            yield from convert.split2segments(passage, is_sentences=Config().sentences)
        else:
            yield passage
def main():
    gl.config = Config(gl.basic_file)
    ok: bool = gl.config.InitConfig()
    if not ok:
        logging.error("Failed to init config, \
            please recheck config files and contents.")
        return

    # 尝试链接并创建表

    # 解析并插入表

    parseTron = ParseTron(gl.config)
    parseTron.sync()
Example #17
0
File: parse.py Project: viksit/ucca
def train_test(train_passages,
               dev_passages,
               test_passages,
               args,
               model_suffix=""):
    scores = None
    train = bool(train_passages)
    model_file = args.model
    if model_file is not None:
        model_base, model_ext = os.path.splitext(model_file)
        model_file = model_base + model_suffix + model_ext
    p = Parser(model_file=model_file, model_type=args.classifier)
    p.train(train_passages,
            dev=dev_passages,
            iterations=args.iterations,
            folds=args.folds)
    if test_passages:
        if args.train or args.folds:
            print("Evaluating on test passages")
        passage_scores = []
        for guessed_passage, ref_passage in p.parse(test_passages):
            if args.evaluate or train:
                score = evaluate_passage(guessed_passage, ref_passage)
                passage_scores.append(score)
            if guessed_passage is not None and not args.nowrite:
                util.write_passage(guessed_passage, args)
        if passage_scores and (not args.verbose or len(passage_scores) > 1):
            scores = evaluation.Scores.aggregate(passage_scores)
            print("\nAverage F1 score on test: %.3f" %
                  scores.average_unlabeled_f1())
            print("Aggregated scores:")
            scores.print()
            if Config().test_scores:
                with open(Config().test_scores, "a") as f:
                    print(",".join(scores.fields()), file=f)
    return scores
Example #18
0
File: state.py Project: viksit/ucca
 def assert_possible_parent(node):
     assert node.text is None, "Terminals may not have children: %s" % node.text
     assert not node.implicit, "Implicit nodes may not have children: %s" % s0
     if Config().constraints:
         assert action.tag not in Constraints.UniqueOutgoing or action.tag not in node.outgoing_tags, \
             "Outgoing edge tag %s must be unique, but %s already has one" % (
                 action.tag, node)
         assert action.tag not in Constraints.MutuallyExclusiveOutgoing or not \
             node.outgoing_tags & Constraints.MutuallyExclusiveOutgoing, \
             "Outgoing edge tags %s are mutually exclusive, but %s already has %s and is being added %s" % (
                 Constraints.MutuallyExclusiveOutgoing, node, node.outgoing_tags, action.tag)
         assert action.tag in Constraints.ChildlessOutgoing or not \
             node.incoming_tags & Constraints.ChildlessIncoming, \
             "Units with incoming %s edges may not have children, and %s has incoming %s" % (
                 Constraints.ChildlessIncoming, node, node.incoming_tags)
Example #19
0
File: state.py Project: viksit/ucca
 def assert_possible_child(node):
     assert node is not self.root, "The root may not have parents"
     assert (node.text is not None) == (action.tag == EdgeTags.Terminal), \
         "Edge tag must be %s iff child is terminal, but node is %s and edge tag is %s" % (
             EdgeTags.Terminal, node, action.tag)
     if Config().constraints:
         assert action.tag not in Constraints.UniqueIncoming or \
             action.tag not in node.incoming_tags, \
             "Incoming edge tag %s must be unique, but %s already has one" % (
                 action.tag, node)
         assert action.tag not in Constraints.ChildlessIncoming or \
             node.outgoing_tags <= Constraints.ChildlessOutgoing, \
             "Units with incoming %s edges may not have children, but %s has %d" % (
                 Constraints.ChildlessIncoming, node, len(node.children))
         assert action.remote or action.tag in Constraints.possible_multiple_incoming() or \
             all(e.remote or e.tag in Constraints.possible_multiple_incoming()
                 for e in node.incoming), \
             "Multiple parents only allowed if they are remote or linkage edges: %s, %s" % (
                 action, node)
Example #20
0
def create_model(model_type, labels):
    if model_type == "sparse":
        from classifiers.sparse_perceptron import SparsePerceptron
        from features.sparse_features import SparseFeatureExtractor
        features = SparseFeatureExtractor()
        model = SparsePerceptron(labels, min_update=Config().min_update)
    elif model_type == "dense":
        from features.embedding import FeatureEmbedding
        from classifiers.dense_perceptron import DensePerceptron
        features = dense_features_wrapper(FeatureEmbedding)
        model = DensePerceptron(labels, num_features=features.num_features())
    elif model_type == "nn":
        from features.indexer import FeatureIndexer
        from classifiers.neural_network import NeuralNetwork
        features = dense_features_wrapper(FeatureIndexer)
        model = NeuralNetwork(labels, inputs=features.feature_types)
    else:
        raise ValueError("Invalid model type: '%s'" % model_type)
    return features, model
Example #21
0
File: state.py Project: viksit/ucca
 def transition(self, action):
     """
     Main part of the parser: apply action given by oracle or classifier
     :param action: Action object to apply
     """
     action.apply()
     self.log = []
     if action.is_type(Actions.Shift):  # Push buffer head to stack; shift buffer
         self.stack.append(self.buffer.popleft())
     elif action.is_type(Actions.Node):  # Create new parent node and add to the buffer
         parent = self.add_node(action.orig_node)
         self.update_swap_index(parent)
         self.add_edge(Edge(parent, self.stack[-1], action.tag))
         self.buffer.appendleft(parent)
     elif action.is_type(Actions.Implicit):  # Create new child node and add to the buffer
         child = self.add_node(action.orig_node, implicit=True)
         self.update_swap_index(child)
         self.add_edge(Edge(self.stack[-1], child, action.tag))
         self.buffer.appendleft(child)
     elif action.is_type(Actions.Reduce):  # Pop stack (no more edges to create with this node)
         self.stack.pop()
     elif action.is_type(Actions.LeftEdge, Actions.LeftRemote, Actions.RightEdge, Actions.RightRemote):
         parent, child = self.get_parent_child(action)
         self.add_edge(Edge(parent, child, action.tag, remote=action.remote))
     elif action.is_type(Actions.Swap):  # Place second (or more) stack item back on the buffer
         distance = action.tag or 1
         s = slice(-distance - 1, -1)
         self.log.append("%s <--> %s" % (", ".join(map(str, self.stack[s])), self.stack[-1]))
         self.buffer.extendleft(reversed(self.stack[s]))  # extendleft reverses the order
         del self.stack[s]
     elif action.is_type(Actions.Finish):  # Nothing left to do
         self.finished = True
     else:
         raise Exception("Invalid action: " + action)
     if Config().verify:
         intersection = set(self.stack).intersection(self.buffer)
         assert not intersection, "Stack and buffer overlap: %s" % intersection
     self.assert_node_ratio()
     self.actions.append(action)
Example #22
0
File: parse.py Project: viksit/ucca
    def train(self, passages, dev=None, iterations=1, folds=None):
        """
        Train parser on given passages
        :param passages: iterable of passages to train on
        :param dev: iterable of passages to tune on
        :param iterations: number of iterations to perform
        :param folds: whether we are inside cross-validation with this many folds
        :return: trained model
        """
        if not passages:
            if self.model_file is not None:  # Nothing to train on; pre-trained model given
                self.model.load(self.model_file, util)
                Actions().all = self.model.labels
            return self.model

        best_score = 0
        best_model = None
        save_model = True
        last = False
        if Config().dev_scores:
            with open(Config().dev_scores, "w") as f:
                print(",".join(["iteration"] +
                               evaluation.Scores.field_titles()),
                      file=f)
        for iteration in range(iterations):
            if last:
                break
            last = iteration == iterations - 1
            print("Training iteration %d of %d: " %
                  (iteration + 1, iterations))
            passages = [
                passage for _, passage in self.parse(passages, mode="train")
            ]
            model = self.model  # Save non-finalize model
            self.model = self.model.finalize(
            )  # To evaluate finalized model on dev
            if last:
                if folds is None:  # Free some memory, as these are not needed any more
                    del passages[:]
            else:
                self.learning_rate *= self.decay_factor
                Config().random.shuffle(passages)
            if dev:
                print("Evaluating on dev passages")
                dev, scores = zip(
                    *[(passage, evaluate_passage(predicted_passage, passage))
                      for predicted_passage, passage in self.parse(
                          dev, mode="dev")])
                dev = list(dev)
                scores = evaluation.Scores.aggregate(scores)
                score = scores.average_unlabeled_f1()
                print("Average unlabeled F1 score on dev: %.3f" % score)
                if Config().dev_scores:
                    with open(Config().dev_scores, "a") as f:
                        print(",".join([str(iteration)] + scores.fields()),
                              file=f)
                if score >= best_score:
                    print("Better than previous best score (%.3f)" %
                          best_score)
                    best_score = score
                    save_model = True
                else:
                    print("Not better than previous best score (%.3f)" %
                          best_score)
                    save_model = False
                if score >= 1:  # Score cannot go any better, so no point in more training
                    last = True
                if last and folds is None:  # Free more memory
                    del dev[:]
            if save_model or best_model is None:
                best_model = self.model  # This is the finalized model
                if self.model_file is not None:
                    best_model.save(self.model_file, util)
            if not last:
                self.model = model  # Restore non-finalized model

        print("Trained %d iterations" % iterations)

        self.model = best_model
        return self.model
Example #23
0
File: state.py Project: viksit/ucca
 def assert_node_ratio(self, extra=0):
     max_ratio = Config().max_nodes_ratio
     assert self.node_ratio(extra=extra) <= max_ratio, \
         "Reached maximum ratio (%.3f) of non-terminals to terminals" % max_ratio
Example #24
0
File: state.py Project: viksit/ucca
 def assert_height(self):
     max_height = Config().max_height
     assert self.root.height <= max_height, \
         "Reached maximum graph height (%d)" % max_height
Example #25
0
File: state.py Project: viksit/ucca
    def assert_valid(self, action):
        """
        Raise AssertionError if the action is invalid in the current state
        :param action: action to check for validity
        """
        def assert_possible_node():
            if self.labeled:  # We're in training, so we must have an original node to refer to
                assert action.orig_node is not None, "May only create real nodes during training"
            self.assert_node_ratio(extra=1)
            self.assert_height()

        def assert_possible_parent(node):
            assert node.text is None, "Terminals may not have children: %s" % node.text
            assert not node.implicit, "Implicit nodes may not have children: %s" % s0
            if Config().constraints:
                assert action.tag not in Constraints.UniqueOutgoing or action.tag not in node.outgoing_tags, \
                    "Outgoing edge tag %s must be unique, but %s already has one" % (
                        action.tag, node)
                assert action.tag not in Constraints.MutuallyExclusiveOutgoing or not \
                    node.outgoing_tags & Constraints.MutuallyExclusiveOutgoing, \
                    "Outgoing edge tags %s are mutually exclusive, but %s already has %s and is being added %s" % (
                        Constraints.MutuallyExclusiveOutgoing, node, node.outgoing_tags, action.tag)
                assert action.tag in Constraints.ChildlessOutgoing or not \
                    node.incoming_tags & Constraints.ChildlessIncoming, \
                    "Units with incoming %s edges may not have children, and %s has incoming %s" % (
                        Constraints.ChildlessIncoming, node, node.incoming_tags)

        def assert_possible_child(node):
            assert node is not self.root, "The root may not have parents"
            assert (node.text is not None) == (action.tag == EdgeTags.Terminal), \
                "Edge tag must be %s iff child is terminal, but node is %s and edge tag is %s" % (
                    EdgeTags.Terminal, node, action.tag)
            if Config().constraints:
                assert action.tag not in Constraints.UniqueIncoming or \
                    action.tag not in node.incoming_tags, \
                    "Incoming edge tag %s must be unique, but %s already has one" % (
                        action.tag, node)
                assert action.tag not in Constraints.ChildlessIncoming or \
                    node.outgoing_tags <= Constraints.ChildlessOutgoing, \
                    "Units with incoming %s edges may not have children, but %s has %d" % (
                        Constraints.ChildlessIncoming, node, len(node.children))
                assert action.remote or action.tag in Constraints.possible_multiple_incoming() or \
                    all(e.remote or e.tag in Constraints.possible_multiple_incoming()
                        for e in node.incoming), \
                    "Multiple parents only allowed if they are remote or linkage edges: %s, %s" % (
                        action, node)
                # Commented out due to passage 106, unit 1.300
                # assert not node.incoming_tags or (action.tag in Constraints.LinkerIncoming) == (
                #     node.incoming_tags <= Constraints.LinkerIncoming), \
                #     "Linker units may only have incoming edges with tags from %s, but %s is being added '%s'" % (
                #         Constraints.LinkerIncoming, node, action.tag)

        def assert_possible_edge():
            parent, child = self.get_parent_child(action)
            assert_possible_parent(parent)
            assert_possible_child(child)
            if parent is self.root and Config().constraints:
                assert child.text is None, "Root may not have terminal children, but is being added '%s'" % child
                assert action.tag in Constraints.TopLevel, "The root may not have %s edges" % action.tag
            # if Config().multiple_edges:  # Removed this option because it is not useful right now
            #     edge = Edge(parent, child, action.tag, remote=action.remote)
            #     assert edge not in parent.outgoing, "Edge must not already exist: %s" % edge
            # else:
            assert child not in parent.children, "Edge must not already exist: %s->%s" % (parent, child)
            assert parent not in child.descendants, "Detected cycle created by edge: %s->%s" % (parent, child)

        if action.is_type(Actions.Finish):
            if not Config().no_swap:  # Without swap, the oracle may be incapable even of single action
                assert self.root.outgoing, \
                    "Root must have at least one child at the end of the parse, but has none"
        elif action.is_type(Actions.Shift):
            assert self.buffer, "Buffer must not be empty in order to shift from it"
        else:  # Unary actions
            assert self.actions, "First action must be Shift, but was %s" % action
            assert self.stack, "Action requires non-empty stack: %s" % action
            s0 = self.stack[-1]
            if action.is_type(Actions.Node):
                assert_possible_child(s0)
                assert_possible_node()
            elif action.is_type(Actions.Implicit):
                assert_possible_parent(s0)
                assert_possible_node()
            elif action.is_type(Actions.Reduce):
                assert s0 is not self.root or s0.outgoing, "May not reduce the root without children"
                # Commented out due to passage 126, unit 1.338
                # assert not s0.outgoing_tags & Constraints.SceneSufficientOutgoing and \
                #     not s0.incoming_tags & Constraints.SceneSufficientIncoming or \
                #     s0.outgoing_tags & Constraints.SceneNecessaryOutgoing, \
                #     "May not reduce a scene before it has any outgoing edge of %s (it has only %s)" % (
                #         Constraints.SceneNecessaryOutgoing, s0.outgoing_tags)
                # Commented out due to passage 126, unit 1.60
                # assert s0.incoming_tags == Constraints.LinkerIncoming or not \
                #     s0.incoming_tags & Constraints.LinkerIncoming, \
                #     "May not reduce a linker before it has all incoming edges of %s (it has only %s)" % (
                #         Constraints.LinkerIncoming, s0.incoming_tags)
            else:  # Binary actions
                assert len(self.stack) > 1, "Action requires at least two stack elements: %s" % action
                if action.is_type(Actions.LeftEdge, Actions.RightEdge, Actions.LeftRemote, Actions.RightRemote):
                    assert_possible_edge()
                elif action.is_type(Actions.Swap):
                    # A regular swap is possible since the stack has at least two elements;
                    # A compound swap is possible if the stack is longer than the distance
                    distance = action.tag or 1
                    assert 1 <= distance < len(self.stack), "Invalid swap distance: %d" % distance
                    swapped = self.stack[-distance - 1]
                    # To prevent swap loops: only swap if the nodes are currently in their original order
                    assert self.swappable(s0, swapped),\
                        "Swapping already-swapped nodes: %s (swap index %d) <--> %s (swap index %d)" % (
                            swapped, swapped.swap_index, s0, s0.swap_index)
                else:
                    raise Exception("Invalid action: %s" % action)
Example #26
0
File: parse.py Project: viksit/ucca
    def parse(self, passages, mode="test"):
        """
        Parse given passages
        :param passages: iterable of passages to parse
        :param mode: "train", "test" or "dev".
                     If "train", use oracle to train on given passages.
                     Otherwise, just parse with classifier.
        :return: generator of pairs of (parsed passage, original passage)
        """
        train = mode == "train"
        dev = mode == "dev"
        test = mode == "test"
        assert train or dev or test, "Invalid parse mode: %s" % mode
        passage_word = "sentence" if Config().sentences else \
                       "paragraph" if Config().paragraphs else \
                       "passage"
        self.total_actions = 0
        self.total_correct = 0
        total_duration = 0
        total_tokens = 0
        num_passages = 0
        for passage in passages:
            l0 = passage.layer(layer0.LAYER_ID)
            num_tokens = len(l0.all)
            total_tokens += num_tokens
            l1 = passage.layer(layer1.LAYER_ID)
            labeled = len(l1.all) > 1
            assert not train or labeled, "Cannot train on unannotated passage"
            print("%s %-7s" % (passage_word, passage.ID),
                  end=Config().line_end,
                  flush=True)
            started = time.time()
            self.action_count = 0
            self.correct_count = 0
            self.state = State(passage, callback=self.pos_tag)
            self.state_hash_history = set()
            self.oracle = Oracle(passage) if train else None
            failed = False
            try:
                self.parse_passage(
                    train)  # This is where the actual parsing takes place
            except ParserException as e:
                if train:
                    raise
                Config().log("%s %s: %s" % (passage_word, passage.ID, e))
                if not test:
                    print("failed")
                failed = True
            predicted_passage = passage
            if not train or Config().verify:
                predicted_passage = self.state.create_passage(
                    assert_proper=Config().verify)
            duration = time.time() - started
            total_duration += duration
            if train:  # We have an oracle to verify by
                if not failed and Config().verify:
                    self.verify_passage(passage, predicted_passage, train)
                if self.action_count:
                    print("%-16s" %
                          ("%d%% (%d/%d)" %
                           (100 * self.correct_count / self.action_count,
                            self.correct_count, self.action_count)),
                          end=Config().line_end)
            print("%0.3fs" % duration, end="")
            print("%-15s" % ("" if failed else " (%d tokens/s)" %
                             (num_tokens / duration)),
                  end="")
            print(Config().line_end, end="")
            if train:
                print(Config().line_end, flush=True)
            self.total_correct += self.correct_count
            self.total_actions += self.action_count
            num_passages += 1
            yield predicted_passage, passage

        if num_passages > 1:
            print("Parsed %d %ss" % (num_passages, passage_word))
            if self.oracle and self.total_actions:
                print("Overall %d%% correct transitions (%d/%d) on %s" %
                      (100 * self.total_correct / self.total_actions,
                       self.total_correct, self.total_actions, mode))
            print(
                "Total time: %.3fs (average time/%s: %.3fs, average tokens/s: %d)"
                % (total_duration, passage_word, total_duration / num_passages,
                   total_tokens / total_duration),
                flush=True)
Example #27
0
File: parse.py Project: viksit/ucca
            dev_passages = folds[i]
            test_passages = folds[(i + 1) % k]
            train_passages = [
                passage for fold in folds
                if fold is not dev_passages and fold is not test_passages
                for passage in fold
            ]
            s = train_test(train_passages, dev_passages, test_passages, args,
                           "_%d" % i)
            if s is not None:
                fold_scores.append(s)
        if fold_scores:
            scores = evaluation.Scores.aggregate(fold_scores)
            print("Average unlabeled test F1 score for each fold: " +
                  ", ".join("%.3f" % s.average_unlabeled_f1()
                            for s in fold_scores))
            print("Aggregated scores across folds:\n")
            scores.print()
    else:  # Simple train/dev/test by given arguments
        train_passages, dev_passages, test_passages = [
            util.read_files_and_dirs(arg)
            for arg in (args.train, args.dev, args.passages)
        ]
        scores = train_test(train_passages, dev_passages, test_passages, args)
    return scores


if __name__ == "__main__":
    main()
    Config().close()
Example #28
0
 def init(self):
     # edge and node action will be created as they are returned by the oracle
     self.all = [Actions.Reduce, Actions.Shift, Actions.Finish] + \
                (map(Actions.Swap, range(1, Action.MAX_SWAP)) if Config().compound_swap
                 else [] if Config().no_swap else [Actions.Swap])
Example #29
0
 def __init__(self, *args, **kwargs):
     super(ParserTests, self).__init__(*args, **kwargs)
     Config("", "-m", "test")
     self.passage = convert.from_standard(
         TestUtil.load_xml('test_files/standard3.xml'))
Example #30
0
 def possible_multiple_incoming(cls):
     return () if Config().no_linkage else cls.PossibleMultipleIncoming