def generate_actions(self, state): """ Determine all zero-cost action according to current state :param state: current State of the parser :return: generator of Action items to perform """ if not self.edges_remaining or not state.buffer and not state.stack: yield Actions.Finish if state.stack: yield Actions.Reduce return self.edge_found = False if state.stack: s0 = state.stack[-1] incoming = self.edges_remaining.intersection(s0.orig_node.incoming) outgoing = self.edges_remaining.intersection(s0.orig_node.outgoing) if not incoming and not outgoing: yield Actions.Reduce return else: # Check for actions to create new nodes for edge in incoming: if edge.parent.ID in self.nodes_remaining and not edge.attrib.get("remote"): yield self.create_node_action(edge, edge.parent, Actions.Node) for edge in outgoing: if edge.child.attrib.get("implicit"): yield self.create_node_action(edge, edge.child, Actions.Implicit) if len(state.stack) > 1: s1 = state.stack[-2] # Check for actions to create binary edges for edge in incoming: if edge.parent.ID == s1.node_id: yield self.create_edge_action(edge, Action.RIGHT) for edge in outgoing: if edge.child.ID == s1.node_id: yield self.create_edge_action(edge, Action.LEFT) if not self.edge_found: # Check if a swap is necessary, and how far (if compound swap is enabled) related = dict([(edge.child.ID, edge) for edge in outgoing] + [(edge.parent.ID, edge) for edge in incoming]) distance = None # Swap distance (how many nodes in the stack to swap) for i, s in enumerate(state.stack[-3::-1]): # Skip top two, they are not related edge = related.pop(s.node_id, None) if edge is not None: if Config().no_swap: # We have no chance to reach it, so stop trying self.remove(edge) continue if distance is None and Config().compound_swap: # Save the first one distance = i + 1 if not related: # All related nodes are in the stack yield Actions.Swap(distance) return if state.buffer and not self.edge_found: yield Actions.Shift
def add_to_l1(self, l1, parent, tag, terminals, train): """ Called when creating final Passage to add a new core.Node :param l1: Layer1 of the passage :param parent: node :param tag: edge tag to link to parent :param terminals: all terminals strings in the passage :param train: in training, so keep original node IDs in the "remarks" field """ if Config().verify: assert self.node is None or self.text is not None,\ "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent) edge = self.outgoing[0] if len(self.outgoing) == 1 else None if self.text: # For Word terminals (Punctuation already created by add_punct for parent) if self.node is None and parent.node is not None: self.node = parent.node.add(EdgeTags.Terminal, terminals[self.index]).child elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]): if Config().verify: assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag) assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag) self.node = l1.add_punct(parent.node, terminals[edge.child.index]) edge.child.node = self.node[0].child else: # The usual case self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit) if train and self.node is not None and self.node_id is not None: # In training self.node.extra["remarks"] = self.node_id # Keep original node ID for reference
def run(self): assert Config().args.train and Config().args.passages or Config().args.folds, \ "insufficient parameters given to parser" print("Running with %s" % self) Config().learning_rate = self.learning_rate Config().decay_factor = self.decay_factor self.scores = parse.main() assert self.score is not None, "parser failed to produce score"
def dense_features_wrapper(wrapper): from features.dense_features import DenseFeatureExtractor return wrapper(DenseFeatureExtractor(), w=(Config().word_vectors, 10000), t=(Config().tag_dim, 100), e=(Config().label_dim, 15), p=(Config().punct_dim, 5), x=(Config().gap_dim, 3), )
def parse_passage(self, train=False): """ Internal method to parse a single passage :param train: use oracle to train on given passages, or just parse with classifier? """ if Config().verbose: print(" initial state: %s" % self.state) while True: if Config().check_loops: self.check_loop(print_oracle=train) true_actions = [] if self.oracle is not None: try: true_actions = self.oracle.get_actions(self.state) except (AttributeError, AssertionError) as e: if train: raise ParserException( "Error in oracle during training") from e features = self.feature_extractor.extract_features(self.state) predicted_action = self.predict_action( features, true_actions) # sets self.scores action = predicted_action if not true_actions: true_actions = "?" elif predicted_action in true_actions: self.correct_count += 1 elif train: best_true_action = true_actions[0] if len(true_actions) == 1 else \ true_actions[self.scores[[a.id for a in true_actions]].argmax()] rate = self.learning_rate if best_true_action.is_swap: rate *= Config().importance self.model.update(features, predicted_action.id, best_true_action.id, rate) action = Config().random.choice(true_actions) self.action_count += 1 try: self.state.transition(action) except AssertionError as e: raise ParserException("Invalid transition (%s): %s" % (action, e)) from e if Config().verbose: if self.oracle is None: print(" action: %-15s %s" % (action, self.state)) else: print(" predicted: %-15s true: %-15s taken: %-15s %s" % (predicted_action, "|".join( str(true_action) for true_action in true_actions), action, self.state)) for line in self.state.log: print(" " + line) if self.state.finished: return # action is FINISH
def __init__(self, passage): l1 = passage.layer(layer1.LAYER_ID) self.nodes_remaining = {node.ID for node in l1.all if node is not l1.heads[0] and (not Config().no_linkage or node.tag != layer1.NodeTags.Linkage) and (not Config().no_implicit or not node.attrib.get("implicit"))} self.edges_remaining = {edge for node in passage.nodes.values() for edge in node if (not Config().no_linkage or edge.tag not in ( layer1.EdgeTags.LinkRelation, layer1.EdgeTags.LinkArgument)) and (not Config().no_implicit or not edge.child.attrib.get("implicit")) and (not Config().no_remote or not edge.attrib.get("remote"))} self.passage = passage self.edge_found = False self.log = None
def __init__(self, feature_extractor, **kwargs): self.feature_extractor = feature_extractor self.sizes = {} self.embedding = {} for suffix, dims in kwargs.items(): dim = dims[0] if isinstance(dim, int): self.sizes[suffix] = dim self.embedding[suffix] = defaultdict( lambda s=dim: Config().random.normal(size=s)) else: print("Loading word vectors from '%s'..." % dim) w2v = Word2Vec.load_word2vec_format(dim) unk = Config().random.normal(size=w2v.vector_size) self.sizes[suffix] = w2v.vector_size self.embedding[suffix] = Word2VecWrapper(w2v, unk)
def fix_terminal_tags(self, terminals): for terminal, orig_terminal in zip(terminals, self.terminals): if terminal.tag != orig_terminal.tag: if Config().verbose: print("%s is the wrong tag for terminal: %s" % (terminal.tag, terminal.text), file=sys.stderr) terminal.tag = orig_terminal.tag
def __init__(self, model_file=None, model_type="sparse"): self.state = None # State object created at each parse self.oracle = None # Oracle object created at each parse self.scores = None # NumPy array of action scores at each action self.action_count = 0 self.correct_count = 0 self.total_actions = 0 self.total_correct = 0 self.feature_extractor, self.model = models.create_model( model_type, Actions().all) self.model_file = model_file self.learning_rate = Config().learning_rate self.decay_factor = Config().decay_factor self.state_hash_history = None # For loop checking # Used in verify_passage to optionally ignore a mismatch in linkage nodes: self.ignore_node = lambda n: n.tag == layer1.NodeTags.Linkage if Config( ).no_linkage else None
def evaluate_passage(guessed_passage, ref_passage): score = evaluation.evaluate(guessed_passage, ref_passage, verbose=Config().verbose and guessed_passage is not None, units=False, errors=False) print("F1=%.3f" % score.average_unlabeled_f1(), flush=True) return score
def main(): args = Config().args print("Running parser with %s" % Config()) scores = None if Config().test_scores: with open(Config().test_scores, "w") as f: print(",".join(evaluation.Scores.field_titles()), file=f) if args.folds is not None: k = args.folds fold_scores = [] all_passages = list(util.read_files_and_dirs(args.passages)) assert len(all_passages) >= k,\ "%d folds are not possible with only %d passages" % (k, len(all_passages)) Config().random.shuffle(all_passages) folds = [all_passages[i::k] for i in range(k)] for i in range(k): print("Fold %d of %d:" % (i + 1, k)) dev_passages = folds[i] test_passages = folds[(i + 1) % k] train_passages = [ passage for fold in folds if fold is not dev_passages and fold is not test_passages for passage in fold ] s = train_test(train_passages, dev_passages, test_passages, args, "_%d" % i) if s is not None: fold_scores.append(s) if fold_scores: scores = evaluation.Scores.aggregate(fold_scores) print("Average unlabeled test F1 score for each fold: " + ", ".join("%.3f" % s.average_unlabeled_f1() for s in fold_scores)) print("Aggregated scores across folds:\n") scores.print() else: # Simple train/dev/test by given arguments train_passages, dev_passages, test_passages = [ util.read_files_and_dirs(arg) for arg in (args.train, args.dev, args.passages) ] scores = train_test(train_passages, dev_passages, test_passages, args) return scores
def pos_tag(state): """ Function to pass to State to POS tag the tokens when created :param state: State object to modify """ tokens = [token for tokens in state.tokens for token in tokens] tokens, tags = zip(*pos_tag(tokens)) if Config().verbose: print(" ".join("%s/%s" % (token, tag) for (token, tag) in zip(tokens, tags))) for node, tag in zip(state.nodes, tags): node.pos_tag = tag
def add_node(self, *args, **kwargs): """ Called during parsing to add a new Node (not core.Node) to the temporary representation :param args: ordinal arguments for Node() :param kwargs: keyword arguments for Node() """ node = Node(len(self.nodes), *args, **kwargs) if Config().verify: assert node not in self.nodes, "Node already exists" self.nodes.append(node) self.log.append("node: %s" % node) return node
def assert_possible_edge(): parent, child = self.get_parent_child(action) assert_possible_parent(parent) assert_possible_child(child) if parent is self.root and Config().constraints: assert child.text is None, "Root may not have terminal children, but is being added '%s'" % child assert action.tag in Constraints.TopLevel, "The root may not have %s edges" % action.tag # if Config().multiple_edges: # Removed this option because it is not useful right now # edge = Edge(parent, child, action.tag, remote=action.remote) # assert edge not in parent.outgoing, "Edge must not already exist: %s" % edge # else: assert child not in parent.children, "Edge must not already exist: %s->%s" % (parent, child) assert parent not in child.descendants, "Detected cycle created by edge: %s->%s" % (parent, child)
def read_passages(files): """ :param files: iterable of files or Passage objects :return: generator of passages from all files given """ for file in files: if isinstance(file, core.Passage): # Not really a file, but a Passage passage = file elif os.path.exists(file): # A file try: passage = ioutil.file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text) with open(file) as f: yield from converter(f, passage_id=base, split=Config().split) continue else: raise IOError("File not found: %s" % file) if Config().split: yield from convert.split2segments(passage, is_sentences=Config().sentences) else: yield passage
def main(): gl.config = Config(gl.basic_file) ok: bool = gl.config.InitConfig() if not ok: logging.error("Failed to init config, \ please recheck config files and contents.") return # 尝试链接并创建表 # 解析并插入表 parseTron = ParseTron(gl.config) parseTron.sync()
def train_test(train_passages, dev_passages, test_passages, args, model_suffix=""): scores = None train = bool(train_passages) model_file = args.model if model_file is not None: model_base, model_ext = os.path.splitext(model_file) model_file = model_base + model_suffix + model_ext p = Parser(model_file=model_file, model_type=args.classifier) p.train(train_passages, dev=dev_passages, iterations=args.iterations, folds=args.folds) if test_passages: if args.train or args.folds: print("Evaluating on test passages") passage_scores = [] for guessed_passage, ref_passage in p.parse(test_passages): if args.evaluate or train: score = evaluate_passage(guessed_passage, ref_passage) passage_scores.append(score) if guessed_passage is not None and not args.nowrite: util.write_passage(guessed_passage, args) if passage_scores and (not args.verbose or len(passage_scores) > 1): scores = evaluation.Scores.aggregate(passage_scores) print("\nAverage F1 score on test: %.3f" % scores.average_unlabeled_f1()) print("Aggregated scores:") scores.print() if Config().test_scores: with open(Config().test_scores, "a") as f: print(",".join(scores.fields()), file=f) return scores
def assert_possible_parent(node): assert node.text is None, "Terminals may not have children: %s" % node.text assert not node.implicit, "Implicit nodes may not have children: %s" % s0 if Config().constraints: assert action.tag not in Constraints.UniqueOutgoing or action.tag not in node.outgoing_tags, \ "Outgoing edge tag %s must be unique, but %s already has one" % ( action.tag, node) assert action.tag not in Constraints.MutuallyExclusiveOutgoing or not \ node.outgoing_tags & Constraints.MutuallyExclusiveOutgoing, \ "Outgoing edge tags %s are mutually exclusive, but %s already has %s and is being added %s" % ( Constraints.MutuallyExclusiveOutgoing, node, node.outgoing_tags, action.tag) assert action.tag in Constraints.ChildlessOutgoing or not \ node.incoming_tags & Constraints.ChildlessIncoming, \ "Units with incoming %s edges may not have children, and %s has incoming %s" % ( Constraints.ChildlessIncoming, node, node.incoming_tags)
def assert_possible_child(node): assert node is not self.root, "The root may not have parents" assert (node.text is not None) == (action.tag == EdgeTags.Terminal), \ "Edge tag must be %s iff child is terminal, but node is %s and edge tag is %s" % ( EdgeTags.Terminal, node, action.tag) if Config().constraints: assert action.tag not in Constraints.UniqueIncoming or \ action.tag not in node.incoming_tags, \ "Incoming edge tag %s must be unique, but %s already has one" % ( action.tag, node) assert action.tag not in Constraints.ChildlessIncoming or \ node.outgoing_tags <= Constraints.ChildlessOutgoing, \ "Units with incoming %s edges may not have children, but %s has %d" % ( Constraints.ChildlessIncoming, node, len(node.children)) assert action.remote or action.tag in Constraints.possible_multiple_incoming() or \ all(e.remote or e.tag in Constraints.possible_multiple_incoming() for e in node.incoming), \ "Multiple parents only allowed if they are remote or linkage edges: %s, %s" % ( action, node)
def create_model(model_type, labels): if model_type == "sparse": from classifiers.sparse_perceptron import SparsePerceptron from features.sparse_features import SparseFeatureExtractor features = SparseFeatureExtractor() model = SparsePerceptron(labels, min_update=Config().min_update) elif model_type == "dense": from features.embedding import FeatureEmbedding from classifiers.dense_perceptron import DensePerceptron features = dense_features_wrapper(FeatureEmbedding) model = DensePerceptron(labels, num_features=features.num_features()) elif model_type == "nn": from features.indexer import FeatureIndexer from classifiers.neural_network import NeuralNetwork features = dense_features_wrapper(FeatureIndexer) model = NeuralNetwork(labels, inputs=features.feature_types) else: raise ValueError("Invalid model type: '%s'" % model_type) return features, model
def transition(self, action): """ Main part of the parser: apply action given by oracle or classifier :param action: Action object to apply """ action.apply() self.log = [] if action.is_type(Actions.Shift): # Push buffer head to stack; shift buffer self.stack.append(self.buffer.popleft()) elif action.is_type(Actions.Node): # Create new parent node and add to the buffer parent = self.add_node(action.orig_node) self.update_swap_index(parent) self.add_edge(Edge(parent, self.stack[-1], action.tag)) self.buffer.appendleft(parent) elif action.is_type(Actions.Implicit): # Create new child node and add to the buffer child = self.add_node(action.orig_node, implicit=True) self.update_swap_index(child) self.add_edge(Edge(self.stack[-1], child, action.tag)) self.buffer.appendleft(child) elif action.is_type(Actions.Reduce): # Pop stack (no more edges to create with this node) self.stack.pop() elif action.is_type(Actions.LeftEdge, Actions.LeftRemote, Actions.RightEdge, Actions.RightRemote): parent, child = self.get_parent_child(action) self.add_edge(Edge(parent, child, action.tag, remote=action.remote)) elif action.is_type(Actions.Swap): # Place second (or more) stack item back on the buffer distance = action.tag or 1 s = slice(-distance - 1, -1) self.log.append("%s <--> %s" % (", ".join(map(str, self.stack[s])), self.stack[-1])) self.buffer.extendleft(reversed(self.stack[s])) # extendleft reverses the order del self.stack[s] elif action.is_type(Actions.Finish): # Nothing left to do self.finished = True else: raise Exception("Invalid action: " + action) if Config().verify: intersection = set(self.stack).intersection(self.buffer) assert not intersection, "Stack and buffer overlap: %s" % intersection self.assert_node_ratio() self.actions.append(action)
def train(self, passages, dev=None, iterations=1, folds=None): """ Train parser on given passages :param passages: iterable of passages to train on :param dev: iterable of passages to tune on :param iterations: number of iterations to perform :param folds: whether we are inside cross-validation with this many folds :return: trained model """ if not passages: if self.model_file is not None: # Nothing to train on; pre-trained model given self.model.load(self.model_file, util) Actions().all = self.model.labels return self.model best_score = 0 best_model = None save_model = True last = False if Config().dev_scores: with open(Config().dev_scores, "w") as f: print(",".join(["iteration"] + evaluation.Scores.field_titles()), file=f) for iteration in range(iterations): if last: break last = iteration == iterations - 1 print("Training iteration %d of %d: " % (iteration + 1, iterations)) passages = [ passage for _, passage in self.parse(passages, mode="train") ] model = self.model # Save non-finalize model self.model = self.model.finalize( ) # To evaluate finalized model on dev if last: if folds is None: # Free some memory, as these are not needed any more del passages[:] else: self.learning_rate *= self.decay_factor Config().random.shuffle(passages) if dev: print("Evaluating on dev passages") dev, scores = zip( *[(passage, evaluate_passage(predicted_passage, passage)) for predicted_passage, passage in self.parse( dev, mode="dev")]) dev = list(dev) scores = evaluation.Scores.aggregate(scores) score = scores.average_unlabeled_f1() print("Average unlabeled F1 score on dev: %.3f" % score) if Config().dev_scores: with open(Config().dev_scores, "a") as f: print(",".join([str(iteration)] + scores.fields()), file=f) if score >= best_score: print("Better than previous best score (%.3f)" % best_score) best_score = score save_model = True else: print("Not better than previous best score (%.3f)" % best_score) save_model = False if score >= 1: # Score cannot go any better, so no point in more training last = True if last and folds is None: # Free more memory del dev[:] if save_model or best_model is None: best_model = self.model # This is the finalized model if self.model_file is not None: best_model.save(self.model_file, util) if not last: self.model = model # Restore non-finalized model print("Trained %d iterations" % iterations) self.model = best_model return self.model
def assert_node_ratio(self, extra=0): max_ratio = Config().max_nodes_ratio assert self.node_ratio(extra=extra) <= max_ratio, \ "Reached maximum ratio (%.3f) of non-terminals to terminals" % max_ratio
def assert_height(self): max_height = Config().max_height assert self.root.height <= max_height, \ "Reached maximum graph height (%d)" % max_height
def assert_valid(self, action): """ Raise AssertionError if the action is invalid in the current state :param action: action to check for validity """ def assert_possible_node(): if self.labeled: # We're in training, so we must have an original node to refer to assert action.orig_node is not None, "May only create real nodes during training" self.assert_node_ratio(extra=1) self.assert_height() def assert_possible_parent(node): assert node.text is None, "Terminals may not have children: %s" % node.text assert not node.implicit, "Implicit nodes may not have children: %s" % s0 if Config().constraints: assert action.tag not in Constraints.UniqueOutgoing or action.tag not in node.outgoing_tags, \ "Outgoing edge tag %s must be unique, but %s already has one" % ( action.tag, node) assert action.tag not in Constraints.MutuallyExclusiveOutgoing or not \ node.outgoing_tags & Constraints.MutuallyExclusiveOutgoing, \ "Outgoing edge tags %s are mutually exclusive, but %s already has %s and is being added %s" % ( Constraints.MutuallyExclusiveOutgoing, node, node.outgoing_tags, action.tag) assert action.tag in Constraints.ChildlessOutgoing or not \ node.incoming_tags & Constraints.ChildlessIncoming, \ "Units with incoming %s edges may not have children, and %s has incoming %s" % ( Constraints.ChildlessIncoming, node, node.incoming_tags) def assert_possible_child(node): assert node is not self.root, "The root may not have parents" assert (node.text is not None) == (action.tag == EdgeTags.Terminal), \ "Edge tag must be %s iff child is terminal, but node is %s and edge tag is %s" % ( EdgeTags.Terminal, node, action.tag) if Config().constraints: assert action.tag not in Constraints.UniqueIncoming or \ action.tag not in node.incoming_tags, \ "Incoming edge tag %s must be unique, but %s already has one" % ( action.tag, node) assert action.tag not in Constraints.ChildlessIncoming or \ node.outgoing_tags <= Constraints.ChildlessOutgoing, \ "Units with incoming %s edges may not have children, but %s has %d" % ( Constraints.ChildlessIncoming, node, len(node.children)) assert action.remote or action.tag in Constraints.possible_multiple_incoming() or \ all(e.remote or e.tag in Constraints.possible_multiple_incoming() for e in node.incoming), \ "Multiple parents only allowed if they are remote or linkage edges: %s, %s" % ( action, node) # Commented out due to passage 106, unit 1.300 # assert not node.incoming_tags or (action.tag in Constraints.LinkerIncoming) == ( # node.incoming_tags <= Constraints.LinkerIncoming), \ # "Linker units may only have incoming edges with tags from %s, but %s is being added '%s'" % ( # Constraints.LinkerIncoming, node, action.tag) def assert_possible_edge(): parent, child = self.get_parent_child(action) assert_possible_parent(parent) assert_possible_child(child) if parent is self.root and Config().constraints: assert child.text is None, "Root may not have terminal children, but is being added '%s'" % child assert action.tag in Constraints.TopLevel, "The root may not have %s edges" % action.tag # if Config().multiple_edges: # Removed this option because it is not useful right now # edge = Edge(parent, child, action.tag, remote=action.remote) # assert edge not in parent.outgoing, "Edge must not already exist: %s" % edge # else: assert child not in parent.children, "Edge must not already exist: %s->%s" % (parent, child) assert parent not in child.descendants, "Detected cycle created by edge: %s->%s" % (parent, child) if action.is_type(Actions.Finish): if not Config().no_swap: # Without swap, the oracle may be incapable even of single action assert self.root.outgoing, \ "Root must have at least one child at the end of the parse, but has none" elif action.is_type(Actions.Shift): assert self.buffer, "Buffer must not be empty in order to shift from it" else: # Unary actions assert self.actions, "First action must be Shift, but was %s" % action assert self.stack, "Action requires non-empty stack: %s" % action s0 = self.stack[-1] if action.is_type(Actions.Node): assert_possible_child(s0) assert_possible_node() elif action.is_type(Actions.Implicit): assert_possible_parent(s0) assert_possible_node() elif action.is_type(Actions.Reduce): assert s0 is not self.root or s0.outgoing, "May not reduce the root without children" # Commented out due to passage 126, unit 1.338 # assert not s0.outgoing_tags & Constraints.SceneSufficientOutgoing and \ # not s0.incoming_tags & Constraints.SceneSufficientIncoming or \ # s0.outgoing_tags & Constraints.SceneNecessaryOutgoing, \ # "May not reduce a scene before it has any outgoing edge of %s (it has only %s)" % ( # Constraints.SceneNecessaryOutgoing, s0.outgoing_tags) # Commented out due to passage 126, unit 1.60 # assert s0.incoming_tags == Constraints.LinkerIncoming or not \ # s0.incoming_tags & Constraints.LinkerIncoming, \ # "May not reduce a linker before it has all incoming edges of %s (it has only %s)" % ( # Constraints.LinkerIncoming, s0.incoming_tags) else: # Binary actions assert len(self.stack) > 1, "Action requires at least two stack elements: %s" % action if action.is_type(Actions.LeftEdge, Actions.RightEdge, Actions.LeftRemote, Actions.RightRemote): assert_possible_edge() elif action.is_type(Actions.Swap): # A regular swap is possible since the stack has at least two elements; # A compound swap is possible if the stack is longer than the distance distance = action.tag or 1 assert 1 <= distance < len(self.stack), "Invalid swap distance: %d" % distance swapped = self.stack[-distance - 1] # To prevent swap loops: only swap if the nodes are currently in their original order assert self.swappable(s0, swapped),\ "Swapping already-swapped nodes: %s (swap index %d) <--> %s (swap index %d)" % ( swapped, swapped.swap_index, s0, s0.swap_index) else: raise Exception("Invalid action: %s" % action)
def parse(self, passages, mode="test"): """ Parse given passages :param passages: iterable of passages to parse :param mode: "train", "test" or "dev". If "train", use oracle to train on given passages. Otherwise, just parse with classifier. :return: generator of pairs of (parsed passage, original passage) """ train = mode == "train" dev = mode == "dev" test = mode == "test" assert train or dev or test, "Invalid parse mode: %s" % mode passage_word = "sentence" if Config().sentences else \ "paragraph" if Config().paragraphs else \ "passage" self.total_actions = 0 self.total_correct = 0 total_duration = 0 total_tokens = 0 num_passages = 0 for passage in passages: l0 = passage.layer(layer0.LAYER_ID) num_tokens = len(l0.all) total_tokens += num_tokens l1 = passage.layer(layer1.LAYER_ID) labeled = len(l1.all) > 1 assert not train or labeled, "Cannot train on unannotated passage" print("%s %-7s" % (passage_word, passage.ID), end=Config().line_end, flush=True) started = time.time() self.action_count = 0 self.correct_count = 0 self.state = State(passage, callback=self.pos_tag) self.state_hash_history = set() self.oracle = Oracle(passage) if train else None failed = False try: self.parse_passage( train) # This is where the actual parsing takes place except ParserException as e: if train: raise Config().log("%s %s: %s" % (passage_word, passage.ID, e)) if not test: print("failed") failed = True predicted_passage = passage if not train or Config().verify: predicted_passage = self.state.create_passage( assert_proper=Config().verify) duration = time.time() - started total_duration += duration if train: # We have an oracle to verify by if not failed and Config().verify: self.verify_passage(passage, predicted_passage, train) if self.action_count: print("%-16s" % ("%d%% (%d/%d)" % (100 * self.correct_count / self.action_count, self.correct_count, self.action_count)), end=Config().line_end) print("%0.3fs" % duration, end="") print("%-15s" % ("" if failed else " (%d tokens/s)" % (num_tokens / duration)), end="") print(Config().line_end, end="") if train: print(Config().line_end, flush=True) self.total_correct += self.correct_count self.total_actions += self.action_count num_passages += 1 yield predicted_passage, passage if num_passages > 1: print("Parsed %d %ss" % (num_passages, passage_word)) if self.oracle and self.total_actions: print("Overall %d%% correct transitions (%d/%d) on %s" % (100 * self.total_correct / self.total_actions, self.total_correct, self.total_actions, mode)) print( "Total time: %.3fs (average time/%s: %.3fs, average tokens/s: %d)" % (total_duration, passage_word, total_duration / num_passages, total_tokens / total_duration), flush=True)
dev_passages = folds[i] test_passages = folds[(i + 1) % k] train_passages = [ passage for fold in folds if fold is not dev_passages and fold is not test_passages for passage in fold ] s = train_test(train_passages, dev_passages, test_passages, args, "_%d" % i) if s is not None: fold_scores.append(s) if fold_scores: scores = evaluation.Scores.aggregate(fold_scores) print("Average unlabeled test F1 score for each fold: " + ", ".join("%.3f" % s.average_unlabeled_f1() for s in fold_scores)) print("Aggregated scores across folds:\n") scores.print() else: # Simple train/dev/test by given arguments train_passages, dev_passages, test_passages = [ util.read_files_and_dirs(arg) for arg in (args.train, args.dev, args.passages) ] scores = train_test(train_passages, dev_passages, test_passages, args) return scores if __name__ == "__main__": main() Config().close()
def init(self): # edge and node action will be created as they are returned by the oracle self.all = [Actions.Reduce, Actions.Shift, Actions.Finish] + \ (map(Actions.Swap, range(1, Action.MAX_SWAP)) if Config().compound_swap else [] if Config().no_swap else [Actions.Swap])
def __init__(self, *args, **kwargs): super(ParserTests, self).__init__(*args, **kwargs) Config("", "-m", "test") self.passage = convert.from_standard( TestUtil.load_xml('test_files/standard3.xml'))
def possible_multiple_incoming(cls): return () if Config().no_linkage else cls.PossibleMultipleIncoming