def _cut_valid_data(self): """Put aside part of the training set for validation.""" train_size = len(self.train_trees) # we have multiple references if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(train_size) # data stored "serially" (all different instances next to each other, then again in the # same order) if refs_stored == 'serial': train_tree_chunks = [ chunk for chunk in chunk_list(self.train_trees, train_size / num_refs) ] train_da_chunks = [ chunk for chunk in chunk_list(self.train_das, train_size / num_refs) ] self.valid_trees = [[ chunk[i] for chunk in train_tree_chunks ] for i in xrange(train_size / num_refs - self.validation_size, train_size / num_refs)] self.valid_das = train_da_chunks[0][-self.validation_size:] self.train_trees = sum([ chunk[:-self.validation_size] for chunk in train_tree_chunks ], []) self.train_das = sum([ chunk[:-self.validation_size] for chunk in train_da_chunks ], []) # data stored in "parallel" (all synonymous instances next to each other) else: self.valid_trees = [ chunk for chunk in chunk_list( self.train_trees[-self.validation_size * num_refs:], num_refs) ] self.valid_das = self.train_das[-self.validation_size * num_refs::num_refs] self.train_trees = self.train_trees[:-self.validation_size * num_refs] self.train_das = self.train_das[:-self.validation_size * num_refs] # single validation reference else: # make "reference lists" of length 1 to accommodate for functions working # with multiple references self.valid_trees = [ [tree] for tree in self.train_trees[-self.validation_size:] ] self.valid_das = self.train_das[-self.validation_size:] self.train_trees = self.train_trees[:-self.validation_size] self.train_das = self.train_das[:-self.validation_size]
def _load_valid_data(self, valid_data_paths): """Load validation data from separate files (comma-separated list of files with DAs, trees, and optionally contexts is expected).""" # parse validation data file specification valid_data_paths = valid_data_paths.split(',') if len( valid_data_paths ) == 3: # with contexts (this does not determine if they're used) valid_das_file, valid_trees_file, valid_context_file = valid_data_paths else: valid_das_file, valid_trees_file = valid_data_paths # load the validation data log_info('Reading DAs from ' + valid_das_file + '...') self.valid_das = read_das(valid_das_file) self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors) if self.use_context: self.valid_das = self._load_contexts(self.valid_das, valid_context_file) # reorder validation data for multiple references (see also _cut_valid_data) valid_size = len(self.valid_trees) if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(valid_size) # serial: different instances next to each other, then synonymous in the same order if refs_stored == 'serial': valid_tree_chunks = [ chunk for chunk in chunk_list(self.valid_trees, valid_size / num_refs) ] self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks] for i in xrange(valid_size / num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[0:valid_size / num_refs] # parallel: synonymous instances next to each other elif refs_stored == 'parallel': self.valid_trees = [ chunk for chunk in chunk_list(self.valid_trees, num_refs) ] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[::num_refs] # no multiple references; make lists of size 1 to simplify working with the data else: self.valid_trees = [[tree] for tree in self.valid_trees]
def sample_gen(args): from pytreex.core.document import Document opts, files = getopt(args, 'r:n:o:w:') num_to_generate = 1 oracle_eval_file = None fname_ttrees_out = None for opt, arg in opts: if opt == '-n': num_to_generate = int(arg) elif opt == '-o': oracle_eval_file = arg elif opt == '-w': fname_ttrees_out = arg if len(files) != 2: sys.exit(__doc__) fname_cand_model, fname_da_test = files # load model log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = candgen tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker}) # generate log_info('Generating...') gen_doc = Document() das = read_das(fname_da_test) for da in das: for _ in xrange(num_to_generate): # repeat generation n times tgen.generate_tree(da, gen_doc) # evaluate if needed if oracle_eval_file is not None: log_info('Evaluating oracle F1...') log_info('Loading gold data from ' + oracle_eval_file) gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector) gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Gold data loaded.') correct, predicted, gold = 0, 0, 0 for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)): # find best of predicted trees (in terms of F1) _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree), gen_trees)], key=lambda x: x[0]) correct += tc predicted += tp gold += tg # evaluate oracle F1 log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold)) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def _cut_valid_data(self): """Put aside part of the training set for validation.""" train_size = len(self.train_trees) # we have multiple references if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(train_size) # data stored "serially" (all different instances next to each other, then again in the # same order) if refs_stored == 'serial': train_tree_chunks = [chunk for chunk in chunk_list(self.train_trees, train_size / num_refs)] train_da_chunks = [chunk for chunk in chunk_list(self.train_das, train_size / num_refs)] self.valid_trees = [[chunk[i] for chunk in train_tree_chunks] for i in xrange(train_size / num_refs - self.validation_size, train_size / num_refs)] self.valid_das = train_da_chunks[0][-self.validation_size:] self.train_trees = sum([chunk[:-self.validation_size] for chunk in train_tree_chunks], []) self.train_das = sum([chunk[:-self.validation_size] for chunk in train_da_chunks], []) # data stored in "parallel" (all synonymous instances next to each other) else: self.valid_trees = [chunk for chunk in chunk_list(self.train_trees[-self.validation_size * num_refs:], num_refs)] self.valid_das = self.train_das[-self.validation_size * num_refs::num_refs] self.train_trees = self.train_trees[:-self.validation_size * num_refs] self.train_das = self.train_das[:-self.validation_size * num_refs] # single validation reference else: # make "reference lists" of length 1 to accommodate for functions working # with multiple references self.valid_trees = [[tree] for tree in self.train_trees[-self.validation_size:]] self.valid_das = self.train_das[-self.validation_size:] self.train_trees = self.train_trees[:-self.validation_size] self.train_das = self.train_das[:-self.validation_size]
def _load_valid_data(self, valid_data_paths): """Load validation data from separate files (comma-separated list of files with DAs, trees, and optionally contexts is expected).""" # parse validation data file specification valid_data_paths = valid_data_paths.split(',') if len(valid_data_paths) == 3: # with contexts (this does not determine if they're used) valid_das_file, valid_trees_file, valid_context_file = valid_data_paths else: valid_das_file, valid_trees_file = valid_data_paths # load the validation data log_info('Reading DAs from ' + valid_das_file + '...') self.valid_das = read_das(valid_das_file) self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors) if self.use_context: self.valid_das = self._load_contexts(self.valid_das, valid_context_file) # reorder validation data for multiple references (see also _cut_valid_data) valid_size = len(self.valid_trees) if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(valid_size) # serial: different instances next to each other, then synonymous in the same order if refs_stored == 'serial': valid_tree_chunks = [chunk for chunk in chunk_list(self.valid_trees, valid_size / num_refs)] self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks] for i in xrange(valid_size / num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[0:valid_size / num_refs] # parallel: synonymous instances next to each other elif refs_stored == 'parallel': self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[::num_refs] # no multiple references; make lists of size 1 to simplify working with the data else: self.valid_trees = [[tree] for tree in self.valid_trees]