def initial_evaluation(model_logit, questions_all, problems_all, batch_size, print_each_problem=False): print('Initial evaluation...') ds_individual = datasets.questions.individual.dict_to_dataset(questions_all, problems_all) print(f'Evaluating with batch size {batch_size}...') eval_res = model_logit.evaluate(datasets.questions.batch.batch(ds_individual, batch_size), return_dict=True) print(f'Evaluation result with batch size {batch_size}: {eval_res}') ds_batches = datasets.questions.batch.batch(ds_individual, 1) print('Evaluating with batch size 1...') eval_res = model_logit.evaluate(ds_batches, return_dict=True) print(f'Evaluation result with batch size 1: {eval_res}') batch_sizes = [] batch_losses = [] logit_lists = [] for batch in tqdm(ds_batches, disable=print_each_problem): problem_names = [py_str(p) for p in batch['problems']] eval_res = model_logit.evaluate(tf.data.Dataset.from_tensors(batch), return_dict=True, verbose=0) call_res = model_logit(batch, training=False) if print_each_problem: print(f'{problem_names}: {eval_res}, {call_res}') batch_sizes.append(len(problem_names)) batch_losses.append(eval_res['loss']) logit_lists.append(call_res.flat_values) print('Weighted average loss (expected overall loss): ', np.average(batch_losses, weights=batch_sizes)) print('Mean of batch losses: ', np.mean(batch_losses)) print('Median of batch losses (expected: 0.69): ', np.median(batch_losses)) logits = tf.concat(logit_lists, axis=0) print('Mean of question logits (expected: 0): ', np.mean(logits)) print('Mean of question accuracies (expected: 0.5): ', np.mean(logits.numpy() > 0)) accs = [] for logit in logit_lists: n_correct = np.count_nonzero(logit.numpy() > 0) acc = n_correct / len(logit) accs.append(acc) print('Weighted average accuracy (expected overall accuracy): ', np.mean(accs))
def is_sufficiently_small_py(problem): if cfg.gcn.max_problem_nodes[k] is None: return True num_nodes = graphs_df['graph_nodes'][py_str(problem)] if pd.notna(num_nodes) and num_nodes <= cfg.gcn.max_problem_nodes[k]: return True return False
def graphify(self, problem, cache=True): # TODO: Time the whole call (all inclusive). problem = py_str(problem) logging.debug(f'Graphifying problem {problem}...') record = {'problem': problem, 'error': None, 'clausify_cached': cache} nodes_lower_bound = self.nodes_lower_bound(problem) if self.max_number_of_nodes is not None and nodes_lower_bound > self.max_number_of_nodes: record['error'] = 'nodes_from_tptp_header' record['graph_nodes_lower_bound'] = nodes_lower_bound return None, record with timer() as t: clausify_result = self.clausifier.clausify(problem, cache=cache) record.update({ 'clausify_returncode': clausify_result.returncode, 'clausify_time': t.elapsed, 'clausify_time_original': clausify_result.time_elapsed }) if clausify_result.returncode != 0 or clausify_result.clauses is None or clausify_result.symbols is None: logging.debug( f'Failed to graphify problem {problem}: clausification failed.' ) record['error'] = 'clausify' return None, record symbol_types = ('predicate', 'function') symbols = { symbol_type: clausify_result.symbols_of_type(symbol_type) for symbol_type in symbol_types } record['num_clauses'] = len(clausify_result.clauses) record.update({ f'num_{symbol_type}': len(symbols[symbol_type]) for symbol_type in symbol_types }) with timer() as t: try: g = self.clausify_result_to_graph(clausify_result) except FormulaVisitor.NumNodesError as e: # The graph would be too large (too many nodes). logging.debug(f'Failed to graphify problem {problem}.', exc_info=True) record['error'] = 'node_count' g = None record['graph_nodes_lower_bound'] = e.actual record['graph_time'] = t.elapsed if g is not None: record['graph_nodes'] = g.num_nodes() record['graph_nodes_lower_bound'] = g.num_nodes() record.update({ f'graph_nodes_{ntype}': g.num_nodes(ntype) for ntype in g.ntypes }) record['graph_edges'] = sum( g.num_edges(canonical_etype) for canonical_etype in g.canonical_etypes) logging.debug(f'Problem {problem} graphified.') return g, record
def get_datasets_split(patterns, validation_split, max_problems=None): problems_all = get_dataset(patterns) logging.info('Number of problems available: %d', problems_all.cardinality()) logging.debug('Leading 10 problems: %s', [py_str(p) for p in problems_all.take(10)]) if max_problems is not None: problems_all = problems_all.take(max_problems) logging.info('Number of problems taken: %d', problems_all.cardinality()) assert 0 <= validation_split <= 1 problems_validation_count = tf.cast(tf.cast(problems_all.cardinality(), tf.float32) * validation_split, tf.int64) problems = { 'validation': problems_all.take(problems_validation_count), 'train': problems_all.skip(problems_validation_count) } for k in problems: logging.info(f'Number of {k} problems: %d', problems[k].cardinality()) return problems
def gen(): for problem in problems: try: q = tf.constant(questions[py_str(problem)], dtype=dtype) # Let n be the number of symbols in the problem. n_symbols = tf.shape(q)[1] n_symbols = tf.cast(n_symbols, q.dtype) tf.debugging.assert_greater_equal(n_symbols, tf.reduce_max(q)) tf.debugging.assert_less_equal(-n_symbols, tf.reduce_min(q)) if normalize: # We scale the question matrix by a factor that makes questions from problems of various sizes commensurable. # We set the factor so that if symbol cost is 1 for all symbols, then precedence cost is 1 for all precedences. q *= 2 / (n_symbols * (n_symbols + 1)) tf.debugging.assert_greater_equal(2 / (n_symbols + 1), tf.reduce_max(q)) tf.debugging.assert_less_equal(-2 / (n_symbols + 1), tf.reduce_min(q)) yield {'problem': problem, 'questions': q} except KeyError: pass
def _predict_one(self, problem): problem = py_str(problem) try: return self.costs[problem], True except KeyError: return self.invalid_costs(), False