def score_operation(self, operation) -> float: if isinstance(operation, Constant): return 1.0 transformed_column = Column(values=operation.transform()) target_column = Column(values=operation.target_token.values) return (values_jaccard(transformed_column, target_column) + syntactic_sim(transformed_column, target_column)) / 2.0
def score_operation(self, operation) -> float: if isinstance(operation, Constant): return 1.0 transformed_column = Column(values=operation.transform()) target_column = Column(values=operation.target_token.values) result = self.scoring_model.predict_similarity(transformed_column, target_column) # print( # operation, # result, # operation.transform()[:3], # operation.target_token.values[:3], # ) return result
def train_scoring_model(self, example_patterns_by_groups: List[List[Pattern]]): labeled_pairs = [] for idx, example_patterns in enumerate(example_patterns_by_groups): labeled_cols: List[Tuple[str, Column]] = [] idx = 0 for pattern in example_patterns: for token in pattern.tokens: if len(token.values) <= 2: continue idx += 1 kf = KFold(n_splits=2) for train_indices, test_indices in kf.split(token.values): train_values = [] for index in test_indices: train_values.append(token.values[index]) column = Column(str(idx), str(idx), train_values) labeled_cols.append((str(idx), column)) for label1, col1 in labeled_cols: for label2, col2 in labeled_cols: # if label1 == label2 or abs(int(label1) - int(label2)) == 1: labeled_pairs.append((col1, col2, label1 == label2)) try: self.scoring_model.train_from_pairs(labeled_pairs) except Exception as e: logger.error(e)
def to_cols(self): columns = [] for idx, token in enumerate(self.tokens): column = Column(str(idx), str(idx), token.values) columns.append(column) return columns
def text_jaccard(col1: Column, col2: Column) -> float: col1_array = np.array(col1.text().split(" ")) col2_array = np.array(col2.text().split(" ")) return jaccard(col1_array, col2_array)
def text_cosine(col1: Column, col2: Column) -> float: return cosine(col1.text(), col2.text())
def learn(self, original_tree: PatternTree, target_tree: PatternTree): self.token_mapper.train_scoring_model( original_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True) + target_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True) ) final_node_to_result: Dict[PatternNode, TransformedResult] = {} target_col = Column(values=target_tree.values) best_score, best_level = 0, 0 for layer in [3, 2, 1]: transformed_col = Column() original_nodes = sorted( original_tree.node_in_layers[layer], key=lambda x: len(x.value.tokens), reverse=True, ) transformed_nodes = sorted( target_tree.node_in_layers[layer], key=lambda x: len(x.value.tokens), reverse=True, ) node_to_result: Dict[PatternNode, TransformedResult] = {} for original_node in original_nodes: transformed_results = [] for target_node in transformed_nodes: result = self.token_mapper.learn( original_node.value, target_node.value ) transformed_results.append( TransformedResult( original_target_pairs=result[0], score=result[1] ) ) node_to_result[original_node] = max( transformed_results, key=lambda x: x.score ) value_tuples = list( zip(*node_to_result[original_node].original_target_pairs) ) transformed_col.extend_values(list(value_tuples[1])) pattern_score = self.token_mapper.scoring_model.predict_similarity( target_col, transformed_col ) if pattern_score > best_score: best_score = pattern_score best_level = layer final_node_to_result = node_to_result.copy() validated_original_to_transformed_tuples = [] scores = [] for node, result in final_node_to_result.items(): for original_value, transformed_value in result.original_target_pairs: validation_result = self.validator.validate_result( transformed_value, original_tree, target_tree, result.score, best_level, ) validated_original_to_transformed_tuples.append( (original_value, transformed_value, validation_result) ) scores.append(result.score) return validated_original_to_transformed_tuples, sum(scores) * 1.0 / len(scores)
def learn_top_k(self, original_tree: PatternTree, target_tree: PatternTree, k: int): self.token_mapper.train_scoring_model( original_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True) + target_tree.get_patterns_by_layers([1, 2, 3, 4], in_groups=True) ) final_node_to_results: Dict[PatternNode, List[TransformedResult]] = {} target_col = Column(values=target_tree.values) best_score, best_level = -1, 0 for layer in range(len(PATTERNS_BY_LEVEL) - 1, 0, -1): logger.debug("Mapping level: %s" % layer) transformed_col = Column() original_nodes = sorted( original_tree.node_in_layers[layer], key=lambda x: (len(x.value.values), -len(x.value.tokens)), reverse=True, ) transformed_nodes = sorted( target_tree.node_in_layers[layer], key=lambda x: (len(x.value.values), -len(x.value.tokens)), reverse=True, ) node_to_results = {} pattern_scores = [] logger.debug( "Num combinations: %s * %s = %s" % ( len(original_nodes), len(transformed_nodes), len(original_nodes) * len(transformed_nodes), ) ) num_possible_nodes = 10 for original_node in original_nodes[:num_possible_nodes]: token_mapping_results = [] for target_node in transformed_nodes: validated_values_by_pattern, scores_by_pattern = self.token_mapper.learn_top_k( original_node.value, target_node.value, k ) token_mapping_results.append( [ TransformedResult(*result) for result in zip( validated_values_by_pattern, scores_by_pattern ) ] ) node_to_results[original_node] = max( token_mapping_results, key=lambda x: x[0].score ) pattern_scores.append( max([x.score for x in node_to_results[original_node]]) ) value_tuples = list( zip(*node_to_results[original_node][0].original_target_pairs) ) transformed_col.extend_values(list(value_tuples[1])) for original_node in original_nodes[num_possible_nodes:]: for i in range(k): transform_result = TransformedResult( [(str_value, "") for str_value in original_node.value.values], 0.0, ) node_to_results[original_node] = [transform_result] value_tuples = list( zip(*node_to_results[original_node][0].original_target_pairs) ) transformed_col.extend_values(list(value_tuples[1])) if self.mapping_method == "sim": assert isinstance(self.token_mapper.scoring_model, MultiBinary) pattern_score = ( self.token_mapper.scoring_model.predict_similarity( target_col, transformed_col ) * len([x for x in transformed_col.values if x]) * 1.0 / len(transformed_col.values) ) else: pattern_score = np.mean(pattern_scores) # print( # "Pattern Score", # pattern_score, # transformed_col.values, # target_col.values, # ) if pattern_score > best_score: best_score = pattern_score best_level = layer final_node_to_results = node_to_results.copy() validated_values_by_pattern = [[] for _ in range(len(final_node_to_results))] scores_by_pattern = [[] for _ in range(len(final_node_to_results))] idx = 0 full_validation_result = False for node, results in final_node_to_results.items(): full_validation_result = ( full_validation_result or self.validator.validate_results( results[0].original_target_pairs, target_tree, results[0].score, 0 if len(results) == 1 else results[1].score, 1, )[0] ) validated_values_by_pattern[idx] = [] for idx1, result in enumerate(results): for idx2, (original_value, transformed_value) in enumerate( result.original_target_pairs ): if len(results) == 1: current_score = 1 next_score = 0 elif idx1 < len(results) - 1: current_score = result.score next_score = results[idx1 + 1].score else: current_score = 0 next_score = 0 validation_result = self.validator.validate_results( [(original_value, transformed_value)], target_tree, current_score, next_score, 1, ) if idx1 == 0: validated_values_by_pattern[idx].append( (original_value, [transformed_value], [validation_result]) ) else: current_values = validated_values_by_pattern[idx][idx2][0] assert ( original_value == current_values ), f"Original value should be the same f{original_value} vs {current_values}" validated_values_by_pattern[idx][idx2][1].append( transformed_value ) validated_values_by_pattern[idx][idx2][2].append( validation_result ) scores_by_pattern[idx].append(result.score) idx += 1 return validated_values_by_pattern, scores_by_pattern, full_validation_result