Beispiel #1
0
    def _load_data_from_sample(hyperparameters: Dict[str, Any],
                               metadata: Dict[str, Any],
                               raw_sample: Dict[str, Any],
                               result_holder: Dict[str, Any],
                               is_train: bool = True) -> bool:
        keep_sample = super(Path2Annotation,
                            Path2Annotation)._load_data_from_sample(
                                hyperparameters, metadata, raw_sample,
                                result_holder, is_train)
        if not keep_sample:
            return False

        target_class = []
        for node_idx, annotation_data in raw_sample['supernodes'].items():
            annotation = annotation_data['annotation']
            if is_train and ignore_type_annotation(annotation):
                continue

            target_class.append(
                TypeClassificationModel._get_idx_for_type(
                    annotation, metadata, hyperparameters))

        result_holder['variable_target_class'] = np.array(target_class,
                                                          dtype=np.uint16)
        return len(target_class) > 0
Beispiel #2
0
    def _load_data_from_sample(hyperparameters: Dict[str, Any],
                               metadata: Dict[str, Any],
                               raw_sample: Dict[str, Any],
                               result_holder: Dict[str, Any],
                               is_train: bool = True) -> bool:
        keep_sample = super(Graph2HybridMetric,
                            Graph2HybridMetric)._load_data_from_sample(
                                hyperparameters, metadata, raw_sample,
                                result_holder, is_train)
        if not keep_sample:
            return False

        target_node_idxs, target_class, target_class_id = [], [], []
        for node_idx, annotation_data in raw_sample['supernodes'].items():
            node_idx = int(node_idx)
            annotation = annotation_data['annotation']
            if is_train and ignore_type_annotation(annotation):
                continue
            target_node_idxs.append(node_idx)
            target_class.append(annotation)
            target_class_id.append(
                TypeClassificationModel._get_idx_for_type(
                    annotation, metadata, hyperparameters))

        result_holder['target_node_idxs'] = np.array(target_node_idxs,
                                                     dtype=np.uint16)
        result_holder['target_type'] = target_class
        result_holder['variable_target_class'] = np.array(target_class_id,
                                                          dtype=np.uint16)
        return len(target_node_idxs) > 0
    def _load_data_from_sample(hyperparameters: Dict[str, Any],
                               metadata: Dict[str, Any],
                               raw_sample: Dict[str, Any],
                               result_holder: Dict[str, Any],
                               is_train: bool = True) -> bool:
        keep_sample = super(Sequence2HybridMetric,
                            Sequence2HybridMetric)._load_data_from_sample(
                                hyperparameters, metadata, raw_sample,
                                result_holder, is_train)
        if not keep_sample:
            return False

        token_node_idxs = set(raw_sample['token-sequence'])
        node_idx_to_supernode_idx = {}  #  type: Dict[int, int]
        for from_idx, to_idxs in raw_sample['edges']['OCCURRENCE_OF'].items():
            from_idx = int(from_idx)
            if from_idx not in token_node_idxs:
                # Some supernodes do not have an associated token. Such nodes are attributes
                if str(from_idx) in raw_sample['edges']['CHILD']:
                    right_token_idx = max(
                        raw_sample['edges']['CHILD'][str(from_idx)])
                    assert right_token_idx in token_node_idxs
                    from_idx = right_token_idx
                else:
                    continue
            for to_idx in to_idxs:
                node_idx_to_supernode_idx[from_idx] = to_idx

        supernodes_with_related_nodes = set(node_idx_to_supernode_idx.values())

        variable_types = []  # type: List[str]
        variable_type_idxs = []  # type: List[int]
        ignored_supernodes = set()
        supernode_idxs_to_annotated_variable_idx = {}  # type: Dict[int, int]
        for node_idx, supernode_data in raw_sample['supernodes'].items():
            node_idx = int(node_idx)
            annotation = supernode_data['annotation']
            if ignore_type_annotation(annotation) and is_train:
                ignored_supernodes.add(node_idx)
                continue
            if node_idx not in supernodes_with_related_nodes:
                ignored_supernodes.add(node_idx)
                continue

            variable_idx = len(supernode_idxs_to_annotated_variable_idx)
            variable_types.append(annotation)
            variable_type_idxs.append(
                TypeClassificationModel._get_idx_for_type(
                    annotation, metadata, hyperparameters))
            supernode_idxs_to_annotated_variable_idx[node_idx] = variable_idx

        if len(variable_types) == 0:
            return False

        token_idx, variable_idx = [], []

        def create_token_sequence():
            for i, node_idx in enumerate(raw_sample['token-sequence']):
                supernode_idx = node_idx_to_supernode_idx.get(node_idx)
                if supernode_idx is not None:
                    annotated_variable_idxs = supernode_idxs_to_annotated_variable_idx.get(
                        supernode_idx)
                    if annotated_variable_idxs is not None:
                        token_idx.append(i)
                        variable_idx.append(annotated_variable_idxs)
                yield raw_sample['nodes'][node_idx]

        token_sequence = list(create_token_sequence())
        if len(token_sequence) > hyperparameters['max_seq_len']:
            return False

        # Did we see at least one token per variable?
        assert len(np.unique(variable_idx)) == len(variable_types)

        TokenEmbedder.load_data_from_sample('token', metadata, token_sequence,
                                            result_holder, hyperparameters,
                                            is_train)

        result_holder['sequence_length'] = len(token_sequence)
        result_holder['variable_token_idxs'] = np.array(token_idx,
                                                        dtype=np.uint32)
        result_holder['variable_idxs'] = np.array(variable_idx,
                                                  dtype=np.uint32)
        result_holder['target_type'] = variable_types
        result_holder['variable_target_class'] = np.array(variable_type_idxs,
                                                          dtype=np.uint32)
        result_holder['ignored_supernodes'] = ignored_supernodes
        return keep_sample