Example #1
0
    def process_qnode(self, kw: KgtkWriter, current_process_node_id: str,
                      each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool:
        interesting_qnode: bool = False
        if each_node_attributes:
            for k in each_node_attributes:
                if each_node_attributes[k]:
                    interesting_qnode = True
                    break
        if not interesting_qnode:
            return False

        concat_sentence: str
        explanation: str
        concat_sentence, explanation = self.attribute_to_sentence(
            each_node_attributes, current_process_node_id)
        if self.explain:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence),
                KgtkFormat.stringify(explanation)
            ])
        else:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence)
            ])
        return True
Example #2
0
 def write_row(self, ew: KgtkWriter, node1: str, label: str, node2: str):
     output_row: typing.List[str] = [node1, label, node2]
     if self.idbuilder is None:
         ew.write(output_row)
     else:
         ew.write(self.idbuilder.build(output_row, self.output_line_count))
     self.output_line_count += 1
Example #3
0
File: filter.py Project: yyht/kgtk
    def single_object_filter_inverted(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        obj_idx: int,
        obj_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single object filter inverted",
                  file=error_file,
                  flush=True)

        obj_filter_value: str = list(obj_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[obj_idx] != obj_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
Example #4
0
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                    self.output_line_count += 1
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row.
            self.current_key = input_key
            self.expand_row(row)
        else:
            # Merge into an existing row.
            self.merge_row(row)
Example #5
0
File: filter.py Project: yyht/kgtk
    def single_predicate_filter(
        kr: KgtkReader,
        kw: KgtkWriter,
        rw: typing.Optional[KgtkWriter],
        pred_idx: int,
        pred_filter: typing.Set[str],
    ):
        if verbose:
            print("Applying a single predicate filter",
                  file=error_file,
                  flush=True)

        pred_filter_value: str = list(pred_filter)[0]

        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            if row[pred_idx] == pred_filter_value:
                kw.write(row)
                output_line_count += 1

            else:
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
Example #6
0
    def process_cacheing_filter(self, input_kr: KgtkReader,
                                filter_kr: KgtkReader,
                                input_key_columns: typing.List[int],
                                filter_key_columns: typing.List[int],
                                ew: KgtkWriter):
        if self.verbose:
            print("Processing by cacheing the filter file's key set..")

        if self.verbose:
            print("Building the filter key set from %s" %
                  self.filter_file_path,
                  file=self.error_file,
                  flush=True)
        key_set: typing.Set[str] = self.extract_key_set(
            filter_kr, "filter", filter_key_columns)
        if self.verbose or self.very_verbose:
            print("There are %d entries in the filter key set." % len(key_set),
                  file=self.error_file,
                  flush=True)
            if self.very_verbose:
                print("Keys: %s" % " ".join(key_set),
                      file=self.error_file,
                      flush=True)

        if self.verbose:
            print("Filtering records from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_line_count: int = 0
        output_line_count: int = 0

        # TODO: join these two code paths using xor?
        row: typing.List[str]
        input_key: str
        if self.invert:
            for row in input_kr:
                input_line_count += 1
                input_key = self.build_key(row, input_key_columns)
                if input_key not in key_set:
                    ew.write(row)
                    output_line_count += 1
        else:
            for row in input_kr:
                input_line_count += 1
                input_key = self.build_key(row, input_key_columns)
                if input_key in key_set:
                    ew.write(row)
                    output_line_count += 1

        if self.verbose:
            print("Read %d records, wrote %d records." %
                  (input_line_count, output_line_count),
                  file=self.error_file,
                  flush=True)
Example #7
0
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        if self.very_verbose:
            print("Input key %s" % repr(input_key), file=self.error_file, flush=True)
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            if self.very_verbose:
                print("No current key", file=self.error_file, flush=True)
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                if self.very_verbose:
                    if flush:
                        print("flush", file=self.error_file, flush=True)
                    else:
                        print("current_key %s != input_key %s" % (repr(self.current_key), repr(input_key)), file=self.error_file, flush=True)
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if self.very_verbose:
                        print("writing %s" % repr(self.field_separator.join(self.current_row)), file=self.error_file, flush=True)
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                    self.output_line_count += 1
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row.
            if self.very_verbose:
                print("New current_key %s" % repr(self.current_key), file=self.error_file, flush=True)
            self.current_key = input_key
            if self.very_verbose:
                print("Expand row %s" % self.field_separator.join(row), file=self.error_file, flush=True)
            self.expand_row(row)
        else:
            # Merge into an existing row.
            if self.very_verbose:
                print("Merge row", file=self.error_file, flush=True)
            self.merge_row(row)
Example #8
0
    def write_output_row(
        self,
        ew: KgtkWriter,
        row: typing.List[str],
        new_columns: int,
        input_select_column_idx: int,
        label_select_column_idx: int,
        labels: typing.Mapping[str, str],
        lifted_column_idxs: typing.List[int],
        lifted_output_column_idxs: typing.List[int],
    ) -> bool:
        output_row: typing.List[str] = row.copy()
        if new_columns > 0:
            output_row.extend([""] * new_columns)
        output_select_column_idx: int = input_select_column_idx

        do_write: bool = True
        do_lift: bool = True
        if label_select_column_idx >= 0:
            print("label_select_column_idx %d" % label_select_column_idx)
            if row[label_select_column_idx] == self.label_select_column_value:
                # Don't lift label columns, if we have stored labels in the input records.
                do_lift = False
                if self.remove_label_records:
                    do_write = False
        if input_select_column_idx >= 0:
            if self.input_select_column_value is not None and row[
                    input_select_column_idx] != self.input_select_column_value:
                # Not selected for lifting into.
                do_lift = False
        if do_lift:
            # Lift the specified columns in this row.
            did_lift: bool = False
            lifted_column_idx: int
            for idx, lifted_column_idx in enumerate(lifted_column_idxs):
                label_key: str = row[lifted_column_idx]
                if label_key in labels:
                    output_row[
                        lifted_output_column_idxs[idx]] = labels[label_key]
                    did_lift = True  # What if we want to note if we lifted all columns?
            if did_lift and output_select_column_idx >= 0 and self.output_select_column_value is not None:
                output_row[
                    output_select_column_idx] = self.output_select_column_value

        if do_write:
            ew.write(output_row)
        return do_write
Example #9
0
    def pass_group_through(self, kw: KgtkWriter,
                           uninvolvedw: typing.Optional[KgtkWriter],
                           node1_group: typing.List[typing.List[str]],
                           new_id_column: bool):
        # Unreification was not triggered.  Pass this group of rows
        # through unchanged, except for possibly appending an ID
        # column.
        #
        # TODO: Perhaps we'd like to build an ID value at the same time?
        row: typing.List[str]
        for row in node1_group:
            if uninvolvedw is not None:
                uninvolvedw.write(row)

            if new_id_column:
                row = row.copy()
                row.append("")

            kw.write(row)
            self.output_line_count += 1
Example #10
0
    def process_row(self,
                    input_key: str,
                    row: typing.List[str],
                    line_number: int,
                    idb: typing.Optional[KgtkIdBuilder],
                    ew: KgtkWriter,
                    flush: bool = False):
        # Note:  This code makes the assumption that row lengths do not vary!
        if self.current_key is not None:
            # We have a record being built.  Write it?
            if flush or self.current_key != input_key:
                # self.current_key != input_key means that the key is changing.
                self.compact_row()
                if self.current_row is not None:
                    if idb is None:
                        ew.write(self.current_row)
                    else:
                        ew.write(idb.build(self.current_row, line_number))
                self.current_key = None
                self.current_row = None

        if flush:
            # This was a flush request.  We're done.
            return

        # Are we starting a new key?
        if self.current_key is None:
            # Save the new row as the current row.  If the next row
            # doesn't have the same input key, we'll write this
            # row out with a minimum of handling.
            self.current_key = input_key
            self.current_row = row
            return

        if self.current_row_lists is None:
            self.expand_row()
        self.merge_row(row)
Example #11
0
    def write_output_row(self, ew: KgtkWriter, row: typing.List[str],
                         new_columns: int, label_column_idx: int,
                         labels: typing.Mapping[str, str],
                         lifted_column_idxs: typing.List[int],
                         lifted_output_column_idxs: typing.List[int]):
        output_row: typing.List[str] = row.copy()
        if new_columns > 0:
            output_row.extend([""] * new_columns)

        if label_column_idx >= 0 and row[
                label_column_idx] == self.label_column_value:
            # Don't lift label columns, if we have stored labels in the input records.
            pass
        else:
            # Lift the specified columns in this row.
            lifted_column_idx: int
            for idx, lifted_column_idx in enumerate(lifted_column_idxs):
                lifted_value: str = row[lifted_column_idx]
                if lifted_value in labels:
                    output_row[lifted_output_column_idxs[idx]] = labels[
                        row[lifted_column_idx]]

        ew.write(output_row)
        return
Example #12
0
 def process(self, kr: KgtkReader, kw: KgtkWriter):
     line_number: int = 0
     row: typing.List[str]
     for row in kr:
         line_number += 1
         kw.write(self.build(row, line_number))
Example #13
0
    def process_cacheing_input_preserving_order(
            self, input_kr: KgtkReader, filter_kr: KgtkReader,
            input_key_columns: typing.List[int],
            filter_key_columns: typing.List[int], ew: KgtkWriter):
        # This algorithm preserves the input file's record order in the output file,
        # at the cost of extra work building keys.

        if self.verbose:
            print(
                "Processing by cacheing the input file while preserving record order."
            )

        # Step one:  read the input file, cache it, and build the input key set
        if self.verbose:
            print("Building the input key set from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        input_key_set: typing.Set[str]
        input_cache: typing.List[typing.List[str]]
        input_key_set, input_cache = self.extract_key_set_and_cache(
            input_kr, "input", input_key_columns)
        input_line_count: int = len(input_cache)
        if self.verbose or self.very_verbose:
            print("There are %d rows in the input cache." % input_line_count,
                  file=self.error_file,
                  flush=True)
            print("There are %d entries in the input key set." %
                  len(input_key_set),
                  file=self.error_file,
                  flush=True)
            if self.very_verbose:
                print("Keys: %s" % " ".join(input_key_set),
                      file=self.error_file,
                      flush=True)

        # Step two: read the filter file and act on the key_set.
        output_key_set: typing.Set[str] = set()
        if self.verbose:
            print("Applying the filter from %s" % self.filter_file_path,
                  file=self.error_file,
                  flush=True)
        filter_key: str
        filter_line_count: int = 0
        row: typing.List[str]
        if self.invert:
            output_key_set = input_key_set
            for row in filter_kr:
                filter_line_count += 1
                filter_key = self.build_key(row, filter_key_columns)
                if filter_key in output_key_set:
                    output_key_set.remove(filter_key)
        else:
            for row in filter_kr:
                filter_line_count += 1
                filter_key = self.build_key(row, filter_key_columns)
                if filter_key in input_key_set:
                    output_key_set.add(filter_key)
        if self.verbose:
            print("Read %d rows from the filter file." % filter_line_count,
                  file=self.error_file,
                  flush=True)
            print("There are %d entries in the output key set." %
                  len(output_key_set),
                  file=self.error_file,
                  flush=True)

        # Step three: read the input rows from the cache and write only the
        # ones with keys in the output key set.
        output_line_count: int = 0
        for row in input_cache:
            input_key: str = self.build_key(row, input_key_columns)
            if input_key in output_key_set:
                ew.write(row)
                output_line_count += 1
        if self.verbose:
            print("Wrote %d rows to the output file." % output_line_count,
                  file=self.error_file,
                  flush=True)
Example #14
0
    def process_cacheing_input(self, input_kr: KgtkReader,
                               filter_kr: KgtkReader,
                               input_key_columns: typing.List[int],
                               filter_key_columns: typing.List[int],
                               ew: KgtkWriter):
        if self.verbose:
            print("Processing by cacheing the input file.")
        input_line_count: int = 0
        filter_line_count: int = 0
        output_line_count: int = 0

        # Map key values to lists of input and output data.
        inputmap: typing.MutableMapping[str,
                                        typing.List[typing.List[str]]] = {}
        outputmap: typing.MutableMapping[str,
                                         typing.List[typing.List[str]]] = {}

        if self.verbose:
            print("Reading the input data from %s" % self.input_file_path,
                  file=self.error_file,
                  flush=True)
        row: typing.List[str]
        for row in input_kr:
            input_line_count += 1
            input_key: str = self.build_key(row, input_key_columns)
            if input_key in inputmap:
                # Append the row to an existing list for that key.
                inputmap[input_key].append(row)
            else:
                # Create a new list of rows for this key.
                inputmap[input_key] = [row]

        if self.verbose:
            print("Applying the filter from %s" % self.filter_file_path,
                  file=self.error_file,
                  flush=True)
        filter_key: str
        if self.invert:
            outputmap = inputmap
            for row in filter_kr:
                filter_line_count += 1
                filter_key = self.build_key(row, filter_key_columns)
                if filter_key in outputmap:
                    del outputmap[filter_key]
        else:
            for row in filter_kr:
                filter_line_count += 1
                filter_key = self.build_key(row, filter_key_columns)
                if filter_key in inputmap:
                    outputmap[filter_key] = inputmap[filter_key]

        if self.verbose:
            print("Writing the output data to %s" % self.output_file_path,
                  file=self.error_file,
                  flush=True)

        # To simplify debugging, write the output data in sorted order (keys,
        # then input order).
        key: str
        for key in sorted(outputmap.keys()):
            for row in outputmap[key]:
                ew.write(row)
                output_line_count += 1

        if self.verbose:
            print(
                "Read %d input records, read %d filter records, wrote %d records."
                % (input_line_count, filter_line_count, output_line_count),
                file=self.error_file,
                flush=True)
Example #15
0
File: filter.py Project: yyht/kgtk
    def general_filter(kr: KgtkReader, kw: KgtkWriter,
                       rw: typing.Optional[KgtkWriter], subj_idx: int,
                       subj_filter: typing.Set[str], pred_idx: int,
                       pred_filter: typing.Set[str], obj_idx: int,
                       obj_filter: typing.Set[str]):
        if verbose:
            print("Applying a general filter", file=error_file, flush=True)

        apply_subj_filter: bool = len(subj_filter) > 0
        apply_pred_filter: bool = len(pred_filter) > 0
        apply_obj_filter: bool = len(obj_filter) > 0
        input_line_count: int = 0
        reject_line_count: int = 0
        output_line_count: int = 0
        subj_filter_keep_count: int = 0
        pred_filter_keep_count: int = 0
        obj_filter_keep_count: int = 0
        subj_filter_reject_count: int = 0
        pred_filter_reject_count: int = 0
        obj_filter_reject_count: int = 0

        row: typing.List[str]
        for row in kr:
            input_line_count += 1

            keep: bool = False
            reject: bool = False
            if apply_subj_filter:
                if row[subj_idx] in subj_filter:
                    keep = True
                    subj_filter_keep_count += 1
                else:
                    reject = True
                    subj_filter_reject_count += 1

            if apply_pred_filter:
                if row[pred_idx] in pred_filter:
                    keep = True
                    pred_filter_keep_count += 1
                else:
                    reject = True
                    pred_filter_reject_count += 1

            if apply_obj_filter:
                if row[obj_idx] in obj_filter:
                    keep = True
                    obj_filter_keep_count += 1
                else:
                    reject = True
                    obj_filter_reject_count += 1

            if (not keep ^ invert) if or_pattern else (reject ^ invert):
                if rw is not None:
                    rw.write(row)
                reject_line_count += 1
            else:
                kw.write(row)
                output_line_count += 1

        if verbose:
            print("Read %d rows, rejected %d rows, wrote %d rows." %
                  (input_line_count, reject_line_count, output_line_count))
            print("Keep counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_keep_count, pred_filter_keep_count,
                   obj_filter_keep_count))
            print("Reject counts: subject=%d, predicate=%d, object=%d." %
                  (subj_filter_reject_count, pred_filter_reject_count,
                   obj_filter_reject_count))