Beispiel #1
0
    def add_entity_label(self, node_id: str, node_label: str):
        text: str
        language: str
        language_suffix: str
        if node_label.startswith(("'", '"')):
            text, language, language_suffix = KgtkFormat.destringify(
                node_label)
        else:
            text = node_label
            language = ""
            language_suffix = ""

        # The following code will take the last-read English label,
        # otherwise, the first-read non-English label.
        if language == "en" and language_suffix == "":
            if node_id in self.node_labels:
                self.english_labels_reloaded += 1
            else:
                self.english_labels_loaded += 1
            self.node_labels[node_id] = text
        else:
            if node_id not in self.node_labels:
                self.node_labels[node_id] = node_label
                self.non_english_labels_loaded += 1
            else:
                self.non_english_labels_ignored += 1
Beispiel #2
0
    def add_attr_to_map(
        self,
        attr_map: typing.MutableMapping[str, typing.Mapping[str, str]],
        attr: str,
        who: str,
    ):
        kv: KgtkValue = KgtkValue(attr,
                                  options=self.value_options,
                                  parse_fields=False,
                                  error_file=self.error_file,
                                  verbose=self.verbose)
        if not kv.is_language_qualified_string(validate=True):
            raise ValueError("Invald attr %s for %s" % (attr, who))

        text: str
        language: str
        language_suffix: str
        text, language, language_suffix = KgtkFormat.destringify(kv.value)
        if len(language) == 0:
            raise ValueError("No attr language in %s for %s" % (attr, who))
        lang: str = language + language_suffix
        attr_map[lang] = {"language": lang, "value": text}
Beispiel #3
0
def load_property_labels_file(
    input_files: typing.List[str],
    error_file: typing.TextIO,
    reader_options: KgtkReaderOptions,
    value_options: KgtkValueOptions,
    label_filter: typing.List[str],
    verbose: bool = False,
):
    labels_dict: typing.MutableMapping[str, str] = {}
    for each_file in input_files:
        kr: KgtkReader = KgtkReader.open(
            Path(each_file),
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
        )
        fail: bool = False
        if kr.node1_column_idx < 0:
            fail = True
            print("Cannot determine which column is node1 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if len(label_filter) > 0 and kr.label_column_idx < 0:
            fail = True
            print("Cannot determine which column is label in %s" % each_file,
                  file=error_file,
                  flush=True)
        if kr.node2_column_idx < 0:
            fail = True
            print("Cannot determine which column is node2 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if fail:
            raise KGTKException("Cannot identify a required column in %s" %
                                each_file)

        row: typing.List[str]
        for row in kr:
            if len(label_filter) > 0:
                if row[kr.label_column_idx] not in label_filter:
                    continue

            node_id: str = row[kr.node1_column_idx]
            node_label: str = row[kr.node2_column_idx]
            text: str
            language: str
            language_suffix: str
            if node_label.startswith(("'", '"')):
                text, language, language_suffix = KgtkFormat.destringify(
                    node_label)
            else:
                text = node_label
                language = ""
                language_suffix = ""

            # The following code will take the last-read English label,
            # otherwise, the first-read non-English label.
            if language == "en" and language_suffix == "":
                labels_dict[node_id] = text
            else:
                if node_id not in labels_dict:
                    labels_dict[node_id] = node_label

        kr.close()
    return labels_dict
Beispiel #4
0
    def process_qual_datavalue(self, value: str, qual_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = qual_row[self.qual_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = qual_row[self.qual_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do for Q16097-F1?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value %s is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr for %s." % value)

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr for %s." %
                                 value)
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: Research this further.  Why did we get here?  Is it because import_wikidata
                # dropped the units?
                #
                # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row)))
                valuemap["unit"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing from %s." % value)
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported, error in %s." %
                                 value)

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing from %s." % value)
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError(
                    "date_and_time precision is missing from %s." % value)
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[
                    self.qual_calendar_idx]
            return datavalue

        if kv.is_location_coordinates():
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing from %s" % value)
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing from %s" % value)
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            valuemap["precision"] = float(qual_row[self.qual_precision_idx])

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: convert the symbol to a string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            '"' + value + '"')  # TODO: KgtkValue should do this to text
        return datavalue
Beispiel #5
0
    def process_edge_datavalue(self, value: str, edge_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = edge_row[self.edge_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = edge_row[self.edge_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value '%s' is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr.")

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr.")
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: research this further.
                #
                # raise ValueError("quantity is missing units_node for %s." % value)
                valuemap["init"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing.")
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported.")

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing.")
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError("date_and_time precision is missing.")
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[
                    self.edge_calendar_idx]
            return datavalue

        if kv.is_location_coordinates:
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing")
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing")
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            # TODO: Validate that it's OK to have location coordinates without precision.
            precision: str = edge_row[self.edge_precision_idx]
            if len(precision) > 0:
                try:
                    valuemap["precision"] = float(
                        edge_row[self.edge_precision_idx])
                except ValueError:
                    print("Invalid precision '%s'" % precision,
                          file=self.error_file,
                          flush=True)

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: treat as string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            value)  # TODO: KgtkValue should do this to text
        return datavalue