def add_entity_label(self, node_id: str, node_label: str): text: str language: str language_suffix: str if node_label.startswith(("'", '"')): text, language, language_suffix = KgtkFormat.destringify( node_label) else: text = node_label language = "" language_suffix = "" # The following code will take the last-read English label, # otherwise, the first-read non-English label. if language == "en" and language_suffix == "": if node_id in self.node_labels: self.english_labels_reloaded += 1 else: self.english_labels_loaded += 1 self.node_labels[node_id] = text else: if node_id not in self.node_labels: self.node_labels[node_id] = node_label self.non_english_labels_loaded += 1 else: self.non_english_labels_ignored += 1
def add_attr_to_map( self, attr_map: typing.MutableMapping[str, typing.Mapping[str, str]], attr: str, who: str, ): kv: KgtkValue = KgtkValue(attr, options=self.value_options, parse_fields=False, error_file=self.error_file, verbose=self.verbose) if not kv.is_language_qualified_string(validate=True): raise ValueError("Invald attr %s for %s" % (attr, who)) text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify(kv.value) if len(language) == 0: raise ValueError("No attr language in %s for %s" % (attr, who)) lang: str = language + language_suffix attr_map[lang] = {"language": lang, "value": text}
def load_property_labels_file( input_files: typing.List[str], error_file: typing.TextIO, reader_options: KgtkReaderOptions, value_options: KgtkValueOptions, label_filter: typing.List[str], verbose: bool = False, ): labels_dict: typing.MutableMapping[str, str] = {} for each_file in input_files: kr: KgtkReader = KgtkReader.open( Path(each_file), error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, ) fail: bool = False if kr.node1_column_idx < 0: fail = True print("Cannot determine which column is node1 in %s" % each_file, file=error_file, flush=True) if len(label_filter) > 0 and kr.label_column_idx < 0: fail = True print("Cannot determine which column is label in %s" % each_file, file=error_file, flush=True) if kr.node2_column_idx < 0: fail = True print("Cannot determine which column is node2 in %s" % each_file, file=error_file, flush=True) if fail: raise KGTKException("Cannot identify a required column in %s" % each_file) row: typing.List[str] for row in kr: if len(label_filter) > 0: if row[kr.label_column_idx] not in label_filter: continue node_id: str = row[kr.node1_column_idx] node_label: str = row[kr.node2_column_idx] text: str language: str language_suffix: str if node_label.startswith(("'", '"')): text, language, language_suffix = KgtkFormat.destringify( node_label) else: text = node_label language = "" language_suffix = "" # The following code will take the last-read English label, # otherwise, the first-read non-English label. if language == "en" and language_suffix == "": labels_dict[node_id] = text else: if node_id not in labels_dict: labels_dict[node_id] = node_label kr.close() return labels_dict
def process_qual_datavalue(self, value: str, qual_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = qual_row[self.qual_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = qual_row[self.qual_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do for Q16097-F1? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value %s is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: Research this further. Why did we get here? Is it because import_wikidata # dropped the units? # # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row))) valuemap["unit"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing from %s." % value) if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported, error in %s." % value) if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing from %s." % value) valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError( "date_and_time precision is missing from %s." % value) valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[ self.qual_calendar_idx] return datavalue if kv.is_location_coordinates(): if kv.fields.latitude is None: raise ValueError("latitude is missing from %s" % value) valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing from %s" % value) valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated valuemap["precision"] = float(qual_row[self.qual_precision_idx]) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: convert the symbol to a string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( '"' + value + '"') # TODO: KgtkValue should do this to text return datavalue
def process_edge_datavalue(self, value: str, edge_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = edge_row[self.edge_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = edge_row[self.edge_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value '%s' is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: research this further. # # raise ValueError("quantity is missing units_node for %s." % value) valuemap["init"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing.") if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported.") if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing.") valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError("date_and_time precision is missing.") valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[ self.edge_calendar_idx] return datavalue if kv.is_location_coordinates: if kv.fields.latitude is None: raise ValueError("latitude is missing") valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing") valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated # TODO: Validate that it's OK to have location coordinates without precision. precision: str = edge_row[self.edge_precision_idx] if len(precision) > 0: try: valuemap["precision"] = float( edge_row[self.edge_precision_idx]) except ValueError: print("Invalid precision '%s'" % precision, file=self.error_file, flush=True) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: treat as string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue