def join_tsv( self, values: typing.List[str], unquoted: bool = False, unescape_pipe: bool = True, csvlike: bool = False, ) -> str: line: str = "" value: str for value in values: # TODO: Complain if the value is a KGTK List. if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL): value = self.reformat_datetime(value) elif value.startswith( (KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): if unquoted: # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') try: value = KgtkFormat.unstringify( value, unescape_pipe=unescape_pipe ) # Lose the language code. except ValueError as e: print("KgtkWriter: File %s: Error unstringifying %s" % (repr(self.file_path), repr(value)), file=self.error_file, flush=True) raise e elif csvlike: # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') try: value = KgtkFormat.unstringify( value, unescape_pipe=unescape_pipe ) # Lose the language code. except ValueError as e: print("KgtkWriter: File %s: Error unstringifying %s" % (repr(self.file_path), repr(value)), file=self.error_file, flush=True) raise e value = '"' + value.replace('"', '""') + '"' else: value = value.replace("\\|", "|") else: value = value.replace("\\|", "|") if len(line) > 0: line += "\t" line += value return line
def join_csv(self, values: typing.List[str], unquoted: bool = False, )->str: line: str = "" value: str for value in values: # TODO: Complain if the value is a KGTK List. if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL): value = self.reformat_datetime(value) elif value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') value = KgtkFormat.unstringify(value) # Lose the language code. # TODO: Complain if internal newline or carriage return. if not unquoted: value = '"' + value.replace('"', '""') + '"' else: value = value.replace("\\|", "|") if '"' in value or ',' in value: # A symbol with an internal double quote or comma: turn it into a string. value = '"' + value.replace('"', '""') + '"' if len(line) > 0: line += "," line += value return line
def process_qnode_edge_qualifier( self, statement: typing.MutableMapping[str, typing.Any], edge_id: str, qualifier_row: typing.List[str]): if "qualifiers" not in statement: statement["qualifiers"] = dict() qualifiers = statement["qualifiers"] prop: str = qualifier_row[self.qual_label_idx] if prop not in qualifiers: qualifiers[prop] = list() proplist: typing.List[typing.Mapping[str, typing.Any]] = qualifiers[prop] qualifier: typing.MutableMapping[str, typing.Any] = dict() proplist.append(qualifier) qualifier["property"] = prop datatype: str = qualifier_row[self.qual_wikidatatype_idx] qualifier["datatype"] = datatype datahash: str = qualifier_row[self.qual_datahash_idx] if len(datahash) > 0: qualifier["hash"] = KgtkFormat.unstringify(datahash) value: str = qualifier_row[self.qual_node2_idx] if value == "somevalue": qualifier["snaktype"] = "somevalue" elif value == "novalue": qualifier["snaktype"] = "novalue" else: qualifier["datavalue"] = self.process_qual_datavalue( value, qualifier_row, datatype)
def add_sitelink(self, result: typing.MutableMapping[str, typing.Any], edge_id: str, qualifier_rows: typing.List[typing.List[str]]): if "sitelinks" not in result: result["sitelinks"] = dict() sitelinks: typing.MutableMapping[str, typing.Mapping[str, typing.Union[ str, typing.List[str]]]] = result["sitelinks"] site: str = "" title: str = "" badges: typing.List[str] = list() qualifier_row: typing.List[str] for qualifier_row in qualifier_rows: label: str = qualifier_row[self.qual_label_idx] if label == "site": site = qualifier_row[self.qual_node2_idx] elif label == "title": title = KgtkFormat.unstringify( qualifier_row[self.qual_node2_idx]) elif label == "badge": badges.append(qualifier_row[self.qual_node2_idx]) if len(site) == 0: # TODO: give a better error message. raise ValueError("Missing sitelink site for %s" % edge_id) if len(title) == 0: # TODO: give a better error message. raise ValueError("Missing sitelink title for %s" % edge_id) sitelinks[site] = {"site": site, "title": title, "badges": badges}
def reformat_value_for_json(self, value: str)->typing.Union[str, int, float, bool]: # TODO: Complain if the value is a KGTK List. if value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') return KgtkFormat.unstringify(value) # Lose the language code. elif value == KgtkFormat.TRUE_SYMBOL: return True elif value == KgtkFormat.FALSE_SYMBOL: return False elif value.isdigit(): return int(value) elif value.startswith(("+", "-")) and value[1:].isdigit(): return int(value) else: # TODO: process floating point numbers. # TODO: process datetimes # TODO: process geolocations return value
def process_row( self, node_id: str, node_property: str, node_value: str, each_node_attributes: EACH_NODE_ATTRIBUTES, ): if self.very_verbose: print("Processing row (%s, %s, %s)" % (repr(node_id), repr(node_property), repr(node_value)), file=self.error_file, flush=True) # CMR: the following code looks like it was intended to remove # any language code and language suffix. It would have the # side effect of removing location coordinates entirely. # # remove @ mark # if "@" in node_value and node_value[0] != "@": # node_value = node_value[:node_value.index("@")] # CMR: Better to use KgtkFormat.unstringify(node_value), as it will remove escapes from # internal double or single quotes. # # remove extra double quote " and single quote ' # while len(node_value) >= 3 and node_value[0] == '"' and node_value[-1] == '"': # node_value = node_value[1:-1] # while len(node_value) >= 3 and node_value[0] == "'" and node_value[-1] == "'": # node_value = node_value[1:-1] if node_value.startswith(("'", '"')): node_value = KgtkFormat.unstringify(node_value) # in case we meet an empty value, skip it if node_value == "": self._logger.warning( """Skip line ({}, {}, {}) because of empty value.""".format( node_id, node_property, node_value)) return if self.very_verbose: print("Revised node_value = %s" % repr(node_value), file=self.error_file, flush=True) if node_property in self.properties_reversed: if self.very_verbose: print("node_property %s is in self.properties_reversed" % repr(node_property), file=self.error_file, flush=True) roles = self.properties_reversed[node_property].copy() node_value = self.get_real_label_name(node_value) if self.very_verbose: print("node_value label = %s" % repr(node_value), file=self.error_file, flush=True) # if we get property_values, it should be saved to isa-properties part if self.PROPERTY_VALUES in roles: if self.very_verbose: print("property_values is in roles", file=self.error_file, flush=True) # for property values part, changed to be "{property} {value}" node_value_combine = self.get_real_label_name( node_property) + " " + self.get_real_label_name(node_value) if self.very_verbose: print("node_value_combine = %s" % repr(node_value_combine), file=self.error_file, flush=True) if each_node_attributes is None: raise ValueError("each_node_attributes is missing") property_values: typing.Optional[ Lexicalize.ATTRIBUTE_TYPES] = each_node_attributes[ self.PROPERTY_VALUES] if isinstance(property_values, list): property_values.append(node_value_combine) else: raise ValueError( 'each_node_attributes["property_values"] is not a list.' ) if self.very_verbose: print('each_node_attributes["property_values"] = %s' % repr(property_values), file=self.error_file, flush=True) # remove those 2 roles in case we have duplicate using of this node later roles.discard(self.PROPERTY_VALUES) roles.discard(self.HAS_PROPERTIES) for each_role in roles: attrs: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[ each_role] if isinstance(attrs, set): attrs.add(node_value) elif isinstance(attrs, list): attrs.append(node_value) else: raise ValueError( 'each_node_attributes[%s] is not a list or set.' % repr(each_role)) if self.very_verbose: print("%s: %s" % (each_role, repr(attrs)), file=self.error_file, flush=True) elif self.add_all_properties: # add remained properties if need all properties if self.very_verbose: print("self.add_all_properties is True", file=self.error_file, flush=True) attrs2: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[ self.HAS_PROPERTIES] if isinstance(attrs2, list): attrs2.append(self.get_real_label_name(node_property)) if self.very_verbose: print("has_properties: %s" % repr(attrs2), file=self.error_file, flush=True) else: raise ValueError( 'each_node_attributes["has_properties"] is not a list.') return
def process_qual_datavalue(self, value: str, qual_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = qual_row[self.qual_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = qual_row[self.qual_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do for Q16097-F1? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value %s is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: Research this further. Why did we get here? Is it because import_wikidata # dropped the units? # # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row))) valuemap["unit"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing from %s." % value) if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported, error in %s." % value) if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing from %s." % value) valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError( "date_and_time precision is missing from %s." % value) valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[ self.qual_calendar_idx] return datavalue if kv.is_location_coordinates(): if kv.fields.latitude is None: raise ValueError("latitude is missing from %s" % value) valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing from %s" % value) valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated valuemap["precision"] = float(qual_row[self.qual_precision_idx]) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: convert the symbol to a string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( '"' + value + '"') # TODO: KgtkValue should do this to text return datavalue
def process_edge_datavalue(self, value: str, edge_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = edge_row[self.edge_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = edge_row[self.edge_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value '%s' is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr.") valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: research this further. # # raise ValueError("quantity is missing units_node for %s." % value) valuemap["init"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing.") if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported.") if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing.") valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError("date_and_time precision is missing.") valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[ self.edge_calendar_idx] return datavalue if kv.is_location_coordinates: if kv.fields.latitude is None: raise ValueError("latitude is missing") valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing") valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated # TODO: Validate that it's OK to have location coordinates without precision. precision: str = edge_row[self.edge_precision_idx] if len(precision) > 0: try: valuemap["precision"] = float( edge_row[self.edge_precision_idx]) except ValueError: print("Invalid precision '%s'" % precision, file=self.error_file, flush=True) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: treat as string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue
def run(input_file: KGTKFiles, output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException import csv import re import json from pathlib import Path from string import Template import pandas as pd from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def make_node(x): und_x = x.replace(' ', '_') pref_und_x = 'at:%s' % und_x return pref_und_x def remove_people_mentions(event): e = event.replace('personx', '').strip() e = e.replace('persony', '').strip() e = e.replace('person x', '').strip() e = e.replace('person y', '').strip() e = e.replace('the ___', '') e = e.replace('___', '') e = e.replace("'s", '') e = e.replace('to y', '') return e.strip() def produce_node_labels(event): if '\t' in event: event = event.split('\t')[0] e1 = event.lower() e1 = e1.rstrip('.').strip() e2 = remove_people_mentions(e1) while ' ' in e2: e2 = e2.replace(' ', ' ') if e1 != e2 and e2: return '|'.join( [KgtkFormat.stringify(e1), KgtkFormat.stringify(e2)]) else: return KgtkFormat.stringify(e1) def produce_rel_label(rel): mapping = { 'xAttr': 'person x has attribute', 'oAttr': 'others have attribute', 'xReact': 'person x feels', 'oReact': 'others feel', 'xIntent': 'person x wants', 'xWant': 'person x wants', 'oWant': 'others want', 'xNeed': 'person x needs', 'xEffect': 'effect on person x', 'oEffect': 'the effect on others' } return KgtkFormat.stringify(mapping[rel]) try: filename: Path = KGTKArgumentParser.get_input_file(input_file) out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) df = pd.read_csv(filename, index_col=0) df.iloc[:, :9] = df.iloc[:, :9].apply( lambda col: col.apply(json.loads)) df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True) df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True) for event, row in df.iterrows(): event_label = produce_node_labels(event) first_event_label = KgtkFormat.unstringify( event_label.split('|')[0] if '|' in event_label else event_label) n1 = make_node(first_event_label) for c in df.columns: for v in row[c]: if v == 'none': continue value_label = produce_node_labels(v) first_value_label = KgtkFormat.unstringify( value_label.split('|')[0] if '|' in value_label else value_label) n2 = make_node(first_value_label) rel_label = produce_rel_label(c) sentence = '' relation = make_node(c) this_row = [ n1, relation, n2, event_label, value_label, rel_label, '', KgtkFormat.stringify('AT'), sentence ] ew.write(this_row) # Clean up. ew.close() except Exception as e: raise KGTKException('Error: ' + str(e))