def process_qnode(self, kw: KgtkWriter, current_process_node_id: str, each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool: interesting_qnode: bool = False if each_node_attributes: for k in each_node_attributes: if each_node_attributes[k]: interesting_qnode = True break if not interesting_qnode: return False concat_sentence: str explanation: str concat_sentence, explanation = self.attribute_to_sentence( each_node_attributes, current_process_node_id) if self.explain: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence), KgtkFormat.stringify(explanation) ]) else: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence) ]) return True
def join_tsv( self, values: typing.List[str], unquoted: bool = False, unescape_pipe: bool = True, csvlike: bool = False, ) -> str: line: str = "" value: str for value in values: # TODO: Complain if the value is a KGTK List. if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL): value = self.reformat_datetime(value) elif value.startswith( (KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): if unquoted: # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') try: value = KgtkFormat.unstringify( value, unescape_pipe=unescape_pipe ) # Lose the language code. except ValueError as e: print("KgtkWriter: File %s: Error unstringifying %s" % (repr(self.file_path), repr(value)), file=self.error_file, flush=True) raise e elif csvlike: # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') try: value = KgtkFormat.unstringify( value, unescape_pipe=unescape_pipe ) # Lose the language code. except ValueError as e: print("KgtkWriter: File %s: Error unstringifying %s" % (repr(self.file_path), repr(value)), file=self.error_file, flush=True) raise e value = '"' + value.replace('"', '""') + '"' else: value = value.replace("\\|", "|") else: value = value.replace("\\|", "|") if len(line) > 0: line += "\t" line += value return line
def produce_node_labels(event): if '\t' in event: event = event.split('\t')[0] e1 = event.lower() e1 = e1.rstrip('.').strip() e2 = remove_people_mentions(e1) while ' ' in e2: e2 = e2.replace(' ', ' ') if e1 != e2 and e2: return '|'.join( [KgtkFormat.stringify(e1), KgtkFormat.stringify(e2)]) else: return KgtkFormat.stringify(e1)
def add_entity_label(self, node_id: str, node_label: str): text: str language: str language_suffix: str if node_label.startswith(("'", '"')): text, language, language_suffix = KgtkFormat.destringify( node_label) else: text = node_label language = "" language_suffix = "" # The following code will take the last-read English label, # otherwise, the first-read non-English label. if language == "en" and language_suffix == "": if node_id in self.node_labels: self.english_labels_reloaded += 1 else: self.english_labels_loaded += 1 self.node_labels[node_id] = text else: if node_id not in self.node_labels: self.node_labels[node_id] = node_label self.non_english_labels_loaded += 1 else: self.non_english_labels_ignored += 1
def join_csv(self, values: typing.List[str], unquoted: bool = False, )->str: line: str = "" value: str for value in values: # TODO: Complain if the value is a KGTK List. if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL): value = self.reformat_datetime(value) elif value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') value = KgtkFormat.unstringify(value) # Lose the language code. # TODO: Complain if internal newline or carriage return. if not unquoted: value = '"' + value.replace('"', '""') + '"' else: value = value.replace("\\|", "|") if '"' in value or ',' in value: # A symbol with an internal double quote or comma: turn it into a string. value = '"' + value.replace('"', '""') + '"' if len(line) > 0: line += "," line += value return line
def edge2KGTK(edge: Tuple[str, str, str]) -> pd.Series: """ Gets the edge as triple of subject, object, predicate and converts the edge to the KGTK format Args: edge: Tuple[str, str, str] input edge Returns: pd.Series pandas Series with keys according to KGTK format at https://docs.google.com/document/d/1fbbqgyX0N2EdxLam6hatfke1R-nZWkoN6M1oB_f4aQo/edit#heading=h.a5nlqev5bmm4 """ s, p, o = edge def clean(e: str) -> str: out = e.split(':')[-1].replace('_', ' ') return KgtkFormat.stringify( re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower()) return pd.Series({ 'node1': s, 'relation': p, 'node2': o, 'node1;label': clean(s), 'node2;label': clean(o), 'relation;label': clean(p), 'relation;dimension': '', 'source': KgtkFormat.stringify('FN'), 'sentence': '' })
def process_qnode_edge_qualifier( self, statement: typing.MutableMapping[str, typing.Any], edge_id: str, qualifier_row: typing.List[str]): if "qualifiers" not in statement: statement["qualifiers"] = dict() qualifiers = statement["qualifiers"] prop: str = qualifier_row[self.qual_label_idx] if prop not in qualifiers: qualifiers[prop] = list() proplist: typing.List[typing.Mapping[str, typing.Any]] = qualifiers[prop] qualifier: typing.MutableMapping[str, typing.Any] = dict() proplist.append(qualifier) qualifier["property"] = prop datatype: str = qualifier_row[self.qual_wikidatatype_idx] qualifier["datatype"] = datatype datahash: str = qualifier_row[self.qual_datahash_idx] if len(datahash) > 0: qualifier["hash"] = KgtkFormat.unstringify(datahash) value: str = qualifier_row[self.qual_node2_idx] if value == "somevalue": qualifier["snaktype"] = "somevalue" elif value == "novalue": qualifier["snaktype"] = "novalue" else: qualifier["datavalue"] = self.process_qual_datavalue( value, qualifier_row, datatype)
def add_sitelink(self, result: typing.MutableMapping[str, typing.Any], edge_id: str, qualifier_rows: typing.List[typing.List[str]]): if "sitelinks" not in result: result["sitelinks"] = dict() sitelinks: typing.MutableMapping[str, typing.Mapping[str, typing.Union[ str, typing.List[str]]]] = result["sitelinks"] site: str = "" title: str = "" badges: typing.List[str] = list() qualifier_row: typing.List[str] for qualifier_row in qualifier_rows: label: str = qualifier_row[self.qual_label_idx] if label == "site": site = qualifier_row[self.qual_node2_idx] elif label == "title": title = KgtkFormat.unstringify( qualifier_row[self.qual_node2_idx]) elif label == "badge": badges.append(qualifier_row[self.qual_node2_idx]) if len(site) == 0: # TODO: give a better error message. raise ValueError("Missing sitelink site for %s" % edge_id) if len(title) == 0: # TODO: give a better error message. raise ValueError("Missing sitelink title for %s" % edge_id) sitelinks[site] = {"site": site, "title": title, "badges": badges}
def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl, image_id): my_row = [ node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl), rel_lbl, '', KgtkFormat.stringify('VG'), '' ] return my_row
def create_edges(data, labels, rel, rel_label): all_rows=[] source=KgtkFormat.stringify('WN') for node1, v in data.items(): for node2 in v: node1_preflabel=labels[node1].split('|')[0] node2_preflabel=labels[node2].split('|')[0] a_row=['wn:' + node1, rel, 'wn:' + node2, labels[node1], labels[node2], rel_label, "", source, ''] all_rows.append(a_row) return all_rows
def row_to_edge(row, cols): edge = {} edge['node1'] = row[2] edge['relation'] = row[1] edge['node2'] = row[3] edge['node1_label'] = make_node_label(row[2]) edge['node2_label'] = make_node_label(row[3]) edge['relation_label'] = make_rel_label(row[1]) edge['relation_dimension'] = '' metadata = json.loads(row[4]) edge['source'] = KgtkFormat.stringify('CN') if 'surfaceText' in metadata.keys(): edge['sentence'] = KgtkFormat.stringify( metadata['surfaceText'].replace('\\', '')) else: edge['sentence'] = '' edge_list = [edge[col] for col in cols] return '\t'.join(edge_list) + '\n'
def extract(input_file, output_file, source): rows=[] with open(output_file, 'w') as w: columns=['id', 'node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'source', 'sentence'] w.write(print_edge(columns)) with open(input_file, 'r') as f: header=next(f) for line in f: data=line.split('\t') data[1]=data[1].replace('same', 'Same') id='-'.join(data[:3]) new_row=[id, *data[:3], "", "", "", "", KgtkFormat.stringify(source), ""] w.write(print_edge(new_row))
def produce_rel_label(rel): mapping = { 'xAttr': 'person x has attribute', 'oAttr': 'others have attribute', 'xReact': 'person x feels', 'oReact': 'others feel', 'xIntent': 'person x wants', 'xWant': 'person x wants', 'oWant': 'others want', 'xNeed': 'person x needs', 'xEffect': 'effect on person x', 'oEffect': 'the effect on others' } return KgtkFormat.stringify(mapping[rel])
def row_to_edge(node1, rel, node2, source, cols): edge = {} prefix = source.lower() edge['node1'] = prefix + ':' + node1 edge['relation'] = rel edge['node2'] = prefix + ':' + node2 edge['node1;label'] = make_node_label(node1) edge['node2;label'] = make_node_label(node2) edge['relation;label'] = make_rel_label(rel) edge['relation;dimension'] = '' edge['source'] = KgtkFormat.stringify(source) edge['sentence'] = '' edge_list = [edge[col] for col in cols] return edge_list
def add_attr_to_map( self, attr_map: typing.MutableMapping[str, typing.Mapping[str, str]], attr: str, who: str, ): kv: KgtkValue = KgtkValue(attr, options=self.value_options, parse_fields=False, error_file=self.error_file, verbose=self.verbose) if not kv.is_language_qualified_string(validate=True): raise ValueError("Invald attr %s for %s" % (attr, who)) text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify(kv.value) if len(language) == 0: raise ValueError("No attr language in %s for %s" % (attr, who)) lang: str = language + language_suffix attr_map[lang] = {"language": lang, "value": text}
def reformat_value_for_json(self, value: str)->typing.Union[str, int, float, bool]: # TODO: Complain if the value is a KGTK List. if value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)): # What if the value is a list? unstringify(...) will be # unhappy. The following hack protects strings (but not # language-qualified strings) against errors, introducing # an ambiguity when exporting lists: value = value.replace('"|"', '|') return KgtkFormat.unstringify(value) # Lose the language code. elif value == KgtkFormat.TRUE_SYMBOL: return True elif value == KgtkFormat.FALSE_SYMBOL: return False elif value.isdigit(): return int(value) elif value.startswith(("+", "-")) and value[1:].isdigit(): return int(value) else: # TODO: process floating point numbers. # TODO: process datetimes # TODO: process geolocations return value
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles, output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from collections import defaultdict from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl, image_id): my_row = [ node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl), rel_lbl, '', KgtkFormat.stringify('VG'), '' ] return my_row try: scene_graph_filename: Path = KGTKArgumentParser.get_input_file( input_file) attr_synsets_filename: Path = KGTKArgumentParser.get_input_file( attr_syn_file) out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) proximity_relation = '/r/LocatedNear' property_relation = 'mw:MayHaveProperty' property_relation_label = KgtkFormat.stringify('may have property') capableof_relation = '/r/CapableOf' capableof_relation_label = KgtkFormat.stringify('capable of') with open(scene_graph_filename, 'r') as f: images_data = json.load(f) with open(attr_synsets_filename, 'r') as f: attr_synsets = json.load(f) for counter, an_image in enumerate(images_data): image_id = str(an_image['image_id']) # OBJECTS objid2names = defaultdict(list) objid2syns = {} rows = [] for o in an_image['objects']: obj_id = o['object_id'] o_synset = o['synsets'] objid2syns[obj_id] = o_synset for name in o['names']: name = name.strip().lower().rstrip('.') if not name: continue objid2names[obj_id].append(KgtkFormat.stringify(name)) # ATTRIBUTES if 'attributes' in o.keys(): for attr in o['attributes']: attr = attr.lower() if attr in attr_synsets: asyn = attr_synsets[attr] apos = asyn.split('.')[1] if apos != 'n': if apos == 'v': # verb for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], capableof_relation, capableof_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) else: #adjective for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], property_relation, property_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) # RELATIONS for rel in an_image['relationships']: #synsets=rel['synsets'] relation_label = KgtkFormat.stringify( rel['predicate'].lower().strip().strip('.')) sub_id = rel['subject_id'] sub_names = objid2names[sub_id] sub_syns = objid2syns[sub_id] obj_id = rel['object_id'] obj_names = objid2names[obj_id] obj_syns = objid2syns[obj_id] for ssyn in sub_syns: for osyn in obj_syns: if osyn != ssyn: edge_row = create_edge('wn:' + ssyn, sub_names, 'wn:' + osyn, obj_names, proximity_relation, relation_label, image_id) if edge_row not in rows: rows.append(edge_row) for a_row in rows: ew.write(a_row) # Clean up ew.close() except Exception as e: kgtk_exception_auto_handler(e)
def load_property_labels_file( input_files: typing.List[str], error_file: typing.TextIO, reader_options: KgtkReaderOptions, value_options: KgtkValueOptions, label_filter: typing.List[str], verbose: bool = False, ): labels_dict: typing.MutableMapping[str, str] = {} for each_file in input_files: kr: KgtkReader = KgtkReader.open( Path(each_file), error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, ) fail: bool = False if kr.node1_column_idx < 0: fail = True print("Cannot determine which column is node1 in %s" % each_file, file=error_file, flush=True) if len(label_filter) > 0 and kr.label_column_idx < 0: fail = True print("Cannot determine which column is label in %s" % each_file, file=error_file, flush=True) if kr.node2_column_idx < 0: fail = True print("Cannot determine which column is node2 in %s" % each_file, file=error_file, flush=True) if fail: raise KGTKException("Cannot identify a required column in %s" % each_file) row: typing.List[str] for row in kr: if len(label_filter) > 0: if row[kr.label_column_idx] not in label_filter: continue node_id: str = row[kr.node1_column_idx] node_label: str = row[kr.node2_column_idx] text: str language: str language_suffix: str if node_label.startswith(("'", '"')): text, language, language_suffix = KgtkFormat.destringify( node_label) else: text = node_label language = "" language_suffix = "" # The following code will take the last-read English label, # otherwise, the first-read non-English label. if language == "en" and language_suffix == "": labels_dict[node_id] = text else: if node_id not in labels_dict: labels_dict[node_id] = node_label kr.close() return labels_dict
sources=['AT', 'RG', 'CN'] identity_rel='mw:SameAs' lbl2ids=defaultdict(set) with open(input_file, 'r') as f: header=next(f) for line in f: data=line.split('\t') if check_source(data[8], sources): node1=data[1] node2=data[3] if lexical_node(node1): node1_label=data[4] lbl2ids[node1_label].add(node1) if lexical_node(node2): node2_label=data[5] lbl2ids[node2_label].add(node2) print(len(lbl2ids)) with open('tmp/lexical_mappings.tsv', 'w') as w: w.write(header) for label, ids in lbl2ids.items(): if len(ids)<=1: continue list_ids=list(ids) for i in range(len(list_ids)-1): edge_id='%s-%s-%s-1' % (list_ids[i], identity_rel, list_ids[i+1]) row=[edge_id, list_ids[i], identity_rel, list_ids[i+1], '', '', '', '', KgtkFormat.stringify('LEX'), ''] w.write('\t'.join(row) + '\n')
def make_rel_label(rel): return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))
def make_node_label(node): return KgtkFormat.stringify(node.strip().split('/')[3].replace( '_', ' '))
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import csv import json import re from pathlib import Path from collections import defaultdict from kgtk.kgtkformat import KgtkFormat out_columns = [ 'node1', 'relation', 'node2', 'node1_label', 'node2_label', 'relation_label', 'relation_dimension', 'source', 'sentence' ] proximity_relation = '/r/LocatedNear' property_relation = 'mw:MayHaveProperty' property_relation_label = KgtkFormat.stringify('may have property') capableof_relation = '/r/CapableOf' capableof_relation_label = KgtkFormat.stringify('capable of') def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl, image_id): my_row = [ node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl), rel_lbl, '', KgtkFormat.stringify('VG'), '' ] return '\t'.join(my_row) + '\n' def header_to_edge(row): row = [r.replace('_', ';') for r in row] return '\t'.join(row) + '\n' def create_uri(ns, rel): return '%s:%s' % (ns, rel) try: scene_graph_filename: Path = KGTKArgumentParser.get_input_file( input_file) attr_synsets_filename: Path = KGTKArgumentParser.get_input_file( attr_syn_file) with open(scene_graph_filename, 'r') as f: images_data = json.load(f) with open(attr_synsets_filename, 'r') as f: attr_synsets = json.load(f) sys.stdout.write(header_to_edge(out_columns)) for counter, an_image in enumerate(images_data): image_id = str(an_image['image_id']) # OBJECTS objid2names = defaultdict(list) objid2syns = {} rows = [] for o in an_image['objects']: obj_id = o['object_id'] o_synset = o['synsets'] objid2syns[obj_id] = o_synset for name in o['names']: name = name.strip().lower().rstrip('.') if not name: continue objid2names[obj_id].append(KgtkFormat.stringify(name)) # ATTRIBUTES if 'attributes' in o.keys(): for attr in o['attributes']: attr = attr.lower() if attr in attr_synsets: asyn = attr_synsets[attr] apos = asyn.split('.')[1] if apos != 'n': if apos == 'v': # verb for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], capableof_relation, capableof_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) else: #adjective for osyn in o_synset: if osyn != asyn: edge_row = create_edge( 'wn:' + osyn, objid2names[obj_id], 'wn:' + asyn, [KgtkFormat.stringify(attr)], property_relation, property_relation_label, image_id) if edge_row not in rows: rows.append(edge_row) # RELATIONS for rel in an_image['relationships']: #synsets=rel['synsets'] relation_label = KgtkFormat.stringify( rel['predicate'].lower().strip().strip('.')) sub_id = rel['subject_id'] sub_names = objid2names[sub_id] sub_syns = objid2syns[sub_id] obj_id = rel['object_id'] obj_names = objid2names[obj_id] obj_syns = objid2syns[obj_id] for ssyn in sub_syns: for osyn in obj_syns: if osyn != ssyn: edge_row = create_edge('wn:' + ssyn, sub_names, 'wn:' + osyn, obj_names, proximity_relation, relation_label, image_id) if edge_row not in rows: rows.append(edge_row) for a_row in rows: sys.stdout.write(a_row) except Exception as e: kgtk_exception_auto_handler(e)
def process_row( self, node_id: str, node_property: str, node_value: str, each_node_attributes: EACH_NODE_ATTRIBUTES, ): if self.very_verbose: print("Processing row (%s, %s, %s)" % (repr(node_id), repr(node_property), repr(node_value)), file=self.error_file, flush=True) # CMR: the following code looks like it was intended to remove # any language code and language suffix. It would have the # side effect of removing location coordinates entirely. # # remove @ mark # if "@" in node_value and node_value[0] != "@": # node_value = node_value[:node_value.index("@")] # CMR: Better to use KgtkFormat.unstringify(node_value), as it will remove escapes from # internal double or single quotes. # # remove extra double quote " and single quote ' # while len(node_value) >= 3 and node_value[0] == '"' and node_value[-1] == '"': # node_value = node_value[1:-1] # while len(node_value) >= 3 and node_value[0] == "'" and node_value[-1] == "'": # node_value = node_value[1:-1] if node_value.startswith(("'", '"')): node_value = KgtkFormat.unstringify(node_value) # in case we meet an empty value, skip it if node_value == "": self._logger.warning( """Skip line ({}, {}, {}) because of empty value.""".format( node_id, node_property, node_value)) return if self.very_verbose: print("Revised node_value = %s" % repr(node_value), file=self.error_file, flush=True) if node_property in self.properties_reversed: if self.very_verbose: print("node_property %s is in self.properties_reversed" % repr(node_property), file=self.error_file, flush=True) roles = self.properties_reversed[node_property].copy() node_value = self.get_real_label_name(node_value) if self.very_verbose: print("node_value label = %s" % repr(node_value), file=self.error_file, flush=True) # if we get property_values, it should be saved to isa-properties part if self.PROPERTY_VALUES in roles: if self.very_verbose: print("property_values is in roles", file=self.error_file, flush=True) # for property values part, changed to be "{property} {value}" node_value_combine = self.get_real_label_name( node_property) + " " + self.get_real_label_name(node_value) if self.very_verbose: print("node_value_combine = %s" % repr(node_value_combine), file=self.error_file, flush=True) if each_node_attributes is None: raise ValueError("each_node_attributes is missing") property_values: typing.Optional[ Lexicalize.ATTRIBUTE_TYPES] = each_node_attributes[ self.PROPERTY_VALUES] if isinstance(property_values, list): property_values.append(node_value_combine) else: raise ValueError( 'each_node_attributes["property_values"] is not a list.' ) if self.very_verbose: print('each_node_attributes["property_values"] = %s' % repr(property_values), file=self.error_file, flush=True) # remove those 2 roles in case we have duplicate using of this node later roles.discard(self.PROPERTY_VALUES) roles.discard(self.HAS_PROPERTIES) for each_role in roles: attrs: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[ each_role] if isinstance(attrs, set): attrs.add(node_value) elif isinstance(attrs, list): attrs.append(node_value) else: raise ValueError( 'each_node_attributes[%s] is not a list or set.' % repr(each_role)) if self.very_verbose: print("%s: %s" % (each_role, repr(attrs)), file=self.error_file, flush=True) elif self.add_all_properties: # add remained properties if need all properties if self.very_verbose: print("self.add_all_properties is True", file=self.error_file, flush=True) attrs2: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[ self.HAS_PROPERTIES] if isinstance(attrs2, list): attrs2.append(self.get_real_label_name(node_property)) if self.very_verbose: print("has_properties: %s" % repr(attrs2), file=self.error_file, flush=True) else: raise ValueError( 'each_node_attributes["has_properties"] is not a list.') return
def implode_language_qualified_string( self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, ) -> typing.Tuple[str, bool]: valid: bool = True text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME] text_val: str = row[text_idx] if len(text_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) elif len(text_val) == 1: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is too short" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) else: if not text_val.startswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not start with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) if not text_val.endswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not end with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME] language_val: str = self.unwrap(row[language_idx]) if len(language_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.LANGUAGE_FIELD_NAME), file=self.error_file, flush=True) suf_idx: int = implosion[KgtkValueFields.LANGUAGE_SUFFIX_FIELD_NAME] suf: str = self.unwrap(row[suf_idx]) if suf_idx >= 0 else "" if len(suf) > 0 and not suf.startswith("-"): # As a siecial favor, we'll accept language suffixes that do not # start with a dash. We'll prepend the dash. suf = "-" + suf value: str = "" if valid: # This subterfuge uses Python's literal parser to parse the string. if not self.escape_pipes: # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|). # (this is documented behavior) so we will remove escaped pipes manually. text_val = text_val.replace('\\|', '|') value = KgtkFormat.stringify(ast.literal_eval(text_val), language=language_val, language_suffix=suf) if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_language_qualified_string(validate=True) if not valid: if self.verbose: print( "Input line %d: data type '%s': imploded value '%s' is not a valid language qualified string." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def implode_string( self, input_line_count: int, row: typing.List[str], implosion: typing.Mapping[str, int], type_name: str, ) -> typing.Tuple[str, bool]: valid: bool = True if KgtkValueFields.LANGUAGE_FIELD_NAME in implosion: language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME] if language_idx >= 0: language_val: str = self.unwrap(row[language_idx]) if len(language_val) > 0: if self.general_strings: return self.implode_language_qualified_string( input_line_count, row, implosion, type_name) else: valid = False if self.verbose: print( "Input line %d: data type '%s': %s field is not empty" % (input_line_count, type_name, KgtkValueFields.LANGUAGE_FIELD_NAME), file=self.error_file, flush=True) text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME] text_val: str = row[text_idx] if len(text_val) == 0: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is empty" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) elif len(text_val) == 1: valid = False if self.verbose: print("Input line %d: data type '%s': %s field is too short" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) else: if not text_val.startswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not start with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) if not text_val.endswith('"'): valid = False if self.verbose: print( "Input line %d: data type '%s': %s field does not end with a double quote" % (input_line_count, type_name, KgtkValueFields.TEXT_FIELD_NAME), file=self.error_file, flush=True) value: str = "" if valid: # This subterfuge uses Python's literal parser to parse the string. if not self.escape_pipes: # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|). # (this is documented behavior) so we will remove escaped pipes manually. text_val = text_val.replace('\\|', '|') value = KgtkFormat.stringify(ast.literal_eval(text_val)) if valid and self.validate: kv: KgtkValue = KgtkValue(value, options=self.value_options) valid = kv.is_string(validate=True) if not valid: if self.verbose: print( "Input line %d: data type '%s': imploded value '%s' is not a valid string." % (input_line_count, type_name, value), file=self.error_file, flush=True) return value, valid
def clean(e: str) -> str: out = e.split(':')[-1].replace('_', ' ') return KgtkFormat.stringify( re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower())
def run(output_file: KGTKFiles): # import modules locally import sys # type: ignore from kgtk.exceptions import kgtk_exception_auto_handler import json import nltk nltk.download("wordnet") from nltk.corpus import wordnet as wn from kgtk.kgtkformat import KgtkFormat from kgtk.io.kgtkwriter import KgtkWriter def obtain_wordnet_lemmas(syn): lemmas = [] for lemma in syn.lemma_names(): lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' '))) return lemmas def obtain_hypernyms(syn): hyps = [] for hypernym in syn.hypernyms(): hyps.append(hypernym.name()) return hyps def obtain_member_holonyms(syn): hols = [] for hol in syn.member_holonyms(): hols.append(hol.name()) return hols def obtain_part_holonyms(syn): hols = [] for hol in syn.part_holonyms(): hols.append(hol.name()) return hols def obtain_substance_meronyms(syn): hols = [] for hol in syn.substance_meronyms(): hols.append(hol.name()) return hols def get_wn_data(): syns = list(wn.all_synsets()) all_labels = {} all_hyps = {} all_members = {} all_parts = {} all_subs = {} for syn in syns: syn_name = syn.name() lemmas = obtain_wordnet_lemmas(syn) all_labels[syn_name] = '|'.join(lemmas) hypernyms = obtain_hypernyms(syn) if len(hypernyms): all_hyps[syn_name] = hypernyms member_holonyms = obtain_member_holonyms(syn) if len(member_holonyms): all_members[syn_name] = member_holonyms part_holonyms = obtain_part_holonyms(syn) if len(part_holonyms): all_parts[syn_name] = part_holonyms substance_meronyms = obtain_substance_meronyms(syn) if len(substance_meronyms): all_subs[syn_name] = substance_meronyms return all_labels, all_hyps, all_members, all_parts, all_subs def create_edges(data, labels, rel, rel_label): all_rows = [] source = KgtkFormat.stringify('WN') for node1, v in data.items(): for node2 in v: node1_preflabel = labels[node1].split('|')[0] node2_preflabel = labels[node2].split('|')[0] a_row = [ 'wn:' + node1, rel, 'wn:' + node2, labels[node1], labels[node2], rel_label, "", source, '' ] all_rows.append(a_row) return all_rows try: out_columns = [ 'node1', 'relation', 'node2', 'node1;label', 'node2;label', 'relation;label', 'relation;dimension', 'source', 'sentence' ] output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) ew: KgtkWriter = KgtkWriter.open( out_columns, output_kgtk_file, #mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, #verbose=self.verbose, #very_verbose=self.very_verbose ) all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data() hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA', KgtkFormat.stringify('is a')) member_edges = create_edges(all_members, all_labels, '/r/PartOf', KgtkFormat.stringify('is a part of')) part_edges = create_edges(all_parts, all_labels, '/r/PartOf', KgtkFormat.stringify('is a part of')) sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf', KgtkFormat.stringify('is made of')) all_edges = hyp_edges + member_edges + part_edges + sub_edges for edge in all_edges: ew.write(edge) # Clean up. ew.close() except Exception as e: kgtk_exception_auto_handler(e)
def obtain_wordnet_lemmas(syn): lemmas = [] for lemma in syn.lemma_names(): lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' '))) return lemmas
def process_qual_datavalue(self, value: str, qual_row: typing.List[str], datatype: str): datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[ str, typing.Optional[typing.Union[str, int, float]]]]] = dict() datavalue["type"] = qual_row[self.qual_val_type_idx] valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[ str, int, float]]] = dict() datavalue["value"] = valuemap entity_type: str = qual_row[self.qual_entity_type_idx] if len(entity_type) > 0: valuemap["entity-type"] = entity_type valuemap["id"] = value # TODO: Is this the right thing to do for Q16097-F1? numeric_id: str = value[1:] if "-" in numeric_id: numeric_id = numeric_id[:numeric_id.index("-")] valuemap["numeric-id"] = int(numeric_id) return datavalue kv = KgtkValue(value, options=self.value_options, parse_fields=True, error_file=self.error_file, verbose=self.verbose) if not kv.validate(): # raise ValueError("Invalid KGTK value '%s'" % value) print("Warning: Invalid KGTK value '%s'" % value, file=self.error_file, flush=True) if kv.fields is None: raise ValueError("KGTK value %s is missing fields." % value) if kv.is_number(): if kv.fields.numberstr is None: raise ValueError("number is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign valuemap["unit"] = "1" return datavalue if kv.is_quantity(): if kv.fields.numberstr is None: raise ValueError("quantity is missing numberstr for %s." % value) valuemap["amount"] = kv.fields.numberstr # TODO: add plus sign if kv.fields.units_node is None: # TODO: Research this further. Why did we get here? Is it because import_wikidata # dropped the units? # # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row))) valuemap["unit"] = "undefined" else: valuemap[ "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node if kv.fields.low_tolerancestr is not None and len( kv.fields.low_tolerancestr) > 0: valuemap[ "lowerBound"] = kv.fields.low_tolerancestr # TODO: add plus sign if kv.fields.high_tolerancestr is not None and len( kv.fields.high_tolerancestr) > 0: valuemap[ "higherBound"] = kv.fields.high_tolerancestr # TODO: add plus sign return datavalue if kv.is_language_qualified_string(): text: str language: str language_suffix: str text, language, language_suffix = KgtkFormat.destringify( value) # TODO: KgtkValue should do this to text language += language_suffix valuemap["text"] = text valuemap["language"] = language return datavalue if kv.is_string(): valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( value) # TODO: KgtkValue should do this to text return datavalue if kv.is_date_and_times(): if kv.fields.zonestr is None: raise ValueError("timezone is missing from %s." % value) if kv.fields.zonestr != "Z": raise ValueError("Only Z-time is supported, error in %s." % value) if kv.fields.date_and_time is None: raise ValueError("date_and_time is missing from %s." % value) valuemap["time"] = kv.fields.date_and_time valuemap["timezone"] = 0 valuemap["before"] = 0 valuemap["after"] = 0 if kv.fields.precision is None: raise ValueError( "date_and_time precision is missing from %s." % value) valuemap["precision"] = kv.fields.precision valuemap[ "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[ self.qual_calendar_idx] return datavalue if kv.is_location_coordinates(): if kv.fields.latitude is None: raise ValueError("latitude is missing from %s" % value) valuemap["latitude"] = kv.fields.latitude if kv.fields.longitude is None: raise ValueError("longitude is missing from %s" % value) valuemap["longitude"] = kv.fields.longitude valuemap["altitide"] = None # deprecated valuemap["precision"] = float(qual_row[self.qual_precision_idx]) valuemap["globe"] = "http://www.wikidata.org/entity/Q2" return datavalue # Default: convert the symbol to a string. valuemap["type"] = "string" valuemap["value"] = KgtkFormat.unstringify( '"' + value + '"') # TODO: KgtkValue should do this to text return datavalue
def make_node_label(node): return KgtkFormat.stringify(node[3:])