Example #1
0
    def process_qnode(self, kw: KgtkWriter, current_process_node_id: str,
                      each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool:
        interesting_qnode: bool = False
        if each_node_attributes:
            for k in each_node_attributes:
                if each_node_attributes[k]:
                    interesting_qnode = True
                    break
        if not interesting_qnode:
            return False

        concat_sentence: str
        explanation: str
        concat_sentence, explanation = self.attribute_to_sentence(
            each_node_attributes, current_process_node_id)
        if self.explain:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence),
                KgtkFormat.stringify(explanation)
            ])
        else:
            kw.write([
                current_process_node_id, self.sentence_label,
                KgtkFormat.stringify(concat_sentence)
            ])
        return True
    def join_tsv(
        self,
        values: typing.List[str],
        unquoted: bool = False,
        unescape_pipe: bool = True,
        csvlike: bool = False,
    ) -> str:
        line: str = ""
        value: str
        for value in values:
            # TODO: Complain if the value is a KGTK List.
            if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL):
                value = self.reformat_datetime(value)

            elif value.startswith(
                (KgtkFormat.STRING_SIGIL,
                 KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
                if unquoted:
                    # What if the value is a list? unstringify(...) will be
                    # unhappy.  The following hack protects strings (but not
                    # language-qualified strings) against errors, introducing
                    # an ambiguity when exporting lists:
                    value = value.replace('"|"', '|')
                    try:
                        value = KgtkFormat.unstringify(
                            value, unescape_pipe=unescape_pipe
                        )  # Lose the language code.
                    except ValueError as e:
                        print("KgtkWriter: File %s: Error unstringifying %s" %
                              (repr(self.file_path), repr(value)),
                              file=self.error_file,
                              flush=True)
                        raise e
                elif csvlike:
                    # What if the value is a list? unstringify(...) will be
                    # unhappy.  The following hack protects strings (but not
                    # language-qualified strings) against errors, introducing
                    # an ambiguity when exporting lists:
                    value = value.replace('"|"', '|')
                    try:
                        value = KgtkFormat.unstringify(
                            value, unescape_pipe=unescape_pipe
                        )  # Lose the language code.
                    except ValueError as e:
                        print("KgtkWriter: File %s: Error unstringifying %s" %
                              (repr(self.file_path), repr(value)),
                              file=self.error_file,
                              flush=True)
                        raise e
                    value = '"' + value.replace('"', '""') + '"'

                else:
                    value = value.replace("\\|", "|")
            else:
                value = value.replace("\\|", "|")

            if len(line) > 0:
                line += "\t"
            line += value
        return line
Example #3
0
 def produce_node_labels(event):
     if '\t' in event:
         event = event.split('\t')[0]
     e1 = event.lower()
     e1 = e1.rstrip('.').strip()
     e2 = remove_people_mentions(e1)
     while '  ' in e2:
         e2 = e2.replace('  ', ' ')
     if e1 != e2 and e2:
         return '|'.join(
             [KgtkFormat.stringify(e1),
              KgtkFormat.stringify(e2)])
     else:
         return KgtkFormat.stringify(e1)
Example #4
0
    def add_entity_label(self, node_id: str, node_label: str):
        text: str
        language: str
        language_suffix: str
        if node_label.startswith(("'", '"')):
            text, language, language_suffix = KgtkFormat.destringify(
                node_label)
        else:
            text = node_label
            language = ""
            language_suffix = ""

        # The following code will take the last-read English label,
        # otherwise, the first-read non-English label.
        if language == "en" and language_suffix == "":
            if node_id in self.node_labels:
                self.english_labels_reloaded += 1
            else:
                self.english_labels_loaded += 1
            self.node_labels[node_id] = text
        else:
            if node_id not in self.node_labels:
                self.node_labels[node_id] = node_label
                self.non_english_labels_loaded += 1
            else:
                self.non_english_labels_ignored += 1
Example #5
0
    def join_csv(self, values: typing.List[str],
                 unquoted: bool = False,
                 )->str:
        line: str = ""
        value: str
        for value in values:
            # TODO: Complain if the value is a KGTK List.
            if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL):
                value = self.reformat_datetime(value)

            elif value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
                # What if the value is a list? unstringify(...) will be
                # unhappy.  The following hack protects strings (but not
                # language-qualified strings) against errors, introducing
                # an ambiguity when exporting lists:
                value = value.replace('"|"', '|')
                value = KgtkFormat.unstringify(value) # Lose the language code.
                # TODO: Complain if internal newline or carriage return.

                if not unquoted:
                    value = '"' + value.replace('"', '""') + '"'
                
            else:
                value = value.replace("\\|", "|")
                if '"' in value or ',' in value:
                    # A symbol with an internal double quote or comma: turn it into a string.
                    value = '"' + value.replace('"', '""') + '"'
            if len(line) > 0:
                line += ","
            line += value
        return line
Example #6
0
    def edge2KGTK(edge: Tuple[str, str, str]) -> pd.Series:
        """
        Gets the edge as triple of subject, object, predicate and converts the edge to the KGTK format
        Args:
            edge: Tuple[str, str, str]
                input edge
        Returns: pd.Series
            pandas Series with keys according to KGTK format at
            https://docs.google.com/document/d/1fbbqgyX0N2EdxLam6hatfke1R-nZWkoN6M1oB_f4aQo/edit#heading=h.a5nlqev5bmm4
        """
        s, p, o = edge

        def clean(e: str) -> str:
            out = e.split(':')[-1].replace('_', ' ')
            return KgtkFormat.stringify(
                re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower())

        return pd.Series({
            'node1': s,
            'relation': p,
            'node2': o,
            'node1;label': clean(s),
            'node2;label': clean(o),
            'relation;label': clean(p),
            'relation;dimension': '',
            'source': KgtkFormat.stringify('FN'),
            'sentence': ''
        })
Example #7
0
    def process_qnode_edge_qualifier(
            self, statement: typing.MutableMapping[str, typing.Any],
            edge_id: str, qualifier_row: typing.List[str]):
        if "qualifiers" not in statement:
            statement["qualifiers"] = dict()
        qualifiers = statement["qualifiers"]

        prop: str = qualifier_row[self.qual_label_idx]
        if prop not in qualifiers:
            qualifiers[prop] = list()
        proplist: typing.List[typing.Mapping[str,
                                             typing.Any]] = qualifiers[prop]

        qualifier: typing.MutableMapping[str, typing.Any] = dict()
        proplist.append(qualifier)

        qualifier["property"] = prop

        datatype: str = qualifier_row[self.qual_wikidatatype_idx]
        qualifier["datatype"] = datatype

        datahash: str = qualifier_row[self.qual_datahash_idx]
        if len(datahash) > 0:
            qualifier["hash"] = KgtkFormat.unstringify(datahash)

        value: str = qualifier_row[self.qual_node2_idx]
        if value == "somevalue":
            qualifier["snaktype"] = "somevalue"
        elif value == "novalue":
            qualifier["snaktype"] = "novalue"
        else:
            qualifier["datavalue"] = self.process_qual_datavalue(
                value, qualifier_row, datatype)
Example #8
0
    def add_sitelink(self, result: typing.MutableMapping[str, typing.Any],
                     edge_id: str,
                     qualifier_rows: typing.List[typing.List[str]]):
        if "sitelinks" not in result:
            result["sitelinks"] = dict()
        sitelinks: typing.MutableMapping[str, typing.Mapping[str, typing.Union[
            str, typing.List[str]]]] = result["sitelinks"]

        site: str = ""
        title: str = ""
        badges: typing.List[str] = list()

        qualifier_row: typing.List[str]
        for qualifier_row in qualifier_rows:
            label: str = qualifier_row[self.qual_label_idx]

            if label == "site":
                site = qualifier_row[self.qual_node2_idx]

            elif label == "title":
                title = KgtkFormat.unstringify(
                    qualifier_row[self.qual_node2_idx])

            elif label == "badge":
                badges.append(qualifier_row[self.qual_node2_idx])

        if len(site) == 0:
            # TODO: give a better error message.
            raise ValueError("Missing sitelink site for %s" % edge_id)

        if len(title) == 0:
            # TODO: give a better error message.
            raise ValueError("Missing sitelink title for %s" % edge_id)

        sitelinks[site] = {"site": site, "title": title, "badges": badges}
Example #9
0
 def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                 image_id):
     my_row = [
         node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
         rel_lbl, '',
         KgtkFormat.stringify('VG'), ''
     ]
     return my_row
Example #10
0
 def create_edges(data, labels, rel, rel_label):
     all_rows=[]
     source=KgtkFormat.stringify('WN')
     for node1, v in data.items():
         for node2 in v:
             node1_preflabel=labels[node1].split('|')[0]
             node2_preflabel=labels[node2].split('|')[0]
             a_row=['wn:' + node1, rel, 'wn:' + node2, labels[node1], labels[node2], rel_label, "", source, '']
             all_rows.append(a_row)
     return all_rows
Example #11
0
    def row_to_edge(row, cols):

        edge = {}
        edge['node1'] = row[2]
        edge['relation'] = row[1]
        edge['node2'] = row[3]
        edge['node1_label'] = make_node_label(row[2])
        edge['node2_label'] = make_node_label(row[3])
        edge['relation_label'] = make_rel_label(row[1])
        edge['relation_dimension'] = ''

        metadata = json.loads(row[4])
        edge['source'] = KgtkFormat.stringify('CN')
        if 'surfaceText' in metadata.keys():
            edge['sentence'] = KgtkFormat.stringify(
                metadata['surfaceText'].replace('\\', ''))
        else:
            edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return '\t'.join(edge_list) + '\n'
Example #12
0
def extract(input_file, output_file, source):
    rows=[]
    with open(output_file, 'w') as w:
        columns=['id', 'node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'source', 'sentence']
        w.write(print_edge(columns))
        with open(input_file, 'r') as f:
            header=next(f)
            for line in f:
                data=line.split('\t')
                data[1]=data[1].replace('same', 'Same')
                id='-'.join(data[:3])
                new_row=[id, *data[:3], "", "", "", "", KgtkFormat.stringify(source), ""]
                w.write(print_edge(new_row))
Example #13
0
 def produce_rel_label(rel):
     mapping = {
         'xAttr': 'person x has attribute',
         'oAttr': 'others have attribute',
         'xReact': 'person x feels',
         'oReact': 'others feel',
         'xIntent': 'person x wants',
         'xWant': 'person x wants',
         'oWant': 'others want',
         'xNeed': 'person x needs',
         'xEffect': 'effect on person x',
         'oEffect': 'the effect on others'
     }
     return KgtkFormat.stringify(mapping[rel])
    def row_to_edge(node1, rel, node2, source, cols):

        edge = {}
        prefix = source.lower()
        edge['node1'] = prefix + ':' + node1
        edge['relation'] = rel
        edge['node2'] = prefix + ':' + node2
        edge['node1;label'] = make_node_label(node1)
        edge['node2;label'] = make_node_label(node2)
        edge['relation;label'] = make_rel_label(rel)
        edge['relation;dimension'] = ''

        edge['source'] = KgtkFormat.stringify(source)
        edge['sentence'] = ''

        edge_list = [edge[col] for col in cols]
        return edge_list
Example #15
0
    def add_attr_to_map(
        self,
        attr_map: typing.MutableMapping[str, typing.Mapping[str, str]],
        attr: str,
        who: str,
    ):
        kv: KgtkValue = KgtkValue(attr,
                                  options=self.value_options,
                                  parse_fields=False,
                                  error_file=self.error_file,
                                  verbose=self.verbose)
        if not kv.is_language_qualified_string(validate=True):
            raise ValueError("Invald attr %s for %s" % (attr, who))

        text: str
        language: str
        language_suffix: str
        text, language, language_suffix = KgtkFormat.destringify(kv.value)
        if len(language) == 0:
            raise ValueError("No attr language in %s for %s" % (attr, who))
        lang: str = language + language_suffix
        attr_map[lang] = {"language": lang, "value": text}
Example #16
0
 def reformat_value_for_json(self, value: str)->typing.Union[str, int, float, bool]:
     # TODO: Complain if the value is a KGTK List.
     if value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
         # What if the value is a list? unstringify(...) will be
         # unhappy.  The following hack protects strings (but not
         # language-qualified strings) against errors, introducing
         # an ambiguity when exporting lists:
         value = value.replace('"|"', '|')
         return KgtkFormat.unstringify(value) # Lose the language code.
     elif value == KgtkFormat.TRUE_SYMBOL:
         return True
     elif value == KgtkFormat.FALSE_SYMBOL:
         return False
     elif value.isdigit():
         return int(value)
     elif value.startswith(("+", "-")) and value[1:].isdigit():
         return int(value)
     else:
         # TODO: process floating point numbers.
         # TODO: process datetimes
         # TODO: process geolocations
         return value
Example #17
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles,
        output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = [
            node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
            rel_lbl, '',
            KgtkFormat.stringify('VG'), ''
        ]
        return my_row

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        proximity_relation = '/r/LocatedNear'
        property_relation = 'mw:MayHaveProperty'
        property_relation_label = KgtkFormat.stringify('may have property')
        capableof_relation = '/r/CapableOf'
        capableof_relation_label = KgtkFormat.stringify('capable of')

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(KgtkFormat.stringify(name))

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = KgtkFormat.stringify(
                    rel['predicate'].lower().strip().strip('.'))
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge('wn:' + ssyn, sub_names,
                                                   'wn:' + osyn, obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                ew.write(a_row)

        # Clean up
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Example #18
0
def load_property_labels_file(
    input_files: typing.List[str],
    error_file: typing.TextIO,
    reader_options: KgtkReaderOptions,
    value_options: KgtkValueOptions,
    label_filter: typing.List[str],
    verbose: bool = False,
):
    labels_dict: typing.MutableMapping[str, str] = {}
    for each_file in input_files:
        kr: KgtkReader = KgtkReader.open(
            Path(each_file),
            error_file=error_file,
            options=reader_options,
            value_options=value_options,
            verbose=verbose,
        )
        fail: bool = False
        if kr.node1_column_idx < 0:
            fail = True
            print("Cannot determine which column is node1 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if len(label_filter) > 0 and kr.label_column_idx < 0:
            fail = True
            print("Cannot determine which column is label in %s" % each_file,
                  file=error_file,
                  flush=True)
        if kr.node2_column_idx < 0:
            fail = True
            print("Cannot determine which column is node2 in %s" % each_file,
                  file=error_file,
                  flush=True)
        if fail:
            raise KGTKException("Cannot identify a required column in %s" %
                                each_file)

        row: typing.List[str]
        for row in kr:
            if len(label_filter) > 0:
                if row[kr.label_column_idx] not in label_filter:
                    continue

            node_id: str = row[kr.node1_column_idx]
            node_label: str = row[kr.node2_column_idx]
            text: str
            language: str
            language_suffix: str
            if node_label.startswith(("'", '"')):
                text, language, language_suffix = KgtkFormat.destringify(
                    node_label)
            else:
                text = node_label
                language = ""
                language_suffix = ""

            # The following code will take the last-read English label,
            # otherwise, the first-read non-English label.
            if language == "en" and language_suffix == "":
                labels_dict[node_id] = text
            else:
                if node_id not in labels_dict:
                    labels_dict[node_id] = node_label

        kr.close()
    return labels_dict
sources=['AT', 'RG', 'CN']
identity_rel='mw:SameAs'

lbl2ids=defaultdict(set)

with open(input_file, 'r') as f:
    header=next(f)
    for line in f:
        data=line.split('\t')
        if check_source(data[8], sources):
            node1=data[1]
            node2=data[3]
            if lexical_node(node1):
                node1_label=data[4]
                lbl2ids[node1_label].add(node1)
            if lexical_node(node2):
                node2_label=data[5]
                lbl2ids[node2_label].add(node2)
print(len(lbl2ids))

with open('tmp/lexical_mappings.tsv', 'w') as w:
    w.write(header)
    for label, ids in lbl2ids.items():
        if len(ids)<=1: continue

        list_ids=list(ids)
        for i in range(len(list_ids)-1):
            edge_id='%s-%s-%s-1' % (list_ids[i], identity_rel, list_ids[i+1])
            row=[edge_id, list_ids[i], identity_rel, list_ids[i+1], '', '', '', '', KgtkFormat.stringify('LEX'), '']
            w.write('\t'.join(row) + '\n')
Example #20
0
 def make_rel_label(rel):
     return KgtkFormat.stringify(split_camel_case(rel.split('/')[-1]))
Example #21
0
 def make_node_label(node):
     return KgtkFormat.stringify(node.strip().split('/')[3].replace(
         '_', ' '))
Example #22
0
def run(input_file: KGTKFiles, attr_syn_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import csv
    import json
    import re
    from pathlib import Path
    from collections import defaultdict
    from kgtk.kgtkformat import KgtkFormat

    out_columns = [
        'node1', 'relation', 'node2', 'node1_label', 'node2_label',
        'relation_label', 'relation_dimension', 'source', 'sentence'
    ]

    proximity_relation = '/r/LocatedNear'
    property_relation = 'mw:MayHaveProperty'
    property_relation_label = KgtkFormat.stringify('may have property')
    capableof_relation = '/r/CapableOf'
    capableof_relation_label = KgtkFormat.stringify('capable of')

    def create_edge(node1, node1_lbl, node2, node2_lbl, rel, rel_lbl,
                    image_id):
        my_row = [
            node1, rel, node2, '|'.join(node1_lbl), '|'.join(node2_lbl),
            rel_lbl, '',
            KgtkFormat.stringify('VG'), ''
        ]
        return '\t'.join(my_row) + '\n'

    def header_to_edge(row):
        row = [r.replace('_', ';') for r in row]
        return '\t'.join(row) + '\n'

    def create_uri(ns, rel):
        return '%s:%s' % (ns, rel)

    try:
        scene_graph_filename: Path = KGTKArgumentParser.get_input_file(
            input_file)
        attr_synsets_filename: Path = KGTKArgumentParser.get_input_file(
            attr_syn_file)

        with open(scene_graph_filename, 'r') as f:
            images_data = json.load(f)

        with open(attr_synsets_filename, 'r') as f:
            attr_synsets = json.load(f)

        sys.stdout.write(header_to_edge(out_columns))

        for counter, an_image in enumerate(images_data):

            image_id = str(an_image['image_id'])

            # OBJECTS
            objid2names = defaultdict(list)
            objid2syns = {}
            rows = []
            for o in an_image['objects']:
                obj_id = o['object_id']
                o_synset = o['synsets']
                objid2syns[obj_id] = o_synset
                for name in o['names']:
                    name = name.strip().lower().rstrip('.')
                    if not name: continue
                    objid2names[obj_id].append(KgtkFormat.stringify(name))

                # ATTRIBUTES
                if 'attributes' in o.keys():
                    for attr in o['attributes']:
                        attr = attr.lower()
                        if attr in attr_synsets:
                            asyn = attr_synsets[attr]
                            apos = asyn.split('.')[1]
                            if apos != 'n':
                                if apos == 'v':  # verb
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                capableof_relation,
                                                capableof_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)
                                else:  #adjective
                                    for osyn in o_synset:
                                        if osyn != asyn:
                                            edge_row = create_edge(
                                                'wn:' + osyn,
                                                objid2names[obj_id],
                                                'wn:' + asyn,
                                                [KgtkFormat.stringify(attr)],
                                                property_relation,
                                                property_relation_label,
                                                image_id)
                                            if edge_row not in rows:
                                                rows.append(edge_row)

            # RELATIONS
            for rel in an_image['relationships']:
                #synsets=rel['synsets']
                relation_label = KgtkFormat.stringify(
                    rel['predicate'].lower().strip().strip('.'))
                sub_id = rel['subject_id']
                sub_names = objid2names[sub_id]
                sub_syns = objid2syns[sub_id]
                obj_id = rel['object_id']
                obj_names = objid2names[obj_id]
                obj_syns = objid2syns[obj_id]

                for ssyn in sub_syns:
                    for osyn in obj_syns:
                        if osyn != ssyn:
                            edge_row = create_edge('wn:' + ssyn, sub_names,
                                                   'wn:' + osyn, obj_names,
                                                   proximity_relation,
                                                   relation_label, image_id)
                            if edge_row not in rows:
                                rows.append(edge_row)
            for a_row in rows:
                sys.stdout.write(a_row)

    except Exception as e:
        kgtk_exception_auto_handler(e)
Example #23
0
    def process_row(
        self,
        node_id: str,
        node_property: str,
        node_value: str,
        each_node_attributes: EACH_NODE_ATTRIBUTES,
    ):
        if self.very_verbose:
            print("Processing row (%s, %s, %s)" %
                  (repr(node_id), repr(node_property), repr(node_value)),
                  file=self.error_file,
                  flush=True)

        # CMR: the following code looks like it was intended to remove
        # any language code and language suffix.  It would have the
        # side effect of removing location coordinates entirely.
        #
        # remove @ mark
        # if "@" in node_value and node_value[0] != "@":
        #    node_value = node_value[:node_value.index("@")]

        # CMR: Better to use KgtkFormat.unstringify(node_value), as it will remove escapes from
        # internal double or single quotes.
        #
        # remove extra double quote " and single quote '
        # while len(node_value) >= 3 and node_value[0] == '"' and node_value[-1] == '"':
        #     node_value = node_value[1:-1]
        # while len(node_value) >= 3 and node_value[0] == "'" and node_value[-1] == "'":
        #     node_value = node_value[1:-1]
        if node_value.startswith(("'", '"')):
            node_value = KgtkFormat.unstringify(node_value)

        # in case we meet an empty value, skip it
        if node_value == "":
            self._logger.warning(
                """Skip line ({}, {}, {}) because of empty value.""".format(
                    node_id, node_property, node_value))
            return

        if self.very_verbose:
            print("Revised node_value = %s" % repr(node_value),
                  file=self.error_file,
                  flush=True)

        if node_property in self.properties_reversed:
            if self.very_verbose:
                print("node_property %s is in self.properties_reversed" %
                      repr(node_property),
                      file=self.error_file,
                      flush=True)
            roles = self.properties_reversed[node_property].copy()
            node_value = self.get_real_label_name(node_value)
            if self.very_verbose:
                print("node_value label = %s" % repr(node_value),
                      file=self.error_file,
                      flush=True)
            # if we get property_values, it should be saved to isa-properties part
            if self.PROPERTY_VALUES in roles:
                if self.very_verbose:
                    print("property_values is in roles",
                          file=self.error_file,
                          flush=True)
                # for property values part, changed to be "{property} {value}"
                node_value_combine = self.get_real_label_name(
                    node_property) + " " + self.get_real_label_name(node_value)
                if self.very_verbose:
                    print("node_value_combine = %s" % repr(node_value_combine),
                          file=self.error_file,
                          flush=True)
                if each_node_attributes is None:
                    raise ValueError("each_node_attributes is missing")

                property_values: typing.Optional[
                    Lexicalize.ATTRIBUTE_TYPES] = each_node_attributes[
                        self.PROPERTY_VALUES]
                if isinstance(property_values, list):
                    property_values.append(node_value_combine)
                else:
                    raise ValueError(
                        'each_node_attributes["property_values"] is not a list.'
                    )
                if self.very_verbose:
                    print('each_node_attributes["property_values"] = %s' %
                          repr(property_values),
                          file=self.error_file,
                          flush=True)

                # remove those 2 roles in case we have duplicate using of this node later
                roles.discard(self.PROPERTY_VALUES)
                roles.discard(self.HAS_PROPERTIES)
            for each_role in roles:
                attrs: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[
                    each_role]
                if isinstance(attrs, set):
                    attrs.add(node_value)
                elif isinstance(attrs, list):
                    attrs.append(node_value)
                else:
                    raise ValueError(
                        'each_node_attributes[%s] is not a list or set.' %
                        repr(each_role))
                if self.very_verbose:
                    print("%s: %s" % (each_role, repr(attrs)),
                          file=self.error_file,
                          flush=True)

        elif self.add_all_properties:  # add remained properties if need all properties
            if self.very_verbose:
                print("self.add_all_properties is True",
                      file=self.error_file,
                      flush=True)
            attrs2: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[
                self.HAS_PROPERTIES]
            if isinstance(attrs2, list):
                attrs2.append(self.get_real_label_name(node_property))
                if self.very_verbose:
                    print("has_properties: %s" % repr(attrs2),
                          file=self.error_file,
                          flush=True)
            else:
                raise ValueError(
                    'each_node_attributes["has_properties"] is not a list.')

        return
Example #24
0
    def implode_language_qualified_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
        language_val: str = self.unwrap(row[language_idx])
        if len(language_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.LANGUAGE_FIELD_NAME),
                      file=self.error_file,
                      flush=True)

        suf_idx: int = implosion[KgtkValueFields.LANGUAGE_SUFFIX_FIELD_NAME]
        suf: str = self.unwrap(row[suf_idx]) if suf_idx >= 0 else ""
        if len(suf) > 0 and not suf.startswith("-"):
            # As a siecial favor, we'll accept language suffixes that do not
            # start with a dash.  We'll prepend the dash.
            suf = "-" + suf

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val),
                                         language=language_val,
                                         language_suffix=suf)

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_language_qualified_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid language qualified string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid
Example #25
0
    def implode_string(
        self,
        input_line_count: int,
        row: typing.List[str],
        implosion: typing.Mapping[str, int],
        type_name: str,
    ) -> typing.Tuple[str, bool]:
        valid: bool = True
        if KgtkValueFields.LANGUAGE_FIELD_NAME in implosion:
            language_idx: int = implosion[KgtkValueFields.LANGUAGE_FIELD_NAME]
            if language_idx >= 0:
                language_val: str = self.unwrap(row[language_idx])
                if len(language_val) > 0:
                    if self.general_strings:
                        return self.implode_language_qualified_string(
                            input_line_count, row, implosion, type_name)
                    else:
                        valid = False
                        if self.verbose:
                            print(
                                "Input line %d: data type '%s': %s field is not empty"
                                % (input_line_count, type_name,
                                   KgtkValueFields.LANGUAGE_FIELD_NAME),
                                file=self.error_file,
                                flush=True)

        text_idx: int = implosion[KgtkValueFields.TEXT_FIELD_NAME]
        text_val: str = row[text_idx]
        if len(text_val) == 0:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is empty" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        elif len(text_val) == 1:
            valid = False
            if self.verbose:
                print("Input line %d: data type '%s': %s field is too short" %
                      (input_line_count, type_name,
                       KgtkValueFields.TEXT_FIELD_NAME),
                      file=self.error_file,
                      flush=True)
        else:
            if not text_val.startswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not start with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)
            if not text_val.endswith('"'):
                valid = False
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': %s field does not end with a double quote"
                        % (input_line_count, type_name,
                           KgtkValueFields.TEXT_FIELD_NAME),
                        file=self.error_file,
                        flush=True)

        value: str = ""
        if valid:
            # This subterfuge uses Python's literal parser to parse the string.
            if not self.escape_pipes:
                # ast.literal_eval(...) doesn't treat backslash pipe (\|) as an escaped pipe (|).
                # (this is documented behavior) so we will remove escaped pipes manually.
                text_val = text_val.replace('\\|', '|')
            value = KgtkFormat.stringify(ast.literal_eval(text_val))

        if valid and self.validate:
            kv: KgtkValue = KgtkValue(value, options=self.value_options)
            valid = kv.is_string(validate=True)
            if not valid:
                if self.verbose:
                    print(
                        "Input line %d: data type '%s': imploded value '%s' is not a valid string."
                        % (input_line_count, type_name, value),
                        file=self.error_file,
                        flush=True)
        return value, valid
Example #26
0
 def clean(e: str) -> str:
     out = e.split(':')[-1].replace('_', ' ')
     return KgtkFormat.stringify(
         re.sub("([a-z])([A-Z])", "\g<1> \g<2>", out).strip().lower())
Example #27
0
def run(output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler
    import json
    import nltk
    nltk.download("wordnet")
    from nltk.corpus import wordnet as wn
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def obtain_wordnet_lemmas(syn):
        lemmas = []
        for lemma in syn.lemma_names():
            lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' ')))
        return lemmas

    def obtain_hypernyms(syn):
        hyps = []
        for hypernym in syn.hypernyms():
            hyps.append(hypernym.name())
        return hyps

    def obtain_member_holonyms(syn):
        hols = []
        for hol in syn.member_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_part_holonyms(syn):
        hols = []
        for hol in syn.part_holonyms():
            hols.append(hol.name())
        return hols

    def obtain_substance_meronyms(syn):
        hols = []
        for hol in syn.substance_meronyms():
            hols.append(hol.name())
        return hols

    def get_wn_data():
        syns = list(wn.all_synsets())
        all_labels = {}
        all_hyps = {}
        all_members = {}
        all_parts = {}
        all_subs = {}
        for syn in syns:
            syn_name = syn.name()

            lemmas = obtain_wordnet_lemmas(syn)
            all_labels[syn_name] = '|'.join(lemmas)

            hypernyms = obtain_hypernyms(syn)
            if len(hypernyms):
                all_hyps[syn_name] = hypernyms

            member_holonyms = obtain_member_holonyms(syn)
            if len(member_holonyms):
                all_members[syn_name] = member_holonyms

            part_holonyms = obtain_part_holonyms(syn)
            if len(part_holonyms):
                all_parts[syn_name] = part_holonyms

            substance_meronyms = obtain_substance_meronyms(syn)
            if len(substance_meronyms):
                all_subs[syn_name] = substance_meronyms

        return all_labels, all_hyps, all_members, all_parts, all_subs

    def create_edges(data, labels, rel, rel_label):
        all_rows = []
        source = KgtkFormat.stringify('WN')
        for node1, v in data.items():
            for node2 in v:
                node1_preflabel = labels[node1].split('|')[0]
                node2_preflabel = labels[node2].split('|')[0]
                a_row = [
                    'wn:' + node1, rel, 'wn:' + node2, labels[node1],
                    labels[node2], rel_label, "", source, ''
                ]
                all_rows.append(a_row)
        return all_rows

    try:
        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        all_labels, all_hyps, all_members, all_parts, all_subs = get_wn_data()
        hyp_edges = create_edges(all_hyps, all_labels, '/r/IsA',
                                 KgtkFormat.stringify('is a'))
        member_edges = create_edges(all_members, all_labels, '/r/PartOf',
                                    KgtkFormat.stringify('is a part of'))
        part_edges = create_edges(all_parts, all_labels, '/r/PartOf',
                                  KgtkFormat.stringify('is a part of'))
        sub_edges = create_edges(all_subs, all_labels, '/r/MadeOf',
                                 KgtkFormat.stringify('is made of'))
        all_edges = hyp_edges + member_edges + part_edges + sub_edges

        for edge in all_edges:
            ew.write(edge)

        # Clean up.
        ew.close()

    except Exception as e:
        kgtk_exception_auto_handler(e)
Example #28
0
 def obtain_wordnet_lemmas(syn):
     lemmas = []
     for lemma in syn.lemma_names():
         lemmas.append(KgtkFormat.stringify(lemma.replace('_', ' ')))
     return lemmas
Example #29
0
    def process_qual_datavalue(self, value: str, qual_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = qual_row[self.qual_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = qual_row[self.qual_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do for Q16097-F1?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value %s is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr for %s." % value)

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr for %s." %
                                 value)
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: Research this further.  Why did we get here?  Is it because import_wikidata
                # dropped the units?
                #
                # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row)))
                valuemap["unit"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing from %s." % value)
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported, error in %s." %
                                 value)

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing from %s." % value)
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError(
                    "date_and_time precision is missing from %s." % value)
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[
                    self.qual_calendar_idx]
            return datavalue

        if kv.is_location_coordinates():
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing from %s" % value)
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing from %s" % value)
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            valuemap["precision"] = float(qual_row[self.qual_precision_idx])

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: convert the symbol to a string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            '"' + value + '"')  # TODO: KgtkValue should do this to text
        return datavalue
 def make_node_label(node):
     return KgtkFormat.stringify(node[3:])