def join_tsv(
        self,
        values: typing.List[str],
        unquoted: bool = False,
        unescape_pipe: bool = True,
        csvlike: bool = False,
    ) -> str:
        line: str = ""
        value: str
        for value in values:
            # TODO: Complain if the value is a KGTK List.
            if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL):
                value = self.reformat_datetime(value)

            elif value.startswith(
                (KgtkFormat.STRING_SIGIL,
                 KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
                if unquoted:
                    # What if the value is a list? unstringify(...) will be
                    # unhappy.  The following hack protects strings (but not
                    # language-qualified strings) against errors, introducing
                    # an ambiguity when exporting lists:
                    value = value.replace('"|"', '|')
                    try:
                        value = KgtkFormat.unstringify(
                            value, unescape_pipe=unescape_pipe
                        )  # Lose the language code.
                    except ValueError as e:
                        print("KgtkWriter: File %s: Error unstringifying %s" %
                              (repr(self.file_path), repr(value)),
                              file=self.error_file,
                              flush=True)
                        raise e
                elif csvlike:
                    # What if the value is a list? unstringify(...) will be
                    # unhappy.  The following hack protects strings (but not
                    # language-qualified strings) against errors, introducing
                    # an ambiguity when exporting lists:
                    value = value.replace('"|"', '|')
                    try:
                        value = KgtkFormat.unstringify(
                            value, unescape_pipe=unescape_pipe
                        )  # Lose the language code.
                    except ValueError as e:
                        print("KgtkWriter: File %s: Error unstringifying %s" %
                              (repr(self.file_path), repr(value)),
                              file=self.error_file,
                              flush=True)
                        raise e
                    value = '"' + value.replace('"', '""') + '"'

                else:
                    value = value.replace("\\|", "|")
            else:
                value = value.replace("\\|", "|")

            if len(line) > 0:
                line += "\t"
            line += value
        return line
Exemple #2
0
    def join_csv(self, values: typing.List[str],
                 unquoted: bool = False,
                 )->str:
        line: str = ""
        value: str
        for value in values:
            # TODO: Complain if the value is a KGTK List.
            if value.startswith(KgtkFormat.DATE_AND_TIMES_SIGIL):
                value = self.reformat_datetime(value)

            elif value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
                # What if the value is a list? unstringify(...) will be
                # unhappy.  The following hack protects strings (but not
                # language-qualified strings) against errors, introducing
                # an ambiguity when exporting lists:
                value = value.replace('"|"', '|')
                value = KgtkFormat.unstringify(value) # Lose the language code.
                # TODO: Complain if internal newline or carriage return.

                if not unquoted:
                    value = '"' + value.replace('"', '""') + '"'
                
            else:
                value = value.replace("\\|", "|")
                if '"' in value or ',' in value:
                    # A symbol with an internal double quote or comma: turn it into a string.
                    value = '"' + value.replace('"', '""') + '"'
            if len(line) > 0:
                line += ","
            line += value
        return line
Exemple #3
0
    def process_qnode_edge_qualifier(
            self, statement: typing.MutableMapping[str, typing.Any],
            edge_id: str, qualifier_row: typing.List[str]):
        if "qualifiers" not in statement:
            statement["qualifiers"] = dict()
        qualifiers = statement["qualifiers"]

        prop: str = qualifier_row[self.qual_label_idx]
        if prop not in qualifiers:
            qualifiers[prop] = list()
        proplist: typing.List[typing.Mapping[str,
                                             typing.Any]] = qualifiers[prop]

        qualifier: typing.MutableMapping[str, typing.Any] = dict()
        proplist.append(qualifier)

        qualifier["property"] = prop

        datatype: str = qualifier_row[self.qual_wikidatatype_idx]
        qualifier["datatype"] = datatype

        datahash: str = qualifier_row[self.qual_datahash_idx]
        if len(datahash) > 0:
            qualifier["hash"] = KgtkFormat.unstringify(datahash)

        value: str = qualifier_row[self.qual_node2_idx]
        if value == "somevalue":
            qualifier["snaktype"] = "somevalue"
        elif value == "novalue":
            qualifier["snaktype"] = "novalue"
        else:
            qualifier["datavalue"] = self.process_qual_datavalue(
                value, qualifier_row, datatype)
Exemple #4
0
    def add_sitelink(self, result: typing.MutableMapping[str, typing.Any],
                     edge_id: str,
                     qualifier_rows: typing.List[typing.List[str]]):
        if "sitelinks" not in result:
            result["sitelinks"] = dict()
        sitelinks: typing.MutableMapping[str, typing.Mapping[str, typing.Union[
            str, typing.List[str]]]] = result["sitelinks"]

        site: str = ""
        title: str = ""
        badges: typing.List[str] = list()

        qualifier_row: typing.List[str]
        for qualifier_row in qualifier_rows:
            label: str = qualifier_row[self.qual_label_idx]

            if label == "site":
                site = qualifier_row[self.qual_node2_idx]

            elif label == "title":
                title = KgtkFormat.unstringify(
                    qualifier_row[self.qual_node2_idx])

            elif label == "badge":
                badges.append(qualifier_row[self.qual_node2_idx])

        if len(site) == 0:
            # TODO: give a better error message.
            raise ValueError("Missing sitelink site for %s" % edge_id)

        if len(title) == 0:
            # TODO: give a better error message.
            raise ValueError("Missing sitelink title for %s" % edge_id)

        sitelinks[site] = {"site": site, "title": title, "badges": badges}
Exemple #5
0
 def reformat_value_for_json(self, value: str)->typing.Union[str, int, float, bool]:
     # TODO: Complain if the value is a KGTK List.
     if value.startswith((KgtkFormat.STRING_SIGIL, KgtkFormat.LANGUAGE_QUALIFIED_STRING_SIGIL)):
         # What if the value is a list? unstringify(...) will be
         # unhappy.  The following hack protects strings (but not
         # language-qualified strings) against errors, introducing
         # an ambiguity when exporting lists:
         value = value.replace('"|"', '|')
         return KgtkFormat.unstringify(value) # Lose the language code.
     elif value == KgtkFormat.TRUE_SYMBOL:
         return True
     elif value == KgtkFormat.FALSE_SYMBOL:
         return False
     elif value.isdigit():
         return int(value)
     elif value.startswith(("+", "-")) and value[1:].isdigit():
         return int(value)
     else:
         # TODO: process floating point numbers.
         # TODO: process datetimes
         # TODO: process geolocations
         return value
Exemple #6
0
    def process_row(
        self,
        node_id: str,
        node_property: str,
        node_value: str,
        each_node_attributes: EACH_NODE_ATTRIBUTES,
    ):
        if self.very_verbose:
            print("Processing row (%s, %s, %s)" %
                  (repr(node_id), repr(node_property), repr(node_value)),
                  file=self.error_file,
                  flush=True)

        # CMR: the following code looks like it was intended to remove
        # any language code and language suffix.  It would have the
        # side effect of removing location coordinates entirely.
        #
        # remove @ mark
        # if "@" in node_value and node_value[0] != "@":
        #    node_value = node_value[:node_value.index("@")]

        # CMR: Better to use KgtkFormat.unstringify(node_value), as it will remove escapes from
        # internal double or single quotes.
        #
        # remove extra double quote " and single quote '
        # while len(node_value) >= 3 and node_value[0] == '"' and node_value[-1] == '"':
        #     node_value = node_value[1:-1]
        # while len(node_value) >= 3 and node_value[0] == "'" and node_value[-1] == "'":
        #     node_value = node_value[1:-1]
        if node_value.startswith(("'", '"')):
            node_value = KgtkFormat.unstringify(node_value)

        # in case we meet an empty value, skip it
        if node_value == "":
            self._logger.warning(
                """Skip line ({}, {}, {}) because of empty value.""".format(
                    node_id, node_property, node_value))
            return

        if self.very_verbose:
            print("Revised node_value = %s" % repr(node_value),
                  file=self.error_file,
                  flush=True)

        if node_property in self.properties_reversed:
            if self.very_verbose:
                print("node_property %s is in self.properties_reversed" %
                      repr(node_property),
                      file=self.error_file,
                      flush=True)
            roles = self.properties_reversed[node_property].copy()
            node_value = self.get_real_label_name(node_value)
            if self.very_verbose:
                print("node_value label = %s" % repr(node_value),
                      file=self.error_file,
                      flush=True)
            # if we get property_values, it should be saved to isa-properties part
            if self.PROPERTY_VALUES in roles:
                if self.very_verbose:
                    print("property_values is in roles",
                          file=self.error_file,
                          flush=True)
                # for property values part, changed to be "{property} {value}"
                node_value_combine = self.get_real_label_name(
                    node_property) + " " + self.get_real_label_name(node_value)
                if self.very_verbose:
                    print("node_value_combine = %s" % repr(node_value_combine),
                          file=self.error_file,
                          flush=True)
                if each_node_attributes is None:
                    raise ValueError("each_node_attributes is missing")

                property_values: typing.Optional[
                    Lexicalize.ATTRIBUTE_TYPES] = each_node_attributes[
                        self.PROPERTY_VALUES]
                if isinstance(property_values, list):
                    property_values.append(node_value_combine)
                else:
                    raise ValueError(
                        'each_node_attributes["property_values"] is not a list.'
                    )
                if self.very_verbose:
                    print('each_node_attributes["property_values"] = %s' %
                          repr(property_values),
                          file=self.error_file,
                          flush=True)

                # remove those 2 roles in case we have duplicate using of this node later
                roles.discard(self.PROPERTY_VALUES)
                roles.discard(self.HAS_PROPERTIES)
            for each_role in roles:
                attrs: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[
                    each_role]
                if isinstance(attrs, set):
                    attrs.add(node_value)
                elif isinstance(attrs, list):
                    attrs.append(node_value)
                else:
                    raise ValueError(
                        'each_node_attributes[%s] is not a list or set.' %
                        repr(each_role))
                if self.very_verbose:
                    print("%s: %s" % (each_role, repr(attrs)),
                          file=self.error_file,
                          flush=True)

        elif self.add_all_properties:  # add remained properties if need all properties
            if self.very_verbose:
                print("self.add_all_properties is True",
                      file=self.error_file,
                      flush=True)
            attrs2: Lexicalize.ATTRIBUTE_TYPES = each_node_attributes[
                self.HAS_PROPERTIES]
            if isinstance(attrs2, list):
                attrs2.append(self.get_real_label_name(node_property))
                if self.very_verbose:
                    print("has_properties: %s" % repr(attrs2),
                          file=self.error_file,
                          flush=True)
            else:
                raise ValueError(
                    'each_node_attributes["has_properties"] is not a list.')

        return
Exemple #7
0
    def process_qual_datavalue(self, value: str, qual_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = qual_row[self.qual_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = qual_row[self.qual_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do for Q16097-F1?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value %s is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr for %s." % value)

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr for %s." %
                                 value)
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: Research this further.  Why did we get here?  Is it because import_wikidata
                # dropped the units?
                #
                # raise ValueError("quantity is missing units_node for %s in: %s" % (value, " ".join(qual_row)))
                valuemap["unit"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing from %s." % value)
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported, error in %s." %
                                 value)

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing from %s." % value)
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError(
                    "date_and_time precision is missing from %s." % value)
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + qual_row[
                    self.qual_calendar_idx]
            return datavalue

        if kv.is_location_coordinates():
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing from %s" % value)
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing from %s" % value)
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            valuemap["precision"] = float(qual_row[self.qual_precision_idx])

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: convert the symbol to a string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            '"' + value + '"')  # TODO: KgtkValue should do this to text
        return datavalue
Exemple #8
0
    def process_edge_datavalue(self, value: str, edge_row: typing.List[str],
                               datatype: str):
        datavalue: typing.MutableMapping[str, typing.Union[str, typing.Mapping[
            str, typing.Optional[typing.Union[str, int, float]]]]] = dict()
        datavalue["type"] = edge_row[self.edge_val_type_idx]

        valuemap: typing.MutableMapping[str, typing.Optional[typing.Union[
            str, int, float]]] = dict()
        datavalue["value"] = valuemap

        entity_type: str = edge_row[self.edge_entity_type_idx]
        if len(entity_type) > 0:
            valuemap["entity-type"] = entity_type
            valuemap["id"] = value

            # TODO: Is this the right thing to do?
            numeric_id: str = value[1:]
            if "-" in numeric_id:
                numeric_id = numeric_id[:numeric_id.index("-")]
            valuemap["numeric-id"] = int(numeric_id)
            return datavalue

        kv = KgtkValue(value,
                       options=self.value_options,
                       parse_fields=True,
                       error_file=self.error_file,
                       verbose=self.verbose)
        if not kv.validate():
            # raise ValueError("Invalid KGTK value '%s'" % value)
            print("Warning: Invalid KGTK value '%s'" % value,
                  file=self.error_file,
                  flush=True)
        if kv.fields is None:
            raise ValueError("KGTK value '%s' is missing fields." % value)

        if kv.is_number():
            if kv.fields.numberstr is None:
                raise ValueError("number is missing numberstr.")

            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign
            valuemap["unit"] = "1"
            return datavalue

        if kv.is_quantity():
            if kv.fields.numberstr is None:
                raise ValueError("quantity is missing numberstr.")
            valuemap["amount"] = kv.fields.numberstr  # TODO: add plus sign

            if kv.fields.units_node is None:
                # TODO: research this further.
                #
                # raise ValueError("quantity is missing units_node for %s." % value)
                valuemap["init"] = "undefined"
            else:
                valuemap[
                    "unit"] = "http://www.wikidata.org/entity/" + kv.fields.units_node

            if kv.fields.low_tolerancestr is not None and len(
                    kv.fields.low_tolerancestr) > 0:
                valuemap[
                    "lowerBound"] = kv.fields.low_tolerancestr  # TODO: add plus sign

            if kv.fields.high_tolerancestr is not None and len(
                    kv.fields.high_tolerancestr) > 0:
                valuemap[
                    "higherBound"] = kv.fields.high_tolerancestr  # TODO: add plus sign
            return datavalue

        if kv.is_language_qualified_string():
            text: str
            language: str
            language_suffix: str
            text, language, language_suffix = KgtkFormat.destringify(
                value)  # TODO: KgtkValue should do this to text
            language += language_suffix
            valuemap["text"] = text
            valuemap["language"] = language
            return datavalue

        if kv.is_string():
            valuemap["type"] = "string"
            valuemap["value"] = KgtkFormat.unstringify(
                value)  # TODO: KgtkValue should do this to text
            return datavalue

        if kv.is_date_and_times():
            if kv.fields.zonestr is None:
                raise ValueError("timezone is missing.")
            if kv.fields.zonestr != "Z":
                raise ValueError("Only Z-time is supported.")

            if kv.fields.date_and_time is None:
                raise ValueError("date_and_time is missing.")
            valuemap["time"] = kv.fields.date_and_time
            valuemap["timezone"] = 0
            valuemap["before"] = 0
            valuemap["after"] = 0

            if kv.fields.precision is None:
                raise ValueError("date_and_time precision is missing.")
            valuemap["precision"] = kv.fields.precision

            valuemap[
                "calendarmodel"] = "http://www.wikidata.org/entity/" + edge_row[
                    self.edge_calendar_idx]
            return datavalue

        if kv.is_location_coordinates:
            if kv.fields.latitude is None:
                raise ValueError("latitude is missing")
            valuemap["latitude"] = kv.fields.latitude

            if kv.fields.longitude is None:
                raise ValueError("longitude is missing")
            valuemap["longitude"] = kv.fields.longitude

            valuemap["altitide"] = None  # deprecated

            # TODO: Validate that it's OK to have location coordinates without precision.
            precision: str = edge_row[self.edge_precision_idx]
            if len(precision) > 0:
                try:
                    valuemap["precision"] = float(
                        edge_row[self.edge_precision_idx])
                except ValueError:
                    print("Invalid precision '%s'" % precision,
                          file=self.error_file,
                          flush=True)

            valuemap["globe"] = "http://www.wikidata.org/entity/Q2"
            return datavalue

        # Default: treat as string.
        valuemap["type"] = "string"
        valuemap["value"] = KgtkFormat.unstringify(
            value)  # TODO: KgtkValue should do this to text
        return datavalue
Exemple #9
0
def run(input_file: KGTKFiles, output_file: KGTKFiles):

    # import modules locally
    import sys  # type: ignore
    from kgtk.exceptions import kgtk_exception_auto_handler, KGTKException
    import csv
    import re
    import json
    from pathlib import Path
    from string import Template
    import pandas as pd
    from kgtk.kgtkformat import KgtkFormat
    from kgtk.io.kgtkwriter import KgtkWriter

    def make_node(x):
        und_x = x.replace(' ', '_')
        pref_und_x = 'at:%s' % und_x
        return pref_und_x

    def remove_people_mentions(event):
        e = event.replace('personx', '').strip()
        e = e.replace('persony', '').strip()
        e = e.replace('person x', '').strip()
        e = e.replace('person y', '').strip()
        e = e.replace('the ___', '')
        e = e.replace('___', '')
        e = e.replace("'s", '')
        e = e.replace('to y', '')
        return e.strip()

    def produce_node_labels(event):
        if '\t' in event:
            event = event.split('\t')[0]
        e1 = event.lower()
        e1 = e1.rstrip('.').strip()
        e2 = remove_people_mentions(e1)
        while '  ' in e2:
            e2 = e2.replace('  ', ' ')
        if e1 != e2 and e2:
            return '|'.join(
                [KgtkFormat.stringify(e1),
                 KgtkFormat.stringify(e2)])
        else:
            return KgtkFormat.stringify(e1)

    def produce_rel_label(rel):
        mapping = {
            'xAttr': 'person x has attribute',
            'oAttr': 'others have attribute',
            'xReact': 'person x feels',
            'oReact': 'others feel',
            'xIntent': 'person x wants',
            'xWant': 'person x wants',
            'oWant': 'others want',
            'xNeed': 'person x needs',
            'xEffect': 'effect on person x',
            'oEffect': 'the effect on others'
        }
        return KgtkFormat.stringify(mapping[rel])

    try:

        filename: Path = KGTKArgumentParser.get_input_file(input_file)

        out_columns = [
            'node1', 'relation', 'node2', 'node1;label', 'node2;label',
            'relation;label', 'relation;dimension', 'source', 'sentence'
        ]

        output_kgtk_file: Path = KGTKArgumentParser.get_output_file(
            output_file)
        ew: KgtkWriter = KgtkWriter.open(
            out_columns,
            output_kgtk_file,
            #mode=input_kr.mode,
            require_all_columns=False,
            prohibit_extra_columns=True,
            fill_missing_columns=True,
            gzip_in_parallel=False,
            #verbose=self.verbose,
            #very_verbose=self.very_verbose
        )

        df = pd.read_csv(filename, index_col=0)
        df.iloc[:, :9] = df.iloc[:, :9].apply(
            lambda col: col.apply(json.loads))

        df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)
        df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)

        for event, row in df.iterrows():
            event_label = produce_node_labels(event)

            first_event_label = KgtkFormat.unstringify(
                event_label.split('|')[0] if '|' in
                event_label else event_label)
            n1 = make_node(first_event_label)
            for c in df.columns:
                for v in row[c]:
                    if v == 'none': continue
                    value_label = produce_node_labels(v)
                    first_value_label = KgtkFormat.unstringify(
                        value_label.split('|')[0] if '|' in
                        value_label else value_label)
                    n2 = make_node(first_value_label)

                    rel_label = produce_rel_label(c)

                    sentence = ''

                    relation = make_node(c)

                    this_row = [
                        n1, relation, n2, event_label, value_label, rel_label,
                        '',
                        KgtkFormat.stringify('AT'), sentence
                    ]
                    ew.write(this_row)

        # Clean up.
        ew.close()

    except Exception as e:
        raise KGTKException('Error: ' + str(e))