Exemple #1
0
    def set_tasks_to_datatable(self, dim, task):

        for column in self.vl_spec["hconcat"]:
            if task["task"] == 'filter':
                if task["operator"] == 'IN':
                    for attr in task['attributes']:
                        column['transform'].append({
                            'filter': {
                                "field": attr,
                                "oneOf": task["values"]
                            }
                        })
                elif task["operator"] == 'RANGE':
                    for attr in task['attributes']:
                        column['transform'].append({
                            "filter": {
                                "field": attr,
                                "range": task["values"]
                            }
                        })
                elif task["operator"] == 'NOT RANGE':
                    for attr in task['attributes']:
                        # self.vl_spec['transform'].append({"filter": {"field": attr, "gte": task["values"][1], "lte": task["values"][0]}})
                        column['transform'].append({
                            "filter": {
                                "not": {
                                    "field": attr,
                                    "range": task["values"]
                                }
                            }
                        })
                else:
                    for attr in task['attributes']:
                        symbol = constants.operator_symbol_mapping[
                            task["operator"]]
                        if helpers.isfloat(task["values"][0]) or helpers.isint(
                                task["values"][0]):
                            column['transform'].append({
                                'filter':
                                'lower(datum["{}"]) {} {}'.format(
                                    attr, symbol, task["values"][0])
                            })
                        elif helpers.isdate(task["values"][0]):
                            column['transform'].append({
                                'filter':
                                'lower(datum["{}"]) {} "{}"'.format(
                                    attr, symbol, task["values"][0])
                            })
Exemple #2
0
    def is_datatype_ambiguous(self, attributes, task, values):
        is_datatype_ambiguous = False
        if task == "filter" and len(values) > 0 and helpers.isfloat(values[0]):
            for a in attributes:
                if self.nl4dv_instance.data_genie_instance.data_attribute_map[
                        a]["dataType"] != constants.attribute_types[
                            "QUANTITATIVE"]:
                    is_datatype_ambiguous = True
                    break
        elif task in ["derived_value", "find_extremum"]:
            for a in attributes:
                if self.nl4dv_instance.data_genie_instance.data_attribute_map[
                        a]["dataType"] != constants.attribute_types[
                            "QUANTITATIVE"]:
                    is_datatype_ambiguous = True
                    break
        elif task in ["trend"]:
            for a in attributes:
                if self.nl4dv_instance.data_genie_instance.data_attribute_map[
                        a]["dataType"] != constants.attribute_types["TEMPORAL"]:
                    is_datatype_ambiguous = True
                    break

        return is_datatype_ambiguous
Exemple #3
0
    def set_data(self, data_url=None):
        # type: (str) -> bool
        """
        User can choose to manually initialize data

        """
        self.nl4dv_instance.data_url = data_url

        # initialize values
        self.data_attribute_map = dict()
        self.data = list()
        self.rows = 0

        if self.nl4dv_instance.data_url is not None and os.path.isfile(
                self.nl4dv_instance.data_url):

            # local variables
            reader = None
            json_data = None
            attributes = list()

            if self.nl4dv_instance.data_url.lower().endswith('.csv'):
                reader = csv.reader(open(self.nl4dv_instance.data_url,
                                         'r',
                                         encoding='utf-8'),
                                    delimiter=',')
                attributes = next(
                    reader)  # assumes headers are in the first line
            elif self.nl4dv_instance.data_url.lower().endswith('.tsv'):
                reader = csv.reader(open(self.nl4dv_instance.data_url,
                                         'r',
                                         encoding='utf-8'),
                                    delimiter='\t')
                attributes = next(
                    reader)  # assumes headers are in the first line
            elif self.nl4dv_instance.data_url.lower().endswith('.json'):
                json_data = json.load(
                    open(self.nl4dv_instance.data_url, 'r', encoding='utf-8'))
                attributes = json_data[0].keys()

            # initialize properties in Attribute Map
            for attr in attributes:
                # Don't consider attribute names that are empty or just whitespaces
                if attr and attr.strip():
                    self.data_attribute_map[attr] = {
                        'domain': set(),
                        'isLabelAttribute':
                        attr == self.nl4dv_instance.label_attribute,
                        'summary': dict(),
                        'dataTypeList':
                        list(),  # temporary to determine datatype
                        'dataType': '',
                        'aliases': list(),
                    }

            # initialize properties in Attribute Map
            # implies file is either .csv or .tsv
            if reader is not None:
                for line in reader:
                    data_obj = dict()
                    for i in range(len(line)):
                        # Don't consider attribute names that are empty or just whitespaces
                        if attributes[i] and attributes[i].strip():
                            data_obj[attributes[i]] = line[i]
                    self.data.append(data_obj)
                    self.rows += 1
            else:
                # JSON file
                for data_obj in json_data:
                    self.data.append(data_obj)
                    self.rows += 1

            # infer attribute datatypes and compute summary (range, domain)
            for datum in self.data:
                for attr in self.data_attribute_map.keys():
                    attr_val = datum[attr]

                    # Check for Numeric (float, int)
                    if helpers.isfloat(attr_val) or helpers.isint(attr_val):
                        attr_datatype = constants.attribute_types[
                            'QUANTITATIVE']
                        self.populate_dataset_meta(attr, attr_val,
                                                   attr_datatype)

                    # Check for Datetime
                    # ToDo:- Works fine for datetime strings. Not for others like Epochs and Int-only Years (e.g. 2018) which get captured above.
                    # ToDo:- It is VERY risky to switch this elif block with the if block above
                    elif helpers.isdate(attr_val)[0]:
                        attr_datatype = constants.attribute_types['TEMPORAL']
                        self.populate_dataset_meta(attr, attr_val,
                                                   attr_datatype)

                    # Otherwise set as Nominal
                    else:
                        attr_datatype = constants.attribute_types['NOMINAL']
                        self.populate_dataset_meta(attr, attr_val,
                                                   attr_datatype)

                    # Irrespective of above assignment, make a list of attribute types for each data row
                    # to take best decision on heterogeneous data with multiple datatypes
                    self.data_attribute_map[attr]['dataTypeList'].append(
                        attr_datatype)

            # Determine the Datatype based on majority of values.
            # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ...
            for attr in self.data_attribute_map:
                # most common attribute type
                attr_datatype = Counter(self.data_attribute_map[attr]
                                        ['dataTypeList']).most_common(1)[0][0]

                # if it's quantitative but with less than or equal to 12 unique values, then it's ordinal.
                # eg. 1, 2, 3, ..., 12 (months of a year)
                # eg. -3, -2, -1, 0, 1, 2, 3 (likert ratings)
                if attr_datatype == constants.attribute_types[
                        'QUANTITATIVE'] and len(
                            self.data_attribute_map[attr]['domain']) <= 12:
                    attr_datatype = constants.attribute_types['ORDINAL']
                    self.populate_dataset_meta_for_attr(attr, attr_datatype)

                # If an attribute has (almnost) no repeating value, then mark it as the label attribute.
                # eg. primary/unique key of the table? Car1 , Car2, Car3, ...
                # Almost == 90% heuristic-based
                if attr_datatype == constants.attribute_types[
                        'NOMINAL'] and len(self.data_attribute_map[attr]
                                           ['domain']) > 0.9 * self.rows:
                    self.nl4dv_instance.label_attribute = attr
                    self.data_attribute_map[attr]['isLabelAttribute'] = True

                # Set the final data type
                self.data_attribute_map[attr]['dataType'] = attr_datatype

                # Presentation
                self.prepare_output(attr, attr_datatype)

            return True

        return False
Exemple #4
0
    def set_task(self, dim, task):
        if task["task"] == 'find_extremum':
            if dim is None:
                dim = 'y'
            if task["operator"] == 'MIN':
                if dim == 'x':
                    if 'y' in self.vl_spec['encoding']:
                        self.vl_spec['encoding']['y']['sort'] = 'x'
                elif dim == 'y':
                    if 'x' in self.vl_spec['encoding']:
                        self.vl_spec['encoding']['x']['sort'] = 'y'
            elif task["operator"] == 'MAX':
                if dim == 'x':
                    if 'y' in self.vl_spec['encoding']:
                        self.vl_spec['encoding']['y']['sort'] = '-x'
                elif dim == 'y':
                    if 'x' in self.vl_spec['encoding']:
                        self.vl_spec['encoding']['x']['sort'] = '-y'

        elif task["task"] == 'filter':
            if task["operator"] == 'IN':
                for attr in task['attributes']:
                    self.vl_spec['transform'].append(
                        {'filter': {
                            "field": attr,
                            "oneOf": task["values"]
                        }})
            elif task["operator"] == 'RANGE':
                for attr in task['attributes']:
                    self.vl_spec['transform'].append(
                        {"filter": {
                            "field": attr,
                            "range": task["values"]
                        }})
            elif task["operator"] == 'NOT RANGE':
                for attr in task['attributes']:
                    # self.vl_spec['transform'].append({"filter": {"field": attr, "gte": task["values"][1], "lte": task["values"][0]}})
                    self.vl_spec['transform'].append({
                        "filter": {
                            "not": {
                                "field": attr,
                                "range": task["values"]
                            }
                        }
                    })
            else:
                for attr in task['attributes']:
                    symbol = constants.operator_symbol_mapping[
                        task["operator"]]
                    if helpers.isfloat(task["values"][0]) or helpers.isint(
                            task["values"][0]):
                        self.vl_spec['transform'].append({
                            'filter':
                            'lower(datum["{}"]) {} {}'.format(
                                attr, symbol, task["values"][0])
                        })
                    elif helpers.isdate(task["values"][0]):
                        self.vl_spec['transform'].append({
                            'filter':
                            'lower(datum["{}"]) {} "{}"'.format(
                                attr, symbol, task["values"][0])
                        })
Exemple #5
0
    def set_data(self, data_url=None, data_value=None):
        # type: (str) -> None
        """
        User can choose to manually initialize data

        """
        self.nl4dv_instance.data_url = data_url if data_url is not None else self.nl4dv_instance.data_url
        self.nl4dv_instance.data_value = data_value if data_value is not None else self.nl4dv_instance.data_value

        # initialize values
        self.data_attribute_map = dict()
        self.data = list()
        self.rows = 0

        if self.nl4dv_instance.data_url is not None:
            # Possible Local FILE or HTTP URL
            if self.nl4dv_instance.data_url.lower().endswith('.csv'):
                self.data = pd.read_csv(self.nl4dv_instance.data_url,
                                        sep=',').to_dict('records')
            elif self.nl4dv_instance.data_url.lower().endswith('.tsv'):
                self.data = pd.read_csv(self.nl4dv_instance.data_url,
                                        sep='\t').to_dict('records')
            elif self.nl4dv_instance.data_url.lower().endswith('.json'):
                self.data = pd.read_json(
                    self.nl4dv_instance.data_url).to_dict('records')

        elif self.nl4dv_instance.data_value is not None:
            if isinstance(data_value, pd.DataFrame):
                self.data = data_value.to_dict('records')
            elif isinstance(data_value, list):
                self.data = data_value
            elif isinstance(data_value, dict):
                self.data = pd.DataFrame(data_value).to_dict('records')

        # Set number of rows in the dataset
        self.rows = len(self.data)

        # initialize properties in Attribute Map
        if len(self.data) > 0:
            for attr in self.data[0].keys():
                # Don't consider attribute names that are empty or just whitespaces
                if attr and attr.strip():
                    self.data_attribute_map[attr] = {
                        'domain': set(),
                        'domainMeta': dict(),
                        'isLabelAttribute':
                        attr == self.nl4dv_instance.label_attribute,
                        'summary': dict(),
                        'dataTypeList':
                        list(),  # temporary to determine datatype
                        'dataType': '',
                        'dataTypeMeta': dict(
                        ),  # Used for for e.g., temporal attributes when they conform to a certain format
                        'aliases': list(),
                    }

        # infer attribute datatypes and compute summary (range, domain)
        for datum in self.data:
            for attr in self.data_attribute_map.keys():
                # Check for Datetime
                is_date, unformatted_date_obj = helpers.isdate(datum[attr])
                if is_date:
                    attr_datatype_for_majority = constants.attribute_types[
                        'TEMPORAL'] + "-" + str(
                            unformatted_date_obj["regex_id"])
                # Check for Numeric (float, int)
                elif helpers.isfloat(datum[attr]) or helpers.isint(
                        datum[attr]):
                    attr_datatype_for_majority = constants.attribute_types[
                        'QUANTITATIVE']
                # Otherwise set as Nominal
                else:
                    attr_datatype_for_majority = constants.attribute_types[
                        'NOMINAL']

                # Append the list of attribute types for each data row to take best decision on heterogeneous data with multiple datatypes
                self.data_attribute_map[attr]['dataTypeList'].append(
                    attr_datatype_for_majority)

        # Determine the Datatype based on majority of values.
        # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ...
        for attr in self.data_attribute_map:
            # By default, set the attribute datatype to the most common attribute
            attr_datatype = Counter(self.data_attribute_map[attr]
                                    ['dataTypeList']).most_common(1)[0][0]

            # If attr_datatype is Temporal (e.g., T-1, T-2, T-n where 'n' corresponds to the n'th index of the date_regex array.
            # Then: PROCESS this and eventually strip the '-n' from the datatype
            if not (attr_datatype in [
                    constants.attribute_types['QUANTITATIVE'],
                    constants.attribute_types['NOMINAL']
            ]):

                # If there is at least one instance of 'T-2' (DD*MM*YY(YY)), in the `dataTypeList`, set the regex_id to this, even if 'T-1' is the majority.
                if 'T-2' in self.data_attribute_map[attr]['dataTypeList']:
                    attr_datatype = 'T-2'

                # Strip the '-n' from the datatype
                attr_datatype_split = attr_datatype.split("-")
                attr_datatype = attr_datatype_split[0]

                # Set the final data type
                self.data_attribute_map[attr]['dataTypeMeta'] = {
                    "regex_id": attr_datatype_split[1]
                }

                # Add raw data to the domain's metadata. Only for Temporal Attributes.
                if not 'raw' in self.data_attribute_map[attr]['domainMeta']:
                    self.data_attribute_map[attr]['domainMeta']['raw'] = set()

            # Set the final data type
            self.data_attribute_map[attr]['dataType'] = attr_datatype

            # Update the dataset metadata for each attribute
            self.populate_dataset_meta_for_attr(attr, attr_datatype)
Exemple #6
0
    def set_data(self, data_url=None, data_value=None):
        # type: (str) -> None
        """
        User can choose to manually initialize data

        """
        self.nl4dv_instance.data_url = data_url if data_url is not None else self.nl4dv_instance.data_url
        self.nl4dv_instance.data_value = data_value if data_value is not None else self.nl4dv_instance.data_value

        # initialize values
        self.data_attribute_map = dict()
        self.data = list()
        self.rows = 0

        if self.nl4dv_instance.data_url is not None:
            # Possible Local FILE or HTTP URL
            if self.nl4dv_instance.data_url.lower().endswith('.csv'):
                self.data = pd.read_csv(self.nl4dv_instance.data_url,
                                        sep=',').to_dict('records')
            elif self.nl4dv_instance.data_url.lower().endswith('.tsv'):
                self.data = pd.read_csv(self.nl4dv_instance.data_url,
                                        sep='\t').to_dict('records')
            elif self.nl4dv_instance.data_url.lower().endswith('.json'):
                self.data = pd.read_json(
                    self.nl4dv_instance.data_url).to_dict('records')

        elif self.nl4dv_instance.data_value is not None:
            if isinstance(data_value, pd.DataFrame):
                self.data = data_value.to_dict('records')
            elif isinstance(data_value, list):
                self.data = data_value
            elif isinstance(data_value, dict):
                self.data = pd.DataFrame(data_value).to_dict('records')

        # Set number of rows in the dataset
        self.rows = len(self.data)

        # initialize properties in Attribute Map
        if len(self.data) > 0:
            for attr in self.data[0].keys():
                # Don't consider attribute names that are empty or just whitespaces
                if attr and attr.strip():
                    self.data_attribute_map[attr] = {
                        'domain': set(),
                        'isLabelAttribute':
                        attr == self.nl4dv_instance.label_attribute,
                        'summary': dict(),
                        'dataTypeList':
                        list(),  # temporary to determine datatype
                        'dataType': '',
                        'aliases': list(),
                    }

        # infer attribute datatypes and compute summary (range, domain)
        for datum in self.data:
            for attr in self.data_attribute_map.keys():
                attr_val = datum[attr]

                # Check for Numeric (float, int)
                if helpers.isfloat(attr_val) or helpers.isint(attr_val):
                    attr_datatype = constants.attribute_types['QUANTITATIVE']
                    self.populate_dataset_meta(attr, attr_val, attr_datatype)

                # Check for Datetime
                # ToDo:- Works fine for datetime strings. Not for others like Epochs and Int-only Years (e.g. 2018) which get captured above.
                # ToDo:- It is VERY risky to switch this elif block with the if block above
                elif helpers.isdate(attr_val)[0]:
                    attr_datatype = constants.attribute_types['TEMPORAL']
                    self.populate_dataset_meta(attr, attr_val, attr_datatype)

                # Otherwise set as Nominal
                else:
                    attr_datatype = constants.attribute_types['NOMINAL']
                    self.populate_dataset_meta(attr, attr_val, attr_datatype)

                # Irrespective of above assignment, make a list of attribute types for each data row
                # to take best decision on heterogeneous data with multiple datatypes
                self.data_attribute_map[attr]['dataTypeList'].append(
                    attr_datatype)

        # Determine the Datatype based on majority of values.
        # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ...
        for attr in self.data_attribute_map:
            # most common attribute type
            attr_datatype = Counter(self.data_attribute_map[attr]
                                    ['dataTypeList']).most_common(1)[0][0]

            ## NOTE: For all practical purposes, let QUANTITATIVE be the determined Data Type. If an attribute is known to be ORDINAL, it can be set using the set_attribute_datatype() API.
            # # if it's quantitative but with less than or equal to 12 unique values, then it's ordinal.
            # # eg. 1, 2, 3, ..., 12 (months of a year)
            # # eg. -3, -2, -1, 0, 1, 2, 3 (likert ratings)
            # if attr_datatype == constants.attribute_types['QUANTITATIVE'] and len(
            #         self.data_attribute_map[attr]['domain']) <= 12:
            #     attr_datatype = constants.attribute_types['ORDINAL']
            #     self.populate_dataset_meta_for_attr(attr, attr_datatype)

            # If an attribute has (almnost) no repeating value, then mark it as the label attribute.
            # eg. primary/unique key of the table? Car1 , Car2, Car3, ...
            # Almost == 90% heuristic-based
            if attr_datatype == constants.attribute_types['NOMINAL'] and len(
                    self.data_attribute_map[attr]['domain']) > 0.9 * self.rows:
                self.nl4dv_instance.label_attribute = attr
                self.data_attribute_map[attr]['isLabelAttribute'] = True

            # Set the final data type
            self.data_attribute_map[attr]['dataType'] = attr_datatype

            # Presentation
            self.prepare_output(attr, attr_datatype)