def set_tasks_to_datatable(self, dim, task): for column in self.vl_spec["hconcat"]: if task["task"] == 'filter': if task["operator"] == 'IN': for attr in task['attributes']: column['transform'].append({ 'filter': { "field": attr, "oneOf": task["values"] } }) elif task["operator"] == 'RANGE': for attr in task['attributes']: column['transform'].append({ "filter": { "field": attr, "range": task["values"] } }) elif task["operator"] == 'NOT RANGE': for attr in task['attributes']: # self.vl_spec['transform'].append({"filter": {"field": attr, "gte": task["values"][1], "lte": task["values"][0]}}) column['transform'].append({ "filter": { "not": { "field": attr, "range": task["values"] } } }) else: for attr in task['attributes']: symbol = constants.operator_symbol_mapping[ task["operator"]] if helpers.isfloat(task["values"][0]) or helpers.isint( task["values"][0]): column['transform'].append({ 'filter': 'lower(datum["{}"]) {} {}'.format( attr, symbol, task["values"][0]) }) elif helpers.isdate(task["values"][0]): column['transform'].append({ 'filter': 'lower(datum["{}"]) {} "{}"'.format( attr, symbol, task["values"][0]) })
def is_datatype_ambiguous(self, attributes, task, values): is_datatype_ambiguous = False if task == "filter" and len(values) > 0 and helpers.isfloat(values[0]): for a in attributes: if self.nl4dv_instance.data_genie_instance.data_attribute_map[ a]["dataType"] != constants.attribute_types[ "QUANTITATIVE"]: is_datatype_ambiguous = True break elif task in ["derived_value", "find_extremum"]: for a in attributes: if self.nl4dv_instance.data_genie_instance.data_attribute_map[ a]["dataType"] != constants.attribute_types[ "QUANTITATIVE"]: is_datatype_ambiguous = True break elif task in ["trend"]: for a in attributes: if self.nl4dv_instance.data_genie_instance.data_attribute_map[ a]["dataType"] != constants.attribute_types["TEMPORAL"]: is_datatype_ambiguous = True break return is_datatype_ambiguous
def set_data(self, data_url=None): # type: (str) -> bool """ User can choose to manually initialize data """ self.nl4dv_instance.data_url = data_url # initialize values self.data_attribute_map = dict() self.data = list() self.rows = 0 if self.nl4dv_instance.data_url is not None and os.path.isfile( self.nl4dv_instance.data_url): # local variables reader = None json_data = None attributes = list() if self.nl4dv_instance.data_url.lower().endswith('.csv'): reader = csv.reader(open(self.nl4dv_instance.data_url, 'r', encoding='utf-8'), delimiter=',') attributes = next( reader) # assumes headers are in the first line elif self.nl4dv_instance.data_url.lower().endswith('.tsv'): reader = csv.reader(open(self.nl4dv_instance.data_url, 'r', encoding='utf-8'), delimiter='\t') attributes = next( reader) # assumes headers are in the first line elif self.nl4dv_instance.data_url.lower().endswith('.json'): json_data = json.load( open(self.nl4dv_instance.data_url, 'r', encoding='utf-8')) attributes = json_data[0].keys() # initialize properties in Attribute Map for attr in attributes: # Don't consider attribute names that are empty or just whitespaces if attr and attr.strip(): self.data_attribute_map[attr] = { 'domain': set(), 'isLabelAttribute': attr == self.nl4dv_instance.label_attribute, 'summary': dict(), 'dataTypeList': list(), # temporary to determine datatype 'dataType': '', 'aliases': list(), } # initialize properties in Attribute Map # implies file is either .csv or .tsv if reader is not None: for line in reader: data_obj = dict() for i in range(len(line)): # Don't consider attribute names that are empty or just whitespaces if attributes[i] and attributes[i].strip(): data_obj[attributes[i]] = line[i] self.data.append(data_obj) self.rows += 1 else: # JSON file for data_obj in json_data: self.data.append(data_obj) self.rows += 1 # infer attribute datatypes and compute summary (range, domain) for datum in self.data: for attr in self.data_attribute_map.keys(): attr_val = datum[attr] # Check for Numeric (float, int) if helpers.isfloat(attr_val) or helpers.isint(attr_val): attr_datatype = constants.attribute_types[ 'QUANTITATIVE'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Check for Datetime # ToDo:- Works fine for datetime strings. Not for others like Epochs and Int-only Years (e.g. 2018) which get captured above. # ToDo:- It is VERY risky to switch this elif block with the if block above elif helpers.isdate(attr_val)[0]: attr_datatype = constants.attribute_types['TEMPORAL'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Otherwise set as Nominal else: attr_datatype = constants.attribute_types['NOMINAL'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Irrespective of above assignment, make a list of attribute types for each data row # to take best decision on heterogeneous data with multiple datatypes self.data_attribute_map[attr]['dataTypeList'].append( attr_datatype) # Determine the Datatype based on majority of values. # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ... for attr in self.data_attribute_map: # most common attribute type attr_datatype = Counter(self.data_attribute_map[attr] ['dataTypeList']).most_common(1)[0][0] # if it's quantitative but with less than or equal to 12 unique values, then it's ordinal. # eg. 1, 2, 3, ..., 12 (months of a year) # eg. -3, -2, -1, 0, 1, 2, 3 (likert ratings) if attr_datatype == constants.attribute_types[ 'QUANTITATIVE'] and len( self.data_attribute_map[attr]['domain']) <= 12: attr_datatype = constants.attribute_types['ORDINAL'] self.populate_dataset_meta_for_attr(attr, attr_datatype) # If an attribute has (almnost) no repeating value, then mark it as the label attribute. # eg. primary/unique key of the table? Car1 , Car2, Car3, ... # Almost == 90% heuristic-based if attr_datatype == constants.attribute_types[ 'NOMINAL'] and len(self.data_attribute_map[attr] ['domain']) > 0.9 * self.rows: self.nl4dv_instance.label_attribute = attr self.data_attribute_map[attr]['isLabelAttribute'] = True # Set the final data type self.data_attribute_map[attr]['dataType'] = attr_datatype # Presentation self.prepare_output(attr, attr_datatype) return True return False
def set_task(self, dim, task): if task["task"] == 'find_extremum': if dim is None: dim = 'y' if task["operator"] == 'MIN': if dim == 'x': if 'y' in self.vl_spec['encoding']: self.vl_spec['encoding']['y']['sort'] = 'x' elif dim == 'y': if 'x' in self.vl_spec['encoding']: self.vl_spec['encoding']['x']['sort'] = 'y' elif task["operator"] == 'MAX': if dim == 'x': if 'y' in self.vl_spec['encoding']: self.vl_spec['encoding']['y']['sort'] = '-x' elif dim == 'y': if 'x' in self.vl_spec['encoding']: self.vl_spec['encoding']['x']['sort'] = '-y' elif task["task"] == 'filter': if task["operator"] == 'IN': for attr in task['attributes']: self.vl_spec['transform'].append( {'filter': { "field": attr, "oneOf": task["values"] }}) elif task["operator"] == 'RANGE': for attr in task['attributes']: self.vl_spec['transform'].append( {"filter": { "field": attr, "range": task["values"] }}) elif task["operator"] == 'NOT RANGE': for attr in task['attributes']: # self.vl_spec['transform'].append({"filter": {"field": attr, "gte": task["values"][1], "lte": task["values"][0]}}) self.vl_spec['transform'].append({ "filter": { "not": { "field": attr, "range": task["values"] } } }) else: for attr in task['attributes']: symbol = constants.operator_symbol_mapping[ task["operator"]] if helpers.isfloat(task["values"][0]) or helpers.isint( task["values"][0]): self.vl_spec['transform'].append({ 'filter': 'lower(datum["{}"]) {} {}'.format( attr, symbol, task["values"][0]) }) elif helpers.isdate(task["values"][0]): self.vl_spec['transform'].append({ 'filter': 'lower(datum["{}"]) {} "{}"'.format( attr, symbol, task["values"][0]) })
def set_data(self, data_url=None, data_value=None): # type: (str) -> None """ User can choose to manually initialize data """ self.nl4dv_instance.data_url = data_url if data_url is not None else self.nl4dv_instance.data_url self.nl4dv_instance.data_value = data_value if data_value is not None else self.nl4dv_instance.data_value # initialize values self.data_attribute_map = dict() self.data = list() self.rows = 0 if self.nl4dv_instance.data_url is not None: # Possible Local FILE or HTTP URL if self.nl4dv_instance.data_url.lower().endswith('.csv'): self.data = pd.read_csv(self.nl4dv_instance.data_url, sep=',').to_dict('records') elif self.nl4dv_instance.data_url.lower().endswith('.tsv'): self.data = pd.read_csv(self.nl4dv_instance.data_url, sep='\t').to_dict('records') elif self.nl4dv_instance.data_url.lower().endswith('.json'): self.data = pd.read_json( self.nl4dv_instance.data_url).to_dict('records') elif self.nl4dv_instance.data_value is not None: if isinstance(data_value, pd.DataFrame): self.data = data_value.to_dict('records') elif isinstance(data_value, list): self.data = data_value elif isinstance(data_value, dict): self.data = pd.DataFrame(data_value).to_dict('records') # Set number of rows in the dataset self.rows = len(self.data) # initialize properties in Attribute Map if len(self.data) > 0: for attr in self.data[0].keys(): # Don't consider attribute names that are empty or just whitespaces if attr and attr.strip(): self.data_attribute_map[attr] = { 'domain': set(), 'domainMeta': dict(), 'isLabelAttribute': attr == self.nl4dv_instance.label_attribute, 'summary': dict(), 'dataTypeList': list(), # temporary to determine datatype 'dataType': '', 'dataTypeMeta': dict( ), # Used for for e.g., temporal attributes when they conform to a certain format 'aliases': list(), } # infer attribute datatypes and compute summary (range, domain) for datum in self.data: for attr in self.data_attribute_map.keys(): # Check for Datetime is_date, unformatted_date_obj = helpers.isdate(datum[attr]) if is_date: attr_datatype_for_majority = constants.attribute_types[ 'TEMPORAL'] + "-" + str( unformatted_date_obj["regex_id"]) # Check for Numeric (float, int) elif helpers.isfloat(datum[attr]) or helpers.isint( datum[attr]): attr_datatype_for_majority = constants.attribute_types[ 'QUANTITATIVE'] # Otherwise set as Nominal else: attr_datatype_for_majority = constants.attribute_types[ 'NOMINAL'] # Append the list of attribute types for each data row to take best decision on heterogeneous data with multiple datatypes self.data_attribute_map[attr]['dataTypeList'].append( attr_datatype_for_majority) # Determine the Datatype based on majority of values. # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ... for attr in self.data_attribute_map: # By default, set the attribute datatype to the most common attribute attr_datatype = Counter(self.data_attribute_map[attr] ['dataTypeList']).most_common(1)[0][0] # If attr_datatype is Temporal (e.g., T-1, T-2, T-n where 'n' corresponds to the n'th index of the date_regex array. # Then: PROCESS this and eventually strip the '-n' from the datatype if not (attr_datatype in [ constants.attribute_types['QUANTITATIVE'], constants.attribute_types['NOMINAL'] ]): # If there is at least one instance of 'T-2' (DD*MM*YY(YY)), in the `dataTypeList`, set the regex_id to this, even if 'T-1' is the majority. if 'T-2' in self.data_attribute_map[attr]['dataTypeList']: attr_datatype = 'T-2' # Strip the '-n' from the datatype attr_datatype_split = attr_datatype.split("-") attr_datatype = attr_datatype_split[0] # Set the final data type self.data_attribute_map[attr]['dataTypeMeta'] = { "regex_id": attr_datatype_split[1] } # Add raw data to the domain's metadata. Only for Temporal Attributes. if not 'raw' in self.data_attribute_map[attr]['domainMeta']: self.data_attribute_map[attr]['domainMeta']['raw'] = set() # Set the final data type self.data_attribute_map[attr]['dataType'] = attr_datatype # Update the dataset metadata for each attribute self.populate_dataset_meta_for_attr(attr, attr_datatype)
def set_data(self, data_url=None, data_value=None): # type: (str) -> None """ User can choose to manually initialize data """ self.nl4dv_instance.data_url = data_url if data_url is not None else self.nl4dv_instance.data_url self.nl4dv_instance.data_value = data_value if data_value is not None else self.nl4dv_instance.data_value # initialize values self.data_attribute_map = dict() self.data = list() self.rows = 0 if self.nl4dv_instance.data_url is not None: # Possible Local FILE or HTTP URL if self.nl4dv_instance.data_url.lower().endswith('.csv'): self.data = pd.read_csv(self.nl4dv_instance.data_url, sep=',').to_dict('records') elif self.nl4dv_instance.data_url.lower().endswith('.tsv'): self.data = pd.read_csv(self.nl4dv_instance.data_url, sep='\t').to_dict('records') elif self.nl4dv_instance.data_url.lower().endswith('.json'): self.data = pd.read_json( self.nl4dv_instance.data_url).to_dict('records') elif self.nl4dv_instance.data_value is not None: if isinstance(data_value, pd.DataFrame): self.data = data_value.to_dict('records') elif isinstance(data_value, list): self.data = data_value elif isinstance(data_value, dict): self.data = pd.DataFrame(data_value).to_dict('records') # Set number of rows in the dataset self.rows = len(self.data) # initialize properties in Attribute Map if len(self.data) > 0: for attr in self.data[0].keys(): # Don't consider attribute names that are empty or just whitespaces if attr and attr.strip(): self.data_attribute_map[attr] = { 'domain': set(), 'isLabelAttribute': attr == self.nl4dv_instance.label_attribute, 'summary': dict(), 'dataTypeList': list(), # temporary to determine datatype 'dataType': '', 'aliases': list(), } # infer attribute datatypes and compute summary (range, domain) for datum in self.data: for attr in self.data_attribute_map.keys(): attr_val = datum[attr] # Check for Numeric (float, int) if helpers.isfloat(attr_val) or helpers.isint(attr_val): attr_datatype = constants.attribute_types['QUANTITATIVE'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Check for Datetime # ToDo:- Works fine for datetime strings. Not for others like Epochs and Int-only Years (e.g. 2018) which get captured above. # ToDo:- It is VERY risky to switch this elif block with the if block above elif helpers.isdate(attr_val)[0]: attr_datatype = constants.attribute_types['TEMPORAL'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Otherwise set as Nominal else: attr_datatype = constants.attribute_types['NOMINAL'] self.populate_dataset_meta(attr, attr_val, attr_datatype) # Irrespective of above assignment, make a list of attribute types for each data row # to take best decision on heterogeneous data with multiple datatypes self.data_attribute_map[attr]['dataTypeList'].append( attr_datatype) # Determine the Datatype based on majority of values. # Also Override a few datatypes set above based on rules such as NOMINAL to ORDINAL if all values are unique such as Sr. 1, Sr. 2, ... for attr in self.data_attribute_map: # most common attribute type attr_datatype = Counter(self.data_attribute_map[attr] ['dataTypeList']).most_common(1)[0][0] ## NOTE: For all practical purposes, let QUANTITATIVE be the determined Data Type. If an attribute is known to be ORDINAL, it can be set using the set_attribute_datatype() API. # # if it's quantitative but with less than or equal to 12 unique values, then it's ordinal. # # eg. 1, 2, 3, ..., 12 (months of a year) # # eg. -3, -2, -1, 0, 1, 2, 3 (likert ratings) # if attr_datatype == constants.attribute_types['QUANTITATIVE'] and len( # self.data_attribute_map[attr]['domain']) <= 12: # attr_datatype = constants.attribute_types['ORDINAL'] # self.populate_dataset_meta_for_attr(attr, attr_datatype) # If an attribute has (almnost) no repeating value, then mark it as the label attribute. # eg. primary/unique key of the table? Car1 , Car2, Car3, ... # Almost == 90% heuristic-based if attr_datatype == constants.attribute_types['NOMINAL'] and len( self.data_attribute_map[attr]['domain']) > 0.9 * self.rows: self.nl4dv_instance.label_attribute = attr self.data_attribute_map[attr]['isLabelAttribute'] = True # Set the final data type self.data_attribute_map[attr]['dataType'] = attr_datatype # Presentation self.prepare_output(attr, attr_datatype)