Ejemplo n.º 1
0
def parse_json(attribute_in_json):
    name = attribute_in_json['name']
    data_type = DataType(attribute_in_json['data_type'])
    is_candidate_key = attribute_in_json['is_candidate_key']
    is_categorical = attribute_in_json['is_categorical']
    histogram_size = len(attribute_in_json['distribution_bins'])
    if data_type is DataType.INTEGER:
        attribute = IntegerAttribute(name, is_candidate_key, is_categorical,
                                     histogram_size, Series(dtype=int))
    elif data_type is DataType.FLOAT:
        attribute = FloatAttribute(name, is_candidate_key, is_categorical,
                                   histogram_size, Series(dtype=float))
    elif data_type is DataType.DATETIME:
        attribute = DateTimeAttribute(name, is_candidate_key, is_categorical,
                                      histogram_size,
                                      Series(dtype='datetime64[ns]'))
    elif data_type is DataType.STRING:
        attribute = StringAttribute(name, is_candidate_key, is_categorical,
                                    histogram_size, Series(dtype=str))
    elif data_type is data_type.SOCIAL_SECURITY_NUMBER:
        attribute = SocialSecurityNumberAttribute(name, is_candidate_key,
                                                  is_categorical,
                                                  histogram_size,
                                                  Series(dtype=int))
    else:
        raise Exception('Data type {} is unknown.'.format(data_type.value))

    attribute.missing_rate = attribute_in_json['missing_rate']
    attribute.min = attribute_in_json['min']
    attribute.max = attribute_in_json['max']
    attribute.distribution_bins = attribute_in_json['distribution_bins']
    attribute.distribution_probabilities = attribute_in_json[
        'distribution_probabilities']

    return attribute
Ejemplo n.º 2
0
 def convert_input_dataset_into_a_dict_of_columns(self):
     self.input_dataset_as_column_dict = {}
     for attr in self.input_dataset:
         data_type = self.attribute_to_datatype[attr]
         is_candidate_key = self.attribute_to_is_candidate_key[attr]
         is_categorical = self.attribute_to_is_categorical[attr]
         paras = (attr, is_candidate_key, is_categorical,
                  self.histogram_size)
         if data_type is DataType.INTEGER:
             self.input_dataset_as_column_dict[attr] = IntegerAttribute(
                 *paras)
         elif data_type is DataType.FLOAT:
             self.input_dataset_as_column_dict[attr] = FloatAttribute(
                 *paras)
         elif data_type is DataType.DATETIME:
             self.input_dataset_as_column_dict[attr] = DateTimeAttribute(
                 *paras)
         elif data_type is DataType.STRING:
             self.input_dataset_as_column_dict[attr] = StringAttribute(
                 *paras)
         elif data_type is DataType.SOCIAL_SECURITY_NUMBER:
             self.input_dataset_as_column_dict[
                 attr] = SocialSecurityNumberAttribute(*paras)
         else:
             raise Exception(
                 'The data type of attribute {} is unknown.'.format(attr))
Ejemplo n.º 3
0
 def represent_input_dataset_by_columns(self):
     self.attr_to_column = {}
     for attr in self.df_input:
         data_type = self.attr_to_datatype[attr]
         is_candidate_key = self.attr_to_is_candidate_key[attr]
         is_categorical = self.attr_to_is_categorical[attr]
         paras = (attr, is_candidate_key, is_categorical, self.histogram_bins, self.df_input[attr])
         if data_type is DataType.INTEGER:
             self.attr_to_column[attr] = IntegerAttribute(*paras)
         elif data_type is DataType.FLOAT:
             self.attr_to_column[attr] = FloatAttribute(*paras)
         elif data_type is DataType.DATETIME:
             self.attr_to_column[attr] = DateTimeAttribute(*paras)
         elif data_type is DataType.STRING:
             self.attr_to_column[attr] = StringAttribute(*paras)
         elif data_type is DataType.SOCIAL_SECURITY_NUMBER:
             self.attr_to_column[attr] = SocialSecurityNumberAttribute(*paras)
         else:
             raise Exception(f'The DataType of {attr} is unknown.')