Esempio n. 1
0
    def detect_headers(self, data_matrix):
        """
        We look to see if there are any headers in the first row of the data_matrix.
        We check to make sure that all data types in the first row are either strings
        or booleans (because booleans can have "t", "true", etc. which could
        potentially be a header).

        If all data types in the first row are either strings or booleans, we check
        to make sure that the second row does not match exactly the first row. If
        both rows match, then we assume there were no headers. Otherwise we return
        the first row as headers.
        """
        if len(data_matrix) <= 1:
            return (data_matrix, None)

        header_data_types = DataTypeClassification.classify_row(data_matrix[0])
        row_data_types = DataTypeClassification.classify_data_matrix(data_matrix[1:])

        non_matching_types = 0
        for header_type, row_type in zip(header_data_types, row_data_types):
            if header_type.data_type in ["boolean", "string"]:
                if header_type.data_type != row_type:
                    non_matching_types += 1
            else:
                # Header type is not a boolean or string --  we assume that
                # this data_matrix doesn't have any headers.
                return (data_matrix, None)

        # If all header and first row types are matching, then we assume we don't
        # have any headers. Otherwise, we return the first row as headers.
        if non_matching_types == 0:
            return (data_matrix, None)
        else:
            headers = data_matrix.pop(0)
            return (data_matrix, headers)
Esempio n. 2
0
    def clean_entry(self, entry, data_type):
        if DataTypeClassification.is_missing_data(entry):
            return None
        elif data_type.is_equal("date"):
            return (DataTypeClassification.get_date(entry) - datetime(1970, 1, 1)).total_seconds()
        elif data_type.is_equal("integer"):
            return int(entry)
        elif data_type.is_equal("float"):
            return float(entry)

        return entry
Esempio n. 3
0
    def clean_entry(self, entry, data_type):
        if DataTypeClassification.is_missing_data(entry):
            return None
        elif data_type.is_equal("date"):
            return (DataTypeClassification.get_date(entry) -
                    datetime(1970, 1, 1)).total_seconds()
        elif data_type.is_equal("integer"):
            return int(entry)
        elif data_type.is_equal("float"):
            return float(entry)

        return entry
Esempio n. 4
0
    def _initialize_data_types(self, data_types):
        if data_types == None:
            resulting_types = DataTypeClassification.classify_data_matrix(
                self.data_matrix)
        else:
            if len(data_types) != self.num_cols:
                raise ValueError(
                    "The data_types lists must be the same length as the number of columns in the dataset."
                )

            resulting_types = []
            for data_type in data_types:
                if isinstance(data_type, str):
                    resulting_types.append(DataType(data_type))
                else:
                    resulting_types.append(data_type)

        # Get categories for the string data types.
        for i in xrange(len(resulting_types)):
            if resulting_types[i].is_equal(
                    "string") and resulting_types[i].categories == None:
                resulting_types[i] = DataType("string",
                                              categories=set(
                                                  self.get_column(i)))

        return resulting_types
Esempio n. 5
0
 def test_parsing_data_types_correctly_for_strings(self):
     row = ["some string", "2string", "string24531234 34534 345", "2.345s"]
     data_types = DataTypeClassification.classify_row(row)
     self.assertEqual("string", data_types[0].data_type)
     self.assertEqual("string", data_types[1].data_type)
     self.assertEqual("string", data_types[2].data_type)
     self.assertEqual("string", data_types[3].data_type)
Esempio n. 6
0
 def __init__(self, settings, data_matrix, headers=None, data_types=None):
     self.settings = settings
     self.data_matrix = data_matrix
     self.headers = headers
     self.logger = logging.getLogger(__name__)
     if data_types == None:
         self.data_types = DataTypeClassification.classify_data_matrix(self.data_matrix)
     else:
         self.data_types = data_types
Esempio n. 7
0
 def __init__(self, settings, data_matrix, headers=None, data_types=None):
     self.settings = settings
     self.data_matrix = data_matrix
     self.headers = headers
     self.logger = logging.getLogger(__name__)
     if data_types == None:
         self.data_types = DataTypeClassification.classify_data_matrix(
             self.data_matrix)
     else:
         self.data_types = data_types
Esempio n. 8
0
    def test_is_float(self):
        self.assertEqual(True, DataTypeClassification.is_float("1."))
        self.assertEqual(True, DataTypeClassification.is_float("1435.345"))
        self.assertEqual(
            True,
            DataTypeClassification.is_float(
                "143523452345234523452345234523452345234523.452345"))
        self.assertEqual(True, DataTypeClassification.is_float("-.234"))
        self.assertEqual(True, DataTypeClassification.is_float("5"))

        self.assertEqual(False, DataTypeClassification.is_float("-2.54shc"))
        self.assertEqual(False, DataTypeClassification.is_float("-something"))
        self.assertEqual(False, DataTypeClassification.is_float("5s3"))
Esempio n. 9
0
 def test_parsing_data_types_correctly_for_bools(self):
     row = ["t", "true", "True", "TRUE", "f", "false", "False", "FALSE"]
     data_types = DataTypeClassification.classify_row(row)
     self.assertEqual("boolean", data_types[0].data_type)
     self.assertEqual("boolean", data_types[1].data_type)
     self.assertEqual("boolean", data_types[2].data_type)
     self.assertEqual("boolean", data_types[3].data_type)
     self.assertEqual("boolean", data_types[4].data_type)
     self.assertEqual("boolean", data_types[5].data_type)
     self.assertEqual("boolean", data_types[6].data_type)
     self.assertEqual("boolean", data_types[7].data_type)
Esempio n. 10
0
 def test_parsing_data_types_correctly_for_ints_and_floats(self):
     row = [
         "1234", "  34.231 ", " 32. 43 ", " 5399999999999999 ",
         "23452345234523452345245.24"
     ]
     data_types = DataTypeClassification.classify_row(row)
     self.assertEqual("integer", data_types[0].data_type)
     self.assertEqual("float", data_types[1].data_type)
     self.assertEqual("string", data_types[2].data_type)
     self.assertEqual("integer", data_types[3].data_type)
     self.assertEqual("float", data_types[4].data_type)
Esempio n. 11
0
    def test_classifying_data_matrix_with_missing_values(self):
        data_matrix = [["nan", "24", "true", "2.3"],
                       ["nan", "23", "false", "3.4"],
                       ["bag", "nan", "true", "3.2"]]

        classification = DataTypeClassification.classify_data_matrix(
            data_matrix)
        self.assertEqual(4, len(classification))
        self.assertEqual("string", classification[0].data_type)
        self.assertEqual("integer", classification[1].data_type)
        self.assertEqual("boolean", classification[2].data_type)
        self.assertEqual("float", classification[3].data_type)
Esempio n. 12
0
    def test_classifying_data_matrix_correctly(self):
        data_matrix = [["1", " 2", "hi", "true", "f", " 42.23  "]]

        classification = DataTypeClassification.classify_data_matrix(
            data_matrix)
        self.assertEqual(6, len(classification))
        self.assertEqual("integer", classification[0].data_type)
        self.assertEqual("integer", classification[1].data_type)
        self.assertEqual("string", classification[2].data_type)
        self.assertEqual("boolean", classification[3].data_type)
        self.assertEqual("boolean", classification[4].data_type)
        self.assertEqual("float", classification[5].data_type)
Esempio n. 13
0
    def test_is_integer(self):
        self.assertEqual(True, DataTypeClassification.is_integer("1"))
        self.assertEqual(True, DataTypeClassification.is_integer("1435"))
        self.assertEqual(
            True,
            DataTypeClassification.is_integer(
                "143523452345234523452345234523452345234523452345"))
        self.assertEqual(True, DataTypeClassification.is_integer("-234"))

        self.assertEqual(False, DataTypeClassification.is_integer("0."))
        self.assertEqual(False, DataTypeClassification.is_integer("-2.54"))
        self.assertEqual(False,
                         DataTypeClassification.is_integer("-something"))
        self.assertEqual(False, DataTypeClassification.is_integer("5s3"))
Esempio n. 14
0
    def detect_headers(self, data_matrix):
        """
        We look to see if there are any headers in the first row of the data_matrix.
        We check to make sure that all data types in the first row are either strings
        or booleans (because booleans can have "t", "true", etc. which could
        potentially be a header).

        If all data types in the first row are either strings or booleans, we check
        to make sure that the second row does not match exactly the first row. If
        both rows match, then we assume there were no headers. Otherwise we return
        the first row as headers.
        """
        if len(data_matrix) <= 1:
            return (data_matrix, None)

        header_data_types = DataTypeClassification.classify_row(data_matrix[0])
        row_data_types = DataTypeClassification.classify_data_matrix(
            data_matrix[1:])

        non_matching_types = 0
        for header_type, row_type in zip(header_data_types, row_data_types):
            if header_type.data_type in ["boolean", "string"]:
                if header_type.data_type != row_type:
                    non_matching_types += 1
            else:
                # Header type is not a boolean or string --  we assume that
                # this data_matrix doesn't have any headers.
                return (data_matrix, None)

        # If all header and first row types are matching, then we assume we don't
        # have any headers. Otherwise, we return the first row as headers.
        if non_matching_types == 0:
            return (data_matrix, None)
        else:
            headers = data_matrix.pop(0)
            return (data_matrix, headers)
Esempio n. 15
0
    def read(self, maximum_size=None, delimiter=",", quoting=csv.QUOTE_NONE):
        if maximum_size == None:
            maximum_size = self.settings.get("dataset.maximum_dataset_size")

        with open(self.dataset_filename, 'rb') as f:
            reader = csv.reader(f, delimiter=delimiter, quoting=quoting)
            if self.settings.get("dataset.randomize_file_reader"):
                data_matrix = self.randomized_read_lines(csv_reader, maximum_size)
            else:
                data_matrix = self.greedy_read_lines(reader, maximum_size)

        data_matrix, headers = self.detect_headers(data_matrix)
        data_types = DataTypeClassification.classify_data_matrix(data_matrix)
        data_matrix = DatasetCleaner(self.settings, data_matrix, headers).clean()
        self.logger.info("Read dataset from file: '%s'", self.dataset_filename)
        self.logger.info("Headers: %s", headers)
        self.logger.info("Dataset Size: %s", len(data_matrix))
        return Dataset(data_matrix, headers=headers, data_types=data_types)
Esempio n. 16
0
 def test_classification(self):
     self.assertEqual("integer",
                      DataTypeClassification.classify("23452224").data_type)
     self.assertEqual(
         "float",
         DataTypeClassification.classify("234.52224").data_type)
     self.assertEqual(
         "date",
         DataTypeClassification.classify("3/20/1994").data_type)
     self.assertEqual("boolean",
                      DataTypeClassification.classify("t").data_type)
     self.assertEqual("boolean",
                      DataTypeClassification.classify("False").data_type)
     self.assertEqual("string",
                      DataTypeClassification.classify("be").data_type)
     self.assertEqual("string",
                      DataTypeClassification.classify("alfred").data_type)
Esempio n. 17
0
    def _initialize_data_types(self, data_types):
        if data_types == None:
            resulting_types = DataTypeClassification.classify_data_matrix(self.data_matrix)
        else:
            if len(data_types) != self.num_cols:
                raise ValueError("The data_types lists must be the same length as the number of columns in the dataset.")

            resulting_types = []
            for data_type in data_types:
                if isinstance(data_type, str):
                    resulting_types.append(DataType(data_type))
                else:
                    resulting_types.append(data_type)

        # Get categories for the string data types.
        for i in xrange(len(resulting_types)):
            if resulting_types[i].is_equal("string") and resulting_types[i].categories == None:
                resulting_types[i] = DataType("string", categories=set(self.get_column(i)))

        return resulting_types
Esempio n. 18
0
    def read(self, maximum_size=None, delimiter=",", quoting=csv.QUOTE_NONE):
        if maximum_size == None:
            maximum_size = self.settings.get("dataset.maximum_dataset_size")

        with open(self.dataset_filename, 'rb') as f:
            reader = csv.reader(f, delimiter=delimiter, quoting=quoting)
            if self.settings.get("dataset.randomize_file_reader"):
                data_matrix = self.randomized_read_lines(
                    csv_reader, maximum_size)
            else:
                data_matrix = self.greedy_read_lines(reader, maximum_size)

        data_matrix, headers = self.detect_headers(data_matrix)
        data_types = DataTypeClassification.classify_data_matrix(data_matrix)
        data_matrix = DatasetCleaner(self.settings, data_matrix,
                                     headers).clean()
        self.logger.info("Read dataset from file: '%s'", self.dataset_filename)
        self.logger.info("Headers: %s", headers)
        self.logger.info("Dataset Size: %s", len(data_matrix))
        return Dataset(data_matrix, headers=headers, data_types=data_types)
Esempio n. 19
0
    def test_is_missing_data(self):
        self.assertEqual(True, DataTypeClassification.is_missing_data("NA"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("NAN"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("NaN"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("nan"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("null"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("NULL"))
        self.assertEqual(True, DataTypeClassification.is_missing_data("na"))

        self.assertEqual(False, DataTypeClassification.is_missing_data("nope"))
        self.assertEqual(False, DataTypeClassification.is_missing_data("-"))
        self.assertEqual(False, DataTypeClassification.is_missing_data("5"))
        self.assertEqual(False,
                         DataTypeClassification.is_missing_data("something"))
Esempio n. 20
0
    def test_date_classification(self):
        self.assertEqual(True, DataTypeClassification.is_date("25/3/1992"))
        self.assertEqual(True, DataTypeClassification.is_date("3/14/1942"))
        self.assertEqual(True, DataTypeClassification.is_date("10-2-2010"))
        self.assertEqual(True, DataTypeClassification.is_date("29-3-2014"))
        self.assertEqual(True, DataTypeClassification.is_date("1-1-11"))
        self.assertEqual(True, DataTypeClassification.is_date("15-1-03"))
        self.assertEqual(True, DataTypeClassification.is_date("4/23/95"))
        self.assertEqual(True, DataTypeClassification.is_date("23/2/42"))

        self.assertEqual(False, DataTypeClassification.is_date("52-3-3413"))
        self.assertEqual(False, DataTypeClassification.is_date("1-3-1"))
        self.assertEqual(False, DataTypeClassification.is_date("52-3-22"))
        self.assertEqual(False, DataTypeClassification.is_date("2-43-53"))
        self.assertEqual(False, DataTypeClassification.is_date("13-13-53"))
        self.assertEqual(False, DataTypeClassification.is_date("13/13/53"))
        self.assertEqual(False, DataTypeClassification.is_date("1/1/3"))
Esempio n. 21
0
    def test_is_boolean(self):
        self.assertEqual(True, DataTypeClassification.is_boolean("t"))
        self.assertEqual(True, DataTypeClassification.is_boolean("T"))
        self.assertEqual(True, DataTypeClassification.is_boolean("true"))
        self.assertEqual(True, DataTypeClassification.is_boolean("True"))
        self.assertEqual(True, DataTypeClassification.is_boolean("TRUE"))
        self.assertEqual(True, DataTypeClassification.is_boolean("f"))
        self.assertEqual(True, DataTypeClassification.is_boolean("F"))
        self.assertEqual(True, DataTypeClassification.is_boolean("false"))
        self.assertEqual(True, DataTypeClassification.is_boolean("False"))
        self.assertEqual(True, DataTypeClassification.is_boolean("FALSE"))

        self.assertEqual(False, DataTypeClassification.is_boolean("tru"))
        self.assertEqual(False, DataTypeClassification.is_boolean("fals"))
        self.assertEqual(False, DataTypeClassification.is_boolean("0"))
        self.assertEqual(False, DataTypeClassification.is_boolean("1."))