Ejemplo n.º 1
0
def infer_types(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)

    return zip(headers, [t.__name__ for t in normal_types])
Ejemplo n.º 2
0
    def test_normalize_table_known_types_invalid(self):
        normal_types = [bool, int, int, NoneType]
        data = [[u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'],
                [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u'']]

        try:
            typeinference.normalize_table(data,
                                          normal_types,
                                          accumulate_errors=True)
            self.assertEqual(True, False)
        except InvalidValueForTypeListException as e:
            self.assertEqual(len(e.errors), 2)
            self.assertEqual(e.errors[0].index, 0)
            self.assertEqual(e.errors[0].value, 'a')
            self.assertEqual(e.errors[0].normal_type, bool)
            self.assertEqual(e.errors[2].index, 0)
            self.assertEqual(e.errors[2].value, '2.1')
            self.assertEqual(e.errors[2].normal_type, int)
Ejemplo n.º 3
0
 def test_normalize_table_known_types_invalid(self):
     normal_types = [bool, int, int, NoneType]
     data = [
         [u'a', u'1', u'2.1', u''],
         [u'b', u'5', u'4.1'],
         [u'c', u'100', u'100.9999', u''],
         [u'd', u'2', u'5.3', u'']
     ]
     
     try:
         typeinference.normalize_table(data, normal_types, accumulate_errors=True)
         self.assertEqual(True, False)
     except InvalidValueForTypeListException, e:
         self.assertEqual(len(e.errors), 2)
         self.assertEqual(e.errors[0].index, 0)
         self.assertEqual(e.errors[0].value, 'a')
         self.assertEqual(e.errors[0].normal_type, bool)
         self.assertEqual(e.errors[2].index, 0)
         self.assertEqual(e.errors[2].value, '2.1')
         self.assertEqual(e.errors[2].normal_type, int)
Ejemplo n.º 4
0
def infer_schema(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)
    type_names = [t.__name__ for t in normal_types]

    return [{
        'column': h,
        'simple_type': t,
        'meta_type': None,
        'indexed': False
    } for h, t in zip(headers, type_names)]
Ejemplo n.º 5
0
    def test_normalize_table_known_types(self):
        normal_types = [six.text_type, int, float, NoneType]
        data = [[u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'],
                [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u'']]
        types, columns = typeinference.normalize_table(data, normal_types)

        self.assertEqual(4, len(types))
        self.assertEqual(4, len(columns))

        for i, tup in enumerate(zip(columns, types, normal_types)):
            c, t, et = tup
            self.assertEqual(et, t)
            for row, normalized in zip(data, c):
                if t is NoneType:
                    self.assertTrue(normalized is None)
                else:
                    self.assertEqual(t(row[i]), normalized)
Ejemplo n.º 6
0
    def test_normalize_table(self):
        expected_types = [unicode, int, float, None]
        data = [['a', '1', '2.1', ''], ['b', '5', '4.1', ''],
                ['c', '100', '100.9999', ''], ['d', '2', '5.3', '']]
        column_count = len(expected_types)
        types, columns = typeinference.normalize_table(data, column_count)

        self.assertEqual(column_count, len(types))
        self.assertEqual(column_count, len(columns))

        for i, tup in enumerate(zip(columns, types, expected_types)):
            c, t, et = tup
            self.assertEqual(et, t)
            for row, normalized in zip(data, c):
                if t is None:
                    self.assertTrue(normalized is None)
                    self.assertEqual('', row[i])
                else:
                    self.assertEqual(t(row[i]), normalized)
Ejemplo n.º 7
0
    def test_normalize_table_known_types(self):
        normal_types = [unicode, int, float, NoneType]
        data = [
            [u'a', u'1', u'2.1', u''],
            [u'b', u'5', u'4.1'],
            [u'c', u'100', u'100.9999', u''],
            [u'd', u'2', u'5.3', u'']
        ]
        types, columns = typeinference.normalize_table(data, normal_types)

        self.assertEqual(4, len(types))
        self.assertEqual(4, len(columns))

        for i, tup in enumerate(zip(columns, types, normal_types)):
            c, t, et = tup
            self.assertEqual(et, t)
            for row, normalized in zip(data, c):
                if t is NoneType:
                    self.assertTrue(normalized is None)
                else:
                    self.assertEqual(t(row[i]), normalized)
Ejemplo n.º 8
0
def guess_column_types(path, dialect, sample_size, encoding='utf-8'):
    """
    Guess column types based on a sample of data.
    """
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect)
        headers = reader.next()

        sample = islice(reader, sample_size)
        normal_types, normal_values = normalize_table(sample)

        type_names = []

        for t in normal_types:
            # csvkit recognizes dates and times separately, but we lump them together
            if t in [datetime.date, datetime.time]:
                type_names.append('datetime')
            else:
                type_names.append(t.__name__)

        return type_names 
Ejemplo n.º 9
0
    def test_normalize_table(self):
        expected_types = [unicode,int,float,None]
        data = [
            ['a','1','2.1', ''],
            ['b', '5', '4.1', ''],
            ['c', '100', '100.9999', ''],
            ['d', '2', '5.3', '']
        ]
        column_count = len(expected_types)
        types, columns = typeinference.normalize_table(data,column_count)

        self.assertEqual(column_count, len(types))
        self.assertEqual(column_count, len(columns))

        for i,tup in enumerate(zip(columns,types,expected_types)):
            c, t, et= tup
            self.assertEqual(et, t)
            for row,normalized in zip(data,c):
                if t is None:
                    self.assertTrue(normalized is None)
                    self.assertEqual('', row[i])
                else:
                    self.assertEqual(t(row[i]),normalized)
Ejemplo n.º 10
0
def guess_column_types(path, dialect, sample_size, encoding='utf-8'):
    """
    Guess column types based on a sample of data.
    """
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect)
        headers = reader.next()

        sample = islice(reader, sample_size)
        normal_types, normal_values = normalize_table(sample)

        type_names = []

        for t in normal_types:
            if t is NoneType:
                type_names.append(None)
            else:
                type_names.append(t.__name__)

        # If a final column had no values csvkit will have dropped it
        while len(type_names) < len(headers):
            type_names.append(None)

        return type_names 
Ejemplo n.º 11
0
def guess_column_types(path, dialect, sample_size, encoding='utf-8'):
    """
    Guess column types based on a sample of data.
    """
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect)
        headers = reader.next()

        sample = islice(reader, sample_size)
        normal_types, normal_values = normalize_table(sample)

        type_names = []

        for t in normal_types:
            if t is NoneType:
                type_names.append(None)
            else:
                type_names.append(t.__name__)

        # If a final column had no values csvkit will have dropped it
        while len(type_names) < len(headers):
            type_names.append(None)

        return type_names
    def handle(self, *args, **kwargs):
        solr = sunburnt.SolrInterface("http://localhost:8983/solr/")

        reader = csvkit.CSVKitReader(open('data/Building_Permits.csv', 'r'))
        headers = reader.next()

        first_hundred = islice(reader, 200)
        normal_types, normal_values = normalize_table(first_hundred, len(headers))

        solr_fields = []

        for h, t in zip(headers, normal_types):
            if t == NoneType:
                solr_fields.append(None)
            else:
                solr_fields.append('%s_%s' % (h, t.__name__))
            
        # Reset reader
        reader = csvkit.CSVKitReader(open('data/Building_Permits.csv', 'r'))
        reader.next()

        buffered = []
        normal_type_exceptions = []

        # TEMP
        reader =  islice(reader, 1000)

        for i, row in enumerate(reader, start=1):
            data = {}

            for t, header, field, value in izip(normal_types, headers, solr_fields, row):
                try:
                    value = normalize_column_type([value], normal_type=t)[1][0]
                except InvalidValueForTypeException:
                    # Convert exception to row-specific error
                    normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t))
                    continue

                # No reason to send null fields to Solr (also sunburnt doesn't like them) 
                if value == None:
                    continue

                if t in [unicode, bool, int, float]:
                    if value == None:
                        continue

                    data[field] = value
                elif t == datetime:
                    data[field] = value.isoformat()
                elif t == date:
                    pass
                elif t == time:
                    pass
                else:
                    # Note: if NoneType should never fall through to here 
                    raise TypeError('Unexpected normal type: %s' % t.__name__)

            # If we've had a normal type exception, don't bother do the rest of this
            if not normal_type_exceptions:
                data['id'] = str(i)
                data['dataset_id'] = DATASET_ID
                data['full_text'] = '\n'.join(row)
                buffered.append(data)

                if i % 100 == 0:
                    solr.add(buffered)
                    buffered = []
        
        if not normal_type_exceptions:
            solr.commit()
        else:
            # Rollback pending changes
            solr.delete(queries=solr.query(dataset_id=DATASET_ID))
            
            for e in normal_type_exceptions:
                print e