예제 #1
0
    def test_reload(self, Session):
        csv_filepath = get_sample_filepath("simple.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        # Load it again unchanged
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        assert len(self._get_records(Session, "test1")) == 6
        assert self._get_column_names(Session, "test1") == [
            u"_id",
            u"_full_text",
            u"date",
            u"temperature",
            u"place",
        ]
        assert self._get_column_types(Session, "test1") == [
            u"int4",
            u"tsvector",
            u"text",
            u"text",
            u"text",
        ]
예제 #2
0
    def test_simple(self):
        csv_filepath = get_sample_filepath('simple.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath,
                        resource_id=resource_id,
                        mimetype='text/csv',
                        logger=PrintLogger())

        assert_equal(
            self._get_records('test1', limit=1,
                              exclude_full_text_column=False),
            [(1, "'-01':2,3 '1':4 '2011':1 'galway':5", u'2011-01-01', u'1',
              u'Galway')])
        assert_equal(self._get_records('test1'),
                     [(1, u'2011-01-01', u'1', u'Galway'),
                      (2, u'2011-01-02', u'-1', u'Galway'),
                      (3, u'2011-01-03', u'0', u'Galway'),
                      (4, u'2011-01-01', u'6', u'Berkeley'),
                      (5, None, None, u'Berkeley'),
                      (6, u'2011-01-03', u'5', None)])
        assert_equal(
            self._get_column_names('test1'),
            [u'_id', u'_full_text', u'date', u'temperature', u'place'])
        assert_equal(self._get_column_types('test1'),
                     [u'int4', u'tsvector', u'text', u'text', u'text'])
예제 #3
0
    def test_german(self):
        csv_filepath = get_sample_filepath('german_sample.csv')
        resource_id = 'test_german'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath,
                        resource_id=resource_id,
                        mimetype='text/csv',
                        logger=PrintLogger())

        records = self._get_records('test_german')
        print records
        assert_equal(records[0],
                     (1, u'Zürich', u'68260', u'65444', u'62646', u'6503',
                      u'28800', u'1173', u'6891', u'24221', u'672'))
        print self._get_column_names('test_german')
        assert_equal(self._get_column_names('test_german'), [
            u'_id',
            u'_full_text',
            u'Stadtname',
            u'Schuler_Total_2010/2011',
            u'Schuler_Total_2000/2001',
            u'Schuler_Total_1990/1991',
            u'Schuler_Vorschule_2010/2011',
            u'Schuler_Obligatorische Primar- und Sekundarstufe I_2010/2011',
            u'Schuler_Sekundarstufe II, Ubergangsausbildung Sek I. - Sek. II_',
            u'Schuler_Maturitatsschulen_2010/2011',
            u'Schuler_Berufsausbildung_2010/2011',
            u'Schuler_andere allgemeinbildende Schulen_2010/2011',
        ])
        print self._get_column_types('test_german')
        assert_equal(self._get_column_types('test_german'),
                     [u'int4', u'tsvector'] + [u'text'] *
                     (len(records[0]) - 1))
예제 #4
0
    def test_boston_311(self):
        csv_filepath = get_sample_filepath('boston_311_sample.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath,
                        resource_id=resource_id,
                        mimetype='text/csv',
                        logger=PrintLogger())

        records = self._get_records('test1')
        print records
        assert_equal(
            records,
            [
                (1, u'101002153891', u'2017-07-06 23:38:43',
                 u'2017-07-21 08:30:00', None, u'ONTIME', u'Open', u' ',
                 u'Street Light Outages', u'Public Works Department',
                 u'Street Lights', u'Street Light Outages',
                 u'PWDx_Street Light Outages', u'PWDx', None, None,
                 u'480 Harvard St  Dorchester  MA  02124', u'8', u'07', u'4',
                 u'B3', u'Greater Mattapan', u'9', u'Ward 14', u'1411',
                 u'480 Harvard St', u'02124', u'42.288', u'-71.0927',
                 u'Citizens Connect App'),  # noqa
                (2, u'101002153890', u'2017-07-06 23:29:13',
                 u'2017-09-11 08:30:00', None, u'ONTIME', u'Open', u' ',
                 u'Graffiti Removal', u'Property Management', u'Graffiti',
                 u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP',
                 u' https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg',
                 None, u'522 Saratoga St  East Boston  MA  02128', u'1', u'09',
                 u'1', u'A7', u'East Boston', u'1', u'Ward 1', u'0110',
                 u'522 Saratoga St', u'02128', u'42.3807', u'-71.0259',
                 u'Citizens Connect App'),  # noqa
                (3, u'101002153889', u'2017-07-06 23:24:20',
                 u'2017-09-11 08:30:00', None, u'ONTIME', u'Open', u' ',
                 u'Graffiti Removal', u'Property Management', u'Graffiti',
                 u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP',
                 u' https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg',
                 None, u'965 Bennington St  East Boston  MA  02128', u'1',
                 u'09', u'1', u'A7', u'East Boston', u'1', u'Ward 1', u'0112',
                 u'965 Bennington St', u'02128', u'42.386', u'-71.008',
                 u'Citizens Connect App')
            ]  # noqa
        )
        print self._get_column_names('test1')
        assert_equal(self._get_column_names('test1'), [
            u'_id', u'_full_text', u'CASE_ENQUIRY_ID', u'open_dt',
            u'target_dt', u'closed_dt', u'OnTime_Status', u'CASE_STATUS',
            u'CLOSURE_REASON', u'CASE_TITLE', u'SUBJECT', u'REASON', u'TYPE',
            u'QUEUE', u'Department', u'SubmittedPhoto', u'ClosedPhoto',
            u'Location', u'Fire_district', u'pwd_district',
            u'city_council_district', u'police_district', u'neighborhood',
            u'neighborhood_services_district', u'ward', u'precinct',
            u'LOCATION_STREET_NAME', u'LOCATION_ZIPCODE', u'Latitude',
            u'Longitude', u'Source'
        ])  # noqa
        print self._get_column_types('test1')
        assert_equal(self._get_column_types('test1'), [u'int4', u'tsvector'] +
                     [u'text'] * (len(records[0]) - 1))
예제 #5
0
 def test_shapefile_zip(self):
     filepath = get_sample_filepath('polling_locations.shapefile.zip')
     resource_id = 'test1'
     factories.Resource(id=resource_id)
     with assert_raises(LoaderError) as exception:
         loader.load_csv(filepath, resource_id=resource_id,
                         mimetype='text/csv', logger=PrintLogger())
     assert_in('Error during the load into PostgreSQL: '
               'unquoted carriage return found in data',
               str(exception.exception))
예제 #6
0
    def test_reload_with_overridden_types(self):
        if not p.toolkit.check_ckan_version(min_version='2.7'):
            raise SkipTest(
                'Requires CKAN 2.7 - see https://github.com/ckan/ckan/pull/3557'
            )
        csv_filepath = get_sample_filepath('simple.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath,
                        resource_id=resource_id,
                        mimetype='text/csv',
                        logger=PrintLogger())
        # Change types, as it would be done by Data Dictionary
        rec = p.toolkit.get_action('datastore_search')(None, {
            'resource_id': resource_id,
            'limit': 0
        })
        fields = [f for f in rec['fields'] if not f['id'].startswith('_')]
        fields[0]['info'] = {'type_override': 'timestamp'}
        fields[1]['info'] = {'type_override': 'numeric'}
        p.toolkit.get_action('datastore_create')({
            'ignore_auth': True
        }, {
            'resource_id': resource_id,
            'force': True,
            'fields': fields
        })
        # [{
        #         'id': f['id'],
        #         'type': f['type'],
        #         'info': fi if isinstance(fi, dict) else {}
        #         } for f, fi in izip_longest(fields, info)]

        # Load it again with new types
        fields = loader.load_csv(csv_filepath,
                                 resource_id=resource_id,
                                 mimetype='text/csv',
                                 logger=PrintLogger())
        loader.create_column_indexes(fields=fields,
                                     resource_id=resource_id,
                                     logger=PrintLogger())

        assert_equal(len(self._get_records('test1')), 6)
        assert_equal(
            self._get_column_names('test1'),
            [u'_id', u'_full_text', u'date', u'temperature', u'place'])
        assert_equal(self._get_column_types('test1'),
                     [u'int4', u'tsvector', u'timestamp', u'numeric', u'text'])

        # check that rows with nulls are indexed correctly
        records = self._get_records('test1', exclude_full_text_column=False)
        print records
        assert_equal(records[4][1], "'berkeley':1")
        assert_equal(records[5][1],
                     "'-01':2 '-03':3 '00':4,5,6 '2011':1 '5':7")
예제 #7
0
 def test_geojson(self):
     filepath = get_sample_filepath('polling_locations.geojson')
     resource_id = 'test1'
     factories.Resource(id=resource_id)
     with assert_raises(LoaderError) as exception:
         loader.load_csv(filepath, resource_id=resource_id,
                         mimetype='text/csv', logger=PrintLogger())
     assert_in('Error with field definition',
               str(exception.exception))
     assert_in('"{"type":"FeatureCollection"" is not a valid field name',
               str(exception.exception))
예제 #8
0
 def test_kml(self):
     filepath = get_sample_filepath('polling_locations.kml')
     resource_id = 'test1'
     factories.Resource(id=resource_id)
     with assert_raises(LoaderError) as exception:
         loader.load_csv(filepath, resource_id=resource_id,
                         mimetype='text/csv', logger=PrintLogger())
     assert_in('Error with field definition',
               str(exception.exception))
     assert_in('"<?xml version="1.0" encoding="utf-8" ?>" is not a valid field name',
               str(exception.exception))
예제 #9
0
 def test_shapefile_zip(self):
     filepath = get_sample_filepath("polling_locations.shapefile.zip")
     resource_id = "test1"
     factories.Resource(id=resource_id)
     with pytest.raises(LoaderError) as exception:
         loader.load_csv(
             filepath,
             resource_id=resource_id,
             mimetype="text/csv",
             logger=PrintLogger(),
         )
예제 #10
0
 def test_boston_311_sample5(self):
     # to create the test file:
     # head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv
     csv_filepath = get_sample_filepath('boston_311_sample5.csv')
     resource_id = 'test1'
     factories.Resource(id=resource_id)
     import time
     t0 = time.time()
     print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0)))
     loader.load_csv(csv_filepath, resource_id=resource_id,
                     mimetype='text/csv', logger=PrintLogger())
     print 'Load: {}s'.format(time.time() - t0)
예제 #11
0
 def test_boston_311_complete(self):
     # to get the test file:
     # curl -o ckanext/xloader/tests/samples/boston_311.csv https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2968e2c0-d479-49ba-a884-4ef523ada3c0/download/311.csv
     csv_filepath = get_sample_filepath('boston_311.csv')
     resource_id = 'test1'
     factories.Resource(id=resource_id)
     import time
     t0 = time.time()
     print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0)))
     loader.load_csv(csv_filepath, resource_id=resource_id,
                     mimetype='text/csv', logger=PrintLogger())
     print 'Load: {}s'.format(time.time() - t0)
예제 #12
0
 def test_geojson(self):
     filepath = get_sample_filepath("polling_locations.geojson")
     resource_id = "test1"
     factories.Resource(id=resource_id)
     with pytest.raises(LoaderError) as exception:
         loader.load_csv(
             filepath,
             resource_id=resource_id,
             mimetype="text/csv",
             logger=PrintLogger(),
         )
     assert "Error with field definition" in str(exception.value)
     assert ('"{"type":"FeatureCollection"" is not a valid field name'
             in str(exception.value))
예제 #13
0
    def test_column_names(self):
        csv_filepath = get_sample_filepath('column_names.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath,
                        resource_id=resource_id,
                        mimetype='text/csv',
                        logger=PrintLogger())

        assert_equal(
            self._get_column_names('test1')[2:],
            [u'd@t$e', u't^e&m*pe!r(a)t?u:r%%e', ur'p\l/a[c{e%'])
        assert_equal(
            self._get_records('test1')[0], (1, u'2011-01-01', u'1', u'Galway'))
예제 #14
0
 def test_kml(self):
     filepath = get_sample_filepath("polling_locations.kml")
     resource_id = "test1"
     factories.Resource(id=resource_id)
     with pytest.raises(LoaderError) as exception:
         loader.load_csv(
             filepath,
             resource_id=resource_id,
             mimetype="text/csv",
             logger=PrintLogger(),
         )
     assert "Error with field definition" in str(exception.value)
     assert (
         '"<?xml version="1.0" encoding="utf-8" ?>" is not a valid field name'
         in str(exception.value))
예제 #15
0
 def test_integer_header_xlsx(self):
     # this xlsx file's header is detected by messytables.headers_guess as
     # integers and we should cope with that
     csv_filepath = get_sample_filepath('go-realtime.xlsx')
     resource_id = factories.Resource()['id']
     try:
         loader.load_csv(csv_filepath,
                         resource_id=resource_id,
                         mimetype='CSV',
                         logger=PrintLogger())
     except LoaderError as e:
         # it should fail at the COPY stage
         assert 'Error during the load into PostgreSQL: invalid byte ' \
             'sequence for encoding' in str(e)
     else:
         assert 0, 'There should have been an exception'
예제 #16
0
 def test_integer_header_xlsx(self):
     # this xlsx file's header is detected by messytables.headers_guess as
     # integers and we should cope with that
     csv_filepath = get_sample_filepath("go-realtime.xlsx")
     resource_id = factories.Resource()["id"]
     try:
         loader.load_csv(
             csv_filepath,
             resource_id=resource_id,
             mimetype="CSV",
             logger=PrintLogger(),
         )
     except (LoaderError, UnicodeDecodeError) as e:
         pass
     else:
         assert 0, "There should have been an exception"
예제 #17
0
    def test_german(self, Session):
        csv_filepath = get_sample_filepath("german_sample.csv")
        resource_id = "test_german"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        records = self._get_records(Session, "test_german")
        print(records)
        assert records[0] == (
            1,
            u"Zürich",
            u"68260",
            u"65444",
            u"62646",
            u"6503",
            u"28800",
            u"1173",
            u"6891",
            u"24221",
            u"672",
        )
        print(self._get_column_names(Session, "test_german"))
        assert self._get_column_names(Session, "test_german") == [
            u"_id",
            u"_full_text",
            u"Stadtname",
            u"Schuler_Total_2010/2011",
            u"Schuler_Total_2000/2001",
            u"Schuler_Total_1990/1991",
            u"Schuler_Vorschule_2010/2011",
            u"Schuler_Obligatorische Primar- und Sekundarstufe I_2010/2011",
            u"Schuler_Sekundarstufe II, Ubergangsausbildung Sek I. - Sek. II_",
            u"Schuler_Maturitatsschulen_2010/2011",
            u"Schuler_Berufsausbildung_2010/2011",
            u"Schuler_andere allgemeinbildende Schulen_2010/2011",
        ]
        print(self._get_column_types(Session, "test_german"))
        assert self._get_column_types(Session, "test_german") == [
            u"int4",
            u"tsvector",
        ] + [u"text"] * (len(records[0]) - 1)
예제 #18
0
    def test_boston_311_sample5(self):
        # to create the test file:
        # head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv
        csv_filepath = get_sample_filepath("boston_311_sample5.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        import time

        t0 = time.time()
        print("{} Start load".format(
            time.strftime("%H:%M:%S", time.localtime(t0))))
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )
        print("Load: {}s".format(time.time() - t0))
예제 #19
0
    def test_reload(self):
        csv_filepath = get_sample_filepath('simple.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath, resource_id=resource_id,
                        mimetype='text/csv', logger=PrintLogger())

        # Load it again unchanged
        loader.load_csv(csv_filepath, resource_id=resource_id,
                        mimetype='text/csv', logger=PrintLogger())

        assert_equal(len(self._get_records('test1')), 6)
        assert_equal(
            self._get_column_names('test1'),
            [u'_id', u'_full_text', u'date', u'temperature', u'place'])
        assert_equal(
            self._get_column_types('test1'),
            [u'int4', u'tsvector', u'text', u'text', u'text'])
예제 #20
0
    def test_simple(self, Session):
        csv_filepath = get_sample_filepath("simple.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        assert self._get_records(Session,
                                 "test1",
                                 limit=1,
                                 exclude_full_text_column=False) == [(
                                     1,
                                     "'-01':2,3 '1':4 '2011':1 'galway':5",
                                     u"2011-01-01",
                                     u"1",
                                     u"Galway",
                                 )]
        assert self._get_records(Session, "test1") == [
            (1, u"2011-01-01", u"1", u"Galway"),
            (2, u"2011-01-02", u"-1", u"Galway"),
            (3, u"2011-01-03", u"0", u"Galway"),
            (4, u"2011-01-01", u"6", u"Berkeley"),
            (5, None, None, u"Berkeley"),
            (6, u"2011-01-03", u"5", None),
        ]
        assert self._get_column_names(Session, "test1") == [
            u"_id",
            u"_full_text",
            u"date",
            u"temperature",
            u"place",
        ]
        assert self._get_column_types(Session, "test1") == [
            u"int4",
            u"tsvector",
            u"text",
            u"text",
            u"text",
        ]
예제 #21
0
    def test_brazilian(self):
        csv_filepath = get_sample_filepath('brazilian_sample.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        loader.load_csv(csv_filepath, resource_id=resource_id,
                        mimetype='text/csv', logger=PrintLogger())

        records = self._get_records('test1')
        print records
        assert_equal(
            records[0],
            (1, u'01/01/1996 12:00:00 AM', u'1100015', u"ALTA FLORESTA D'OESTE", u'RO', None, u'128', u'0', u'8', u'119', u'1', u'0', u'3613', u'3051', u'130', u'7', u'121', u'3716', u'3078', u'127', u'7', None, None, None, None, u'6794', u'5036', u'1758', None, None, None, None, None, None, u'337', u'0.26112759', u'0.17210683', u'0.43323442', u'0.13353115', u'24.833692447908199', None, None, u'22.704964', u'67.080006197818605', u'65.144188573097907', u'74.672390253375497', u'16.7913561569619', u'19.4894563570641', u'8.649237411458509', u'7.60165422117368', u'11.1540090366186', u'17.263407056738099', u'8.5269823', u'9.2213373', u'5.3085136', u'52.472769803217503', None, None, None, None, None, None, u'25.0011414302354', u'22.830887000000001', u'66.8150490097632', u'64.893674212235595', u'74.288246611754104', u'17.0725384713319', u'19.8404105332814', u'8.856561911292371', u'7.74275834336647', u'11.357671741889', u'17.9410577459881', u'8.3696527', u'8.9979973', u'5.0570836', u'53.286314230720798', None, None, None, None, None, u'122988', None, u'10.155015000000001', u'14.826086999999999', u'11.671533', u'9.072917', None, None, None, None, None, None, None, None))
        print self._get_column_names('test1')
        assert_equal(
            self._get_column_names('test1'),
            [u'_id', u'_full_text', u'NU_ANO_CENSO', u'CO_MUNICIPIO', u'MUNIC', u'SIGLA', u'CO_UF', u'SCHOOLS_NU', u'SCHOOLS_FED_NU', u'SCHOOLS_ESTADUAL_NU', u'SCHOOLS_MUN_NU', u'SCHOOLS_PRIV_NU', u'SCHOOLS_FED_STUD', u'SCHOOLS_ESTADUAL_STUD', u'SCHOOLS_MUN_STUD', u'SCHOOLS_PRIV_STUD', u'SCHOOLS_URBAN_NU', u'SCHOOLS_RURAL_NU', u'SCHOOLS_URBAN_STUD', u'SCHOOLS_RURAL_STUD', u'SCHOOLS_NIVFUND_1_NU', u'SCHOOLS_NIVFUND_2_NU', u'SCHOOLS_EIGHTYEARS_NU', u'SCHOOLS_NINEYEARS_NU', u'SCHOOLS_EIGHTYEARS_STUD', u'SCHOOLS_NINEYEARS_STUD', u'MATFUND_NU', u'MATFUND_I_NU', u'MATFUND_T_NU', u'SCHOOLS_INTERNET_AVG', u'SCHOOLS_WATER_PUBLIC_AVG', u'SCHOOLS_WATER_AVG', u'SCHOOLS_ELECTR_PUB_AVG', u'SCHOOLS_SEWAGE_PUB_AVG', u'SCHOOLS_SEWAGE_AVG', u'PROFFUNDTOT_NU', u'PROFFUNDINC_PC', u'PROFFUNDCOMP_PC', u'PROFMED_PC', u'PROFSUP_PC', u'CLASSSIZE', u'CLASSSIZE_I', u'CLASSSIZE_T', u'STUDTEACH', u'RATE_APROV', u'RATE_APROV_I', u'RATE_APROV_T', u'RATE_FAILURE', u'RATE_FAILURE_I', u'RATE_FAILURE_T', u'RATE_ABANDON', u'RATE_ABANDON_I', u'RATE_ABANDON_T', u'RATE_TRANSFER', u'RATE_TRANSFER_I', u'RATE_TRANSFER_T', u'RATE_OVERAGE', u'RATE_OVERAGE_I', u'RATE_OVERAGE_T', u'PROVA_MEAN_PORT_I', u'PROVA_MEAN_PORT_T', u'PROVA_MEAN_MAT_I', u'PROVA_MEAN_MAT_T', u'CLASSSIZE_PUB', u'STUDTEACH_PUB', u'RATE_APROV_PUB', u'RATE_APROV_I_PUB', u'RATE_APROV_T_PUB', u'RATE_FAILURE_PUB', u'RATE_FAILURE_I_PUB', u'RATE_FAILURE_T_PUB', u'RATE_ABANDON_PUB', u'RATE_ABANDON_I_PUB', u'RATE_ABANDON_T_PUB', u'RATE_TRANSFER_PUB', u'RATE_TRANSFER_I_PUB', u'RATE_TRANSFER_T_PUB', u'RATE_OVERAGE_PUB', u'RATE_OVERAGE_I_PUB', u'RATE_OVERAGE_T_PUB', u'PROVA_MEAN_PORT_I_PUB', u'PROVA_MEAN_PORT_T_PUB', u'PROVA_MEAN_MAT_I_PUB', u'PROFFUNDTOT_NU_PUB', u'PROVA_MEAN_MAT_T_PUB', u'EDUCTEACH_PUB', u'EDUCTEACH_FEDERAL', u'EDUCTEACH_STATE', u'EDUCTEACH_MUN', u'PROVA_MEAN_PORT_I_STATE', u'PROVA_MEAN_PORT_T_STATE', u'PROVA_MEAN_MAT_I_STATE', u'PROVA_MEAN_MAT_T_STATE', u'PROVA_MEAN_PORT_I_MUN', u'PROVA_MEAN_PORT_T_MUN', u'PROVA_MEAN_MAT_I_MUN', u'PROVA_MEAN_MAT_T_MUN'])
        print self._get_column_types('test1')
        assert_equal(self._get_column_types('test1'),
                     [u'int4', u'tsvector'] +
                     [u'text'] * (len(records[0]) - 1))
예제 #22
0
    def test_simple_with_indexing(self):
        csv_filepath = get_sample_filepath('simple.csv')
        resource_id = 'test1'
        factories.Resource(id=resource_id)
        fields = loader.load_csv(csv_filepath, resource_id=resource_id,
                                 mimetype='text/csv', logger=PrintLogger())
        loader.create_column_indexes(fields=fields, resource_id=resource_id,
                                     logger=PrintLogger())

        assert_equal(self._get_records(
            'test1', limit=1, exclude_full_text_column=False)[0][1],
                     "'-01':2,3 '1':4 '2011':1 'galway':5")
예제 #23
0
    def test_column_names(self, Session):
        csv_filepath = get_sample_filepath("column_names.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        assert self._get_column_names(Session, "test1")[2:] == [
            u"d@t$e",
            u"t^e&m*pe!r(a)t?u:r%%e",
            r"p\l/a[c{e%",
        ]
        assert self._get_records(Session, "test1")[0] == (
            1,
            u"2011-01-01",
            u"1",
            u"Galway",
        )
예제 #24
0
    def test_simple_with_indexing(self, Session):
        csv_filepath = get_sample_filepath("simple.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        fields = loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )
        loader.create_column_indexes(fields=fields,
                                     resource_id=resource_id,
                                     logger=PrintLogger())

        assert (self._get_records(Session,
                                  "test1",
                                  limit=1,
                                  exclude_full_text_column=False)[0][1] ==
                "'-01':2,3 '1':4 '2011':1 'galway':5")
예제 #25
0
    def test_boston_311(self, Session):
        csv_filepath = get_sample_filepath("boston_311_sample.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        records = self._get_records(Session, "test1")
        print(records)
        assert records == [
            (
                1,
                u"101002153891",
                u"2017-07-06 23:38:43",
                u"2017-07-21 08:30:00",
                None,
                u"ONTIME",
                u"Open",
                u" ",
                u"Street Light Outages",
                u"Public Works Department",
                u"Street Lights",
                u"Street Light Outages",
                u"PWDx_Street Light Outages",
                u"PWDx",
                None,
                None,
                u"480 Harvard St  Dorchester  MA  02124",
                u"8",
                u"07",
                u"4",
                u"B3",
                u"Greater Mattapan",
                u"9",
                u"Ward 14",
                u"1411",
                u"480 Harvard St",
                u"02124",
                u"42.288",
                u"-71.0927",
                u"Citizens Connect App",
            ),  # noqa
            (
                2,
                u"101002153890",
                u"2017-07-06 23:29:13",
                u"2017-09-11 08:30:00",
                None,
                u"ONTIME",
                u"Open",
                u" ",
                u"Graffiti Removal",
                u"Property Management",
                u"Graffiti",
                u"Graffiti Removal",
                u"PROP_GRAF_GraffitiRemoval",
                u"PROP",
                u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",
                None,
                u"522 Saratoga St  East Boston  MA  02128",
                u"1",
                u"09",
                u"1",
                u"A7",
                u"East Boston",
                u"1",
                u"Ward 1",
                u"0110",
                u"522 Saratoga St",
                u"02128",
                u"42.3807",
                u"-71.0259",
                u"Citizens Connect App",
            ),  # noqa
            (
                3,
                u"101002153889",
                u"2017-07-06 23:24:20",
                u"2017-09-11 08:30:00",
                None,
                u"ONTIME",
                u"Open",
                u" ",
                u"Graffiti Removal",
                u"Property Management",
                u"Graffiti",
                u"Graffiti Removal",
                u"PROP_GRAF_GraffitiRemoval",
                u"PROP",
                u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",
                None,
                u"965 Bennington St  East Boston  MA  02128",
                u"1",
                u"09",
                u"1",
                u"A7",
                u"East Boston",
                u"1",
                u"Ward 1",
                u"0112",
                u"965 Bennington St",
                u"02128",
                u"42.386",
                u"-71.008",
                u"Citizens Connect App",
            ),
        ]  # noqa
        print(self._get_column_names(Session, "test1"))
        assert self._get_column_names(Session, "test1") == [
            u"_id",
            u"_full_text",
            u"CASE_ENQUIRY_ID",
            u"open_dt",
            u"target_dt",
            u"closed_dt",
            u"OnTime_Status",
            u"CASE_STATUS",
            u"CLOSURE_REASON",
            u"CASE_TITLE",
            u"SUBJECT",
            u"REASON",
            u"TYPE",
            u"QUEUE",
            u"Department",
            u"SubmittedPhoto",
            u"ClosedPhoto",
            u"Location",
            u"Fire_district",
            u"pwd_district",
            u"city_council_district",
            u"police_district",
            u"neighborhood",
            u"neighborhood_services_district",
            u"ward",
            u"precinct",
            u"LOCATION_STREET_NAME",
            u"LOCATION_ZIPCODE",
            u"Latitude",
            u"Longitude",
            u"Source",
        ]  # noqa
        print(self._get_column_types(Session, "test1"))
        assert self._get_column_types(Session, "test1") == [
            u"int4",
            u"tsvector",
        ] + [u"text"] * (len(records[0]) - 1)
예제 #26
0
    def test_reload_with_overridden_types(self, Session):
        csv_filepath = get_sample_filepath("simple.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )
        # Change types, as it would be done by Data Dictionary
        rec = p.toolkit.get_action("datastore_search")(None, {
            "resource_id": resource_id,
            "limit": 0
        })
        fields = [f for f in rec["fields"] if not f["id"].startswith("_")]
        fields[0]["info"] = {"type_override": "timestamp"}
        fields[1]["info"] = {"type_override": "numeric"}
        p.toolkit.get_action("datastore_create")(
            {
                "ignore_auth": True
            },
            {
                "resource_id": resource_id,
                "force": True,
                "fields": fields
            },
        )

        # Load it again with new types
        fields = loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )
        loader.create_column_indexes(fields=fields,
                                     resource_id=resource_id,
                                     logger=PrintLogger())

        assert len(self._get_records(Session, "test1")) == 6
        assert self._get_column_names(Session, "test1") == [
            u"_id",
            u"_full_text",
            u"date",
            u"temperature",
            u"place",
        ]
        assert self._get_column_types(Session, "test1") == [
            u"int4",
            u"tsvector",
            u"timestamp",
            u"numeric",
            u"text",
        ]

        # check that rows with nulls are indexed correctly
        records = self._get_records(Session,
                                    "test1",
                                    exclude_full_text_column=False)
        print(records)
        assert records[4][1] == "'berkeley':1"
        assert records[5][1] == "'-01':2 '-03':3 '00':4,5,6 '2011':1 '5':7"
예제 #27
0
    def test_brazilian(self, Session):
        csv_filepath = get_sample_filepath("brazilian_sample.csv")
        resource_id = "test1"
        factories.Resource(id=resource_id)
        loader.load_csv(
            csv_filepath,
            resource_id=resource_id,
            mimetype="text/csv",
            logger=PrintLogger(),
        )

        records = self._get_records(Session, "test1")
        print(records)
        assert records[0] == (
            1,
            u"01/01/1996 12:00:00 AM",
            u"1100015",
            u"ALTA FLORESTA D'OESTE",
            u"RO",
            None,
            u"128",
            u"0",
            u"8",
            u"119",
            u"1",
            u"0",
            u"3613",
            u"3051",
            u"130",
            u"7",
            u"121",
            u"3716",
            u"3078",
            u"127",
            u"7",
            None,
            None,
            None,
            None,
            u"6794",
            u"5036",
            u"1758",
            None,
            None,
            None,
            None,
            None,
            None,
            u"337",
            u"0.26112759",
            u"0.17210683",
            u"0.43323442",
            u"0.13353115",
            u"24.833692447908199",
            None,
            None,
            u"22.704964",
            u"67.080006197818605",
            u"65.144188573097907",
            u"74.672390253375497",
            u"16.7913561569619",
            u"19.4894563570641",
            u"8.649237411458509",
            u"7.60165422117368",
            u"11.1540090366186",
            u"17.263407056738099",
            u"8.5269823",
            u"9.2213373",
            u"5.3085136",
            u"52.472769803217503",
            None,
            None,
            None,
            None,
            None,
            None,
            u"25.0011414302354",
            u"22.830887000000001",
            u"66.8150490097632",
            u"64.893674212235595",
            u"74.288246611754104",
            u"17.0725384713319",
            u"19.8404105332814",
            u"8.856561911292371",
            u"7.74275834336647",
            u"11.357671741889",
            u"17.9410577459881",
            u"8.3696527",
            u"8.9979973",
            u"5.0570836",
            u"53.286314230720798",
            None,
            None,
            None,
            None,
            None,
            u"122988",
            None,
            u"10.155015000000001",
            u"14.826086999999999",
            u"11.671533",
            u"9.072917",
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )  # noqa
        print(self._get_column_names(Session, "test1"))
        assert self._get_column_names(Session, "test1") == [
            u"_id",
            u"_full_text",
            u"NU_ANO_CENSO",
            u"CO_MUNICIPIO",
            u"MUNIC",
            u"SIGLA",
            u"CO_UF",
            u"SCHOOLS_NU",
            u"SCHOOLS_FED_NU",
            u"SCHOOLS_ESTADUAL_NU",
            u"SCHOOLS_MUN_NU",
            u"SCHOOLS_PRIV_NU",
            u"SCHOOLS_FED_STUD",
            u"SCHOOLS_ESTADUAL_STUD",
            u"SCHOOLS_MUN_STUD",
            u"SCHOOLS_PRIV_STUD",
            u"SCHOOLS_URBAN_NU",
            u"SCHOOLS_RURAL_NU",
            u"SCHOOLS_URBAN_STUD",
            u"SCHOOLS_RURAL_STUD",
            u"SCHOOLS_NIVFUND_1_NU",
            u"SCHOOLS_NIVFUND_2_NU",
            u"SCHOOLS_EIGHTYEARS_NU",
            u"SCHOOLS_NINEYEARS_NU",
            u"SCHOOLS_EIGHTYEARS_STUD",
            u"SCHOOLS_NINEYEARS_STUD",
            u"MATFUND_NU",
            u"MATFUND_I_NU",
            u"MATFUND_T_NU",
            u"SCHOOLS_INTERNET_AVG",
            u"SCHOOLS_WATER_PUBLIC_AVG",
            u"SCHOOLS_WATER_AVG",
            u"SCHOOLS_ELECTR_PUB_AVG",
            u"SCHOOLS_SEWAGE_PUB_AVG",
            u"SCHOOLS_SEWAGE_AVG",
            u"PROFFUNDTOT_NU",
            u"PROFFUNDINC_PC",
            u"PROFFUNDCOMP_PC",
            u"PROFMED_PC",
            u"PROFSUP_PC",
            u"CLASSSIZE",
            u"CLASSSIZE_I",
            u"CLASSSIZE_T",
            u"STUDTEACH",
            u"RATE_APROV",
            u"RATE_APROV_I",
            u"RATE_APROV_T",
            u"RATE_FAILURE",
            u"RATE_FAILURE_I",
            u"RATE_FAILURE_T",
            u"RATE_ABANDON",
            u"RATE_ABANDON_I",
            u"RATE_ABANDON_T",
            u"RATE_TRANSFER",
            u"RATE_TRANSFER_I",
            u"RATE_TRANSFER_T",
            u"RATE_OVERAGE",
            u"RATE_OVERAGE_I",
            u"RATE_OVERAGE_T",
            u"PROVA_MEAN_PORT_I",
            u"PROVA_MEAN_PORT_T",
            u"PROVA_MEAN_MAT_I",
            u"PROVA_MEAN_MAT_T",
            u"CLASSSIZE_PUB",
            u"STUDTEACH_PUB",
            u"RATE_APROV_PUB",
            u"RATE_APROV_I_PUB",
            u"RATE_APROV_T_PUB",
            u"RATE_FAILURE_PUB",
            u"RATE_FAILURE_I_PUB",
            u"RATE_FAILURE_T_PUB",
            u"RATE_ABANDON_PUB",
            u"RATE_ABANDON_I_PUB",
            u"RATE_ABANDON_T_PUB",
            u"RATE_TRANSFER_PUB",
            u"RATE_TRANSFER_I_PUB",
            u"RATE_TRANSFER_T_PUB",
            u"RATE_OVERAGE_PUB",
            u"RATE_OVERAGE_I_PUB",
            u"RATE_OVERAGE_T_PUB",
            u"PROVA_MEAN_PORT_I_PUB",
            u"PROVA_MEAN_PORT_T_PUB",
            u"PROVA_MEAN_MAT_I_PUB",
            u"PROFFUNDTOT_NU_PUB",
            u"PROVA_MEAN_MAT_T_PUB",
            u"EDUCTEACH_PUB",
            u"EDUCTEACH_FEDERAL",
            u"EDUCTEACH_STATE",
            u"EDUCTEACH_MUN",
            u"PROVA_MEAN_PORT_I_STATE",
            u"PROVA_MEAN_PORT_T_STATE",
            u"PROVA_MEAN_MAT_I_STATE",
            u"PROVA_MEAN_MAT_T_STATE",
            u"PROVA_MEAN_PORT_I_MUN",
            u"PROVA_MEAN_PORT_T_MUN",
            u"PROVA_MEAN_MAT_I_MUN",
            u"PROVA_MEAN_MAT_T_MUN",
        ]  # noqa
        print(self._get_column_types(Session, "test1"))
        assert self._get_column_types(Session, "test1") == [
            u"int4",
            u"tsvector",
        ] + [u"text"] * (len(records[0]) - 1)