Esempio n. 1
0
    def read_schema(self):
        from solariat_bottle.utils.predictor_events import translate_column, get_type

        analysis_temp_file = tempfile.TemporaryFile('r+')
        headers = self.csv_file.readline()
        if not headers:
            raise CsvDataValidationError('Input file is empty')
        analysis_temp_file.write(headers)

        for idx, line_data in enumerate(self.csv_file.readlines(), start=1):
            analysis_temp_file.write(line_data)
            if idx == self.MAX_ANALYSIS_LINES:
                break

        analysis_temp_file.seek(0)
        schema_json = []
        try:
            dataframe = pandas.read_csv(analysis_temp_file, sep=self.sep)
        except Exception as ex:
            LOGGER.error('Cannot parse file:', exc_info=True)
            raise CsvDataValidationError('Cannot parse file %s' % str(ex))

        for col in dataframe.columns:
            schema_entry = dict(name=translate_column(col),
                                type=get_type(dataframe[col].dtype,
                                              dataframe[col].values))
            schema_json.append(schema_entry)

        return schema_json
Esempio n. 2
0
    def load_data(self):
        from solariat_bottle.utils.predictor_events import translate_column

        self.csv_file.seek(0)
        # TODO: commented to simplify difference between csv & json data processing,
        # and this makes no sense, if someone load ALL data and then apply discovered
        # schema, he can loose some data anyway.
        # dataframe = pandas.read_csv(self.csv_file, dtype=str, sep=self.sep)

        # TODO: add chunksize=self.LOAD_CHUNK_SIZE
        dataframe = pandas.read_csv(self.csv_file, sep=self.sep)

        for idx, (_, row_data) in enumerate(dataframe.iterrows(), start=1):
            mongo_data = {}
            for _col_name, col_value in row_data.iteritems():
                col_name = translate_column(_col_name)
                if type(col_value) in (str,
                                       unicode) or not numpy.isnan(col_value):
                    mongo_data[col_name] = col_value
            yield mongo_data
Esempio n. 3
0
        def enforce_schema(self, raw_data, status):
            '''If we decide to go with inserting really RAW data
               into mongo collection without casting when OUT_OF_SYNC,
               then use :status field to make such decision
               to get known
            '''
            from solariat_bottle.utils.predictor_events import translate_column, get_type

            field_types = self.schema_field_types
            mongo_data = {}

            # TODO: cache translate_column
            for _col_name, col_value in raw_data.iteritems():
                col_name = translate_column(_col_name)
                if col_name not in field_types:
                    continue
                mongo_data[col_name] = apply_shema_type(
                    col_value, field_types[col_name])

            return mongo_data
Esempio n. 4
0
    def _load_customer_profile(self):
        from solariat_bottle.utils.predictor_events import translate_column
        manager = getattr(self.user.account, 'customer_profile')
        with open(CSV_SHORT_FILEPATH) as csv_file:
            data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB)
            profile = manager.create(self.user, data_loader)

        id_col_name = translate_column('INTERACTION_ID')
        for col in profile.discovered_schema:
            if col[KEY_NAME] == id_col_name:
                col[KEY_IS_ID] = True

        self._post('/customer_profile/update_schema',
                   {'schema': profile.discovered_schema},
                   expected_code=201)

        data = self._get('/customer_profile/get', {})
        self.assertEqual(data['data']['schema'], profile.discovered_schema)

        profile.reload()
        profile.apply_sync()
        profile.accept_sync()
        data = profile.get_data()[0]
        self.customer_profile = data
Esempio n. 5
0
    def test_cancel_edit_apply_flow(self):
        from solariat_bottle.utils.predictor_events import translate_column
        from solariat.db.mongo import get_connection
        from datetime import datetime

        name = 'TestCancelUpdateCancelLoop'
        ITX_COL_NAME = translate_column('INTERACTION_ID')

        dataset = self.create_and_load_dataset(name)
        new_schema = [dict(col) for col in dataset.schema]
        itx_col = [col for col in new_schema
                   if col[KEY_NAME] == ITX_COL_NAME][0]
        self.assertTrue(itx_col[KEY_TYPE] == TYPE_TIMESTAMP)
        raw_data = dataset.data_coll.find_one()
        self.assertTrue(isinstance(raw_data[ITX_COL_NAME], datetime))

        itx_col[KEY_TYPE] = TYPE_STRING
        dataset.update_schema(new_schema)
        dataset.reload()
        assert dataset.schema == new_schema

        dataset.apply_sync()
        itx_col = [
            col for col in dataset.schema if col[KEY_NAME] == ITX_COL_NAME
        ][0]
        self.assertTrue(
            itx_col[KEY_TYPE] == TYPE_STRING,
            'type:%s, but must be:%s' % (itx_col[KEY_TYPE], TYPE_STRING))
        raw_sync_data = dataset.data_sync_coll.find_one()
        self.assertTrue(isinstance(raw_sync_data[ITX_COL_NAME], basestring))

        self.assertEqual(dataset.sync_status, Dataset.SYNCED)
        dataset.cancel_sync()
        self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC)

        # check sync collection does not exists anymore
        colls = get_connection().collection_names(
            include_system_collections=False)
        self.assertTrue(dataset.sync_collection not in colls)

        TEST_SCHEMA = [dict(col) for col in dataset.schema]
        itx_col = [
            col for col in TEST_SCHEMA if col[KEY_NAME] == ITX_COL_NAME
        ][0]
        itx_col[KEY_TYPE] = TYPE_INTEGER
        dataset.update_schema(TEST_SCHEMA)
        self.assertEqual(dataset.schema, TEST_SCHEMA)

        dataset.apply_sync()
        itx_col = [
            col for col in dataset.schema if col[KEY_NAME] == ITX_COL_NAME
        ][0]
        self.assertTrue(itx_col[KEY_TYPE] == TYPE_INTEGER)
        raw_sync_data = dataset.data_sync_coll.find_one()
        self.assertTrue(
            isinstance(raw_sync_data[ITX_COL_NAME], (int, long, float)))

        dataset.accept_sync()
        self.assertTrue(dataset.sync_status, Dataset.IN_SYNC)
        colls = get_connection().collection_names(
            include_system_collections=False)
        self.assertTrue(dataset.sync_collection not in colls)
        raw_data = dataset.data_coll.find_one()
        self.assertTrue(isinstance(raw_data[ITX_COL_NAME], (int, long, float)))
        dataset.drop_data()
Esempio n. 6
0
    def test_dataset_workflow(self):
        from solariat_bottle.utils.predictor_events import translate_column

        acc = self.user.account

        # create
        with open(CSV_FILEPATH) as csv_file:
            post_data = self.get_post_data(csv_file)
            # test create
            resp = self.client.post('/dataset/create',
                                    buffered=True,
                                    content_type='multipart/form-data',
                                    data=post_data,
                                    base_url='https://localhost')

            self.assertEqual(resp.status_code, 201)
            data = json.loads(resp.data)
            self.assertTrue(data['ok'])
            self.assertEqual(data['data']['sync_status'], Dataset.OUT_OF_SYNC)
            self.assertTrue(data['data']['schema'])
            self.assertFalse(data['data']['is_locked'])
            dataset = acc.datasets.get_dataset(self.user,
                                               CREATE_UPDATE_DATASET_NAME)
            schema = dataset.schema
            DataClass = dataset.get_data_class()
            self.assertEqual(DataClass.objects.count(), 50)

        # test update schema
        # based on test data, just lets change one column type
        itx_col_name = translate_column('INTERACTION_ID')
        itx_col = [s for s in schema if s['name'] == itx_col_name][0]
        assert itx_col['type'] in ('integer', 'timestamp'), (itx_col['type'],
                                                             itx_col_name)
        itx_col['type'] = 'string'
        data = self._post('/dataset/update_schema/%s' %
                          CREATE_UPDATE_DATASET_NAME, {'schema': schema},
                          expected_code=201)
        dataset = acc.datasets.get_dataset(self.user,
                                           CREATE_UPDATE_DATASET_NAME)
        self.assertTrue(bool([1 for col in dataset.schema if col['name'] == itx_col_name \
                                                     and col['type'] == 'string']))

        # test invalid schema
        broken_schema = schema[1:]
        data = self._post('/dataset/update_schema/%s' %
                          CREATE_UPDATE_DATASET_NAME,
                          {'schema': broken_schema},
                          expected_result=False,
                          expected_code=500)

        # cannot accept sync until it's happens
        data = self._post('/dataset/sync/accept/%s' %
                          CREATE_UPDATE_DATASET_NAME, {},
                          expected_result=False,
                          expected_code=500)

        # let's include the case when not all data could be synced
        FAIL_COL_NAME = 'STAT_INI_1'
        dataset.reload()
        col = [
            col for col in dataset.schema if col[KEY_NAME] == FAIL_COL_NAME
        ][0]
        self.assertEqual(col[KEY_TYPE], TYPE_INTEGER)
        raw_data = dataset.data_coll.find_one()
        dataset.data_coll.update({'_id': raw_data['_id']},
                                 {'$set': {
                                     FAIL_COL_NAME: 'fail'
                                 }})

        # test applying schema on dataset (synchronous mode for testing)
        data = self._post('/dataset/sync/apply/%s' %
                          CREATE_UPDATE_DATASET_NAME, {},
                          expected_code=201)

        self.assertEqual(data['data']['sync_status'], Dataset.SYNCED)
        self.assertTrue(data['data']['is_locked'])
        # we manually fail 1 raw sync
        self.assertEqual(data['data']['items_synced'], 49)

        # until we accpet/discard last sync,
        # our original collection keeps origin data
        dataset = acc.datasets.get_dataset(self.user,
                                           CREATE_UPDATE_DATASET_NAME)
        DataClass = dataset.get_data_class()
        self.assertEqual(DataClass.objects.count(), 50)

        data = self._post('/dataset/sync/apply/%s' %
                          CREATE_UPDATE_DATASET_NAME, {},
                          expected_result=False,
                          expected_code=500)

        data = self._post('/dataset/sync/accept/%s' %
                          CREATE_UPDATE_DATASET_NAME, {},
                          expected_code=201)
        dataset = acc.datasets.get_dataset(self.user,
                                           CREATE_UPDATE_DATASET_NAME)
        DataClass = dataset.get_data_class()
        self.assertEqual(DataClass.objects.count(), 49)

        # test update, append 50 items again
        with open(CSV_FILEPATH) as csv_file:
            post_data = self.get_post_data(csv_file)
            resp = self.client.post('/dataset/update/%s' %
                                    CREATE_UPDATE_DATASET_NAME,
                                    buffered=True,
                                    content_type='multipart/form-data',
                                    data=post_data,
                                    base_url='https://localhost')

            data = json.loads(resp.data)
            self.assertEqual(resp.status_code, 201)
            self.assertTrue(data['ok'])
            self.assertEqual(data['data']['rows'], 99)
            dataset = acc.datasets.get_dataset(self.user,
                                               CREATE_UPDATE_DATASET_NAME)
            DataClass = dataset.get_data_class()
            self.assertEqual(DataClass.objects.count(), 99)

        data = self._post('/dataset/update_schema/%s' %
                          CREATE_UPDATE_DATASET_NAME, {'schema': schema},
                          expected_result=False,
                          expected_code=500)

        # # prepare wrong schema for data update
        from StringIO import StringIO
        stream = StringIO()
        with open(CSV_FILEPATH) as csv_file:
            for row in csv_file:
                cols = row.split(CSV_SEPARATOR)
                if len(cols) > 1:
                    row = CSV_SEPARATOR.join(cols[1:])
                stream.write(row)
        stream.seek(0)
        post_data = self.get_post_data(stream)
        resp = self.client.post('/dataset/update/%s' %
                                CREATE_UPDATE_DATASET_NAME,
                                buffered=True,
                                content_type='multipart/form-data',
                                data=post_data,
                                base_url='https://localhost')

        self.assertEqual(resp.status_code, 500)
        data = json.loads(resp.data)
        self.assertFalse(data['ok'])
        dataset.drop_data()