def read_schema(self): from solariat_bottle.utils.predictor_events import translate_column, get_type analysis_temp_file = tempfile.TemporaryFile('r+') headers = self.csv_file.readline() if not headers: raise CsvDataValidationError('Input file is empty') analysis_temp_file.write(headers) for idx, line_data in enumerate(self.csv_file.readlines(), start=1): analysis_temp_file.write(line_data) if idx == self.MAX_ANALYSIS_LINES: break analysis_temp_file.seek(0) schema_json = [] try: dataframe = pandas.read_csv(analysis_temp_file, sep=self.sep) except Exception as ex: LOGGER.error('Cannot parse file:', exc_info=True) raise CsvDataValidationError('Cannot parse file %s' % str(ex)) for col in dataframe.columns: schema_entry = dict(name=translate_column(col), type=get_type(dataframe[col].dtype, dataframe[col].values)) schema_json.append(schema_entry) return schema_json
def load_data(self): from solariat_bottle.utils.predictor_events import translate_column self.csv_file.seek(0) # TODO: commented to simplify difference between csv & json data processing, # and this makes no sense, if someone load ALL data and then apply discovered # schema, he can loose some data anyway. # dataframe = pandas.read_csv(self.csv_file, dtype=str, sep=self.sep) # TODO: add chunksize=self.LOAD_CHUNK_SIZE dataframe = pandas.read_csv(self.csv_file, sep=self.sep) for idx, (_, row_data) in enumerate(dataframe.iterrows(), start=1): mongo_data = {} for _col_name, col_value in row_data.iteritems(): col_name = translate_column(_col_name) if type(col_value) in (str, unicode) or not numpy.isnan(col_value): mongo_data[col_name] = col_value yield mongo_data
def enforce_schema(self, raw_data, status): '''If we decide to go with inserting really RAW data into mongo collection without casting when OUT_OF_SYNC, then use :status field to make such decision to get known ''' from solariat_bottle.utils.predictor_events import translate_column, get_type field_types = self.schema_field_types mongo_data = {} # TODO: cache translate_column for _col_name, col_value in raw_data.iteritems(): col_name = translate_column(_col_name) if col_name not in field_types: continue mongo_data[col_name] = apply_shema_type( col_value, field_types[col_name]) return mongo_data
def _load_customer_profile(self): from solariat_bottle.utils.predictor_events import translate_column manager = getattr(self.user.account, 'customer_profile') with open(CSV_SHORT_FILEPATH) as csv_file: data_loader = CsvDataLoader(csv_file, sep=CsvDataLoader.TAB) profile = manager.create(self.user, data_loader) id_col_name = translate_column('INTERACTION_ID') for col in profile.discovered_schema: if col[KEY_NAME] == id_col_name: col[KEY_IS_ID] = True self._post('/customer_profile/update_schema', {'schema': profile.discovered_schema}, expected_code=201) data = self._get('/customer_profile/get', {}) self.assertEqual(data['data']['schema'], profile.discovered_schema) profile.reload() profile.apply_sync() profile.accept_sync() data = profile.get_data()[0] self.customer_profile = data
def test_cancel_edit_apply_flow(self): from solariat_bottle.utils.predictor_events import translate_column from solariat.db.mongo import get_connection from datetime import datetime name = 'TestCancelUpdateCancelLoop' ITX_COL_NAME = translate_column('INTERACTION_ID') dataset = self.create_and_load_dataset(name) new_schema = [dict(col) for col in dataset.schema] itx_col = [col for col in new_schema if col[KEY_NAME] == ITX_COL_NAME][0] self.assertTrue(itx_col[KEY_TYPE] == TYPE_TIMESTAMP) raw_data = dataset.data_coll.find_one() self.assertTrue(isinstance(raw_data[ITX_COL_NAME], datetime)) itx_col[KEY_TYPE] = TYPE_STRING dataset.update_schema(new_schema) dataset.reload() assert dataset.schema == new_schema dataset.apply_sync() itx_col = [ col for col in dataset.schema if col[KEY_NAME] == ITX_COL_NAME ][0] self.assertTrue( itx_col[KEY_TYPE] == TYPE_STRING, 'type:%s, but must be:%s' % (itx_col[KEY_TYPE], TYPE_STRING)) raw_sync_data = dataset.data_sync_coll.find_one() self.assertTrue(isinstance(raw_sync_data[ITX_COL_NAME], basestring)) self.assertEqual(dataset.sync_status, Dataset.SYNCED) dataset.cancel_sync() self.assertEqual(dataset.sync_status, Dataset.OUT_OF_SYNC) # check sync collection does not exists anymore colls = get_connection().collection_names( include_system_collections=False) self.assertTrue(dataset.sync_collection not in colls) TEST_SCHEMA = [dict(col) for col in dataset.schema] itx_col = [ col for col in TEST_SCHEMA if col[KEY_NAME] == ITX_COL_NAME ][0] itx_col[KEY_TYPE] = TYPE_INTEGER dataset.update_schema(TEST_SCHEMA) self.assertEqual(dataset.schema, TEST_SCHEMA) dataset.apply_sync() itx_col = [ col for col in dataset.schema if col[KEY_NAME] == ITX_COL_NAME ][0] self.assertTrue(itx_col[KEY_TYPE] == TYPE_INTEGER) raw_sync_data = dataset.data_sync_coll.find_one() self.assertTrue( isinstance(raw_sync_data[ITX_COL_NAME], (int, long, float))) dataset.accept_sync() self.assertTrue(dataset.sync_status, Dataset.IN_SYNC) colls = get_connection().collection_names( include_system_collections=False) self.assertTrue(dataset.sync_collection not in colls) raw_data = dataset.data_coll.find_one() self.assertTrue(isinstance(raw_data[ITX_COL_NAME], (int, long, float))) dataset.drop_data()
def test_dataset_workflow(self): from solariat_bottle.utils.predictor_events import translate_column acc = self.user.account # create with open(CSV_FILEPATH) as csv_file: post_data = self.get_post_data(csv_file) # test create resp = self.client.post('/dataset/create', buffered=True, content_type='multipart/form-data', data=post_data, base_url='https://localhost') self.assertEqual(resp.status_code, 201) data = json.loads(resp.data) self.assertTrue(data['ok']) self.assertEqual(data['data']['sync_status'], Dataset.OUT_OF_SYNC) self.assertTrue(data['data']['schema']) self.assertFalse(data['data']['is_locked']) dataset = acc.datasets.get_dataset(self.user, CREATE_UPDATE_DATASET_NAME) schema = dataset.schema DataClass = dataset.get_data_class() self.assertEqual(DataClass.objects.count(), 50) # test update schema # based on test data, just lets change one column type itx_col_name = translate_column('INTERACTION_ID') itx_col = [s for s in schema if s['name'] == itx_col_name][0] assert itx_col['type'] in ('integer', 'timestamp'), (itx_col['type'], itx_col_name) itx_col['type'] = 'string' data = self._post('/dataset/update_schema/%s' % CREATE_UPDATE_DATASET_NAME, {'schema': schema}, expected_code=201) dataset = acc.datasets.get_dataset(self.user, CREATE_UPDATE_DATASET_NAME) self.assertTrue(bool([1 for col in dataset.schema if col['name'] == itx_col_name \ and col['type'] == 'string'])) # test invalid schema broken_schema = schema[1:] data = self._post('/dataset/update_schema/%s' % CREATE_UPDATE_DATASET_NAME, {'schema': broken_schema}, expected_result=False, expected_code=500) # cannot accept sync until it's happens data = self._post('/dataset/sync/accept/%s' % CREATE_UPDATE_DATASET_NAME, {}, expected_result=False, expected_code=500) # let's include the case when not all data could be synced FAIL_COL_NAME = 'STAT_INI_1' dataset.reload() col = [ col for col in dataset.schema if col[KEY_NAME] == FAIL_COL_NAME ][0] self.assertEqual(col[KEY_TYPE], TYPE_INTEGER) raw_data = dataset.data_coll.find_one() dataset.data_coll.update({'_id': raw_data['_id']}, {'$set': { FAIL_COL_NAME: 'fail' }}) # test applying schema on dataset (synchronous mode for testing) data = self._post('/dataset/sync/apply/%s' % CREATE_UPDATE_DATASET_NAME, {}, expected_code=201) self.assertEqual(data['data']['sync_status'], Dataset.SYNCED) self.assertTrue(data['data']['is_locked']) # we manually fail 1 raw sync self.assertEqual(data['data']['items_synced'], 49) # until we accpet/discard last sync, # our original collection keeps origin data dataset = acc.datasets.get_dataset(self.user, CREATE_UPDATE_DATASET_NAME) DataClass = dataset.get_data_class() self.assertEqual(DataClass.objects.count(), 50) data = self._post('/dataset/sync/apply/%s' % CREATE_UPDATE_DATASET_NAME, {}, expected_result=False, expected_code=500) data = self._post('/dataset/sync/accept/%s' % CREATE_UPDATE_DATASET_NAME, {}, expected_code=201) dataset = acc.datasets.get_dataset(self.user, CREATE_UPDATE_DATASET_NAME) DataClass = dataset.get_data_class() self.assertEqual(DataClass.objects.count(), 49) # test update, append 50 items again with open(CSV_FILEPATH) as csv_file: post_data = self.get_post_data(csv_file) resp = self.client.post('/dataset/update/%s' % CREATE_UPDATE_DATASET_NAME, buffered=True, content_type='multipart/form-data', data=post_data, base_url='https://localhost') data = json.loads(resp.data) self.assertEqual(resp.status_code, 201) self.assertTrue(data['ok']) self.assertEqual(data['data']['rows'], 99) dataset = acc.datasets.get_dataset(self.user, CREATE_UPDATE_DATASET_NAME) DataClass = dataset.get_data_class() self.assertEqual(DataClass.objects.count(), 99) data = self._post('/dataset/update_schema/%s' % CREATE_UPDATE_DATASET_NAME, {'schema': schema}, expected_result=False, expected_code=500) # # prepare wrong schema for data update from StringIO import StringIO stream = StringIO() with open(CSV_FILEPATH) as csv_file: for row in csv_file: cols = row.split(CSV_SEPARATOR) if len(cols) > 1: row = CSV_SEPARATOR.join(cols[1:]) stream.write(row) stream.seek(0) post_data = self.get_post_data(stream) resp = self.client.post('/dataset/update/%s' % CREATE_UPDATE_DATASET_NAME, buffered=True, content_type='multipart/form-data', data=post_data, base_url='https://localhost') self.assertEqual(resp.status_code, 500) data = json.loads(resp.data) self.assertFalse(data['ok']) dataset.drop_data()