def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(file_format["path"]): raise PopupException(_('Path %(path)s is not a file') % file_format) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format({ "file": { "stream": stream, "name": file_format['path'] } }) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details]) if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"} elif file_format['inputFormat'] == 'rdbms': format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format() format_['status'] = 0 return JsonResponse(format_)
def setup_class(cls): if not ENABLE_SQOOP.get(): raise SkipTest if not rdbms.get_query_server_config(server='mysql'): raise SkipTest cls.client = make_logged_in_client() cls.user = User.objects.get(username='******') cls.user = rewrite_user(cls.user) cls.indexer = RdbmsIndexer(cls.user, db_conf_name='mysql')
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } elif file_format['inputFormat'] == 'rdbms': query_server = rdbms.get_query_server_config(server=file_format['rdbmsType']) db = rdbms.get(request.user, query_server=query_server) sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data(mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName']) table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in table_metadata ] } return JsonResponse(format_)
if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',')} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"} elif file_format['inputFormat'] == 'rdbms': format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format() format_['status'] = 0 return JsonResponse(format_) def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": {
if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',')} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"} elif file_format['inputFormat'] == 'rdbms': format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format() elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': format_ = {"type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'topics': get_topics()} elif file_format['streamSelection'] == 'sfdc': sf = Salesforce( username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken'] ) format_ = {"type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'objects': [sobject['name'] for sobject in sf.restful('sobjects/')['sobjects'] if sobject['queryable']]} format_['status'] = 0 return JsonResponse(format_)