def parse_header_value(header: str) -> Tuple[str, Dict[str, str]]: """ Parse an HTTP header value. Parameter values will be unquoted. If the key ends with an asterisk (``*``), the asterisk is removed from the key name and the value is then decoded according to :rfc:`2231`. :param header: :return: a tuple of (main value, params dict) """ assert check_argument_types() main_value, params_str = header.partition(';')[::2] params = {} for match in header_param_re.finditer(params_str): key, value = match.groups() value = unquote(value) if key.endswith('*'): key = key[:-1] encoding, value = decode_rfc2231(value)[::2] value = urllib_unquote(value, encoding) params[key] = value return main_value.rstrip(), params
def _ask_with_pinentry(self, prompt, description, error, validator): self._waitfor("OK") env = os.environ.get self._comm("OPTION lc-ctype=%s" % env("LC_CTYPE", env("LC_ALL", "en_US.UTF-8"))) try: self._comm("OPTION ttyname=%s" % env("TTY", os.ttyname(sys.stdout.fileno()))) except: pass if env('TERM'): self._comm("OPTION ttytype=%s" % env("TERM")) if prompt: self._comm("SETPROMPT %s" % self._esc(prompt)) if description: self._comm("SETDESC %s" % self._esc(description)) password = None while not validator(password): if password is not None: self._comm("SETERROR %s" % self._esc(error)) password = self._comm_getpin() # Passphrase may contain percent-encoded entities # gpg/pinentry: pinentry/pinentry.c#L392 copy_and_escape # https://github.com/gpg/pinentry/blob/master/pinentry/pinentry.c#L392 password = urllib_unquote(password) return password
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib_unquote(source['path']) source['path'] = request.fs.netnormpath(path) if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \ if destination['nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif source['inputFormat'] in ( 'stream', 'connector') or destination['ouputFormat'] == 'stream': job_handle = _envelope_job(request, source, destination, start_time=start_time, lib_path=destination['indexerJobLibPath']) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': path = urllib_unquote(file_format["path"]) indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(path): raise PopupException( _('Path %(path)s is not a file') % file_format) stream = request.fs.open(path) format_ = indexer.guess_format( {"file": { "stream": stream, "name": path }}) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) try: table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) except Exception as e: raise PopupException( e.message if hasattr(e, 'message') and e.message else e) storage = {} for delim in table_metadata.storage_details: if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',') } elif table_metadata.details['properties']['format'] == 'parquet': format_ = { "type": "parquet", "hasHeader": False, } else: raise PopupException( 'Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = { "quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001" } elif file_format['inputFormat'] == 'rdbms': format_ = {"type": "csv"} elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'topics': get_topics() } elif file_format['streamSelection'] == 'flume': format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n" } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'objects': [ sobject['name'] for sobject in sf.restful('sobjects/')['sobjects'] if sobject['queryable'] ] } else: raise PopupException( _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s' ) % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) format_['status'] = 0 return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type', 'url', 'user' ] kafkaFieldTypes = ['string'] * len(kafkaFieldNames) kafkaFieldNames.append('timeDate') kafkaFieldTypes.append('date') else: # Note: mocked here, should come from SFDC or Kafka API or sampling job kafkaFieldNames = file_format.get('kafkaFieldNames', '').split(',') kafkaFieldTypes = file_format.get('kafkaFieldTypes', '').split(',') data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': ','.join(kafkaFieldNames), 'data': '\n'.join( [','.join(['...'] * len(kafkaFieldTypes))] * 5) } stream = string_io() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def export_result(request): response = {'status': -1, 'message': _('Success')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', '"hdfs-file"')) destination = urllib_unquote( json.loads(request.POST.get('destination', '""'))) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) start_time = json.loads(request.POST.get('start_time', '-1')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if destination.lower().startswith("abfs"): destination = abfspath(destination) if request.fs.exists(destination) and request.fs.listdir_stats( destination): raise PopupException( _('The destination is not an empty directory!')) if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format in ('search-index', 'dashboard'): # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable) if is_embedded: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) if data_format == 'dashboard': engine = notebook['type'].replace('query-', '') response['watch_url'] = reverse( 'dashboard:browse', kwargs={ 'name': notebook_id }) + '?source=query&engine=%(engine)s' % { 'engine': engine } response['status'] = 0 else: sample = get_api(request, snippet).fetch_result(notebook, snippet, rows=4, start_over=True) for col in sample['meta']: col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string') response['status'] = 0 response['id'] = notebook_id response['name'] = _get_snippet_name(notebook) response['source_type'] = 'query' response['target_type'] = 'index' response['target_path'] = destination response['sample'] = list(sample['data']) response['columns'] = [ Field(col['name'], col['type']).to_dict() for col in sample['meta'] ] else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 if response.get('status') != 0: response['message'] = _('Exporting result failed.') return JsonResponse(response)
def clean_path(self): return urllib_unquote(self.cleaned_data.get('path', ''))
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if type(uri) != str: uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") b64 = urllib_unquote(m.group("data")).encode("utf-8") self.data = base64.b64decode(b64) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib2.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server, **httpConfig) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader( "Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file = gzip.GzipFile( mode="rb", fileobj=six.StringIO(r1.read())) else: self.file = pisaTempFile(r1.read()) else: log.debug( "Received non-200 status: {}".format((r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: if sys.platform == 'win32' and os.path.isfile(basepath): basepath = os.path.dirname(basepath) uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): self.file = open(uri, "r") #removed bytes... lets hope it goes ok :/ else: # removed bytes... lets hope it goes ok :/ self.file = open(uri, "rb")
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib_unquote(source['path']) source['path'] = request.fs.netnormpath(path) parent_path = request.fs.parent_path(path) stats = request.fs.stats(parent_path) split = urlparse(path) # Only for HDFS, import data and non-external table if split.scheme in ( '', 'hdfs') and destination['importData'] and destination[ 'useDefaultLocation'] and oct( stats["mode"])[-1] != '7' and not request.POST.get( 'show_command'): user_scratch_dir = request.fs.get_home_dir() + '/.scratchdir' request.fs.do_as_user(request.user, request.fs.mkdir, user_scratch_dir, 0o0777) request.fs.do_as_user(request.user, request.fs.rename, source['path'], user_scratch_dir) source['path'] = user_scratch_dir + '/' + source['path'].split( '/')[-1] if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath( destination['nonDefaultLocation']) if destination[ 'nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif source['inputFormat'] in ( 'stream', 'connector') or destination['ouputFormat'] == 'stream': job_handle = _envelope_job(request, source, destination, start_time=start_time, lib_path=destination['indexerJobLibPath']) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if type(uri) != str: uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") b64 = urllib_unquote(m.group("data")).encode("utf-8") self.data = base64.b64decode(b64) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib2.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server, **httpConfig) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader("Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file = gzip.GzipFile(mode="rb", fileobj=six.BytesIO( r1.read())) else: self.file = pisaTempFile(r1.read()) else: log.debug("Received non-200 status: {}".format( (r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: if sys.platform == 'win32' and os.path.isfile(basepath): basepath = os.path.dirname(basepath) uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): self.file = open( uri, "r") #removed bytes... lets hope it goes ok :/ else: # removed bytes... lets hope it goes ok :/ self.file = open(uri, "rb")
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) file_type = file_format['file_type'] path = urllib_unquote(file_format["path"]) if sys.version_info[0] < 3 and (file_type == 'excel' or path[-3:] == 'xls' or path[-4:] == 'xlsx'): return JsonResponse({ 'status': -1, 'message': 'Python2 based Hue does not support Excel file importer' }) if file_format['inputFormat'] == 'localfile': if file_type == 'excel': format_ = {"type": "excel", "hasHeader": True} else: format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": True, "fieldSeparator": "," } elif file_format['inputFormat'] == 'file': if path[-3:] == 'xls' or path[-4:] == 'xlsx': file_obj = request.fs.open(path) if path[-3:] == 'xls': df = pd.read_excel(file_obj.read(1024 * 1024 * 1024), engine='xlrd') else: df = pd.read_excel(file_obj.read(1024 * 1024 * 1024), engine='openpyxl') _csv_data = df.to_csv(index=False) path = excel_to_csv_file_name_change(path) request.fs.create(path, overwrite=True, data=_csv_data) indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(path): raise PopupException( _('Path %(path)s is not a file') % file_format) stream = request.fs.open(path) format_ = indexer.guess_format( {"file": { "stream": stream, "name": path }}) _convert_format(format_) if file_format["path"][-3:] == 'xls' or file_format["path"][ -4:] == 'xlsx': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "excel", "hasHeader": True, "fieldSeparator": "," } elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) try: table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) except Exception as e: raise PopupException( e.message if hasattr(e, 'message') and e.message else e) storage = {} for delim in table_metadata.storage_details: if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',') } elif table_metadata.details['properties']['format'] == 'parquet': format_ = { "type": "parquet", "hasHeader": False, } else: raise PopupException( 'Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = { "quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001" } elif file_format['inputFormat'] == 'rdbms': format_ = {"type": "csv"} elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': format_ = { "type": "json", # "fieldSeparator": ",", # "hasHeader": True, # "quoteChar": "\"", # "recordSeparator": "\\n", 'topics': get_topics(request.user) } elif file_format['streamSelection'] == 'flume': format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n" } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'objects': [ sobject['name'] for sobject in sf.restful('sobjects/')['sobjects'] if sobject['queryable'] ] } else: raise PopupException( _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s' ) % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) format_['status'] = 0 return JsonResponse(format_)
def create_table_from_a_file(self, source, destination, start_time=-1, file_encoding=None): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] source_type = source['sourceType'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] comment = destination['description'] source_path = urllib_unquote(source['path']) load_data = destination['importData'] external = not destination['useDefaultLocation'] external_path = urllib_unquote(destination['nonDefaultLocation']) editor_type = destination['sourceType'] is_transactional = destination['isTransactional'] default_transactional_type = 'insert_only' if destination[ 'isInsertOnly'] else 'default' skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination[ 'customCollectionDelimiter'] or None map_delimiter = destination['customMapDelimiter'] or None else: field_delimiter = ',' collection_delimiter = r'\002' map_delimiter = r'\003' regexp_delimiter = destination['customRegexp'] file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False source['format'] = {'quoteChar': '"', 'fieldSeparator': ','} if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe' elif table_format == 'regexp': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe' serde_properties = '"input.regex" = "%s"' % regexp_delimiter elif table_format == 'csv': if source['format']['quoteChar'] == '"': source['format']['quoteChar'] = '\\"' row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "%(fieldSeparator)s", "quoteChar" = "%(quoteChar)s", "escapeChar" = "\\\\" ''' % source['format'] use_temp_table = table_format in ('parquet', 'orc', 'kudu') or is_transactional if use_temp_table: # We'll be using a temp table to load data if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: # Manual row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [ col for col in columns if col['name'] in primary_keys ] + [ col for col in columns if col['name'] not in primary_keys ] if table_format == 'kudu': collection_delimiter = None map_delimiter = None if external or (load_data and table_format in ( 'parquet', 'orc', 'kudu')): # We'll use location to load data if not self.fs.isdir(external_path): # File selected external_path, external_file_name = Hdfs.split(external_path) if len(self.fs.listdir(external_path)) > 1: # If dir not just the file, create data dir and move file there. Make sure it's unique. external_path = external_path + '/%s%s_table' % ( external_file_name, str(uuid.uuid4())) self.fs.mkdir(external_path) self.fs.rename(source_path, external_path) elif load_data: # We'll use load data command parent_path = self.fs.parent_path(source_path) stats = self.fs.stats(parent_path) split = urlparse(source_path) # Only for HDFS, import data and non-external table if split.scheme in ('', 'hdfs') and oct(stats["mode"])[-1] != '7': user_scratch_dir = self.fs.get_home_dir( ) + '/.scratchdir/%s' % str( uuid.uuid4()) # Make sure it's unique. self.fs.do_as_user(self.user, self.fs.mkdir, user_scratch_dir, 0o0777) self.fs.do_as_user(self.user, self.fs.rename, source['path'], user_scratch_dir) if USER_SCRATCH_DIR_PERMISSION.get(): self.fs.do_as_user(self.user, self.fs.chmod, user_scratch_dir, 0o0777, True) source_path = user_scratch_dir + '/' + source['path'].split( '/')[-1] if external_path.lower().startswith( "abfs"): #this is to check if its using an ABFS path external_path = abfspath(external_path) tbl_properties = OrderedDict() if skip_header: tbl_properties['skip.header.line.count'] = '1' # The temp table is not transactional, but final table can be if is_transactional. # tbl_properties that don't exist in previous versions can safely be added without error. tbl_properties['transactional'] = 'false' sql += django_mako.render_to_string( "gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter if source_type == 'hive' else None, 'map_key_terminator': map_delimiter if source_type == 'hive' else None, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'orc', 'kudu'), 'path': external_path, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], 'tbl_properties': tbl_properties }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database }) if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8' and not use_temp_table: sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } if table_format in ('text', 'json', 'csv', 'regexp') and not external and load_data: form_data = { 'path': source_path, 'overwrite': False, 'partition_columns': [(partition['name'], partition['partitionValue']) for partition in partition_columns], } query_server_config = dbms.get_query_server_config( name=source_type) db = dbms.get(self.user, query_server=query_server_config) sql += "\n\n%s;" % db.load_data( database, table_name, form_data, None, generate_ddl_only=True) if load_data and use_temp_table: file_format = 'TextFile' if table_format == 'text' else table_format if table_format == 'kudu': columns_list = [ '`%s`' % col for col in primary_keys + [ col['name'] for col in destination['columns'] if col['name'] not in primary_keys and col['keep'] ] ] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) PARTITION BY HASH PARTITIONS 16 STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas'='1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] extra_create_properties = 'STORED AS %(file_format)s' % { 'file_format': file_format } if is_transactional: extra_create_properties += "\nTBLPROPERTIES('transactional'='true', 'transactional_properties'='%s')" % \ default_transactional_type sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), 'comment': ' COMMENT "%s"' % comment if comment else '' } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8': sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } on_success_url = reverse('metastore:describe_table', kwargs={ 'database': database, 'table': final_table_name }) + '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file_content = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if not isinstance(uri, str): uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") b64 = urllib_unquote(m.group("data")) # The data may be incorrectly unescaped... repairs needed b64 = b64.strip("b'").strip("'").encode() b64 = re.sub(b"\\n", b'', b64) b64 = re.sub(b'[^A-Za-z0-9\\+\\/]+', b'', b64) # Add padding as needed, to make length into a multiple of 4 # b64 += b"=" * ((4 - len(b64) % 4) % 4) self.data = base64.b64decode(b64) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file_content = urlResponse.read() # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB url_splitted = urlparse.urlsplit(uri) server = url_splitted[1] path = url_splitted[2] path += "?" + url_splitted[3] if url_splitted[3] else "" if uri.startswith("https://"): conn = httplib.HTTPSConnection(server, **httpConfig) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader( "Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file_content = gzip.GzipFile( mode="rb", fileobj=six.BytesIO(r1.read())) else: self.file_content = pisaTempFile(r1.read()) else: log.debug( "Received non-200 status: {}".format((r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file_content = urlResponse.read() conn.close() else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: if sys.platform == 'win32' and os.path.isfile(basepath): basepath = os.path.dirname(basepath) uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): with open(uri, "r") as file_handler: # removed bytes... lets hope it goes ok :/ self.file_content = file_handler.read() else: with open(uri, "rb") as file_handler: # removed bytes... lets hope it goes ok :/ self.file_content = file_handler.read()
def create_table_from_local_file(self, source, destination, start_time=-1): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name source_type = source['sourceType'] editor_type = destination['sourceType'] columns = destination['columns'] dialect = get_interpreter(source_type, self.user)['dialect'] if dialect in ('hive', 'mysql'): if dialect == 'mysql': for col in columns: if col['type'] == 'string': col['type'] = 'VARCHAR(255)' sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s ( %(columns)s);\n''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join([' `%(name)s` %(type)s' % col for col in columns]), } elif dialect == 'phoenix': for col in columns: if col['type'] == 'string': col['type'] = 'CHAR(255)' sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s ( %(columns)s CONSTRAINT my_pk PRIMARY KEY (%(primary_keys)s));\n''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join([' %(name)s %(type)s' % col for col in columns]), 'primary_keys': ', '.join(destination.get('primaryKeys')) } elif dialect == 'impala': sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s_tmp ( %(columns)s);\n''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join([' `%(name)s` string' % col for col in columns]), } # Impala does not implicitly cast between string and numeric or Boolean types. path = urllib_unquote(source['path']) if path: # data insertion with open(path, 'r') as local_file: reader = csv.reader(local_file) _csv_rows = [] for count, row in enumerate(reader): if (source['format']['hasHeader'] and count == 0) or not row: continue if dialect == 'impala': # for the boolean col updating csv_val to (1,0) row = self.nomalize_booleans(row, columns) _csv_rows.append(tuple(row)) if _csv_rows: csv_rows = str(_csv_rows)[1:-1] if dialect in ('hive', 'mysql'): sql += '''\nINSERT INTO %(database)s.%(table_name)s VALUES %(csv_rows)s;\n''' % { 'database': database, 'table_name': table_name, 'csv_rows': csv_rows } elif dialect == 'phoenix': for csv_row in _csv_rows: _sql = ', '.join([ "'{0}'".format(col_val) if columns[count]['type'] in ('CHAR(255)', 'timestamp') \ else '{0}'.format(col_val) for count, col_val in enumerate(csv_row)]) sql += '''\nUPSERT INTO %(database)s.%(table_name)s VALUES (%(csv_row)s);\n''' % { 'database': database, 'table_name': table_name, 'csv_row': _sql } elif dialect == 'impala': # casting from string to boolean is not allowed in impala so string -> int -> bool sql_ = ',\n'.join([ ' CAST ( `%(name)s` AS %(type)s ) `%(name)s`' % col if col['type'] != 'boolean' \ else ' CAST ( CAST ( `%(name)s` AS TINYINT ) AS boolean ) `%(name)s`' % col for col in columns ]) sql += '''\nINSERT INTO %(database)s.%(table_name)s_tmp VALUES %(csv_rows)s;\n\nCREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s AS SELECT\n%(sql_)s\nFROM %(database)s.%(table_name)s_tmp;\n\nDROP TABLE IF EXISTS %(database)s.%(table_name)s_tmp;''' % { 'database': database, 'table_name': table_name, 'csv_rows': csv_rows, 'sql_': sql_ } on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \ '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def create_table_from_a_file(self, source, destination, start_time=-1): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] source_type = source['sourceType'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] comment = destination['description'] source_path = urllib_unquote(source['path']) external = not destination['useDefaultLocation'] external_path = urllib_unquote(destination['nonDefaultLocation']) load_data = destination['importData'] skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination['customCollectionDelimiter'] map_delimiter = destination['customMapDelimiter'] else: field_delimiter = ',' collection_delimiter = r'\002' map_delimiter = r'\003' regexp_delimiter = destination['customRegexp'] file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False source['format'] = {'quoteChar': '"', 'fieldSeparator': ','} if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe' elif table_format == 'regexp': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe' serde_properties = '"input.regex" = "%s"' % regexp_delimiter elif table_format == 'csv': if source['format']['quoteChar'] == '"': source['format']['quoteChar'] = '\\"' row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "%(fieldSeparator)s", "quoteChar" = "%(quoteChar)s", "escapeChar" = "\\\\" ''' % source['format'] if table_format in ('parquet', 'kudu'): if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: # Manual row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [ col for col in columns if col['name'] in primary_keys ] + [ col for col in columns if col['name'] not in primary_keys ] if table_format == 'kudu': collection_delimiter = None map_delimiter = None if external or (load_data and table_format in ('parquet', 'kudu')): if not self.fs.isdir(external_path): # File selected external_path, external_file_name = self.fs.split( external_path) if len(self.fs.listdir(external_path)) > 1: external_path = external_path + '/%s_table' % external_file_name # If dir not just the file, create data dir and move file there. self.fs.mkdir(external_path) self.fs.rename(source_path, external_path) if external_path.lower().startswith( "abfs"): #this is to check if its using an ABFS path external_path = abfspath(external_path) sql += django_mako.render_to_string( "gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter if source_type == 'hive' else None, 'map_key_terminator': map_delimiter if source_type == 'hive' else None, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'kudu'), 'path': external_path, 'skip_header': skip_header, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database }) if table_format in ('text', 'json', 'csv', 'regexp') and not external and load_data: form_data = { 'path': source_path, 'overwrite': False, 'partition_columns': [(partition['name'], partition['partitionValue']) for partition in partition_columns], } query_server_config = dbms.get_query_server_config( name=source_type) db = dbms.get(self.user, query_server=query_server_config) sql += "\n\n%s;" % db.load_data( database, table_name, form_data, None, generate_ddl_only=True) if load_data and table_format in ('parquet', 'kudu'): file_format = table_format if table_format == 'kudu': columns_list = [ '`%s`' % col for col in primary_keys + [ col['name'] for col in destination['columns'] if col['name'] not in primary_keys and col['keep'] ] ] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) PARTITION BY HASH PARTITIONS 16 STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas' = '1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] extra_create_properties = 'STORED AS %(file_format)s' % { 'file_format': file_format } sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), 'comment': ' COMMENT "%s"' % comment if comment else '' } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } editor_type = 'impala' if table_format == 'kudu' else destination[ 'sourceType'] on_success_url = reverse('metastore:describe_table', kwargs={ 'database': database, 'table': final_table_name }) + '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def query(self, collection, query): solr_query = {} json_facets = {} solr_query['collection'] = collection['name'] if query.get('download'): solr_query['rows'] = 1000 solr_query['start'] = 0 else: solr_query['rows'] = int(collection['template']['rows'] or 10) solr_query['start'] = int(query['start']) solr_query['rows'] = min(solr_query['rows'], 1000) solr_query['start'] = min(solr_query['start'], 10000) params = self._get_params() + ( ('q', self._get_q(query)), ('wt', 'json'), ('rows', solr_query['rows']), ('start', solr_query['start']), ) if any(collection['facets']): params += ( ('facet', 'true'), ('facet.mincount', 0), ('facet.limit', 10), ) timeFilter = self._get_range_borders(collection, query) for facet in collection['facets']: if facet['type'] == 'query': params += (('facet.query', '%s' % facet['field']),) elif facet['type'] == 'range' or facet['type'] == 'range-up': keys = { 'id': '%(id)s' % facet, 'field': facet['field'], 'key': '%(field)s-%(id)s' % facet, 'start': facet['properties']['start'], 'end': facet['properties']['end'], 'gap': facet['properties']['gap'], 'mincount': int(facet['properties']['mincount']) } if facet['properties']['canRange'] or timeFilter and timeFilter['time_field'] == facet['field'] and (facet['id'] not in timeFilter['time_filter_overrides'] or facet['widgetType'] != 'histogram-widget'): keys.update(self._get_time_filter_query(timeFilter, facet, collection)) params += ( ('facet.range', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.range.start=%(start)s f.%(field)s.facet.range.end=%(end)s f.%(field)s.facet.range.gap=%(gap)s f.%(field)s.facet.mincount=%(mincount)s}%(field)s' % keys), ) elif facet['type'] == 'field': keys = { 'id': '%(id)s' % facet, 'field': facet['field'], 'key': '%(field)s-%(id)s' % facet, 'limit': int(facet['properties'].get('limit', 10)) + (1 if facet['widgetType'] == 'facet-widget' else 0), 'mincount': int(facet['properties']['mincount']) } params += ( ('facet.field', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.limit=%(limit)s f.%(field)s.facet.mincount=%(mincount)s}%(field)s' % keys), ) elif facet['type'] == 'nested': _f = {} if facet['properties']['facets']: self._n_facet_dimension(facet, _f, facet['properties']['facets'], 1, timeFilter, collection, can_range = facet['properties']['canRange']) if facet['properties'].get('domain'): if facet['properties']['domain'].get('blockParent') or facet['properties']['domain'].get('blockChildren'): _f['domain'] = {} if facet['properties']['domain'].get('blockParent'): _f['domain']['blockParent'] = ' OR '.join(facet['properties']['domain']['blockParent']) if facet['properties']['domain'].get('blockChildren'): _f['domain']['blockChildren'] = ' OR '.join(facet['properties']['domain']['blockChildren']) if _f: sort = {'count': facet['properties']['facets'][0]['sort']} for i, agg in enumerate(self._get_dimension_aggregates(facet['properties']['facets'][1:])): if agg['sort'] != 'default': agg_function = self._get_aggregate_function(agg) sort = {'agg_%02d_%02d:%s' % (1, i, agg_function): agg['sort']} if sort.get('count') == 'default': sort['count'] = 'desc' dim_key = [key for key in list(_f['facet'].keys()) if 'dim' in key][0] _f['facet'][dim_key].update({ 'excludeTags': facet['id'], 'offset': 0, 'numBuckets': True, 'allBuckets': True, 'sort': sort #'prefix': '' # Forbidden on numeric fields }) json_facets[facet['id']] = _f['facet'][dim_key] elif facet['type'] == 'function': if facet['properties']['facets']: json_facets[facet['id']] = self._get_aggregate_function(facet['properties']['facets'][0]) if facet['properties']['compare']['is_enabled']: # TODO: global compare override unit = re.split('\d+', facet['properties']['compare']['gap'])[1] json_facets[facet['id']] = { 'type': 'range', 'field': collection['timeFilter'].get('field'), 'start': 'NOW/%s-%s-%s' % (unit, facet['properties']['compare']['gap'], facet['properties']['compare']['gap']), 'end': 'NOW/%s' % unit, 'gap': '+%(gap)s' % facet['properties']['compare'], 'facet': {facet['id']: json_facets[facet['id']]} } if facet['properties']['filter']['is_enabled']: json_facets[facet['id']] = { 'type': 'query', 'q': facet['properties']['filter']['query'] or EMPTY_QUERY.get(), 'facet': {facet['id']: json_facets[facet['id']]} } json_facets['processEmpty'] = True elif facet['type'] == 'pivot': if facet['properties']['facets'] or facet['widgetType'] == 'map-widget': fields = facet['field'] fields_limits = [] for f in facet['properties']['facets']: fields_limits.append('f.%s.facet.limit=%s' % (f['field'], f['limit'])) fields_limits.append('f.%s.facet.mincount=%s' % (f['field'], f['mincount'])) fields += ',' + f['field'] keys = { 'id': '%(id)s' % facet, 'key': '%(field)s-%(id)s' % facet, 'field': facet['field'], 'fields': fields, 'limit': int(facet['properties'].get('limit', 10)), 'mincount': int(facet['properties']['mincount']), 'fields_limits': ' '.join(fields_limits) } params += ( ('facet.pivot', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.limit=%(limit)s f.%(field)s.facet.mincount=%(mincount)s %(fields_limits)s}%(fields)s' % keys), ) params += self._get_fq(collection, query) fl = urllib_unquote(utf_quoter(','.join(Collection2.get_field_list(collection)))) nested_fields = self._get_nested_fields(collection) if nested_fields: fl += urllib_unquote(utf_quoter(',[child parentFilter="%s"]' % ' OR '.join(nested_fields))) if collection['template']['moreLikeThis'] and fl != ['*']: # Potential conflict with nested documents id_field = collection.get('idField', 'id') params += ( ('mlt', 'true'), ('mlt.fl', fl.replace(',%s' % id_field, '')), ('mlt.mintf', 1), ('mlt.mindf', 1), ('mlt.maxdf', 50), ('mlt.maxntp', 1000), ('mlt.count', 10), #('mlt.minwl', 1), #('mlt.maxwl', 1), ) fl = '*' params += (('fl', fl),) params += ( ('hl', 'true'), ('hl.fl', '*'), ('hl.snippets', 5), ('hl.fragsize', 1000), ) #if query.get('timezone'): # params += (('TZ', query.get('timezone')),) if collection['template']['fieldsSelected']: fields = [] for field in collection['template']['fieldsSelected']: attribute_field = [attribute for attribute in collection['template']['fieldsAttributes'] if field == attribute['name']] if attribute_field: if attribute_field[0]['sort']['direction']: fields.append('%s %s' % (field, attribute_field[0]['sort']['direction'])) if fields: params += ( ('sort', ','.join(fields)), ) if json_facets: response = self._root.post( '%(collection)s/select' % solr_query, params, data=json.dumps({'facet': json_facets}), contenttype='application/json') else: response = self._root.get('%(collection)s/select' % solr_query, params) return self._get_json(response)
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib_unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib_unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] # Assumes handle still live fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception as e: try: client.delete_index(index_name, keep_config=False) except Exception as e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2)) raise e return { 'status': 0, 'on_success_url': reverse('indexer:indexes', kwargs={'index': index_name}), 'pub_sub_url': 'assist.collections.refresh', 'errors': errors }
def create_table_from_local_file(self, source, destination, start_time=-1): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name source_type = source['sourceType'] editor_type = destination['sourceType'] columns = destination['columns'] if editor_type in ('hive', 'mysql'): if editor_type == 'mysql': for col in columns: if col['type'] == 'string': col['type'] = 'VARCHAR(255)' sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s ( %(columns)s); ''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join([' `%(name)s` %(type)s' % col for col in columns]), } path = urllib_unquote(source['path']) if path: # data insertion with open(BASE_DIR + path, 'r') as local_file: reader = csv.reader(local_file) list_of_tuples = list(map(tuple, reader)) if source['format']['hasHeader']: list_of_tuples = list_of_tuples[1:] csv_rows = str(list_of_tuples)[1:-1] if editor_type in ('hive', 'mysql'): sql += '''\nINSERT INTO %(database)s.%(table_name)s VALUES %(csv_rows)s; ''' % { 'database': database, 'table_name': table_name, 'csv_rows': csv_rows } on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \ '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get( 'show_command'): # if destination['isTargetExisting']: client.create_index(name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format[ 'streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) file_encoding = None if source['inputFormat'] == 'file': if source['path']: path = urllib_unquote(source['path']) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) source['path'] = request.fs.netnormpath(path) stream = request.fs.open(path) file_encoding = check_encoding(stream.read(10000)) if destination['ouputFormat'] in ('database', 'table') and request.fs is not None: destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \ if destination['nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif destination['ouputFormat'] == 'stream-table': args = { 'source': source, 'destination': destination, 'start_time': start_time, 'dry_run': request.POST.get('show_command') } api = FlinkIndexer(request.user, request.fs) job_nb = api.create_table_from_kafka(**args) if request.POST.get('show_command'): job_handle = {'status': 0, 'commands': job_nb} else: job_handle = job_nb.execute(request, batch=False) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) elif destination['ouputFormat'] == 'big-table': args = { 'request': request, 'source': source, 'destination': destination, 'start_time': start_time, 'dry_run': request.POST.get('show_command') } api = PhoenixIndexer(request.user, request.fs) job_nb = api.create_table_from_file(**args) if request.POST.get('show_command'): job_handle = {'status': 0, 'commands': job_nb} else: job_handle = job_nb.execute(request, batch=False) else: if source['inputFormat'] == 'localfile': job_handle = _create_table_from_local(request, source, destination, start_time) else: # TODO: if inputFormat is 'stream' and tableFormat is 'kudu' --> create Table only job_handle = _create_table(request, source, destination, start_time, file_encoding) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'localfile': path = urllib_unquote(file_format['path']) with open(path, 'r') as local_file: reader = csv.reader(local_file) csv_data = list(reader) if file_format['format']['hasHeader']: sample = csv_data[1:5] column_row = [ re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0] ] else: sample = csv_data[:4] column_row = [ 'field_' + str(count + 1) for count, col in enumerate(sample[0]) ] field_type_guesses = [] for count, col in enumerate(column_row): column_samples = [ sample_row[count] for sample_row in sample if len(sample_row) > count ] field_type_guess = guess_field_type_from_samples( column_samples) field_type_guesses.append(field_type_guess) columns = [ Field(column_row[count], field_type_guesses[count]).to_dict() for count, col in enumerate(column_row) ] format_ = {'columns': columns, 'sample': sample} elif file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) LOG.debug('File %s encoding is %s' % (path, encoding)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def create_table_from_file(self, request, source, destination, start_time=-1, dry_run=False): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name source_type = [interpreter['type'] for interpreter in get_ordered_interpreters(self.user) if interpreter['dialect'] == 'phoenix'][0] editor_type = source_type columns = destination['columns'] # Until we have proper type convertion for col in columns: if col['type'] == 'string': col['type'] = 'varchar' sql = '''CREATE TABLE IF NOT EXISTS %(table_name)s ( %(columns)s CONSTRAINT my_pk PRIMARY KEY (%(primary_keys)s) ); ''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join([' %(name)s %(type)s' % col for col in columns]), 'primary_keys': ', '.join(destination.get('indexerPrimaryKey')) } source_path = urllib_unquote(source['path']) if source['inputFormat'] == 'file': file_obj = request.fs.open(source_path) content = file_obj.read().decode("utf-8") csvfile = string_io(content) reader = csv.reader(csvfile) else: local_file = open(source_path, 'r') reader = csv.reader(local_file) if destination['indexerRunJob']: for count, csv_row in enumerate(reader): if (source['format']['hasHeader'] and count == 0) or not csv_row: continue else: _sql = ', '.join([ "'{0}'".format(col_val) if columns[count]['type'] in ('varchar', 'timestamp') \ else '{0}'.format(col_val) for count, col_val in enumerate(csv_row)]) sql += '''\nUPSERT INTO %(table_name)s VALUES (%(csv_row)s);\n''' % { 'database': database, 'table_name': table_name, 'csv_row': _sql } if dry_run: return sql else: on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \ '?source_type=' + source_type return make_notebook( name=_('Creating table %(database)s.%(table)s') % {'database': database, 'table': final_table_name}, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True )