Example #1
0
def parse_header_value(header: str) -> Tuple[str, Dict[str, str]]:
    """
    Parse an HTTP header value.

    Parameter values will be unquoted.
    If the key ends with an asterisk (``*``), the asterisk is removed from the key name and the
    value is then decoded according to :rfc:`2231`.

    :param header:
    :return: a tuple of (main value, params dict)

    """
    assert check_argument_types()
    main_value, params_str = header.partition(';')[::2]
    params = {}
    for match in header_param_re.finditer(params_str):
        key, value = match.groups()
        value = unquote(value)
        if key.endswith('*'):
            key = key[:-1]
            encoding, value = decode_rfc2231(value)[::2]
            value = urllib_unquote(value, encoding)

        params[key] = value

    return main_value.rstrip(), params
Example #2
0
    def _ask_with_pinentry(self, prompt, description, error, validator):
        self._waitfor("OK")
        env = os.environ.get
        self._comm("OPTION lc-ctype=%s" % env("LC_CTYPE", env("LC_ALL", "en_US.UTF-8")))
        try:
            self._comm("OPTION ttyname=%s" % env("TTY", os.ttyname(sys.stdout.fileno())))
        except:
            pass
        if env('TERM'):
            self._comm("OPTION ttytype=%s" % env("TERM"))
        if prompt:
            self._comm("SETPROMPT %s" % self._esc(prompt))
        if description:
            self._comm("SETDESC %s" % self._esc(description))
        password = None
        while not validator(password):
            if password is not None:
                self._comm("SETERROR %s" % self._esc(error))
            password = self._comm_getpin()

        # Passphrase may contain percent-encoded entities
        # gpg/pinentry: pinentry/pinentry.c#L392 copy_and_escape
        # https://github.com/gpg/pinentry/blob/master/pinentry/pinentry.c#L392
        password = urllib_unquote(password)

        return password
Example #3
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            source['path'] = request.fs.netnormpath(path)

    if destination['ouputFormat'] in ('database', 'table'):
        destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \
                                            if destination['nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif source['inputFormat'] in (
            'stream', 'connector') or destination['ouputFormat'] == 'stream':
        job_handle = _envelope_job(request,
                                   source,
                                   destination,
                                   start_time=start_time,
                                   lib_path=destination['indexerJobLibPath'])
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Example #4
0
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        path = urllib_unquote(file_format["path"])
        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)
    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n",
                'topics': get_topics()
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)
Example #5
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                kafkaFieldNames = [
                    'id', 'additionalInfo', 'allowed', 'collectionName',
                    'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst',
                    'entityId', 'family', 'impersonator', 'ip', 'name',
                    'objectType', 'objType', 'objUsageType', 'operationParams',
                    'operationText', 'op', 'opText', 'path', 'perms',
                    'privilege', 'qualifier', 'QUERY_ID', 'resourcePath',
                    'service', 'SESSION_ID', 'solrVersion', 'src', 'status',
                    'subOperation', 'tableName', 'table', 'time', 'type',
                    'url', 'user'
                ]
                kafkaFieldTypes = ['string'] * len(kafkaFieldNames)
                kafkaFieldNames.append('timeDate')
                kafkaFieldTypes.append('date')
            else:
                # Note: mocked here, should come from SFDC or Kafka API or sampling job
                kafkaFieldNames = file_format.get('kafkaFieldNames',
                                                  '').split(',')
                kafkaFieldTypes = file_format.get('kafkaFieldTypes',
                                                  '').split(',')

            data = """%(kafkaFieldNames)s
%(data)s""" % {
                'kafkaFieldNames': ','.join(kafkaFieldNames),
                'data': '\n'.join(
                    [','.join(['...'] * len(kafkaFieldTypes))] * 5)
            }
            stream = string_io()
            stream.write(data)

            _convert_format(file_format["format"], inverse=True)

            indexer = MorphlineIndexer(request.user, request.fs)
            format_ = indexer.guess_field_types({
                "file": {
                    "stream": stream,
                    "name": file_format['path']
                },
                "format": file_format['format']
            })
            type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes)))

            for col in format_['columns']:
                col['keyType'] = type_mapping[col['name']]
                col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Example #6
0
def export_result(request):
    response = {'status': -1, 'message': _('Success')}

    # Passed by check_document_access_permission but unused by APIs
    notebook = json.loads(request.POST.get('notebook', '{}'))
    snippet = json.loads(request.POST.get('snippet', '{}'))
    data_format = json.loads(request.POST.get('format', '"hdfs-file"'))
    destination = urllib_unquote(
        json.loads(request.POST.get('destination', '""')))
    overwrite = json.loads(request.POST.get('overwrite', 'false'))
    is_embedded = json.loads(request.POST.get('is_embedded', 'false'))
    start_time = json.loads(request.POST.get('start_time', '-1'))

    api = get_api(request, snippet)

    if data_format == 'hdfs-file':  # Blocking operation, like downloading
        if request.fs.isdir(destination):
            if notebook.get('name'):
                destination += '/%(name)s.csv' % notebook
            else:
                destination += '/%(type)s-%(id)s.csv' % notebook
        if overwrite and request.fs.exists(destination):
            request.fs.do_as_user(request.user.username, request.fs.rmtree,
                                  destination)
        response['watch_url'] = api.export_data_as_hdfs_file(
            snippet, destination, overwrite)
        response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to HDFS destination: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format == 'hive-table':
        if is_embedded:
            sql, success_url = api.export_data_as_table(
                notebook, snippet, destination)

            task = make_notebook(name=_('Export %s query to table %s') %
                                 (snippet['type'], destination),
                                 description=_('Query %s to %s') %
                                 (_get_snippet_name(notebook), success_url),
                                 editor_type=snippet['type'],
                                 statement=sql,
                                 status='ready',
                                 database=snippet['database'],
                                 on_success_url=success_url,
                                 last_executed=start_time,
                                 is_task=True)
            response = task.execute(request)
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table&notebook=' + str(notebook_id) + \
                '&snippet=0&destination=' + destination
            response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to Hive table: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format == 'hdfs-directory':
        if destination.lower().startswith("abfs"):
            destination = abfspath(destination)
        if request.fs.exists(destination) and request.fs.listdir_stats(
                destination):
            raise PopupException(
                _('The destination is not an empty directory!'))
        if is_embedded:
            sql, success_url = api.export_large_data_to_hdfs(
                notebook, snippet, destination)

            task = make_notebook(name=_('Export %s query to directory') %
                                 snippet['type'],
                                 description=_('Query %s to %s') %
                                 (_get_snippet_name(notebook), success_url),
                                 editor_type=snippet['type'],
                                 statement=sql,
                                 status='ready-execute',
                                 database=snippet['database'],
                                 on_success_url=success_url,
                                 last_executed=start_time,
                                 is_task=True)
            response = task.execute(request)
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query&notebook=' + str(notebook_id) + \
                '&snippet=0&destination=' + destination
            response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to HDFS directory: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format in ('search-index', 'dashboard'):
        # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable)
        if is_embedded:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))

            if data_format == 'dashboard':
                engine = notebook['type'].replace('query-', '')
                response['watch_url'] = reverse(
                    'dashboard:browse', kwargs={
                        'name': notebook_id
                    }) + '?source=query&engine=%(engine)s' % {
                        'engine': engine
                    }
                response['status'] = 0
            else:
                sample = get_api(request,
                                 snippet).fetch_result(notebook,
                                                       snippet,
                                                       rows=4,
                                                       start_over=True)
                for col in sample['meta']:
                    col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get(
                        col['type'], 'string')

                response['status'] = 0
                response['id'] = notebook_id
                response['name'] = _get_snippet_name(notebook)
                response['source_type'] = 'query'
                response['target_type'] = 'index'
                response['target_path'] = destination
                response['sample'] = list(sample['data'])
                response['columns'] = [
                    Field(col['name'], col['type']).to_dict()
                    for col in sample['meta']
                ]
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query&notebook=' + str(notebook_id) + \
                '&snippet=0&destination=' + destination
            response['status'] = 0

        if response.get('status') != 0:
            response['message'] = _('Exporting result failed.')

    return JsonResponse(response)
Example #7
0
 def clean_path(self):
     return urllib_unquote(self.cleaned_data.get('path', ''))
Example #8
0
    def __init__(self, uri, basepath=None):
        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if type(uri) != str:
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")
            b64 = urllib_unquote(m.group("data")).encode("utf-8")
            self.data = base64.b64decode(b64)

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get(
                    "Content-Type", '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib2.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server,  **httpConfig)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader(
                        "Content-Type", '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file = gzip.GzipFile(
                            mode="rb", fileobj=six.StringIO(r1.read()))
                    else:
                        self.file = pisaTempFile(r1.read())
                else:
                    log.debug(
                        "Received non-200 status: {}".format((r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get(
                        "Content-Type", '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    if sys.platform == 'win32' and os.path.isfile(basepath):
                        basepath = os.path.dirname(basepath)
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        self.file = open(uri, "r") #removed bytes... lets hope it goes ok :/
                    else:
                        # removed bytes... lets hope it goes ok :/
                        self.file = open(uri, "rb")
Example #9
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            source['path'] = request.fs.netnormpath(path)
            parent_path = request.fs.parent_path(path)
            stats = request.fs.stats(parent_path)
            split = urlparse(path)
            # Only for HDFS, import data and non-external table
            if split.scheme in (
                    '', 'hdfs') and destination['importData'] and destination[
                        'useDefaultLocation'] and oct(
                            stats["mode"])[-1] != '7' and not request.POST.get(
                                'show_command'):
                user_scratch_dir = request.fs.get_home_dir() + '/.scratchdir'
                request.fs.do_as_user(request.user, request.fs.mkdir,
                                      user_scratch_dir, 0o0777)
                request.fs.do_as_user(request.user, request.fs.rename,
                                      source['path'], user_scratch_dir)
                source['path'] = user_scratch_dir + '/' + source['path'].split(
                    '/')[-1]

    if destination['ouputFormat'] in ('database', 'table'):
        destination['nonDefaultLocation'] = request.fs.netnormpath(
            destination['nonDefaultLocation']) if destination[
                'nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif source['inputFormat'] in (
            'stream', 'connector') or destination['ouputFormat'] == 'stream':
        job_handle = _envelope_job(request,
                                   source,
                                   destination,
                                   start_time=start_time,
                                   lib_path=destination['indexerJobLibPath'])
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Example #10
0
    def __init__(self, uri, basepath=None):
        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if type(uri) != str:
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")
            b64 = urllib_unquote(m.group("data")).encode("utf-8")
            self.data = base64.b64decode(b64)

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get("Content-Type",
                                                       '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib2.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server, **httpConfig)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader("Content-Type",
                                                 '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file = gzip.GzipFile(mode="rb",
                                                  fileobj=six.BytesIO(
                                                      r1.read()))
                    else:
                        self.file = pisaTempFile(r1.read())
                else:
                    log.debug("Received non-200 status: {}".format(
                        (r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get("Content-Type",
                                                           '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    if sys.platform == 'win32' and os.path.isfile(basepath):
                        basepath = os.path.dirname(basepath)
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        self.file = open(
                            uri,
                            "r")  #removed bytes... lets hope it goes ok :/
                    else:
                        # removed bytes... lets hope it goes ok :/
                        self.file = open(uri, "rb")
Example #11
0
File: api3.py Project: mapr/hue
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    file_type = file_format['file_type']
    path = urllib_unquote(file_format["path"])

    if sys.version_info[0] < 3 and (file_type == 'excel' or path[-3:] == 'xls'
                                    or path[-4:] == 'xlsx'):
        return JsonResponse({
            'status':
            -1,
            'message':
            'Python2 based Hue does not support Excel file importer'
        })

    if file_format['inputFormat'] == 'localfile':
        if file_type == 'excel':
            format_ = {"type": "excel", "hasHeader": True}
        else:
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'file':
        if path[-3:] == 'xls' or path[-4:] == 'xlsx':
            file_obj = request.fs.open(path)
            if path[-3:] == 'xls':
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='xlrd')
            else:
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='openpyxl')
            _csv_data = df.to_csv(index=False)

            path = excel_to_csv_file_name_change(path)
            request.fs.create(path, overwrite=True, data=_csv_data)

        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)

        if file_format["path"][-3:] == 'xls' or file_format["path"][
                -4:] == 'xlsx':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "excel",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "json",
                # "fieldSeparator": ",",
                # "hasHeader": True,
                # "quoteChar": "\"",
                # "recordSeparator": "\\n",
                'topics': get_topics(request.user)
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)
Example #12
0
    def create_table_from_a_file(self,
                                 source,
                                 destination,
                                 start_time=-1,
                                 file_encoding=None):
        if '.' in destination['name']:
            database, table_name = destination['name'].split('.', 1)
        else:
            database = 'default'
            table_name = destination['name']
        final_table_name = table_name

        table_format = destination['tableFormat']
        source_type = source['sourceType']

        columns = destination['columns']
        partition_columns = destination['partitionColumns']
        kudu_partition_columns = destination['kuduPartitionColumns']
        comment = destination['description']

        source_path = urllib_unquote(source['path'])
        load_data = destination['importData']
        external = not destination['useDefaultLocation']
        external_path = urllib_unquote(destination['nonDefaultLocation'])

        editor_type = destination['sourceType']
        is_transactional = destination['isTransactional']
        default_transactional_type = 'insert_only' if destination[
            'isInsertOnly'] else 'default'

        skip_header = destination['hasHeader']

        primary_keys = destination['primaryKeys']

        if destination['useCustomDelimiters']:
            field_delimiter = destination['customFieldDelimiter']
            collection_delimiter = destination[
                'customCollectionDelimiter'] or None
            map_delimiter = destination['customMapDelimiter'] or None
        else:
            field_delimiter = ','
            collection_delimiter = r'\002'
            map_delimiter = r'\003'
        regexp_delimiter = destination['customRegexp']

        file_format = 'TextFile'
        row_format = 'Delimited'
        serde_name = ''
        serde_properties = ''
        extra_create_properties = ''
        sql = ''

        if source['inputFormat'] == 'manual':
            load_data = False
            source['format'] = {'quoteChar': '"', 'fieldSeparator': ','}

        if table_format == 'json':
            row_format = 'serde'
            serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe'
        elif table_format == 'regexp':
            row_format = 'serde'
            serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe'
            serde_properties = '"input.regex" = "%s"' % regexp_delimiter
        elif table_format == 'csv':
            if source['format']['quoteChar'] == '"':
                source['format']['quoteChar'] = '\\"'
            row_format = 'serde'
            serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            serde_properties = '''"separatorChar" = "%(fieldSeparator)s",
    "quoteChar"     = "%(quoteChar)s",
    "escapeChar"    = "\\\\"
    ''' % source['format']

        use_temp_table = table_format in ('parquet', 'orc',
                                          'kudu') or is_transactional
        if use_temp_table:  # We'll be using a temp table to load data
            if load_data:
                table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name

                sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % {
                    'database': database,
                    'table_name': table_name
                }
            else:  # Manual
                row_format = ''
                file_format = table_format
                skip_header = False
                if table_format == 'kudu':
                    columns = [
                        col for col in columns if col['name'] in primary_keys
                    ] + [
                        col
                        for col in columns if col['name'] not in primary_keys
                    ]

        if table_format == 'kudu':
            collection_delimiter = None
            map_delimiter = None

        if external or (load_data and table_format in (
                'parquet', 'orc', 'kudu')):  # We'll use location to load data
            if not self.fs.isdir(external_path):  # File selected
                external_path, external_file_name = Hdfs.split(external_path)

                if len(self.fs.listdir(external_path)) > 1:
                    # If dir not just the file, create data dir and move file there. Make sure it's unique.
                    external_path = external_path + '/%s%s_table' % (
                        external_file_name, str(uuid.uuid4()))
                    self.fs.mkdir(external_path)
                    self.fs.rename(source_path, external_path)
        elif load_data:  # We'll use load data command
            parent_path = self.fs.parent_path(source_path)
            stats = self.fs.stats(parent_path)
            split = urlparse(source_path)
            # Only for HDFS, import data and non-external table
            if split.scheme in ('', 'hdfs') and oct(stats["mode"])[-1] != '7':
                user_scratch_dir = self.fs.get_home_dir(
                ) + '/.scratchdir/%s' % str(
                    uuid.uuid4())  # Make sure it's unique.
                self.fs.do_as_user(self.user, self.fs.mkdir, user_scratch_dir,
                                   0o0777)
                self.fs.do_as_user(self.user, self.fs.rename, source['path'],
                                   user_scratch_dir)
                if USER_SCRATCH_DIR_PERMISSION.get():
                    self.fs.do_as_user(self.user, self.fs.chmod,
                                       user_scratch_dir, 0o0777, True)
                source_path = user_scratch_dir + '/' + source['path'].split(
                    '/')[-1]

        if external_path.lower().startswith(
                "abfs"):  #this is to check if its using an ABFS path
            external_path = abfspath(external_path)

        tbl_properties = OrderedDict()
        if skip_header:
            tbl_properties['skip.header.line.count'] = '1'
        # The temp table is not transactional, but final table can be if is_transactional.
        # tbl_properties that don't exist in previous versions can safely be added without error.
        tbl_properties['transactional'] = 'false'

        sql += django_mako.render_to_string(
            "gen/create_table_statement.mako", {
                'table': {
                    'name':
                    table_name,
                    'comment':
                    comment,
                    'row_format':
                    row_format,
                    'field_terminator':
                    field_delimiter,
                    'collection_terminator':
                    collection_delimiter if source_type == 'hive' else None,
                    'map_key_terminator':
                    map_delimiter if source_type == 'hive' else None,
                    'serde_name':
                    serde_name,
                    'serde_properties':
                    serde_properties,
                    'file_format':
                    file_format,
                    'external':
                    external or load_data
                    and table_format in ('parquet', 'orc', 'kudu'),
                    'path':
                    external_path,
                    'primary_keys':
                    primary_keys
                    if table_format == 'kudu' and not load_data else [],
                    'tbl_properties':
                    tbl_properties
                },
                'columns': columns,
                'partition_columns': partition_columns,
                'kudu_partition_columns': kudu_partition_columns,
                'database': database
            })
        if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8' and not use_temp_table:
            sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \
                   'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % {
                       'database': database,
                       'final_table_name': final_table_name,
                       'file_encoding': file_encoding
                   }

        if table_format in ('text', 'json', 'csv',
                            'regexp') and not external and load_data:
            form_data = {
                'path':
                source_path,
                'overwrite':
                False,
                'partition_columns':
                [(partition['name'], partition['partitionValue'])
                 for partition in partition_columns],
            }
            query_server_config = dbms.get_query_server_config(
                name=source_type)
            db = dbms.get(self.user, query_server=query_server_config)
            sql += "\n\n%s;" % db.load_data(
                database, table_name, form_data, None, generate_ddl_only=True)

        if load_data and use_temp_table:
            file_format = 'TextFile' if table_format == 'text' else table_format
            if table_format == 'kudu':
                columns_list = [
                    '`%s`' % col for col in primary_keys + [
                        col['name'] for col in destination['columns']
                        if col['name'] not in primary_keys and col['keep']
                    ]
                ]
                extra_create_properties = """PRIMARY KEY (%(primary_keys)s)
        PARTITION BY HASH PARTITIONS 16
        STORED AS %(file_format)s
        TBLPROPERTIES(
        'kudu.num_tablet_replicas'='1'
        )""" % {
                    'file_format': file_format,
                    'primary_keys': ', '.join(primary_keys)
                }
            else:
                columns_list = ['*']
                extra_create_properties = 'STORED AS %(file_format)s' % {
                    'file_format': file_format
                }
                if is_transactional:
                    extra_create_properties += "\nTBLPROPERTIES('transactional'='true', 'transactional_properties'='%s')" % \
                        default_transactional_type

            sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s
        %(extra_create_properties)s
        AS SELECT %(columns_list)s
        FROM `%(database)s`.`%(table_name)s`;''' % {
                'database': database,
                'final_table_name': final_table_name,
                'table_name': table_name,
                'extra_create_properties': extra_create_properties,
                'columns_list': ', '.join(columns_list),
                'comment': ' COMMENT "%s"' % comment if comment else ''
            }
            sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % {
                'database': database,
                'table_name': table_name
            }
            if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8':
                sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \
                       'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % {
                    'database': database,
                    'final_table_name': final_table_name,
                    'file_encoding': file_encoding
                }

        on_success_url = reverse('metastore:describe_table',
                                 kwargs={
                                     'database': database,
                                     'table': final_table_name
                                 }) + '?source_type=' + source_type

        return make_notebook(name=_('Creating table %(database)s.%(table)s') %
                             {
                                 'database': database,
                                 'table': final_table_name
                             },
                             editor_type=editor_type,
                             statement=sql.strip(),
                             status='ready',
                             database=database,
                             on_success_url=on_success_url,
                             last_executed=start_time,
                             is_task=True)
Example #13
0
    def __init__(self, uri, basepath=None):

        self.basepath = basepath
        self.mimetype = None
        self.file_content = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if not isinstance(uri, str):
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")

            b64 = urllib_unquote(m.group("data"))

            # The data may be incorrectly unescaped... repairs needed
            b64 = b64.strip("b'").strip("'").encode()
            b64 = re.sub(b"\\n", b'', b64)
            b64 = re.sub(b'[^A-Za-z0-9\\+\\/]+', b'', b64)



            # Add padding as needed, to make length into a multiple of 4
            #
            b64 += b"=" * ((4 - len(b64) % 4) % 4)

            self.data = base64.b64decode(b64)

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get(
                    "Content-Type", '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file_content = urlResponse.read()

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                url_splitted = urlparse.urlsplit(uri)
                server = url_splitted[1]
                path = url_splitted[2]
                path += "?" + url_splitted[3] if url_splitted[3] else ""
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server,  **httpConfig)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader(
                        "Content-Type", '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file_content = gzip.GzipFile(
                            mode="rb", fileobj=six.BytesIO(r1.read()))
                    else:
                        self.file_content = pisaTempFile(r1.read())
                else:
                    log.debug(
                        "Received non-200 status: {}".format((r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get(
                        "Content-Type", '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file_content = urlResponse.read()
                conn.close()

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    if sys.platform == 'win32' and os.path.isfile(basepath):
                        basepath = os.path.dirname(basepath)
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        with open(uri, "r") as file_handler:
                            # removed bytes... lets hope it goes ok :/
                            self.file_content = file_handler.read()
                    else:
                        with open(uri, "rb") as file_handler:
                            # removed bytes... lets hope it goes ok :/
                            self.file_content = file_handler.read()
Example #14
0
    def create_table_from_local_file(self, source, destination, start_time=-1):
        if '.' in destination['name']:
            database, table_name = destination['name'].split('.', 1)
        else:
            database = 'default'
            table_name = destination['name']
        final_table_name = table_name

        source_type = source['sourceType']
        editor_type = destination['sourceType']

        columns = destination['columns']

        dialect = get_interpreter(source_type, self.user)['dialect']

        if dialect in ('hive', 'mysql'):

            if dialect == 'mysql':
                for col in columns:
                    if col['type'] == 'string':
                        col['type'] = 'VARCHAR(255)'

            sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s (
%(columns)s);\n''' % {
                'database':
                database,
                'table_name':
                table_name,
                'columns':
                ',\n'.join(['  `%(name)s` %(type)s' % col for col in columns]),
            }

        elif dialect == 'phoenix':

            for col in columns:
                if col['type'] == 'string':
                    col['type'] = 'CHAR(255)'

            sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s (
%(columns)s
CONSTRAINT my_pk PRIMARY KEY (%(primary_keys)s));\n''' % {
                'database':
                database,
                'table_name':
                table_name,
                'columns':
                ',\n'.join(['  %(name)s %(type)s' % col for col in columns]),
                'primary_keys':
                ', '.join(destination.get('primaryKeys'))
            }

        elif dialect == 'impala':
            sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s_tmp (
%(columns)s);\n''' % {
                'database':
                database,
                'table_name':
                table_name,
                'columns':
                ',\n'.join(['  `%(name)s` string' % col for col in columns]),
            }  # Impala does not implicitly cast between string and numeric or Boolean types.

        path = urllib_unquote(source['path'])

        if path:  # data insertion
            with open(path, 'r') as local_file:
                reader = csv.reader(local_file)
                _csv_rows = []

                for count, row in enumerate(reader):
                    if (source['format']['hasHeader']
                            and count == 0) or not row:
                        continue
                    if dialect == 'impala':  # for the boolean col updating csv_val to (1,0)
                        row = self.nomalize_booleans(row, columns)
                    _csv_rows.append(tuple(row))

                if _csv_rows:
                    csv_rows = str(_csv_rows)[1:-1]

                    if dialect in ('hive', 'mysql'):
                        sql += '''\nINSERT INTO %(database)s.%(table_name)s VALUES %(csv_rows)s;\n''' % {
                            'database': database,
                            'table_name': table_name,
                            'csv_rows': csv_rows
                        }
                    elif dialect == 'phoenix':
                        for csv_row in _csv_rows:
                            _sql = ', '.join([ "'{0}'".format(col_val) if columns[count]['type'] in ('CHAR(255)', 'timestamp') \
                              else '{0}'.format(col_val) for count, col_val in enumerate(csv_row)])

                            sql += '''\nUPSERT INTO %(database)s.%(table_name)s VALUES (%(csv_row)s);\n''' % {
                                'database': database,
                                'table_name': table_name,
                                'csv_row': _sql
                            }
                    elif dialect == 'impala':
                        # casting from string to boolean is not allowed in impala so string -> int -> bool
                        sql_ = ',\n'.join([
                          '  CAST ( `%(name)s` AS %(type)s ) `%(name)s`' % col if col['type'] != 'boolean' \
                          else '  CAST ( CAST ( `%(name)s` AS TINYINT ) AS boolean ) `%(name)s`' % col for col in columns
                        ])

                        sql += '''\nINSERT INTO %(database)s.%(table_name)s_tmp VALUES %(csv_rows)s;\n\nCREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s
AS SELECT\n%(sql_)s\nFROM  %(database)s.%(table_name)s_tmp;\n\nDROP TABLE IF EXISTS %(database)s.%(table_name)s_tmp;''' % {
                            'database': database,
                            'table_name': table_name,
                            'csv_rows': csv_rows,
                            'sql_': sql_
                        }

        on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \
            '?source_type=' + source_type

        return make_notebook(name=_('Creating table %(database)s.%(table)s') %
                             {
                                 'database': database,
                                 'table': final_table_name
                             },
                             editor_type=editor_type,
                             statement=sql.strip(),
                             status='ready',
                             database=database,
                             on_success_url=on_success_url,
                             last_executed=start_time,
                             is_task=True)
Example #15
0
    def create_table_from_a_file(self, source, destination, start_time=-1):
        if '.' in destination['name']:
            database, table_name = destination['name'].split('.', 1)
        else:
            database = 'default'
            table_name = destination['name']
        final_table_name = table_name

        table_format = destination['tableFormat']
        source_type = source['sourceType']

        columns = destination['columns']
        partition_columns = destination['partitionColumns']
        kudu_partition_columns = destination['kuduPartitionColumns']
        comment = destination['description']

        source_path = urllib_unquote(source['path'])
        external = not destination['useDefaultLocation']
        external_path = urllib_unquote(destination['nonDefaultLocation'])

        load_data = destination['importData']
        skip_header = destination['hasHeader']

        primary_keys = destination['primaryKeys']

        if destination['useCustomDelimiters']:
            field_delimiter = destination['customFieldDelimiter']
            collection_delimiter = destination['customCollectionDelimiter']
            map_delimiter = destination['customMapDelimiter']
        else:
            field_delimiter = ','
            collection_delimiter = r'\002'
            map_delimiter = r'\003'
        regexp_delimiter = destination['customRegexp']

        file_format = 'TextFile'
        row_format = 'Delimited'
        serde_name = ''
        serde_properties = ''
        extra_create_properties = ''
        sql = ''

        if source['inputFormat'] == 'manual':
            load_data = False
            source['format'] = {'quoteChar': '"', 'fieldSeparator': ','}

        if table_format == 'json':
            row_format = 'serde'
            serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe'
        elif table_format == 'regexp':
            row_format = 'serde'
            serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe'
            serde_properties = '"input.regex" = "%s"' % regexp_delimiter
        elif table_format == 'csv':
            if source['format']['quoteChar'] == '"':
                source['format']['quoteChar'] = '\\"'
            row_format = 'serde'
            serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            serde_properties = '''"separatorChar" = "%(fieldSeparator)s",
    "quoteChar"     = "%(quoteChar)s",
    "escapeChar"    = "\\\\"
    ''' % source['format']

        if table_format in ('parquet', 'kudu'):
            if load_data:
                table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name

                sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % {
                    'database': database,
                    'table_name': table_name
                }
            else:  # Manual
                row_format = ''
                file_format = table_format
                skip_header = False
                if table_format == 'kudu':
                    columns = [
                        col for col in columns if col['name'] in primary_keys
                    ] + [
                        col
                        for col in columns if col['name'] not in primary_keys
                    ]

        if table_format == 'kudu':
            collection_delimiter = None
            map_delimiter = None

        if external or (load_data and table_format in ('parquet', 'kudu')):
            if not self.fs.isdir(external_path):  # File selected
                external_path, external_file_name = self.fs.split(
                    external_path)

                if len(self.fs.listdir(external_path)) > 1:
                    external_path = external_path + '/%s_table' % external_file_name  # If dir not just the file, create data dir and move file there.
                    self.fs.mkdir(external_path)
                    self.fs.rename(source_path, external_path)

        if external_path.lower().startswith(
                "abfs"):  #this is to check if its using an ABFS path
            external_path = abfspath(external_path)

        sql += django_mako.render_to_string(
            "gen/create_table_statement.mako", {
                'table': {
                    'name':
                    table_name,
                    'comment':
                    comment,
                    'row_format':
                    row_format,
                    'field_terminator':
                    field_delimiter,
                    'collection_terminator':
                    collection_delimiter if source_type == 'hive' else None,
                    'map_key_terminator':
                    map_delimiter if source_type == 'hive' else None,
                    'serde_name':
                    serde_name,
                    'serde_properties':
                    serde_properties,
                    'file_format':
                    file_format,
                    'external':
                    external
                    or load_data and table_format in ('parquet', 'kudu'),
                    'path':
                    external_path,
                    'skip_header':
                    skip_header,
                    'primary_keys':
                    primary_keys
                    if table_format == 'kudu' and not load_data else [],
                },
                'columns': columns,
                'partition_columns': partition_columns,
                'kudu_partition_columns': kudu_partition_columns,
                'database': database
            })

        if table_format in ('text', 'json', 'csv',
                            'regexp') and not external and load_data:
            form_data = {
                'path':
                source_path,
                'overwrite':
                False,
                'partition_columns':
                [(partition['name'], partition['partitionValue'])
                 for partition in partition_columns],
            }
            query_server_config = dbms.get_query_server_config(
                name=source_type)
            db = dbms.get(self.user, query_server=query_server_config)
            sql += "\n\n%s;" % db.load_data(
                database, table_name, form_data, None, generate_ddl_only=True)

        if load_data and table_format in ('parquet', 'kudu'):
            file_format = table_format
            if table_format == 'kudu':
                columns_list = [
                    '`%s`' % col for col in primary_keys + [
                        col['name'] for col in destination['columns']
                        if col['name'] not in primary_keys and col['keep']
                    ]
                ]
                extra_create_properties = """PRIMARY KEY (%(primary_keys)s)
        PARTITION BY HASH PARTITIONS 16
        STORED AS %(file_format)s
        TBLPROPERTIES(
        'kudu.num_tablet_replicas' = '1'
        )""" % {
                    'file_format': file_format,
                    'primary_keys': ', '.join(primary_keys)
                }
            else:
                columns_list = ['*']
                extra_create_properties = 'STORED AS %(file_format)s' % {
                    'file_format': file_format
                }
            sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s
        %(extra_create_properties)s
        AS SELECT %(columns_list)s
        FROM `%(database)s`.`%(table_name)s`;''' % {
                'database': database,
                'final_table_name': final_table_name,
                'table_name': table_name,
                'extra_create_properties': extra_create_properties,
                'columns_list': ', '.join(columns_list),
                'comment': ' COMMENT "%s"' % comment if comment else ''
            }
            sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % {
                'database': database,
                'table_name': table_name
            }

        editor_type = 'impala' if table_format == 'kudu' else destination[
            'sourceType']

        on_success_url = reverse('metastore:describe_table',
                                 kwargs={
                                     'database': database,
                                     'table': final_table_name
                                 }) + '?source_type=' + source_type

        return make_notebook(name=_('Creating table %(database)s.%(table)s') %
                             {
                                 'database': database,
                                 'table': final_table_name
                             },
                             editor_type=editor_type,
                             statement=sql.strip(),
                             status='ready',
                             database=database,
                             on_success_url=on_success_url,
                             last_executed=start_time,
                             is_task=True)
Example #16
0
  def query(self, collection, query):
    solr_query = {}
    json_facets = {}

    solr_query['collection'] = collection['name']

    if query.get('download'):
      solr_query['rows'] = 1000
      solr_query['start'] = 0
    else:
      solr_query['rows'] = int(collection['template']['rows'] or 10)
      solr_query['start'] = int(query['start'])

    solr_query['rows'] = min(solr_query['rows'], 1000)
    solr_query['start'] = min(solr_query['start'], 10000)

    params = self._get_params() + (
        ('q', self._get_q(query)),
        ('wt', 'json'),
        ('rows', solr_query['rows']),
        ('start', solr_query['start']),
    )

    if any(collection['facets']):
      params += (
        ('facet', 'true'),
        ('facet.mincount', 0),
        ('facet.limit', 10),
      )

      timeFilter = self._get_range_borders(collection, query)

      for facet in collection['facets']:
        if facet['type'] == 'query':
          params += (('facet.query', '%s' % facet['field']),)
        elif facet['type'] == 'range' or facet['type'] == 'range-up':
          keys = {
              'id': '%(id)s' % facet,
              'field': facet['field'],
              'key': '%(field)s-%(id)s' % facet,
              'start': facet['properties']['start'],
              'end': facet['properties']['end'],
              'gap': facet['properties']['gap'],
              'mincount': int(facet['properties']['mincount'])
          }

          if facet['properties']['canRange'] or timeFilter and timeFilter['time_field'] == facet['field'] and (facet['id'] not in timeFilter['time_filter_overrides'] or facet['widgetType'] != 'histogram-widget'):
            keys.update(self._get_time_filter_query(timeFilter, facet, collection))

          params += (
             ('facet.range', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.range.start=%(start)s f.%(field)s.facet.range.end=%(end)s f.%(field)s.facet.range.gap=%(gap)s f.%(field)s.facet.mincount=%(mincount)s}%(field)s' % keys),
          )
        elif facet['type'] == 'field':
          keys = {
              'id': '%(id)s' % facet,
              'field': facet['field'],
              'key': '%(field)s-%(id)s' % facet,
              'limit': int(facet['properties'].get('limit', 10)) + (1 if facet['widgetType'] == 'facet-widget' else 0),
              'mincount': int(facet['properties']['mincount'])
          }

          params += (
              ('facet.field', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.limit=%(limit)s f.%(field)s.facet.mincount=%(mincount)s}%(field)s' % keys),
          )
        elif facet['type'] == 'nested':
          _f = {}
          if facet['properties']['facets']:
            self._n_facet_dimension(facet, _f, facet['properties']['facets'], 1, timeFilter, collection, can_range = facet['properties']['canRange'])

          if facet['properties'].get('domain'):
            if facet['properties']['domain'].get('blockParent') or facet['properties']['domain'].get('blockChildren'):
              _f['domain'] = {}
              if facet['properties']['domain'].get('blockParent'):
                _f['domain']['blockParent'] = ' OR '.join(facet['properties']['domain']['blockParent'])
              if facet['properties']['domain'].get('blockChildren'):
                _f['domain']['blockChildren'] = ' OR '.join(facet['properties']['domain']['blockChildren'])

          if _f:
            sort = {'count': facet['properties']['facets'][0]['sort']}
            for i, agg in enumerate(self._get_dimension_aggregates(facet['properties']['facets'][1:])):
              if agg['sort'] != 'default':
                agg_function = self._get_aggregate_function(agg)
                sort = {'agg_%02d_%02d:%s' % (1, i, agg_function): agg['sort']}

            if sort.get('count') == 'default':
              sort['count'] = 'desc'

            dim_key = [key for key in list(_f['facet'].keys()) if 'dim' in key][0]
            _f['facet'][dim_key].update({
                  'excludeTags': facet['id'],
                  'offset': 0,
                  'numBuckets': True,
                  'allBuckets': True,
                  'sort': sort
                  #'prefix': '' # Forbidden on numeric fields
              })
            json_facets[facet['id']] = _f['facet'][dim_key]
        elif facet['type'] == 'function':
          if facet['properties']['facets']:
            json_facets[facet['id']] = self._get_aggregate_function(facet['properties']['facets'][0])
            if facet['properties']['compare']['is_enabled']:
              # TODO: global compare override
              unit = re.split('\d+', facet['properties']['compare']['gap'])[1]
              json_facets[facet['id']] = {
                'type': 'range',
                'field': collection['timeFilter'].get('field'),
                'start': 'NOW/%s-%s-%s' % (unit, facet['properties']['compare']['gap'], facet['properties']['compare']['gap']),
                'end': 'NOW/%s' % unit,
                'gap': '+%(gap)s' % facet['properties']['compare'],
                'facet': {facet['id']: json_facets[facet['id']]}
              }
            if facet['properties']['filter']['is_enabled']:
              json_facets[facet['id']] = {
                'type': 'query',
                'q': facet['properties']['filter']['query'] or EMPTY_QUERY.get(),
                'facet': {facet['id']: json_facets[facet['id']]}
              }
            json_facets['processEmpty'] = True
        elif facet['type'] == 'pivot':
          if facet['properties']['facets'] or facet['widgetType'] == 'map-widget':
            fields = facet['field']
            fields_limits = []
            for f in facet['properties']['facets']:
              fields_limits.append('f.%s.facet.limit=%s' % (f['field'], f['limit']))
              fields_limits.append('f.%s.facet.mincount=%s' % (f['field'], f['mincount']))
              fields += ',' + f['field']
            keys = {
                'id': '%(id)s' % facet,
                'key': '%(field)s-%(id)s' % facet,
                'field': facet['field'],
                'fields': fields,
                'limit': int(facet['properties'].get('limit', 10)),
                'mincount': int(facet['properties']['mincount']),
                'fields_limits': ' '.join(fields_limits)
            }
            params += (
                ('facet.pivot', '{!key=%(key)s ex=%(id)s f.%(field)s.facet.limit=%(limit)s f.%(field)s.facet.mincount=%(mincount)s %(fields_limits)s}%(fields)s' % keys),
            )

    params += self._get_fq(collection, query)

    fl = urllib_unquote(utf_quoter(','.join(Collection2.get_field_list(collection))))

    nested_fields = self._get_nested_fields(collection)
    if nested_fields:
      fl += urllib_unquote(utf_quoter(',[child parentFilter="%s"]' % ' OR '.join(nested_fields)))

    if collection['template']['moreLikeThis'] and fl != ['*']: # Potential conflict with nested documents
      id_field = collection.get('idField', 'id')
      params += (
        ('mlt', 'true'),
        ('mlt.fl', fl.replace(',%s' % id_field, '')),
        ('mlt.mintf', 1),
        ('mlt.mindf', 1),
        ('mlt.maxdf', 50),
        ('mlt.maxntp', 1000),
        ('mlt.count', 10),
        #('mlt.minwl', 1),
        #('mlt.maxwl', 1),
      )
      fl = '*'

    params += (('fl', fl),)

    params += (
      ('hl', 'true'),
      ('hl.fl', '*'),
      ('hl.snippets', 5),
      ('hl.fragsize', 1000),
    )

    #if query.get('timezone'):
    #  params += (('TZ', query.get('timezone')),)

    if collection['template']['fieldsSelected']:
      fields = []
      for field in collection['template']['fieldsSelected']:
        attribute_field = [attribute for attribute in collection['template']['fieldsAttributes'] if field == attribute['name']]
        if attribute_field:
          if attribute_field[0]['sort']['direction']:
            fields.append('%s %s' % (field, attribute_field[0]['sort']['direction']))
      if fields:
        params += (
          ('sort', ','.join(fields)),
        )

    if json_facets:
      response = self._root.post(
          '%(collection)s/select' % solr_query,
          params,
          data=json.dumps({'facet': json_facets}),
          contenttype='application/json')
    else:
      response = self._root.get('%(collection)s/select' % solr_query, params)

    return self._get_json(response)
Example #17
0
def _small_indexing(user, fs, client, source, destination, index_name):
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        path = urllib_unquote(source["path"])
        stats = fs.stats(path)
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)

    fields = indexer.get_field_list(destination['columns'])
    _create_solr_collection(user, fs, client, destination, index_name, kwargs)

    if source['inputFormat'] == 'file':
        path = urllib_unquote(source["path"])
        data = fs.read(path, 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'
        kwargs['map'] = 'NULL:'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            # Assumes handle still live
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over)
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        elif source['inputFormat'] == 'manual':
            pass  # No need to do anything
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception as e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception as e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
        raise e

    return {
        'status': 0,
        'on_success_url': reverse('indexer:indexes',
                                  kwargs={'index': index_name}),
        'pub_sub_url': 'assist.collections.refresh',
        'errors': errors
    }
Example #18
0
File: sql.py Project: quyuehui/hue
    def create_table_from_local_file(self, source, destination, start_time=-1):
        if '.' in destination['name']:
            database, table_name = destination['name'].split('.', 1)
        else:
            database = 'default'
            table_name = destination['name']
        final_table_name = table_name

        source_type = source['sourceType']
        editor_type = destination['sourceType']

        columns = destination['columns']

        if editor_type in ('hive', 'mysql'):

            if editor_type == 'mysql':
                for col in columns:
                    if col['type'] == 'string':
                        col['type'] = 'VARCHAR(255)'

            sql = '''CREATE TABLE IF NOT EXISTS %(database)s.%(table_name)s (
%(columns)s);
      ''' % {
                'database':
                database,
                'table_name':
                table_name,
                'columns':
                ',\n'.join(['  `%(name)s` %(type)s' % col for col in columns]),
            }

        path = urllib_unquote(source['path'])

        if path:  # data insertion
            with open(BASE_DIR + path, 'r') as local_file:
                reader = csv.reader(local_file)
                list_of_tuples = list(map(tuple, reader))

                if source['format']['hasHeader']:
                    list_of_tuples = list_of_tuples[1:]

                csv_rows = str(list_of_tuples)[1:-1]

                if editor_type in ('hive', 'mysql'):
                    sql += '''\nINSERT INTO %(database)s.%(table_name)s VALUES %(csv_rows)s;
          ''' % {
                        'database': database,
                        'table_name': table_name,
                        'csv_rows': csv_rows
                    }

        on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \
            '?source_type=' + source_type

        return make_notebook(name=_('Creating table %(database)s.%(table)s') %
                             {
                                 'database': database,
                                 'table': final_table_name
                             },
                             editor_type=editor_type,
                             statement=sql.strip(),
                             status='ready',
                             database=database,
                             on_success_url=on_success_url,
                             last_executed=start_time,
                             is_task=True)
Example #19
0
def _large_indexing(request,
                    file_format,
                    collection_name,
                    query=None,
                    start_time=None,
                    lib_path=None,
                    destination=None):
    indexer = MorphlineIndexer(request.user, request.fs)

    unique_field = indexer.get_unique_field(file_format)
    is_unique_generated = indexer.is_unique_generated(file_format)

    schema_fields = indexer.get_kept_field_list(file_format['columns'])
    if is_unique_generated:
        schema_fields += [{"name": unique_field, "type": "string"}]

    client = SolrClient(user=request.user)

    if not client.exists(collection_name) and not request.POST.get(
            'show_command'):  # if destination['isTargetExisting']:
        client.create_index(name=collection_name,
                            fields=request.POST.get('fields', schema_fields),
                            unique_key_field=unique_field
                            # No df currently
                            )
    else:
        # TODO: check if format matches
        pass

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        indexer = FlumeIndexer(user=request.user)
        if request.POST.get('show_command'):
            configs = indexer.generate_config(file_format, destination)
            return {'status': 0, 'commands': configs[-1]}
        else:
            return indexer.start(collection_name, file_format, destination)
    elif file_format['inputFormat'] == 'stream':
        return _envelope_job(request,
                             file_format,
                             destination,
                             start_time=start_time,
                             lib_path=lib_path)
    elif file_format['inputFormat'] == 'file':
        input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
    else:
        input_path = None

    morphline = indexer.generate_morphline_config(collection_name,
                                                  file_format,
                                                  unique_field,
                                                  lib_path=lib_path)

    return indexer.run_morphline(request,
                                 collection_name,
                                 morphline,
                                 input_path,
                                 query,
                                 start_time=start_time,
                                 lib_path=lib_path)
Example #20
0
File: api3.py Project: mapr/hue
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    file_encoding = None
    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            if path[-3:] == 'xls' or path[-4:] == 'xlsx':
                path = excel_to_csv_file_name_change(path)
            source['path'] = request.fs.netnormpath(path)
            stream = request.fs.open(path)
            file_encoding = check_encoding(stream.read(10000))

    if destination['ouputFormat'] in ('database',
                                      'table') and request.fs is not None:
        destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \
            if destination['nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif destination['ouputFormat'] == 'stream-table':
        args = {
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = FlinkIndexer(request.user, request.fs)

        job_nb = api.create_table_from_kafka(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'big-table':
        args = {
            'request': request,
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = PhoenixIndexer(request.user, request.fs)

        job_nb = api.create_table_from_file(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    else:
        if source['inputFormat'] == 'localfile':
            job_handle = _create_table_from_local(request, source, destination,
                                                  start_time)
        else:
            # TODO: if inputFormat is 'stream' and tableFormat is 'kudu' --> create Table only
            job_handle = _create_table(request, source, destination,
                                       start_time, file_encoding)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Example #21
0
File: api3.py Project: mapr/hue
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'localfile':
        path = urllib_unquote(file_format['path'])

        with open(path, 'r') as local_file:

            reader = csv.reader(local_file)
            csv_data = list(reader)

            if file_format['format']['hasHeader']:
                sample = csv_data[1:5]
                column_row = [
                    re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0]
                ]
            else:
                sample = csv_data[:4]
                column_row = [
                    'field_' + str(count + 1)
                    for count, col in enumerate(sample[0])
                ]

            field_type_guesses = []
            for count, col in enumerate(column_row):
                column_samples = [
                    sample_row[count] for sample_row in sample
                    if len(sample_row) > count
                ]
                field_type_guess = guess_field_type_from_samples(
                    column_samples)
                field_type_guesses.append(field_type_guess)

            columns = [
                Field(column_row[count], field_type_guesses[count]).to_dict()
                for count, col in enumerate(column_row)
            ]

            format_ = {'columns': columns, 'sample': sample}

    elif file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        if path[-3:] == 'xls' or path[-4:] == 'xlsx':
            path = excel_to_csv_file_name_change(path)
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        LOG.debug('File %s encoding is %s' % (path, encoding))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warning(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            data = get_topic_data(request.user,
                                  file_format.get('kafkaSelectedTopics'))

            kafkaFieldNames = [col['name'] for col in data['full_headers']]
            kafkaFieldTypes = [col['type'] for col in data['full_headers']]
            topics_data = data['rows']

            format_ = {
                "sample":
                topics_data,
                "columns": [
                    Field(col, 'string', unique=False).to_dict()
                    for col in kafkaFieldNames
                ]
            }
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Example #22
0
  def create_table_from_file(self, request, source, destination, start_time=-1, dry_run=False):
    if '.' in destination['name']:
      database, table_name = destination['name'].split('.', 1)
    else:
      database = 'default'
      table_name = destination['name']
    final_table_name = table_name

    source_type = [interpreter['type'] for interpreter in get_ordered_interpreters(self.user) if interpreter['dialect'] == 'phoenix'][0]
    editor_type = source_type

    columns = destination['columns']

    # Until we have proper type convertion
    for col in columns:
      if col['type'] == 'string':
        col['type'] = 'varchar'

    sql = '''CREATE TABLE IF NOT EXISTS %(table_name)s (
%(columns)s
CONSTRAINT my_pk PRIMARY KEY (%(primary_keys)s)
);
''' % {
          'database': database,
          'table_name': table_name,
          'columns': ',\n'.join(['  %(name)s %(type)s' % col for col in columns]),
          'primary_keys': ', '.join(destination.get('indexerPrimaryKey'))
      }

    source_path = urllib_unquote(source['path'])
    if source['inputFormat'] == 'file':
      file_obj = request.fs.open(source_path)
      content = file_obj.read().decode("utf-8")
      csvfile = string_io(content)
      reader = csv.reader(csvfile)
    else:
      local_file = open(source_path, 'r')
      reader = csv.reader(local_file)

    if destination['indexerRunJob']:
      for count, csv_row in enumerate(reader):
        if (source['format']['hasHeader'] and count == 0) or not csv_row:
            continue
        else:
          _sql = ', '.join([ "'{0}'".format(col_val) if columns[count]['type'] in ('varchar', 'timestamp') \
            else '{0}'.format(col_val) for count, col_val in enumerate(csv_row)])

          sql += '''\nUPSERT INTO %(table_name)s VALUES (%(csv_row)s);\n''' % {
            'database': database,
            'table_name': table_name,
            'csv_row': _sql
          }
   
    if dry_run:
      return sql
    else:
      on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \
          '?source_type=' + source_type

      return make_notebook(
          name=_('Creating table %(database)s.%(table)s') % {'database': database, 'table': final_table_name},
          editor_type=editor_type,
          statement=sql.strip(),
          status='ready',
          database=database,
          on_success_url=on_success_url,
          last_executed=start_time,
          is_task=True
      )