def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') else: raise PopupException(_('Action %s is unknown') % action) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', 'success_url': success_url }), 'editor_type': editor_type, })
def create_database(request, source, destination): database = destination['name'] comment = destination['description'] use_default_location = destination['useDefaultLocation'] external_path = destination['nonDefaultLocation'] sql = django_mako.render_to_string("gen/create_database_statement.mako", { 'database': { 'name': database, 'comment': comment, 'use_default_location': use_default_location, 'external_location': external_path, 'properties': [], } } ) editor_type = 'hive' on_success_url = reverse('metastore:show_tables', kwargs={'database': database}) try: notebook = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready', on_success_url=on_success_url) return notebook.execute(request, batch=False) except Exception, e: raise PopupException(_('The table could not be created.'), detail=e.message)
def drop_partition(request, database, table): source_type = request.POST.get('source_type', 'hive') cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) if request.method == 'POST': partition_specs = request.POST.getlist('partition_selection') partition_specs = [spec for spec in partition_specs] try: if request.GET.get("format", "html") == "json": last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_partitions(database, table, partition_specs, design=None, generate_ddl_only=True) job = make_notebook( name=_('Drop partition %s') % ', '.join(partition_specs)[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed ) return JsonResponse(job.execute(request)) else: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_partitions(database, table, partition_specs, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + \ reverse('metastore:describe_partitions', kwargs={'database': database, 'table': table}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(partition)s. Error: %(error)s") % {'partition': '\n'.join(partition_specs), 'error': error_message} raise PopupException(error, title=_("DB Error"), detail=log)
def drop_table(request, database): db = dbms.get(request.user) if request.method == 'POST': try: tables = request.POST.getlist('table_selection') tables_objects = [db.get_table(database, table) for table in tables] skip_trash = request.POST.get('skip_trash') == 'on' if request.POST.get('is_embeddable'): sql = db.drop_tables(database, tables_objects, design=None, skip_trash=skip_trash, generate_ddl_only=True) job = make_notebook( name='Execute and watch', editor_type='hive', statement=sql.strip(), status='ready', database=database, on_success_url='assist.db.refresh', is_task=True ) return JsonResponse(job.execute(request)) else: # Can't be simpler without an important refactoring design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_tables(database, tables_objects, design, skip_trash=skip_trash) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + reverse('metastore:show_tables', kwargs={'database': database}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(tables)s. Error: %(error)s") % {'tables': ','.join(tables), 'error': error_message} raise PopupException(error, title=_("Hive Error"), detail=log)
def drop_database(request): db = dbms.get(request.user) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) if request.POST.get('is_embeddable'): sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook( name='Execute and watch', editor_type='hive', statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True ) return JsonResponse(job.execute(request)) else: query_history = db.drop_databases(databases, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % {'databases': ','.join(databases), 'error': error_message} raise PopupException(error, title=_("Hive Error"), detail=log)
def install_pig_script(self, sample_user): doc2 = None name = _('UpperText') if Document2.objects.filter(owner=sample_user, name=name, type='query-pig', is_history=False).exists(): LOG.info("Sample pig editor script already installed.") doc2 = Document2.objects.get(owner=sample_user, name=name, type='query-pig', is_history=False) else: statement = """REGISTER hdfs://{}/piggybank.jar; data = LOAD '{}/data/midsummer.txt' as (text:CHARARRAY); upper_case = FOREACH data GENERATE org.apache.pig.piggybank.evaluation.string.UPPER(text); STORE upper_case INTO '$output'; """.format(REMOTE_SAMPLE_DIR.get(), REMOTE_SAMPLE_DIR.get()) snippet_properties = { 'hadoopProperties': [], 'parameters': [], 'resources': [] } notebook = make_notebook( name=name, description=_('UpperText: Example Pig script'), editor_type='pig', statement=statement, status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('files') data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=sample_user, name=data['name'], type='query-pig', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample pig script document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def kill_query(self, query_id, request): kill_sql = 'KILL QUERY "%s";' % query_id job = make_notebook( name=_('Kill query %s') % query_id, editor_type='hive', statement=kill_sql, status='ready', on_success_url='assist.db.refresh', is_task=False, ) job.execute_and_wait(request)
def browse(request, database, table, partition_spec=None): snippet = {'type': request.POST.get('sourceType', 'hive')} statement = get_api(request, snippet).get_browse_query(snippet, database, table, partition_spec) editor_type = snippet['type'] if request.method == 'POST': notebook = make_notebook(name='Execute and watch', editor_type=editor_type, statement=statement, status='ready-execute', is_task=True) return JsonResponse(notebook.execute(request, batch=False)) else: editor = make_notebook(name='Browse', editor_type=editor_type, statement=statement, status='ready-execute') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': get_ordered_interpreters(request.user), 'mode': 'editor', 'editor_type': editor_type }), 'editor_type': editor_type, })
def drop_table(request, database): source_type = request.POST.get('source_type', 'hive') db = _get_db(user=request.user, source_type=source_type) if request.method == 'POST': try: tables = request.POST.getlist('table_selection') tables_objects = [ db.get_table(database, table) for table in tables ] skip_trash = request.POST.get('skip_trash') == 'on' if request.POST.get('is_embeddable'): last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_tables(database, tables_objects, design=None, skip_trash=skip_trash, generate_ddl_only=True) job = make_notebook( name=_('Drop table %s') % ', '.join([table.name for table in tables_objects])[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=database, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed) return JsonResponse(job.execute(request)) else: # Can't be simpler without an important refactoring design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_tables(database, tables_objects, design, skip_trash=skip_trash) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id }) + '?on_success_url=' + reverse( 'metastore:show_tables', kwargs={'database': database}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(tables)s. Error: %(error)s") % { 'tables': ','.join(tables), 'error': error_message } raise PopupException(error, title=_("DB Error"), detail=log)
def create_table_from_kafka(self, source, destination, start_time=-1, dry_run=False): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name source_type = source['sourceType'] interpreter = _get_interpreter_from_dialect('flink', self.user) editor_type = interpreter['type'] # destination['sourceType'] columns = destination['columns'] sql = '''CREATE TABLE %(table_name)s ( %(columns)s ) WITH ( 'connector' = 'kafka', 'topic' = '%(topic)s', 'scan.startup.mode' = 'earliest-offset', 'properties.bootstrap.servers' = 'kafka:9094', 'format' = 'json' );''' % { 'database': database, 'table_name': table_name, 'columns': ',\n'.join( [' %(name)s %(type)s' % col for col in columns]), 'topic': source.get('kafkaSelectedTopics') } if dry_run: return sql else: on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': final_table_name}) + \ '?source_type=' + source_type return make_notebook( name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def _install_mapreduce_example(self): doc2 = None name = _('MapReduce Sleep Job') if Document2.objects.filter(owner=self.user, name=name, type='query-mapreduce', is_history=False).exists(): LOG.info("Sample mapreduce editor job already installed.") doc2 = Document2.objects.get(owner=self.user, name=name, type='query-mapreduce', is_history=False) else: snippet_properties = { 'app_jar': '/user/hue/oozie/workspaces/lib/hadoop-examples.jar', 'hadoopProperties': ['mapred.mapper.class=org.apache.hadoop.examples.SleepJob', 'mapred.reducer.class=org.apache.hadoop.examples.SleepJob', 'mapred.mapoutput.key.class=org.apache.hadoop.io.IntWritable', 'mapred.mapoutput.value.class=org.apache.hadoop.io.NullWritable', 'mapred.output.format.class=org.apache.hadoop.mapred.lib.NullOutputFormat', 'mapred.input.format.class=org.apache.hadoop.examples.SleepJob$SleepInputFormat', 'mapred.partitioner.class=org.apache.hadoop.examples.SleepJob', 'sleep.job.map.sleep.time=5', 'sleep.job.reduce.sleep.time=10'], 'archives': [], 'jars': [] } notebook = make_notebook( name=name, description=_('Sleep: Example MapReduce job'), editor_type='mapreduce', statement='', status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=self.user, name=data['name'], type='query-mapreduce', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample mapreduce job document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def _install_pyspark_example(self): doc2 = None name = _('PySpark Pi Estimator Job') if Document2.objects.filter(owner=self.user, name=name, type='query-spark2', is_history=False).exists(): LOG.info("Sample pyspark editor job already installed.") doc2 = Document2.objects.get(owner=self.user, name=name, type='query-spark2', is_history=False) else: snippet_properties = { 'jars': ['/user/hue/oozie/workspaces/lib/pi.py'], 'class': '', 'app_name': '', 'spark_opts': [], 'spark_arguments': [], 'files': [] } notebook = make_notebook( name=name, description=_('Pi Estimator: Example PySpark job'), editor_type='spark2', statement='', status='ready', snippet_properties=snippet_properties, is_saved=True) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=self.user, name=data['name'], type='query-spark2', description=data['description'], data=json.dumps(data)) except Exception, e: LOG.exception( "Failed to create sample PySpark job document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def run_morphline(self, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) snippet_properties = { u'files': [{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, u'${nameNode}%s' % input_path, ], u'archives': [], } notebook = make_notebook( name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data() notebook_doc, created = _save_notebook(notebook, self.user) workflow_doc = WorkflowBuilder().create_workflow( document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name) workflow = Workflow(document=workflow_doc, user=self.user) job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None) return job_id
def import_saved_beeswax_query(bquery): design = bquery.get_design() return make_notebook( name=bquery.name, description=bquery.desc, editor_type=_convert_type(bquery.type), statement=design.hql_query, status='ready', files=design.file_resources, functions=design.functions, settings=design.settings )
def import_saved_beeswax_query(bquery): design = bquery.get_design() return make_notebook( name=bquery.name, description=bquery.desc, editor_type=_convert_type(bquery.type, bquery.data), statement=design.hql_query, status='ready', files=design.file_resources, functions=design.functions, settings=design.settings )
def _load_data_to_table(self, django_user, hql): LOG.info('Loading data into table "%s"' % (self.name,)) job = make_notebook( name=_('Insert data in sample table %s') % self.name, editor_type=self.interpreter['type'] if self.interpreter else self.dialect, statement=hql, status='ready', database=self.db_name, on_success_url='assist.db.refresh', is_task=False, ) job.execute_and_wait(self.request)
def install_pig_script(self, sample_user): doc2 = None name = _('UpperText') if Document2.objects.filter(owner=sample_user, name=name, type='query-pig').exists(): LOG.info("Sample pig editor script already installed.") doc2 = Document2.objects.get(owner=sample_user, name=name, type='query-pig') else: statement = """data = LOAD '/user/hue/pig/examples/data/midsummer.txt' as (text:CHARARRAY); upper_case = FOREACH data GENERATE org.apache.pig.piggybank.evaluation.string.UPPER(text); STORE upper_case INTO '${output}'; """ snippet_properties = { 'hadoopProperties': [], 'parameters': [], 'resources': [] } notebook = make_notebook( name=name, description=_('UpperText: Example Pig script'), editor_type='pig', statement=statement, status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('files') data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=sample_user, name=data['name'], type='query-pig', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample pig script document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def drop_database(request): source_type = request.POST.get('source_type', 'hive') db = _get_db(user=request.user, source_type=source_type) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: if request.POST.get('is_embeddable'): design = SavedQuery.create_empty( app_name=source_type if source_type != 'hive' else 'beeswax', owner=request.user, data=hql_query('').dumps()) last_executed = json.loads(request.POST.get('start_time'), '-1') cluster = json.loads(request.POST.get('cluster', '{}')) namespace = request.POST.get('namespace') sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook(name=_('Drop database %s') % ', '.join(databases)[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=None, namespace=namespace, compute=cluster, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed) return JsonResponse(job.execute(request)) else: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_databases(databases, design) url = reverse( 'beeswax:watch_query_history', kwargs={ 'query_history_id': query_history.id }) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % { 'databases': ','.join(databases), 'error': error_message } raise PopupException(error, title=_("DB Error"), detail=log)
def export_result(request): response = {'status': -1, 'message': _('Exporting result failed.')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = json.loads(request.POST.get('destination', '')) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file(snippet, destination, overwrite) response['status'] = 0 elif data_format == 'hive-table': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 elif data_format == 'hdfs-directory': if is_embedded: sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination) task = make_notebook( name='Execute and watch', editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, is_task=True ) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 elif data_format == 'search-index': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 return JsonResponse(response)
def _install_spark_example(self): doc2 = None name = _('Spark File Copy Job') if Document2.objects.filter(owner=self.user, name=name, type='query-spark2', is_history=False).exists(): LOG.info("Sample Spark editor job already installed.") doc2 = Document2.objects.get(owner=self.user, name=name, type='query-spark2', is_history=False) else: snippet_properties = { 'jars': ['/user/hue/oozie/workspaces/workflows/spark-scala/lib/oozie-examples.jar'], 'class': 'org.apache.oozie.example.SparkFileCopy', 'app_name': '', 'spark_opts': [], 'spark_arguments': [ "/user/hue/oozie/workspaces/data/sonnets.txt", "sonnets" ], 'files': [] } notebook = make_notebook( name=name, description=_('File Copy: Example Spark job'), editor_type='spark2', statement='', status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=self.user, name=data['name'], type='query-spark2', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample Spark job document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def drop_partition(request, database, table): source_type = request.POST.get('source_type', 'hive') cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) if request.method == 'POST': partition_specs = request.POST.getlist('partition_selection') partition_specs = [spec for spec in partition_specs] try: if request.GET.get("format", "html") == "json": last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_partitions(database, table, partition_specs, design=None, generate_ddl_only=True) job = make_notebook(name=_('Drop partition %s') % ', '.join(partition_specs)[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed) return JsonResponse(job.execute(request)) else: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_partitions(database, table, partition_specs, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + \ reverse('metastore:describe_partitions', kwargs={'database': database, 'table': table}) return redirect(url) except Exception as ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(partition)s. Error: %(error)s") % { 'partition': '\n'.join(partition_specs), 'error': error_message } raise PopupException(error, title=_("DB Error"), detail=log) else: title = _("Do you really want to delete the partition(s)?") return render('confirm.mako', request, { 'url': request.path, 'title': title })
def _install_java_example(self): doc2 = None name = _('Java Terasort Job') if Document2.objects.filter(owner=self.user, name=name, type='query-java', is_history=False).exists(): LOG.info("Sample Java editor job already installed.") doc2 = Document2.objects.get(owner=self.user, name=name, type='query-java', is_history=False) else: snippet_properties = { 'app_jar': '/user/hue/oozie/workspaces/lib/hadoop-examples.jar', 'class': 'org.apache.hadoop.examples.terasort.TeraSort', 'java_opts': '', 'hadoopProperties': [], 'archives': [], 'files': [], 'arguments': ['output_dir/teragen', 'output_dir/terasort'], 'capture_output': False } notebook = make_notebook( name=name, description=_('Terasort: Example Java job'), editor_type='java', statement='', status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=self.user, name=data['name'], type='query-java', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample Java job document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def _install_java_example(self): doc2 = None name = _('Java TeraGen Job') if Document2.objects.filter(owner=self.user, name=name, type='query-java', is_history=False).exists(): LOG.info("Sample Java editor job already installed.") doc2 = Document2.objects.get(owner=self.user, name=name, type='query-java', is_history=False) else: snippet_properties = { 'app_jar': '/user/hue/oozie/workspaces/lib/hadoop-examples.jar', 'class': 'org.apache.hadoop.examples.terasort.TeraGen', 'java_opts': '', 'hadoopProperties': [], 'archives': [], 'files': [], 'arguments': ['10000', 'output_dir/teragen'], 'capture_output': False } notebook = make_notebook( name=name, description=_('TeraGen: Generates N rows of random data to a directory.'), editor_type='java', statement='', status='ready', snippet_properties=snippet_properties, is_saved=True ) # Remove files, functions, settings from snippet properties data = notebook.get_data() data['snippets'][0]['properties'].pop('functions') data['snippets'][0]['properties'].pop('settings') try: with transaction.atomic(): doc2 = Document2.objects.create( owner=self.user, name=data['name'], type='query-java', description=data['description'], data=json.dumps(data) ) except Exception, e: LOG.exception("Failed to create sample Java job document: %s" % e) # Just to be sure we delete Doc2 object incase of exception. # Possible when there are mixed InnoDB and MyISAM tables if doc2 and Document2.objects.filter(id=doc2.id).exists(): doc2.delete()
def run(self, request, collection_name, envelope, input_path, start_time=None, lib_path=None): workspace_path = self._upload_workspace(envelope) task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', #on_success_url=reverse('search:browse', kwargs={'name': collection_name}), #pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time) if not DISABLE_HUE_3.get(): # CDH5 shell_command_name = "pipeline.sh" shell_command = """#!/bin/bash SPARK_KAFKA_VERSION=0.10 spark2-submit envelope.jar envelope.conf""" hdfs_shell_cmd_path = os.path.join(workspace_path, shell_command_name) self.fs.do_as_user(self.username, self.fs.create, hdfs_shell_cmd_path, data=shell_command) task.add_shell_snippet(shell_command=shell_command_name, files=[{ u'value': u'%s/envelope.conf' % workspace_path }, { u'value': hdfs_shell_cmd_path }, { u'value': lib_path, }]) else: task.add_spark_snippet(clazz=None, jars=lib_path, arguments=[u'envelope.conf'], files=[{ u'path': u'%s/envelope.conf' % workspace_path, u'type': u'file' }]) return task.execute(request, batch=True)
def _sync_execute(self, sql, database): editor = make_notebook(name='Execute and watch', editor_type=self.engine, statement=sql, database=database, status='ready-execute', skip_historify=True # async=False ) request = MockRequest(self.user, self.cluster) mock_notebook = {} snippet = {'type': self.engine} response = editor.execute(request) if 'handle' in response: snippet['result'] = response if response['handle'].get('sync'): result = response['result'] else: timeout_sec = 20 # To move to Notebook API sleep_interval = 0.5 curr = time.time() end = curr + timeout_sec api = get_api(request, snippet) while curr <= end: status = api.check_status(mock_notebook, snippet) if status['status'] == 'available': result = api.fetch_result(mock_notebook, snippet, rows=10, start_over=True) api.close_statement(mock_notebook, snippet) break time.sleep(sleep_interval) curr = time.time() if curr > end: try: api.cancel_operation(snippet) except Exception as e: LOG.warning("Failed to cancel query: %s" % e) api.close_statement(mock_notebook, snippet) raise OperationTimeout(e) return result
def alanize_fix(request): response = {'status': -1} fix = json.loads(request.POST.get('fix')) start_time = json.loads(request.POST.get('start_time'), '-1') if fix['id'] == 0: notebook = make_notebook(name=_('compute stats %(data)s') % fix, editor_type='impala', statement='compute stats %(data)s' % fix, status='ready', last_executed=start_time, is_task=True) response['details'] = {'task': notebook.execute(request, batch=True)} response['status'] = 0 return JsonResponse(response)
def load_table(request, database, table): response = {'status': -1, 'data': 'None'} source_type = request.POST.get('source_type', request.GET.get('source_type', 'hive')) cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) table = db.get_table(database, table) if request.method == "POST": load_form = LoadDataForm(table, request.POST) if load_form.is_valid(): on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': table.name}) generate_ddl_only = request.POST.get('is_embeddable', 'false') == 'true' try: design = SavedQuery.create_empty(app_name=source_type if source_type != 'hive' else 'beeswax', owner=request.user, data=hql_query('').dumps()) form_data = { 'path': load_form.cleaned_data['path'], 'overwrite': load_form.cleaned_data['overwrite'], 'partition_columns': [(column_name, load_form.cleaned_data[key]) for key, column_name in load_form.partition_columns.iteritems()], } query_history = db.load_data(database, table.name, form_data, design, generate_ddl_only=generate_ddl_only) if generate_ddl_only: last_executed = json.loads(request.POST.get('start_time'), '-1') job = make_notebook( name=_('Load data in %s.%s') % (database, table.name), editor_type=source_type, statement=query_history.strip(), status='ready', database=database, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed ) response = job.execute(request) else: url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + on_success_url response['status'] = 0 response['data'] = url response['query_history_id'] = query_history.id except QueryError, ex: response['status'] = 1 response['data'] = _("Can't load the data: ") + ex.message except Exception, e: response['status'] = 1 response['data'] = _("Can't load the data: ") + str(e)
def load_table(request, database, table): response = {'status': -1, 'data': 'None'} source_type = request.POST.get('source_type', 'hive') cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) table = db.get_table(database, table) if request.method == "POST": load_form = LoadDataForm(table, request.POST) if load_form.is_valid(): on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': table.name}) generate_ddl_only = request.POST.get('is_embeddable', 'false') == 'true' try: design = SavedQuery.create_empty(app_name=source_type if source_type != 'hive' else 'beeswax', owner=request.user, data=hql_query('').dumps()) form_data = { 'path': load_form.cleaned_data['path'], 'overwrite': load_form.cleaned_data['overwrite'], 'partition_columns': [(column_name, load_form.cleaned_data[key]) for key, column_name in load_form.partition_columns.iteritems()], } query_history = db.load_data(database, table.name, form_data, design, generate_ddl_only=generate_ddl_only) if generate_ddl_only: last_executed = json.loads(request.POST.get('start_time'), '-1') job = make_notebook( name=_('Load data in %s.%s') % (database, table.name), editor_type=source_type, statement=query_history.strip(), status='ready', database=database, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed ) response = job.execute(request) else: url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + on_success_url response['status'] = 0 response['data'] = url response['query_history_id'] = query_history.id except QueryError, ex: response['status'] = 1 response['data'] = _("Can't load the data: ") + ex.message except Exception, e: response['status'] = 1 response['data'] = _("Can't load the data: ") + str(e)
def browse(request, database, table): editor_type = request.GET.get('type', 'hive') snippet = {'type': editor_type} sql_select = get_api(request.user, snippet, request.fs, request.jt).get_select_star_query(snippet, database, table) editor = make_notebook(name='Browse', editor_type=editor_type, statement=sql_select, status='ready-execute') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', }), 'editor_type': editor_type, })
def list_tasks(self, user): sql_query = "SELECT * FROM information_schema.scheduled_queries" job = make_notebook( name='List Hive schedules', editor_type='hive', statement=sql_query, status='ready', database='default', is_task=False, ) request = MockRequest(user) handle = job.execute_and_wait(request, include_results=True) return [self._get_task(row) for row in handle['result']['data']]
def browse(request, database, table): editor_type = request.GET.get('type', 'hive') snippet = {'type': editor_type} sql_select = get_api(request, snippet).get_select_star_query(snippet, database, table) editor = make_notebook(name='Browse', editor_type=editor_type, statement=sql_select, status='ready-execute') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', }), 'editor_type': editor_type, })
def _sync_execute(self, sql, database): editor = make_notebook( name='Execute and watch', editor_type=self.engine, statement=sql, database=database, status='ready-execute', skip_historify=True # async=False ) request = MockRequest(self.user) mock_notebook = {} snippet = {'type': self.engine} response = editor.execute(request) if 'handle' in response: snippet['result'] = response if response['handle'].get('sync'): result = response['result'] else: timeout_sec = 20 # To move to Notebook API sleep_interval = 0.5 curr = time.time() end = curr + timeout_sec api = get_api(request, snippet) while curr <= end: status = api.check_status(mock_notebook, snippet) if status['status'] == 'available': result = api.fetch_result(mock_notebook, snippet, rows=10, start_over=True) api.close_statement(snippet) break time.sleep(sleep_interval) curr = time.time() if curr > end: try: api.cancel_operation(snippet) except Exception, e: LOG.warning("Failed to cancel query: %s" % e) api.close_statement(snippet) raise OperationTimeout(e)
def run(self, request, collection_name, configs, input_path, start_time=None, lib_path=None): workspace_path = self._upload_workspace(configs) if lib_path is None: lib_path = CONFIG_JARS_LIBS_PATH.get() task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', #on_success_url=reverse('search:browse', kwargs={'name': collection_name}), #pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time) shell_command_name = "pipeline.sh" shell_command = """#!/bin/bash export SPARK_DIST_CLASSPATH=`hadoop classpath` export SPARK_DIST_CLASSPATH=/etc/hive/conf:`hadoop classpath` export JAVA_HOME=/usr/java/jdk1.8.0_162 SPARK_KAFKA_VERSION=0.10 spark2-submit envelope.jar envelope.conf""" hdfs_shell_cmd_path = os.path.join(workspace_path, shell_command_name) self.fs.do_as_user(self.username, self.fs.create, hdfs_shell_cmd_path, data=shell_command) task.add_shell_snippet(shell_command=shell_command_name, files=[{ u'value': u'%s/envelope.conf' % workspace_path }, { u'value': hdfs_shell_cmd_path }, { u'value': lib_path }]) return task.execute(request, batch=True)
def browse(request, database, table): snippet = {"type": "hive"} sql_select = get_api(request, snippet).get_select_star_query(snippet, database, table) editor_type = snippet["type"] editor = make_notebook(name="Browse", editor_type=editor_type, statement=sql_select, status="ready-execute") return render( "editor.mako", request, { "notebooks_json": json.dumps([editor.get_data()]), "options_json": json.dumps( {"languages": get_ordered_interpreters(request.user), "mode": "editor", "editor_type": editor_type} ), "editor_type": editor_type, }, )
def alanize_fix(request): response = {'status': -1} cluster = json.loads(request.POST.get('cluster', '{}')) fix = json.loads(request.POST.get('fix')) start_time = json.loads(request.POST.get('start_time'), '-1') if fix['id'] == 0: notebook = make_notebook( name=_('compute stats %(data)s') % fix, editor_type='impala', statement='compute stats %(data)s' % fix, status='ready', last_executed=start_time, is_task=True, compute=cluster ) response['details'] = { 'task': notebook.execute(request, batch=True) } response['status'] = 0 return JsonResponse(response)
def _get_sample_data(db, database, table, column, is_async=False, cluster=None, operation=None): if operation == 'hello': table_obj = None else: table_obj = db.get_table(database, table) if table_obj.is_impala_only and db.client.query_server['server_name'] != 'impala': # Kudu table, now Hive should support it though query_server = get_query_server_config('impala', connector=cluster) db = dbms.get(db.client.user, query_server, cluster=cluster) sample_data = db.get_sample(database, table_obj, column, generate_sql_only=is_async, operation=operation) response = {'status': -1} if sample_data: response['status'] = 0 if is_async: notebook = make_notebook( name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column}, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False, compute=cluster if cluster else None ) response['result'] = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
def submit_schedule(self, request, coordinator, mapping): """ coordinator Document2.objects.get(uuid=coordinator.get_data_for_json()['properties']['document']) mapping {u'oozie.use.system.libpath': u'True', 'dryrun': False, u'start_date': u'2019-08-10T17:02', u'end_date': u'2019-08-17T17:02'} """ document = Document2.objects.get( uuid=coordinator.get_data_for_json()['properties'] ['document']) # Assumes Hive SQL queries # (schedule_name,cluster_namespace) is unique #_get_snippet_name(notebook) --> name properties = { 'name': 'query-%(uuid)s' % { 'uuid': document.uuid }, 'username': request.user.username } sql_query = """ CREATE SCHEDULED QUERY %(name)s CRON '1 1 * * *' AS SELECT 1 """ % properties job = make_notebook( name=properties['name'], editor_type='hive', statement=sql_query, status='ready', database='default', is_task=False, ) handle = job.execute_and_wait(request) return handle['history_uuid']
def create_notebook(request): response = {'status': -1} editor_type = request.POST.get('type', 'notebook') gist_id = request.POST.get('gist') directory_uuid = request.POST.get('directory_uuid') is_blank = request.POST.get('blank', 'false') == 'true' if gist_id: gist_doc = _get_gist_document(uuid=gist_id) statement = json.loads(gist_doc.data)['statement'] editor = make_notebook(name='', description='', editor_type=editor_type, statement=statement, is_presentation_mode=True) else: editor = Notebook() if EXAMPLES.AUTO_OPEN.get() and not is_blank: document = _get_dialect_example(dialect=editor_type) if document: editor = Notebook(document=document) editor = upgrade_session_properties(request, editor) data = editor.get_data() if editor_type != 'notebook': data['name'] = '' data[ 'type'] = 'query-%s' % editor_type # TODO: Add handling for non-SQL types data['directoryUuid'] = directory_uuid editor.data = json.dumps(data) response['notebook'] = editor.get_data() response['status'] = 0 return JsonResponse(response)
def drop_table(request, database): source_type = request.POST.get('source_type', request.GET.get('source_type', 'hive')) cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) if request.method == 'POST': try: tables = request.POST.getlist('table_selection') tables_objects = [db.get_table(database, table) for table in tables] skip_trash = request.POST.get('skip_trash') == 'on' cluster = json.loads(request.POST.get('cluster', '{}')) namespace = request.POST.get('namespace') if request.POST.get('is_embeddable'): last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_tables(database, tables_objects, design=None, skip_trash=skip_trash, generate_ddl_only=True) job = make_notebook( name=_('Drop table %s') % ', '.join([table.name for table in tables_objects])[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=database, namespace=namespace, compute=cluster, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed ) return JsonResponse(job.execute(request)) else: # Can't be simpler without an important refactoring design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_tables(database, tables_objects, design, skip_trash=skip_trash) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + reverse('metastore:show_tables', kwargs={'database': database}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(tables)s. Error: %(error)s") % {'tables': ','.join(tables), 'error': error_message} raise PopupException(error, title=_("DB Error"), detail=log)
def drop_database(request): source_type = request.POST.get('source_type', request.GET.get('source_type', 'hive')) cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: if request.POST.get('is_embeddable'): design = SavedQuery.create_empty(app_name=source_type if source_type != 'hive' else 'beeswax', owner=request.user, data=hql_query('').dumps()) last_executed = json.loads(request.POST.get('start_time'), '-1') cluster = json.loads(request.POST.get('cluster', '{}')) namespace = request.POST.get('namespace') sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook( name=_('Drop database %s') % ', '.join(databases)[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=None, namespace=namespace, compute=cluster, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed ) return JsonResponse(job.execute(request)) else: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_databases(databases, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % {'databases': ','.join(databases), 'error': error_message} raise PopupException(error, title=_("DB Error"), detail=log)
def drop_database(request): db = dbms.get(request.user) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) if request.POST.get('is_embeddable'): sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook(name='Execute and watch', editor_type='hive', statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True) return JsonResponse(job.execute(request)) else: query_history = db.drop_databases(databases, design) url = reverse( 'beeswax:watch_query_history', kwargs={ 'query_history_id': query_history.id }) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % { 'databases': ','.join(databases), 'error': error_message } raise PopupException(error, title=_("Hive Error"), detail=log)
def create_database(request, source, destination): database = destination['name'] comment = destination['description'] use_default_location = destination['useDefaultLocation'] external_path = destination['nonDefaultLocation'] sql = django_mako.render_to_string("gen/create_database_statement.mako", { 'database': { 'name': database, 'comment': comment, 'use_default_location': use_default_location, 'external_location': external_path, 'properties': [], } } ) editor_type = 'hive' on_success_url = reverse('metastore:show_tables', kwargs={'database': database}) notebook = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready', on_success_url=on_success_url) return notebook.execute(request, batch=False)
def create_query_document(self, owner, query_type='hive', database='default', name='Test Query', description='Test Query', statement='', files=None, functions=None, settings=None): """ Creates and returns a query Document2 object :param owner: owner of doc :param query_type: hive, impala or spark :param database: database name :param name: name of document :param description: description of document :param statement: SQL statement (can be multi-query statement) :param files: list of dicts representing files :param functions: list of dicts representing functions :param settings: list of dicts representing settings :return: Document2 object representing query """ if query_type not in ('hive', 'impala', 'spark'): raise ValueError("Invalid query_type: %s" % query_type) notebook = make_notebook(name=name, description=description, editor_type=query_type, statement=statement, status='ready', database=database, files=files, functions=functions, settings=settings) notebook_doc, save_as = _save_notebook(notebook.get_data(), owner) return notebook_doc
def list_task(self, task_id): task_id = task_id.replace('schedule-hive-', '') sql_query = """ SELECT * FROM information_schema.scheduled_queries WHERE scheduled_query_id = %(scheduled_query_id)s """ % { 'scheduled_query_id': task_id } job = make_notebook( name='List Hive schedule id', editor_type='hive', statement=sql_query, status='ready', database='default', is_task=False, ) request = MockRequest(self.user) handle = job.execute_and_wait(request, include_results=True) return self._get_task(handle['result']['data'][0])
def run_sync_query(doc_id, user): '''Independently run a query as a user and insert the result into another table.''' # get SQL # Add INSERT INTO table # Add variables? # execute query # return when done. send email notification. get taskid. # see in Flower API for listing runs? from django.contrib.auth.models import User from notebook.models import make_notebook, MockedDjangoRequest from desktop.auth.backend import rewrite_user editor_type = 'impala' sql = 'INSERT into customer_scheduled SELECT * FROM default.customers LIMIT 100;' request = MockedDjangoRequest( user=rewrite_user(User.objects.get(username='******'))) notebook = make_notebook( name='Scheduler query N', editor_type=editor_type, statement=sql, status='ready', #on_success_url=on_success_url, last_executed=time.mktime(datetime.datetime.now().timetuple()) * 1000, is_task=True) task = notebook.execute(request, batch=True) task['uuid'] = task['history_uuid'] status = check_status(task) while status['status'] in ('waiting', 'running'): status = check_status(task) time.sleep(3) return task
def run_sync_query(doc_id, user): '''Independently run a query as a user.''' # Add INSERT INTO table if persist result # Add variable substitution # Send notifications: done/on failure if type(user) is str: lookup = {orm_user_lookup(): user} user = User.objects.get(**lookup) user = rewrite_user(user) query_document = Document2.objects.get_by_uuid(user=user, uuid=doc_id) notebook = Notebook(document=query_document).get_data() snippet = notebook['snippets'][0] editor_type = snippet['type'] sql = _get_statement(notebook) request = MockedDjangoRequest(user=user) last_executed = time.mktime(datetime.datetime.now().timetuple()) * 1000 notebook = make_notebook(name='Scheduled query %s at %s' % (query_document.name, last_executed), editor_type=editor_type, statement=sql, status='ready', last_executed=last_executed, is_task=True) task = notebook.execute(request, batch=True) task['uuid'] = task['history_uuid'] status = check_status(task) while status['status'] in ('waiting', 'running'): status = check_status(task) time.sleep(3) return task
def run_morphline(self, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) snippet_properties = { u'files': [ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, u'${nameNode}%s' % input_path, ], u'archives': [], } notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data() notebook_doc, created = _save_notebook(notebook, self.user) workflow_doc = WorkflowBuilder().create_workflow(document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name) workflow = Workflow(document=workflow_doc, user=self.user) job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None) return job_id
def run_morphline(self, request, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) # snippets = [ # { # u'type': u'java', # u'files': [ # {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, # {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} # ], # u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', # u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), # u'arguments': [ # u'--morphline-file', # u'morphline.conf', # u'--output-dir', # u'${nameNode}/user/%s/indexer' % self.username, # u'--log4j', # u'log4j.properties', # u'--go-live', # u'--zk-host', # zkensemble(), # u'--collection', # collection_name, # input_path, # ], # u'archives': [], # } # ] # # # managed notebook # notebook = make_notebook2(name='Indexer job for %s' % collection_name, snippets=snippets).get_data() # notebook_doc, created = _save_notebook(notebook, self.user) # # snippet = {'wasBatchExecuted': True} snippet_properties = { u'files': [ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], u'archives': [], } notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties, status='running').get_data() notebook_doc, created = _save_notebook(notebook, self.user) snippet = {'wasBatchExecuted': True, 'id': notebook['snippets'][0]['id'], 'statement': ''} job_handle = _execute_notebook(request, notebook, snippet) return job_handle
table_obj = db.get_table(database, table) if table_obj.is_impala_only and db.client.query_server['server_name'] != 'impala': query_server = get_query_server_config('impala', cluster=cluster) db = dbms.get(db.client.user, query_server, cluster=cluster) sample_data = db.get_sample(database, table_obj, column, generate_sql_only=async, operation=operation) response = {'status': -1} if sample_data: response['status'] = 0 if async: notebook = make_notebook( name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column}, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False, compute=cluster if cluster else None ) response['result'] = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols()
def query(self, dashboard, query, facet=None): database, table = self._get_database_table_names(dashboard['name']) if query['qs'] == [{'q': '_root_:*'}]: return {'response': {'numFound': 0}} filters = [q['q'] for q in query['qs'] if q['q']] filters.extend(self._get_fq(dashboard, query, facet)) timeFilter = self._get_time_filter_query(dashboard, query) if timeFilter: filters.append(timeFilter) if facet: if facet['type'] == 'nested': fields_dimensions = [self._get_dimension_field(f)['name'] for f in self._get_dimension_fields(facet)] last_dimension_seen = False fields = [] for f in reversed(facet['properties']['facets']): if f['aggregate']['function'] == 'count': if not last_dimension_seen: fields.insert(0, 'COUNT(*) AS Count') last_dimension_seen = True fields.insert(0, self._get_dimension_field(f)['select']) else: if not last_dimension_seen: fields.insert(0, self._get_aggregate_function(f)) if not last_dimension_seen: fields.insert(0, 'COUNT(*) as Count') fields.insert(0, self._get_dimension_field(facet)['select']) sql = '''SELECT %(fields)s FROM %(database)s.%(table)s %(filters)s GROUP BY %(fields_dimensions)s ORDER BY %(order_by)s LIMIT %(limit)s''' % { 'database': database, 'table': table, 'fields': ', '.join(fields), 'fields_dimensions': ', '.join(fields_dimensions), 'order_by': ', '.join([self._get_dimension_field(f)['order_by'] for f in self._get_dimension_fields(facet)]), 'filters': self._convert_filters_to_where(filters), 'limit': LIMIT } elif facet['type'] == 'function': # 1 dim only now sql = '''SELECT %(fields)s FROM %(database)s.%(table)s %(filters)s''' % { 'database': database, 'table': table, 'fields': self._get_aggregate_function(facet), 'filters': self._convert_filters_to_where(filters), } else: fields = Collection2.get_field_list(dashboard) sql = "SELECT %(fields)s FROM `%(database)s`.`%(table)s`" % { 'database': database, 'table': table, 'fields': ', '.join(['`%s`' % f if f != '*' else '*' for f in fields]) } if filters: sql += ' ' + self._convert_filters_to_where(filters) sql += ' LIMIT %s' % LIMIT editor = make_notebook( name='Execute and watch', editor_type=dashboard['engine'], statement=sql, database=database, status='ready-execute', skip_historify=True ) response = editor.execute(MockRequest(self.user)) if 'handle' in response and response['handle'].get('sync'): response['result'] = self._convert_result(response['result'], dashboard, facet, query) return response
def execute_and_watch(request): notebook_id = request.GET.get("editor", request.GET.get("notebook")) snippet_id = int(request.GET["snippet"]) action = request.GET["action"] destination = request.GET["destination"] notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data() snippet = notebook["snippets"][snippet_id] editor_type = snippet["type"] api = get_api(request, snippet) if action == "save_as_table": sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook( name="Execute and watch", editor_type=editor_type, statement=sql, status="ready-execute", database=snippet["database"], ) elif action == "insert_as_query": sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination) editor = make_notebook( name="Execute and watch", editor_type=editor_type, statement=sql, status="ready-execute", database=snippet["database"], ) elif action == "index_query": sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location="") editor = make_notebook(name="Execute and watch", editor_type=editor_type, statement=sql, status="ready-execute") sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib in next commit from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { "name": "col", "inputFormat": "query", "format": { "quoteChar": '"', "recordSeparator": "\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001", }, "sample": "", "columns": [ Field(col["name"], HiveFormat.FIELD_TYPE_TRANSLATE.get(col["type"], "string")).to_dict() for col in sample["meta"] ], } job_handle = _index(request, file_format, destination, query=notebook["uuid"]) return redirect(reverse("oozie:list_oozie_workflow", kwargs={"job_id": job_handle["handle"]["id"]})) else: raise PopupException(_("Action %s is unknown") % action) return render( "editor.mako", request, { "notebooks_json": json.dumps([editor.get_data()]), "options_json": json.dumps( { "languages": [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], "mode": "editor", "editor_type": editor_type, "success_url": success_url, } ), "editor_type": editor_type, }, )
def _create_table_from_a_file(request, source, destination): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] print kudu_partition_columns comment = destination['description'] source_path = source['path'] external = not destination['useDefaultLocation'] external_path = destination['nonDefaultLocation'] load_data = destination['importData'] skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination['customCollectionDelimiter'] map_delimiter = destination['customMapDelimiter'] regexp_delimiter = destination['customRegexp'] else: field_delimiter = ',' collection_delimiter = r'\\002' map_delimiter = r'\\003' regexp_delimiter = '.*' file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "\\t", "quoteChar" = "'", "escapeChar" = "\\\\" ''' if table_format in ('parquet', 'kudu'): if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [col for col in columns if col['name'] in primary_keys] + [col for col in columns if col['name'] not in primary_keys] if external or (load_data and table_format in ('parquet', 'kudu')): if not request.fs.isdir(external_path): # File selected external_path, external_file_name = request.fs.split(external_path) if len(request.fs.listdir(external_path)) > 1: external_path = external_path + '/%s_table' % external_file_name # If dir not just the file, create data dir and move file there. request.fs.mkdir(external_path) request.fs.rename(source_path, external_path) sql += django_mako.render_to_string("gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter, 'map_key_terminator': map_delimiter, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'kudu'), 'path': external_path, 'skip_header': skip_header, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database } ) if table_format == 'text' and not external and load_data: sql += "\n\nLOAD DATA INPATH '%s' INTO TABLE `%s`.`%s`;" % (source_path, database, table_name) if load_data and table_format in ('parquet', 'kudu'): file_format = table_format if table_format == 'kudu': columns_list = ['`%s`' % col for col in primary_keys] + [col['name'] for col in destination['columns'] if col['name'] not in primary_keys] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) DISTRIBUTE BY HASH INTO 16 BUCKETS STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas' = '1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s` %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } editor_type = 'impala' if table_format == 'kudu' else 'hive' on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': table_name}) return make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready', database=database, on_success_url=on_success_url)