Ejemplo n.º 1
0
    def run_morphline(self, collection_name, morphline, input_path):
        workspace_path = self._upload_workspace(morphline)

        snippet_properties = {
            u'files': [{
                u'path': u'%s/log4j.properties' % workspace_path,
                u'type': u'file'
            }, {
                u'path': u'%s/morphline.conf' % workspace_path,
                u'type': u'file'
            }],
            u'class':
            u'org.apache.solr.hadoop.MapReduceIndexerTool',
            u'app_jar':
            CONFIG_INDEXER_LIBS_PATH.get(),
            u'arguments': [
                u'--morphline-file',
                u'morphline.conf',
                u'--output-dir',
                u'${nameNode}/user/%s/indexer' % self.username,
                u'--log4j',
                u'log4j.properties',
                u'--go-live',
                u'--zk-host',
                zkensemble(),
                u'--collection',
                collection_name,
                u'${nameNode}%s' % input_path,
            ],
            u'archives': [],
        }

        notebook = make_notebook(
            name='Indexer',
            editor_type='java',
            snippet_properties=snippet_properties).get_data()
        notebook_doc, created = _save_notebook(notebook, self.user)

        workflow_doc = WorkflowBuilder().create_workflow(
            document=notebook_doc,
            user=self.user,
            managed=True,
            name=_("Batch job for %s") % notebook_doc.name)
        workflow = Workflow(document=workflow_doc, user=self.user)

        job_id = _submit_workflow(user=self.user,
                                  fs=self.fs,
                                  jt=self.jt,
                                  workflow=workflow,
                                  mapping=None)

        return job_id
Ejemplo n.º 2
0
  def create_query_document(self, owner, query_type='hive', database='default',
                            name='Test Query', description='Test Query', statement='',
                            files=None, functions=None, settings=None):
      """
      Creates and returns a query Document2 object
      :param owner: owner of doc
      :param query_type: hive, impala or spark
      :param database: database name
      :param name: name of document
      :param description: description of document
      :param statement: SQL statement (can be multi-query statement)
      :param files: list of dicts representing files
      :param functions: list of dicts representing functions
      :param settings: list of dicts representing settings
      :return: Document2 object representing query
      """
      if query_type not in ('hive', 'impala', 'spark'):
          raise ValueError("Invalid query_type: %s" % query_type)

      notebook = make_notebook(name=name, description=description, editor_type=query_type, statement=statement,
                               status='ready', database=database, files=files, functions=functions, settings=settings)
      notebook_doc, save_as = _save_notebook(notebook.get_data(), owner)
      return notebook_doc
Ejemplo n.º 3
0
    def create_query_document(self,
                              owner,
                              query_type='hive',
                              database='default',
                              name='Test Query',
                              description='Test Query',
                              statement='',
                              files=None,
                              functions=None,
                              settings=None):
        """
      Creates and returns a query Document2 object
      :param owner: owner of doc
      :param query_type: hive, impala or spark
      :param database: database name
      :param name: name of document
      :param description: description of document
      :param statement: SQL statement (can be multi-query statement)
      :param files: list of dicts representing files
      :param functions: list of dicts representing functions
      :param settings: list of dicts representing settings
      :return: Document2 object representing query
      """
        if query_type not in ('hive', 'impala', 'spark'):
            raise ValueError("Invalid query_type: %s" % query_type)

        notebook = make_notebook(name=name,
                                 description=description,
                                 editor_type=query_type,
                                 statement=statement,
                                 status='ready',
                                 database=database,
                                 files=files,
                                 functions=functions,
                                 settings=settings)
        notebook_doc, save_as = _save_notebook(notebook.get_data(), owner)
        return notebook_doc
Ejemplo n.º 4
0
  def run_morphline(self, collection_name, morphline, input_path):
    workspace_path = self._upload_workspace(morphline)

    snippet_properties =  {
      u'files': [
          {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
          {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
      ],
      u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
      u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
      u'arguments': [
          u'--morphline-file',
          u'morphline.conf',
          u'--output-dir',
          u'${nameNode}/user/%s/indexer' % self.username,
          u'--log4j',
          u'log4j.properties',
          u'--go-live',
          u'--zk-host',
          zkensemble(),
          u'--collection',
          collection_name,
          u'${nameNode}%s' % input_path,
      ],
      u'archives': [],
    }

    notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data()
    notebook_doc, created = _save_notebook(notebook, self.user)

    workflow_doc = WorkflowBuilder().create_workflow(document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name)
    workflow = Workflow(document=workflow_doc, user=self.user)

    job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None)

    return job_id
Ejemplo n.º 5
0
  def run_morphline(self, request, collection_name, morphline, input_path):
    workspace_path = self._upload_workspace(morphline)

#     snippets = [
#       {
#         u'type': u'java',
#         u'files': [
#             {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
#             {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
#         ],
#         u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
#         u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
#         u'arguments': [
#             u'--morphline-file',
#             u'morphline.conf',
#             u'--output-dir',
#             u'${nameNode}/user/%s/indexer' % self.username,
#             u'--log4j',
#             u'log4j.properties',
#             u'--go-live',
#             u'--zk-host',
#             zkensemble(),
#             u'--collection',
#             collection_name,
#             input_path,
#         ],
#         u'archives': [],
#       }
#     ]
#
#     # managed notebook
#     notebook = make_notebook2(name='Indexer job for %s' % collection_name, snippets=snippets).get_data()
#     notebook_doc, created = _save_notebook(notebook, self.user)
#
#     snippet = {'wasBatchExecuted': True}

    snippet_properties =  {
       u'files': [
           {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
           {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
       ],
       u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
       u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
       u'arguments': [
           u'--morphline-file',
           u'morphline.conf',
           u'--output-dir',
           u'${nameNode}/user/%s/indexer' % self.username,
           u'--log4j',
           u'log4j.properties',
           u'--go-live',
           u'--zk-host',
           zkensemble(),
           u'--collection',
           collection_name,
           input_path,
       ],
       u'archives': [],
    }

    notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties, status='running').get_data()
    notebook_doc, created = _save_notebook(notebook, self.user)

    snippet = {'wasBatchExecuted': True, 'id': notebook['snippets'][0]['id'], 'statement': ''}

    job_handle = _execute_notebook(request, notebook, snippet)

    return job_handle