Exemple #1
0
  def generate_morphline_config(self, collection_name, data, uuid_name):
    """
    Input:
    data: {
      'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""},
      'transformation': [
        'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}}
        'ip': {'geoIP': }
      ]
    }
    Output:
    Morphline content 'SOLR_LOCATOR : { ...}'
    """

    properties = {
      "collection_name":collection_name,
      "fields":self.get_field_list(data['columns']),
      "num_base_fields": len(data['columns']),
      "format_character":Indexer._format_character,
      "uuid_name" : uuid_name,
      "get_regex":Indexer._get_regex_for_type,
      "format":data['format'],
      "grok_dictionaries_location" : os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries"),
      "zk_host": zkensemble()
    }

    oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

    lookup = TemplateLookup(directories=[oozie_workspace])
    morphline = lookup.get_template("morphline_template.conf").render(**properties)

    return morphline
Exemple #2
0
    def run_morphline(self,
                      request,
                      collection_name,
                      morphline,
                      input_path,
                      query=None):
        workspace_path = self._upload_workspace(morphline)

        notebook = Notebook(name='Indexer job for %s' % collection_name,
                            isManaged=True)

        if query:
            q = Notebook(document=Document2.objects.get_by_uuid(user=self.user,
                                                                uuid=query))
            notebook_data = q.get_data()
            snippet = notebook_data['snippets'][0]

            api = get_api(request, snippet)

            destination = '__hue_%s' % notebook_data['uuid'][:4]
            location = '/user/%s/__hue-%s' % (request.user,
                                              notebook_data['uuid'][:4])
            sql, success_url = api.export_data_as_table(notebook_data,
                                                        snippet,
                                                        destination,
                                                        is_temporary=True,
                                                        location=location)
            input_path = '${nameNode}%s' % location

            notebook.add_hive_snippet(snippet['database'], sql)

        notebook.add_java_snippet(
            clazz='org.apache.solr.hadoop.MapReduceIndexerTool',
            app_jar=CONFIG_INDEXER_LIBS_PATH.get(),
            arguments=[
                u'--morphline-file',
                u'morphline.conf',
                u'--output-dir',
                u'${nameNode}/user/%s/indexer' % self.username,
                u'--log4j',
                u'log4j.properties',
                u'--go-live',
                u'--zk-host',
                zkensemble(),
                u'--collection',
                collection_name,
                input_path,
            ],
            files=[{
                u'path': u'%s/log4j.properties' % workspace_path,
                u'type': u'file'
            }, {
                u'path': u'%s/morphline.conf' % workspace_path,
                u'type': u'file'
            }])

        return notebook.execute(request, batch=True)
Exemple #3
0
    def generate_morphline_config(self, collection_name, data, uuid_name=None):
        """
    Input:
    data: {
      'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""},
      'transformation': [
        'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}}
        'ip': {'geoIP': }
      ]
    }
    Output:
    Morphline content 'SOLR_LOCATOR : { ...}'
    """

        geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                   "GeoLite2-City.mmdb")
        grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                      "grok_dictionaries")

        properties = {
            "collection_name":
            collection_name,
            "fields":
            self.get_field_list(data['columns']),
            "num_base_fields":
            len(data['columns']),
            "uuid_name":
            uuid_name,
            "get_regex":
            Indexer._get_regex_for_type,
            "format_settings":
            data['format'],
            "format_class":
            get_file_format_class(data['format']['type']),
            "get_kept_args":
            get_checked_args,
            "grok_dictionaries_location":
            grok_dicts_loc
            if self.fs and self.fs.exists(grok_dicts_loc) else None,
            "geolite_db_location":
            geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
            "zk_host":
            zkensemble()
        }

        oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

        lookup = TemplateLookup(directories=[oozie_workspace])
        morphline = lookup.get_template("morphline_template.conf").render(
            **properties)

        return morphline
Exemple #4
0
    def run_morphline(self, collection_name, morphline, input_path):
        workspace_path = self._upload_workspace(morphline)

        snippet_properties = {
            u'files': [{
                u'path': u'%s/log4j.properties' % workspace_path,
                u'type': u'file'
            }, {
                u'path': u'%s/morphline.conf' % workspace_path,
                u'type': u'file'
            }],
            u'class':
            u'org.apache.solr.hadoop.MapReduceIndexerTool',
            u'app_jar':
            CONFIG_INDEXER_LIBS_PATH.get(),
            u'arguments': [
                u'--morphline-file',
                u'morphline.conf',
                u'--output-dir',
                u'${nameNode}/user/%s/indexer' % self.username,
                u'--log4j',
                u'log4j.properties',
                u'--go-live',
                u'--zk-host',
                zkensemble(),
                u'--collection',
                collection_name,
                u'${nameNode}%s' % input_path,
            ],
            u'archives': [],
        }

        notebook = make_notebook(
            name='Indexer',
            editor_type='java',
            snippet_properties=snippet_properties).get_data()
        notebook_doc, created = _save_notebook(notebook, self.user)

        workflow_doc = WorkflowBuilder().create_workflow(
            document=notebook_doc,
            user=self.user,
            managed=True,
            name=_("Batch job for %s") % notebook_doc.name)
        workflow = Workflow(document=workflow_doc, user=self.user)

        job_id = _submit_workflow(user=self.user,
                                  fs=self.fs,
                                  jt=self.jt,
                                  workflow=workflow,
                                  mapping=None)

        return job_id
Exemple #5
0
  def run_morphline(self, request, collection_name, morphline, input_path, query=None):
    workspace_path = self._upload_workspace(morphline)

    notebook = Notebook(
        name='Indexer job for %s' % collection_name,
        isManaged=True
    )

    if query:
      q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query))
      notebook_data = q.get_data()
      snippet = notebook_data['snippets'][0]

      api = get_api(request, snippet)

      destination = '__hue_%s' % notebook_data['uuid'][:4]
      location = '/user/%s/__hue-%s' % (request.user,  notebook_data['uuid'][:4])
      sql, success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location)
      input_path = '${nameNode}%s' % location

      notebook.add_hive_snippet(snippet['database'], sql)

    notebook.add_java_snippet(
      clazz='org.apache.solr.hadoop.MapReduceIndexerTool',
      app_jar=CONFIG_INDEXER_LIBS_PATH.get(),
      arguments=[
          u'--morphline-file',
          u'morphline.conf',
          u'--output-dir',
          u'${nameNode}/user/%s/indexer' % self.username,
          u'--log4j',
          u'log4j.properties',
          u'--go-live',
          u'--zk-host',
          zkensemble(),
          u'--collection',
          collection_name,
          input_path,
      ],
      files=[
          {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
          {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
      ]
    )

    return notebook.execute(request, batch=True)
Exemple #6
0
  def _schedule_oozie_job(self, workspace_path, collection_name, input_path):
    oozie = get_oozie(self.username)

    properties = {
      "dryrun": "False",
      "zkHost":  zkensemble(),
      # these libs can be installed from here:
      # https://drive.google.com/a/cloudera.com/folderview?id=0B1gZoK8Ae1xXc0sxSkpENWJ3WUU&usp=sharing
      "oozie.libpath": CONFIG_INDEXER_LIBS_PATH.get(),
      "security_enabled": "False",
      "collectionName": collection_name,
      "filePath": input_path,
      "outputDir": "/user/%s/indexer" % self.username,
      "workspacePath": workspace_path,
      'oozie.wf.application.path': "${nameNode}%s" % workspace_path,
      'user.name': self.username
    }

    submission = Submission(self.username, fs=self.fs, properties=properties)
    job_id = submission.run(workspace_path)

    return job_id
Exemple #7
0
    def generate_morphline_config(self, collection_name, data, uuid_name=None):
        geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                   "GeoLite2-City.mmdb")
        grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                      "grok_dictionaries")

        properties = {
            "collection_name":
            collection_name,
            "fields":
            self.get_field_list(data['columns']),
            "num_base_fields":
            len(data['columns']),
            "uuid_name":
            uuid_name,
            "get_regex":
            Indexer._get_regex_for_type,
            "format_settings":
            data['format'],
            "format_class":
            get_file_format_class(data['format']['type']),
            "get_kept_args":
            get_checked_args,
            "grok_dictionaries_location":
            grok_dicts_loc
            if self.fs and self.fs.exists(grok_dicts_loc) else None,
            "geolite_db_location":
            geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
            "zk_host":
            zkensemble()
        }

        oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

        lookup = TemplateLookup(directories=[oozie_workspace])
        morphline = lookup.get_template("morphline_template.conf").render(
            **properties)

        return morphline
Exemple #8
0
    def _schedule_oozie_job(self, workspace_path, collection_name, input_path):
        oozie = get_oozie(self.username)

        properties = {
            "dryrun": "False",
            "zkHost": zkensemble(),
            # these libs can be installed from here:
            # https://drive.google.com/a/cloudera.com/folderview?id=0B1gZoK8Ae1xXc0sxSkpENWJ3WUU&usp=sharing
            "oozie.libpath": CONFIG_INDEXER_LIBS_PATH.get(),
            "security_enabled": "False",
            "collectionName": collection_name,
            "filePath": input_path,
            "outputDir": "/user/%s/indexer" % self.username,
            "workspacePath": workspace_path,
            'oozie.wf.application.path': "${nameNode}%s" % workspace_path,
            'user.name': self.username
        }

        submission = Submission(self.username,
                                fs=self.fs,
                                properties=properties)
        job_id = submission.run(workspace_path)

        return job_id
Exemple #9
0
  def run_morphline(self, collection_name, morphline, input_path):
    workspace_path = self._upload_workspace(morphline)

    snippet_properties =  {
      u'files': [
          {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
          {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
      ],
      u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
      u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
      u'arguments': [
          u'--morphline-file',
          u'morphline.conf',
          u'--output-dir',
          u'${nameNode}/user/%s/indexer' % self.username,
          u'--log4j',
          u'log4j.properties',
          u'--go-live',
          u'--zk-host',
          zkensemble(),
          u'--collection',
          collection_name,
          u'${nameNode}%s' % input_path,
      ],
      u'archives': [],
    }

    notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data()
    notebook_doc, created = _save_notebook(notebook, self.user)

    workflow_doc = WorkflowBuilder().create_workflow(document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name)
    workflow = Workflow(document=workflow_doc, user=self.user)

    job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None)

    return job_id
Exemple #10
0
  def generate_morphline_config(self, collection_name, data, uuid_name=None):
    geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb")
    grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries")

    properties = {
      "collection_name": collection_name,
      "fields": self.get_field_list(data['columns']),
      "num_base_fields": len(data['columns']),
      "uuid_name" : uuid_name,
      "get_regex": Indexer._get_regex_for_type,
      "format_settings": data['format'],
      "format_class": get_file_format_class(data['format']['type']),
      "get_kept_args": get_checked_args,
      "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None,
      "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
      "zk_host": zkensemble()
    }

    oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

    lookup = TemplateLookup(directories=[oozie_workspace])
    morphline = lookup.get_template("morphline_template.conf").render(**properties)

    return morphline
Exemple #11
0
  def run_morphline(self, request, collection_name, morphline, input_path):
    workspace_path = self._upload_workspace(morphline)

#     snippets = [
#       {
#         u'type': u'java',
#         u'files': [
#             {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
#             {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
#         ],
#         u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
#         u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
#         u'arguments': [
#             u'--morphline-file',
#             u'morphline.conf',
#             u'--output-dir',
#             u'${nameNode}/user/%s/indexer' % self.username,
#             u'--log4j',
#             u'log4j.properties',
#             u'--go-live',
#             u'--zk-host',
#             zkensemble(),
#             u'--collection',
#             collection_name,
#             input_path,
#         ],
#         u'archives': [],
#       }
#     ]
#
#     # managed notebook
#     notebook = make_notebook2(name='Indexer job for %s' % collection_name, snippets=snippets).get_data()
#     notebook_doc, created = _save_notebook(notebook, self.user)
#
#     snippet = {'wasBatchExecuted': True}

    snippet_properties =  {
       u'files': [
           {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'},
           {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'}
       ],
       u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool',
       u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(),
       u'arguments': [
           u'--morphline-file',
           u'morphline.conf',
           u'--output-dir',
           u'${nameNode}/user/%s/indexer' % self.username,
           u'--log4j',
           u'log4j.properties',
           u'--go-live',
           u'--zk-host',
           zkensemble(),
           u'--collection',
           collection_name,
           input_path,
       ],
       u'archives': [],
    }

    notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties, status='running').get_data()
    notebook_doc, created = _save_notebook(notebook, self.user)

    snippet = {'wasBatchExecuted': True, 'id': notebook['snippets'][0]['id'], 'statement': ''}

    job_handle = _execute_notebook(request, notebook, snippet)

    return job_handle