Ejemplo n.º 1
0
    def _upload_workspace(self, morphline):
        hdfs_workspace_path = Job.get_workspace(self.username)
        hdfs_morphline_path = os.path.join(hdfs_workspace_path,
                                           "morphline.conf")
        hdfs_workflow_path = os.path.join(hdfs_workspace_path, "workflow.xml")
        hdfs_log4j_properties_path = os.path.join(hdfs_workspace_path,
                                                  "log4j.properties")

        workflow_template_path = os.path.join(
            CONFIG_INDEXING_TEMPLATES_PATH.get(), "workflow.xml")
        log4j_template_path = os.path.join(
            CONFIG_INDEXING_TEMPLATES_PATH.get(), "log4j.properties")

        # create workspace on hdfs
        self.fs.do_as_user(self.username, self.fs.mkdir, hdfs_workspace_path)

        self.fs.do_as_user(self.username,
                           self.fs.create,
                           hdfs_morphline_path,
                           data=morphline)
        self.fs.do_as_user(self.username,
                           self.fs.create,
                           hdfs_workflow_path,
                           data=open(workflow_template_path).read())
        self.fs.do_as_user(self.username,
                           self.fs.create,
                           hdfs_log4j_properties_path,
                           data=open(log4j_template_path).read())

        return hdfs_workspace_path
Ejemplo n.º 2
0
  def generate_morphline_config(self, collection_name, data, uuid_name=None, lib_path=None):
    if lib_path is None:
      lib_path = CONFIG_INDEXER_LIBS_PATH.get()

    geolite_loc = os.path.join(lib_path, "GeoLite2-City.mmdb")
    grok_dicts_loc = os.path.join(lib_path, "grok_dictionaries")

    properties = {
      "collection_name": collection_name,
      "fields": self.get_field_list(data['columns'], is_converting_types=True),
      "num_base_fields": len(data['columns']),
      "uuid_name" : uuid_name,
      "get_regex": MorphlineIndexer._get_regex_for_type,
      "format_settings": data['format'],
      "format_class": get_file_format_class(data['format']['type']),
      "get_kept_args": get_checked_args,
      "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None,
      "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
      "zk_host": self.solr_client.get_zookeeper_host() ## offline test?
    }

    oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

    lookup = TemplateLookup(directories=[oozie_workspace])
    morphline = lookup.get_template("morphline_template.conf").render(**properties)

    return morphline
Ejemplo n.º 3
0
  def generate_morphline_config(self, collection_name, data, uuid_name):
    """
    Input:
    data: {
      'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""},
      'transformation': [
        'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}}
        'ip': {'geoIP': }
      ]
    }
    Output:
    Morphline content 'SOLR_LOCATOR : { ...}'
    """

    properties = {
      "collection_name":collection_name,
      "fields":self.get_field_list(data['columns']),
      "num_base_fields": len(data['columns']),
      "format_character":Indexer._format_character,
      "uuid_name" : uuid_name,
      "get_regex":Indexer._get_regex_for_type,
      "format":data['format'],
      "grok_dictionaries_location" : os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries"),
      "zk_host": zkensemble()
    }

    oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

    lookup = TemplateLookup(directories=[oozie_workspace])
    morphline = lookup.get_template("morphline_template.conf").render(**properties)

    return morphline
Ejemplo n.º 4
0
  def _upload_workspace(self, morphline):
    hdfs_workspace_path = Job.get_workspace(self.username)
    hdfs_morphline_path = os.path.join(hdfs_workspace_path, "morphline.conf")
    hdfs_workflow_path = os.path.join(hdfs_workspace_path, "workflow.xml")
    hdfs_log4j_properties_path = os.path.join(hdfs_workspace_path, "log4j.properties")

    workflow_template_path = os.path.join(CONFIG_INDEXING_TEMPLATES_PATH.get(), "workflow.xml")
    log4j_template_path = os.path.join(CONFIG_INDEXING_TEMPLATES_PATH.get(), "log4j.properties")

    # create workspace on hdfs
    self.fs.do_as_user(self.username, self.fs.mkdir, hdfs_workspace_path)

    self.fs.do_as_user(self.username, self.fs.create, hdfs_morphline_path, data=morphline)
    self.fs.do_as_user(self.username, self.fs.create, hdfs_workflow_path, data=open(workflow_template_path).read())
    self.fs.do_as_user(self.username, self.fs.create, hdfs_log4j_properties_path, data=open(log4j_template_path).read())

    return hdfs_workspace_path
Ejemplo n.º 5
0
    def generate_morphline_config(self, collection_name, data, uuid_name=None):
        """
    Input:
    data: {
      'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""},
      'transformation': [
        'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}}
        'ip': {'geoIP': }
      ]
    }
    Output:
    Morphline content 'SOLR_LOCATOR : { ...}'
    """

        geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                   "GeoLite2-City.mmdb")
        grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(),
                                      "grok_dictionaries")

        properties = {
            "collection_name":
            collection_name,
            "fields":
            self.get_field_list(data['columns']),
            "num_base_fields":
            len(data['columns']),
            "uuid_name":
            uuid_name,
            "get_regex":
            Indexer._get_regex_for_type,
            "format_settings":
            data['format'],
            "format_class":
            get_file_format_class(data['format']['type']),
            "get_kept_args":
            get_checked_args,
            "grok_dictionaries_location":
            grok_dicts_loc
            if self.fs and self.fs.exists(grok_dicts_loc) else None,
            "geolite_db_location":
            geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
            "zk_host":
            zkensemble()
        }

        oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

        lookup = TemplateLookup(directories=[oozie_workspace])
        morphline = lookup.get_template("morphline_template.conf").render(
            **properties)

        return morphline
Ejemplo n.º 6
0
  def generate_morphline_config(self, collection_name, data, uuid_name=None):
    geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb")
    grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries")

    properties = {
      "collection_name": collection_name,
      "fields": self.get_field_list(data['columns']),
      "num_base_fields": len(data['columns']),
      "uuid_name" : uuid_name,
      "get_regex": Indexer._get_regex_for_type,
      "format_settings": data['format'],
      "format_class": get_file_format_class(data['format']['type']),
      "get_kept_args": get_checked_args,
      "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None,
      "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None,
      "zk_host": zkensemble()
    }

    oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get()

    lookup = TemplateLookup(directories=[oozie_workspace])
    morphline = lookup.get_template("morphline_template.conf").render(**properties)

    return morphline