def _upload_workspace(self, morphline): hdfs_workspace_path = Job.get_workspace(self.username) hdfs_morphline_path = os.path.join(hdfs_workspace_path, "morphline.conf") hdfs_workflow_path = os.path.join(hdfs_workspace_path, "workflow.xml") hdfs_log4j_properties_path = os.path.join(hdfs_workspace_path, "log4j.properties") workflow_template_path = os.path.join( CONFIG_INDEXING_TEMPLATES_PATH.get(), "workflow.xml") log4j_template_path = os.path.join( CONFIG_INDEXING_TEMPLATES_PATH.get(), "log4j.properties") # create workspace on hdfs self.fs.do_as_user(self.username, self.fs.mkdir, hdfs_workspace_path) self.fs.do_as_user(self.username, self.fs.create, hdfs_morphline_path, data=morphline) self.fs.do_as_user(self.username, self.fs.create, hdfs_workflow_path, data=open(workflow_template_path).read()) self.fs.do_as_user(self.username, self.fs.create, hdfs_log4j_properties_path, data=open(log4j_template_path).read()) return hdfs_workspace_path
def generate_morphline_config(self, collection_name, data, uuid_name=None, lib_path=None): if lib_path is None: lib_path = CONFIG_INDEXER_LIBS_PATH.get() geolite_loc = os.path.join(lib_path, "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(lib_path, "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns'], is_converting_types=True), "num_base_fields": len(data['columns']), "uuid_name" : uuid_name, "get_regex": MorphlineIndexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": self.solr_client.get_zookeeper_host() ## offline test? } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline
def generate_morphline_config(self, collection_name, data, uuid_name): """ Input: data: { 'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""}, 'transformation': [ 'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}} 'ip': {'geoIP': } ] } Output: Morphline content 'SOLR_LOCATOR : { ...}' """ properties = { "collection_name":collection_name, "fields":self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "format_character":Indexer._format_character, "uuid_name" : uuid_name, "get_regex":Indexer._get_regex_for_type, "format":data['format'], "grok_dictionaries_location" : os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries"), "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline
def _upload_workspace(self, morphline): hdfs_workspace_path = Job.get_workspace(self.username) hdfs_morphline_path = os.path.join(hdfs_workspace_path, "morphline.conf") hdfs_workflow_path = os.path.join(hdfs_workspace_path, "workflow.xml") hdfs_log4j_properties_path = os.path.join(hdfs_workspace_path, "log4j.properties") workflow_template_path = os.path.join(CONFIG_INDEXING_TEMPLATES_PATH.get(), "workflow.xml") log4j_template_path = os.path.join(CONFIG_INDEXING_TEMPLATES_PATH.get(), "log4j.properties") # create workspace on hdfs self.fs.do_as_user(self.username, self.fs.mkdir, hdfs_workspace_path) self.fs.do_as_user(self.username, self.fs.create, hdfs_morphline_path, data=morphline) self.fs.do_as_user(self.username, self.fs.create, hdfs_workflow_path, data=open(workflow_template_path).read()) self.fs.do_as_user(self.username, self.fs.create, hdfs_log4j_properties_path, data=open(log4j_template_path).read()) return hdfs_workspace_path
def generate_morphline_config(self, collection_name, data, uuid_name=None): """ Input: data: { 'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""}, 'transformation': [ 'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}} 'ip': {'geoIP': } ] } Output: Morphline content 'SOLR_LOCATOR : { ...}' """ geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "uuid_name": uuid_name, "get_regex": Indexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location": grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location": geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render( **properties) return morphline
def generate_morphline_config(self, collection_name, data, uuid_name=None): geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "uuid_name" : uuid_name, "get_regex": Indexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline