Beispiel #1
0
def test_generate_from_kafka_to_file_csv():
    properties = {
        'app_name': 'Ingest',
        'inputFormat': 'stream',
        'streamSelection': 'kafka',
        'brokers': 'broker:9092',
        'topics': 'kafkaTopic',
        'kafkaFieldType': 'delimited',
        'kafkaFieldDelimiter': ',',
        'kafkaFieldNames': 'id,name',
        'kafkaFieldTypes': 'int,string',
        'ouputFormat': 'file',
        'input_path': '/tmp/output',
        'format': 'csv'
    }

    config = EnvelopeIndexer(username='******').generate_config(properties)

    assert_true(
        '''steps {
    inputdata {
        input {
            type = kafka
                brokers = "broker:9092"
                topics = kafkaTopic
                encoding = string
                translator {
                    type = delimited
                    delimiter = ","
                    field.names = [id,name]
                    field.types = [int,string]
                }
                window {
                    enabled = true
                    milliseconds = 60000
                }

        }
    }

    outputdata {
        dependencies = [inputdata]

        deriver {
          type = sql
          query.literal = """SELECT * from inputdata"""
        }

        planner = {
          type = overwrite
        }
        output = {
          type = filesystem
          path = /tmp/output
          format = csv
          header = true
        }
    }
}
''' in config, config)
 def test_generate_from_stream_kafka_to_solr_index(self):
   properties = {
     'app_name': 'Ingest',
 
     'inputFormat': 'stream',
     'streamSelection': 'kafka',
     'brokers': 'broker:9092',
     'topics': 'kafkaTopic',
     'kafkaFieldType': 'delimited',
     'kafkaFieldDelimiter': ',',
     'kafkaFieldNames': 'id,name',
     'kafkaFieldTypes': 'int,string',
 
     'ouputFormat': 'index',
     'connection': 'http://hue.com:8983/solr/',
     'collectionName': 'traffic'
   }
 
   config = EnvelopeIndexer(username='******').generate_config(properties)
 
   assert_true('''steps {
     inputdata {
         input {
             type = kafka
                 brokers = "broker:9092"
                 topics = kafkaTopic
                 encoding = string
                 translator {
                     type = delimited
                     delimiter = ","
                     field.names = [id,name]
                     field.types = [int,string]
                 }
                 window {
                     enabled = true
                     milliseconds = 60000
                 }
 
         }
     }
 
     outputdata {
         dependencies = [inputdata]
 
         deriver {
           type = sql
           query.literal = """SELECT * from inputdata"""
         }
 
         planner {
             type = upstert
         }
         output {
             type = solr
             connection = "http://hue.com:8983/solr/"
             collection.name = "traffic"
         }
     }
 }''' in  config, config)
 def test_generate_from_stream_sfdc_to_hive_table(self):
   properties = {
     'app_name': 'Ingest',
 
     'inputFormat': 'stream',    
     'streamSelection': 'sfdc',
     'streamUsername': '******',
     'streamPassword': '******',
     'streamToken': 'token',
     'streamEndpointUrl': 'http://sfdc/api',
     'streamObject': 'Opportunities',
 
     'ouputFormat': 'table',
     'output_table': 'sfdc',
     'format': 'text'
   }
 
   config = EnvelopeIndexer(username='******').generate_config(properties)
 
   assert_true('''steps {
     inputdata {
         input {
             type = sfdc
             mode = fetch-all
             sobject = Opportunities
             sfdc: {
               partner: {
                 username = "******"
                 password = "******"
                 token = "token"
                 auth-endpoint = "http://sfdc/api"
               }
             }
   
         }
     }
 
     outputdata {
         dependencies = [inputdata]
 
         deriver {
           type = sql
           query.literal = """SELECT * from inputdata"""
         }
 
           planner {
               type = append
           }
           output {
               type = hive
               table.name = "sfdc"
           }
     }
 }''' in  config, config)
 def test_generate_from_file_to_kafka(self):
   properties = {
     'app_name': 'Ingest',
 
     'inputFormat': 'file',
     'input_path': '/tmp/output',
     'format': 'csv',
 
     'ouputFormat': 'stream',
     'streamSelection': 'kafka',
     'brokers': 'broker:9092',
     'topics': 'kafkaTopic',
     'kafkaFieldType': 'delimited',
   }
 
   config = EnvelopeIndexer(username='******').generate_config(properties)
 
   assert_true('''steps {
     inputdata {
         input {
             type = filesystem
         path = /tmp/output
         format = csv
       
         }
     }
 
     outputdata {
         dependencies = [inputdata]
 
         deriver {
           type = sql
           query.literal = """SELECT * from inputdata"""
         }
 
         
         planner {
             type = append
         }
         output {
             type = kafka
             brokers = "broker:9092"
             topic = kafkaTopic
             serializer.type = delimited
             serializer.field.delimiter = ","
         }
     }
 }
 ''' in  config, config)
Beispiel #5
0
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None):
  collection_name = destination['name']
  indexer = EnvelopeIndexer(request.user, request.fs)

  lib_path = '/tmp/envelope-0.5.0.jar'
  input_path = None

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
    properties = {
      'format': 'json'
    }
  elif file_format['inputFormat'] == 'stream':
    if file_format['streamSelection'] == 'sfdc':
      properties = {
        'streamSelection': file_format['streamSelection'],
        'streamUsername': file_format['streamUsername'],
        'streamPassword': file_format['streamPassword'],
        'streamToken': file_format['streamToken'],
        'streamEndpointUrl': file_format['streamEndpointUrl'],
        'streamObject': file_format['streamObject'],
      }
    elif file_format['streamSelection'] == 'kafka':
      manager = ManagerApi()
      properties = {
        "brokers": manager.get_kafka_brokers(),
        "output_table": "impala::%s" % collection_name,
        "topics": file_format['kafkaSelectedTopics'],
        "kafkaFieldType": file_format['kafkaFieldType'],
        "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
        "kafkaFieldNames": file_format['kafkaFieldNames'],
        "kafkaFieldTypes": file_format['kafkaFieldTypes']
      }

    if destination['outputFormat'] == 'table':
      if destination['isTargetExisting']:
        # Todo: check if format matches
        pass
      else:
        sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(file_format, destination).get_str()
        print sql
      if destination['tableFormat'] == 'kudu':
        manager = ManagerApi()
        properties["output_table"] = "impala::%s" % collection_name
        properties["kudu_master"] = manager.get_kudu_master()
      else:
        properties['output_table'] = collection_name
    elif destination['outputFormat'] == 'file':
      properties['path'] = file_format["path"]
      properties['format'] = file_format['tableFormat'] # or csv
    elif destination['outputFormat'] == 'index':
      properties['collectionName'] = collection_name
      properties['connection'] = SOLR_URL.get()
      if destination['isTargetExisting']:
        # Todo: check if format matches
        pass
      else:
        client = SolrClient(request.user)
        kwargs = {}
        _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs)

  properties["app_name"] = 'Data Ingest'
  properties["inputFormat"] = file_format['inputFormat']
  properties["ouputFormat"] = destination['ouputFormat']
  properties["streamSelection"] = file_format["streamSelection"]

  envelope = indexer.generate_config(properties)

  return indexer.run(request, collection_name, envelope, input_path, start_time=start_time, lib_path=lib_path)
Beispiel #6
0
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = None  # Todo optional input field
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = file_format["path"]
        properties = {'input_path': input_path, 'format': 'csv'}
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        pass
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
            }

            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                schema_fields = MorphlineIndexer.get_kept_field_list(
                    file_format['sampleCols'])
                properties.update({
                    "kafkaFieldNames":
                    ', '.join([_field['name'] for _field in schema_fields]),
                    "kafkaFieldTypes":
                    ', '.join([_field['type'] for _field in schema_fields])
                })
            else:
                properties.update({
                    "kafkaFieldNames":
                    file_format['kafkaFieldNames'],
                    "kafkaFieldTypes":
                    file_format['kafkaFieldTypes']
                })

            if True:
                properties['window'] = ''
            else:  # For "KafkaSQL"
                properties['window'] = '''
            window {
                enabled = true
                milliseconds = 60000
            }'''
    elif file_format['inputFormat'] == 'connector':
        if file_format['streamSelection'] == 'flume':
            properties = {
                'streamSelection':
                file_format['streamSelection'],
                'channelSourceHosts':
                file_format['channelSourceHosts'],
                'channelSourceSelectedHosts':
                file_format['channelSourceSelectedHosts'],
                'channelSourcePath':
                file_format['channelSourcePath'],
            }
        else:
            # sfdc
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }

    if destination['outputFormat'] == 'table':
        if destination['isTargetExisting']:  # Todo: check if format matches
            pass
        else:
            destination['importData'] = False  # Avoid LOAD DATA
            if destination['tableFormat'] == 'kudu':
                properties['kafkaFieldNames'] = properties[
                    'kafkaFieldNames'].lower(
                    )  # Kudu names should be all lowercase
            # Create table
            if not request.POST.get('show_command'):
                SQLIndexer(user=request.user,
                           fs=request.fs).create_table_from_a_file(
                               file_format, destination).execute(request)

        if destination['tableFormat'] == 'kudu':
            manager = ManagerApi()
            properties["output_table"] = "impala::%s" % collection_name
            properties["kudu_master"] = manager.get_kudu_master()
        else:
            properties['output_table'] = collection_name
    elif destination['outputFormat'] == 'stream':
        manager = ManagerApi()
        properties['brokers'] = manager.get_kafka_brokers()
        properties['topics'] = file_format['kafkaSelectedTopics']
        properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter']
    elif destination['outputFormat'] == 'file':
        properties['path'] = file_format["path"]
        if file_format['inputFormat'] == 'stream':
            properties['format'] = 'csv'
        else:
            properties['format'] = file_format['tableFormat']  # or csv
    elif destination['outputFormat'] == 'index':
        properties['collectionName'] = collection_name
        properties['connection'] = SOLR_URL.get()

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    configs = indexer.generate_config(properties)

    if request.POST.get('show_command'):
        return {'status': 0, 'commands': configs['envelope.conf']}
    else:
        return indexer.run(request,
                           collection_name,
                           configs,
                           input_path,
                           start_time=start_time,
                           lib_path=lib_path)
Beispiel #7
0
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = None  # Todo optional input field
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = file_format["path"]
        properties = {'input_path': input_path, 'format': 'csv'}
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        pass
    elif file_format['inputFormat'] in ('stream', 'sfdc'):
        if file_format['inputFormat'] == 'sfdc':
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }
        elif file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
                "kafkaFieldNames": file_format['kafkaFieldNames'],
                "kafkaFieldTypes": file_format['kafkaFieldTypes']
            }

            if True:
                properties['window'] = ''
            else:  # For "KafkaSQL"
                properties['window'] = '''
            window {
                enabled = true
                milliseconds = 60000
            }'''

        if destination['outputFormat'] == 'table':
            if destination['isTargetExisting']:
                # Todo: check if format matches
                pass
            else:
                sql = SQLIndexer(user=request.user,
                                 fs=request.fs).create_table_from_a_file(
                                     file_format, destination).get_str()
                print sql
            if destination['tableFormat'] == 'kudu':
                manager = ManagerApi()
                properties["output_table"] = "impala::%s" % collection_name
                properties["kudu_master"] = manager.get_kudu_master()
            else:
                properties['output_table'] = collection_name
        elif destination['outputFormat'] == 'file':
            properties['path'] = file_format["path"]
            if file_format['inputFormat'] == 'stream':
                properties['format'] = 'csv'
            else:
                properties['format'] = file_format['tableFormat']  # or csv
        elif destination['outputFormat'] == 'index':
            properties['collectionName'] = collection_name
            properties['connection'] = SOLR_URL.get()


# No needed anymore
#       if destination['isTargetExisting']:
#         # Todo: check if format matches
#         pass
#       else:
#         client = SolrClient(request.user)
#         kwargs = {}
#         _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs)

    if destination['outputFormat'] == 'stream':
        manager = ManagerApi()
        properties['brokers'] = manager.get_kafka_brokers()
        properties['topics'] = file_format['kafkaSelectedTopics']
        properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter']

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    envelope = indexer.generate_config(properties)

    return indexer.run(request,
                       collection_name,
                       envelope,
                       input_path,
                       start_time=start_time,
                       lib_path=lib_path)