def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']: client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline( request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path )
def test_generate_from_directory_to_solr_index(): raise SkipTest source = { 'channelSourceType': 'directory', } destination = { 'ouputFormat': 'index', } configs = FlumeIndexer(user=None).generate_config(source=source, destination=destination) assert_equal( '''SOLR_LOCATOR : { # Name of solr collection collection : log_analytics_demo # ZooKeeper ensemble zkHost : "spark2-envelope515-1.gce.cloudera.com:2181/solr" } morphlines : [ { id : hue_accesslogs_no_geo importCommands : ["org.kitesdk.**", "org.apache.solr.**"] commands : [ { ## Read the email stream and break it up into individual messages. ## The beginning of a message is marked by regex clause below ## The reason we use this command is that one event can have multiple ## messages readCSV { ## Hue HTTPD load balancer ## 172.18.18.3 - - [27/Aug/2018:05:47:12 -0700] "GET /static/desktop/js/jquery.rowselector.a04240f7cc48.js HTTP/1.1" 200 2321 separator: " " columns: [client_ip,C1,C2,time,dummy1,request,code,bytes] ignoreFirstLine : false quoteChar : "\"" commentPrefix : "" trim : true charset : UTF-8 } } { split { inputField : request outputFields : [method, url, protocol] separator : " " isRegex : false #separator : """\s*,\s*""" # #isRegex : true addEmptyStrings : false trim : true } } { split { inputField : url outputFields : ["", app, subapp] separator : "\/" isRegex : false #separator : """\s*,\s*""" # #isRegex : true addEmptyStrings : false trim : true } } { userAgent { inputField : user_agent outputFields : { user_agent_family : "@{ua_family}" user_agent_major : "@{ua_major}" device_family : "@{device_family}" os_family : "@{os_family}" os_major : "@{os_major}" } } } #{logInfo { format : "BODY : {}", args : ["@{}"] } } # add Unique ID, in case our message_id field from above is not present { generateUUID { field:id } } # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format { # 21/Nov/2014:22:08:27 convertTimestamp { field : time inputFormats : ["[dd/MMM/yyyy:HH:mm:ss", "EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] #inputTimezone : America/Los_Angeles inputTimezone : UTC outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" outputTimezone : UTC } } # Consume the output record of the previous command and pipe another # record downstream. # # This command sanitizes record fields that are unknown to Solr schema.xml # by deleting them. Recall that Solr throws an exception on any attempt to # load a document that contains a field that isn't specified in schema.xml { sanitizeUnknownSolrFields { # Location from which to fetch Solr schema solrLocator : ${SOLR_LOCATOR} } } # load the record into a SolrServer or MapReduce SolrOutputFormat. { loadSolr { solrLocator : ${SOLR_LOCATOR} } } ] } ] '''.strip(), configs[0][1].strip() # 'agent_morphlines_conf_file' ) assert_equal(( 'agent_config_file', 'tier1.sources = source1\n tier1.channels = channel1\n tier1.sinks = sink1\n\n\n tier1.channels.channel1.type = memory\n tier1.channels.channel1.capacity = 10000\n tier1.channels.channel1.transactionCapacity = 1000\n\n \n tier1.sinks.sink1.type = org.apache.flume.sink.solr.morphline.MorphlineSolrSink\n tier1.sinks.sink1.morphlineFile = morphlines.conf\n tier1.sinks.sink1.morphlineId = hue_accesslogs_no_geo\n tier1.sinks.sink1.channel = channel1' ), configs['agent_config_file'])