Example #1
0
def guess_format(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = Indexer(request.user, request.fs)
    if not request.fs.isfile(file_format["path"]):
      raise PopupException(_('Path %(path)s is not a file') % file_format)

    stream = request.fs.open(file_format["path"])
    format_ = indexer.guess_format({
      "file": {
        "stream": stream,
        "name": file_format['path']
      }
    })
    _convert_format(format_)
  elif file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details])
    if table_metadata.details['properties']['format'] == 'text':
      format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']}
    elif table_metadata.details['properties']['format'] == 'parquet':
      format_ = {"type": "parquet", "hasHeader": False,}
    else:
      raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format'])
  elif file_format['inputFormat'] == 'query':
    format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"}

  format_['status'] = 0
  return JsonResponse(format_)
Example #2
0
    def test_guess_csv_format(self):
        stream = StringIO.StringIO(IndexerTest.simpleCSVString)
        indexer = Indexer("test", None)

        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        # test format
        expected_format = self.simpleCSVFormat

        assert_equal(expected_format, guessed_format)

        # test fields
        expected_fields = self.simpleCSVFields

        for expected, actual in zip(expected_fields, fields):
            for key in ("name", "type"):
                assert_equal(expected[key], actual[key])
Example #3
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = Indexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
        "stream": stream,
        "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    #TODO get schema from explain query
    pass

  return JsonResponse(format_)
Example #4
0
def guess_format(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = Indexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    format_ = indexer.guess_format({
      "file":{
        "stream": stream,
        "name": file_format['path']
        }
      })
    _convert_format(format_)
  elif file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details])
    if table_metadata.details['properties']['format'] == 'text':
      format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']}
    elif table_metadata.details['properties']['format'] == 'parquet':
      format_ = {"type": "parquet", "hasHeader": False,}
    else:
      raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format'])
  elif file_format['inputFormat'] == 'query':
    format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\t"} # \t --> CTRL+A

  return JsonResponse(format_)
Example #5
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = Indexer(request.user, request.fs)
        stream = request.fs.open(file_format["path"])
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": file_format['path']
            },
            "format": file_format['format']
        })
    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format[
            'inputFormat'] == 'query':  # Only support open query history
        # TODO get schema from explain query, which is not possible
        notebook = Notebook(document=Document2.objects.get(
            id=file_format['query'])).get_data()
        snippet = notebook['snippets'][0]
        sample = get_api(request, snippet).fetch_result(notebook,
                                                        snippet,
                                                        4,
                                                        start_over=True)

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in sample.meta
            ]
        }

    return JsonResponse(format_)
Example #6
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  indexer = Indexer(request.user, request.fs)
  stream = request.fs.open(file_format["path"])
  _convert_format(file_format["format"], inverse = True)
  format_ = indexer.guess_field_types({"file":stream, "format":file_format['format']})

  return JsonResponse(format_)
Example #7
0
  def test_generate_csv_morphline(self):
    indexer = Indexer("test")
    morphline =indexer.generate_morphline_config("test_collection", {
        "columns": self.simpleCSVFields,
        "format": self.simpleCSVFormat
      })

    assert_true(isinstance(morphline, basestring))
Example #8
0
def guess_format(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  indexer = Indexer(request.user, request.fs)
  stream = request.fs.open(file_format["path"])
  format_ = indexer.guess_format({"file":stream})
  _convert_format(format_)
  
  return JsonResponse(format_)
Example #9
0
    def test_generate_csv_morphline(self):
        indexer = Indexer("test", None)
        morphline = indexer.generate_morphline_config(
            "test_collection", {
                "columns": self.simpleCSVFields,
                "format": self.simpleCSVFormat
            })

        assert_true(isinstance(morphline, basestring))
Example #10
0
    def test_end_to_end(self):
        if not is_live_cluster():
            raise SkipTest()

        cluster = shared_cluster()
        fs = cluster.fs
        collection_name = "test_collection"
        indexer = Indexer("test", fs=fs, jt=cluster.jt)
        input_loc = "/tmp/test.csv"

        # upload the test file to hdfs
        fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

        # open a filestream for the file on hdfs
        stream = fs.open(input_loc)

        # guess the format of the file
        file_type_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        field_types = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": file_type_format
        })

        format_ = field_types.copy()
        format_['format'] = file_type_format

        # find a field name available to use for the record's uuid
        unique_field = indexer.get_unique_field(format_)
        is_unique_generated = indexer.is_unique_generated(format_)

        # generate morphline
        morphline = indexer.generate_morphline_config(collection_name, format_,
                                                      unique_field)

        schema_fields = indexer.get_kept_field_list(format_['columns'])
        if is_unique_generated:
            schema_fields += [{"name": unique_field, "type": "string"}]

        # create the collection from the specified fields
        collection_manager = CollectionManagerController("test")
        if collection_manager.collection_exists(collection_name):
            collection_manager.delete_collection(collection_name, None)
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

        # index the file
        indexer.run_morphline(collection_name, morphline, input_loc)
Example #11
0
def _test_fixed_type_format_generate_morphline(format_):
  indexer = Indexer("test")
  format_instance = format_()

  morphline = indexer.generate_morphline_config("test_collection", {
      "columns": [field.to_dict() for field in format_instance.fields],
      "format": format_instance.get_format()
    })

  assert_true(isinstance(morphline, basestring))
Example #12
0
def _test_generate_field_operation_morphline(operation_format):
  fields = TestIndexer.simpleCSVFields[:]
  fields[0]['operations'].append(operation_format)

  indexer = Indexer("test")
  morphline =indexer.generate_morphline_config("test_collection", {
      "columns": fields,
      "format": TestIndexer.simpleCSVFormat
    })

  assert_true(isinstance(morphline, basestring))
Example #13
0
def _test_fixed_type_format_generate_morphline(format_):
    indexer = Indexer("test", None)
    format_instance = format_()

    morphline = indexer.generate_morphline_config(
        "test_collection", {
            "columns": [field.to_dict() for field in format_instance.fields],
            "format": format_instance.get_format()
        })

    assert_true(isinstance(morphline, basestring))
Example #14
0
def _test_generate_field_operation_morphline(operation_format):
    fields = IndexerTest.simpleCSVFields[:]
    fields[0]['operations'].append(operation_format)

    indexer = Indexer("test", None)
    morphline = indexer.generate_morphline_config(
        "test_collection", {
            "columns": fields,
            "format": IndexerTest.simpleCSVFormat
        })

    assert_true(isinstance(morphline, basestring))
Example #15
0
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    indexer = Indexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    format_ = indexer.guess_format(
        {"file": {
            "stream": stream,
            "name": file_format['path']
        }})
    _convert_format(format_)

    return JsonResponse(format_)
Example #16
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    indexer = Indexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)
    format_ = indexer.guess_field_types({
        "file": {
            "stream": stream,
            "name": file_format['path']
        },
        "format": file_format['format']
    })

    return JsonResponse(format_)
Example #17
0
    def test_guess_format_invalid_csv_format(self):
        indexer = Indexer("test", None)
        stream = StringIO.StringIO(IndexerTest.simpleCSVString)

        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        guessed_format["fieldSeparator"] = "invalid separator"

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        assert_equal(fields, [])

        stream.seek(0)
        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        guessed_format["recordSeparator"] = "invalid separator"

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        assert_equal(fields, [])

        stream.seek(0)
        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        guessed_format["quoteChar"] = "invalid quoteChar"

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        assert_equal(fields, [])
Example #18
0
  def test_guess_format_invalid_csv_format(self):
    indexer = Indexer("test", None)
    stream = StringIO.StringIO(IndexerTest.simpleCSVString)

    guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    guessed_format["fieldSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["recordSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["quoteChar"] = "invalid quoteChar"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])
Example #19
0
def index_file(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    _convert_format(file_format["format"], inverse=True)
    collection_name = file_format["name"]
    indexer = Indexer(request.user, request.fs)
    unique_field = indexer.get_unique_field(file_format)
    is_unique_generated = indexer.is_unique_generated(file_format)

    schema_fields = indexer.get_kept_field_list(file_format['columns'])
    if is_unique_generated:
        schema_fields += [{"name": unique_field, "type": "string"}]

    morphline = indexer.generate_morphline_config(collection_name, file_format,
                                                  unique_field)

    collection_manager = CollectionManagerController(request.user)
    if not collection_manager.collection_exists(collection_name):
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

    job_id = indexer.run_morphline(collection_name, morphline,
                                   file_format["path"])

    return JsonResponse({"jobId": job_id})
Example #20
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Example #21
0
  def test_guess_csv_format(self):
    stream = StringIO.StringIO(TestIndexer.simpleCSVString)
    indexer = Indexer("test")

    guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    fields = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    # test format
    expected_format = self.simpleCSVFormat

    assert_equal(expected_format, guessed_format)

    # test fields
    expected_fields = self.simpleCSVFields

    for expected, actual in zip(expected_fields, fields):
      for key in ("name", "type"):
        assert_equal(expected[key], actual[key])
Example #22
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse = True)
  collection_name = file_format["name"]
  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_uuid_name(file_format)
  schema_fields = [{"name": unique_field, "type": "string"}] + \
    indexer.get_kept_field_list(file_format['columns'])

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  job_id = indexer.run_morphline(collection_name, morphline, file_format["path"])

  return JsonResponse({"jobId": job_id})
Example #23
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = Indexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query': # Only support open query history
    # TODO get schema from explain query, which is not possible
    notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data()
    snippet = notebook['snippets'][0]
    sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True)

    format_ = {
        "sample": sample['rows'][:4],
        "sample_cols": sample.meta,
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in sample.meta
        ]
    }

  return JsonResponse(format_)
Example #24
0
  def test_end_to_end(self):
    if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = Indexer("test", fs=fs, jt=cluster.jt)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Example #25
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  else:
    input_path = None

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Example #26
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse=True)
  collection_name = file_format["name"]

  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  else:
    input_path = file_format["path"]

  job_handle = indexer.run_morphline(request, collection_name, morphline, input_path) #TODO if query generate insert
  return JsonResponse(job_handle)
Example #27
0
  def test_guess_format(self):
    stream = StringIO.StringIO(IndexerTest.simpleCSVString)
    indexer = Indexer("test", None)

    guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    fields = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    # test format
    assert_equal('csv', guessed_format['type'])
    assert_equal(',', guessed_format['fieldSeparator'])
    assert_equal('\n', guessed_format['recordSeparator'])

    # test fields
    expected_fields = [
      {
        "name": "id",
        "type": "long"
      },
      {
        "name": "Rating",
        "type": "long"
      },
      {
        "name": "Location",
        "type": "string"
      },
      {
        "name": "Name",
        "type": "string"
      },
      {
        "name": "Time",
        "type": "string"
      }
    ]

    for expected, actual in zip(expected_fields, fields):
      for key in ("name", "type"):
        assert_equal(expected[key], actual[key])
Example #28
0
  def test_end_to_end(self):
    fs = cluster.get_hdfs()
    collection_name = "test_collection"
    indexer = Indexer("test", fs)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=IndexerTest.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_uuid_name(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = [{"name": unique_field, "type": "string"}] + indexer.get_kept_field_list(format_['columns'])

    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(collection_name, morphline, input_loc)