Example #1
0
    def initialize(self):

        # open mongo client
        self.mongo_client = MongoClient(self.mongo_uri)

        # open schema collection
        mongo_schema_db = self.mongo_client[self.schema_db_name]
        self.mongo_schema_collection = mongo_schema_db[
            self.schema_collection_name]

        # if overwrite, delete schema collection
        if self.write_disposition == 'overwrite':
            self.mongo_schema_collection.remove({})

        # create data warehouse object
        if self.infra_type == 'hadoop':
            self.dw = Hive(self.hiveserveer_host, self.hiveserver_port,
                           ONEFOLD_HIVESERDES_JAR)
            self.cs = HDFSStorage()
        elif self.infra_type == 'gcloud':
            self.dw = GBigQuery(self.gcloud_project_id,
                                self.gcloud_storage_bucket_id)
            self.cs = GCloudStorage(self.gcloud_project_id,
                                    self.gcloud_storage_bucket_id)

        # turn policies into better data structure for use later (required_fields)
        if self.policies != None:
            for policy in self.policies:
                if 'key' in policy:
                    if 'required' in policy:
                        if policy['key'] not in self.required_fields == None:
                            self.required_fields[policy['key']] = {}
                        self.required_fields[policy['key']] = policy

                    if 'data_type' in policy:
                        datatype_overwrite = policy['data_type']

                        if 'mode' in policy:
                            mode_overwrite = policy['mode']
                        else:
                            mode_overwrite = 'nullable'

                        self.mongo_schema_collection.update_one(
                            {
                                "key": policy['key'].replace(".", "_"),
                                "type": "field"
                            }, {
                                "$set": {
                                    "data_type": datatype_overwrite,
                                    "mode": mode_overwrite,
                                    "forced": True
                                }
                            },
                            upsert=True)
Example #2
0
  def initialize(self):

    # open mongo client
    self.mongo_client = MongoClient(self.mongo_uri)

    # open schema collection
    mongo_schema_db = self.mongo_client[self.schema_db_name]
    self.mongo_schema_collection = mongo_schema_db[self.schema_collection_name]

    # if overwrite, delete schema collection
    if self.write_disposition == 'overwrite':
      self.mongo_schema_collection.remove({})

    # create data warehouse object
    if self.infra_type == 'hadoop':
      self.dw = Hive(self.hiveserveer_host, self.hiveserver_port, ONEFOLD_HIVESERDES_JAR)
      self.cs = HDFSStorage()
    elif self.infra_type == 'gcloud':
      self.dw = GBigQuery(self.gcloud_project_id, self.gcloud_storage_bucket_id)
      self.cs = GCloudStorage(self.gcloud_project_id, self.gcloud_storage_bucket_id)

    # turn policies into better data structure for use later (required_fields)
    if self.policies != None:
      for policy in self.policies:
        if 'key' in policy:
          if 'required' in policy:
            if policy['key'] not in self.required_fields == None:
              self.required_fields[policy['key']] = {}
            self.required_fields[policy['key']] = policy

          if 'data_type' in policy:
            datatype_overwrite = policy['data_type']

            if 'mode' in policy:
              mode_overwrite = policy['mode']
            else:
              mode_overwrite = 'nullable'

            self.mongo_schema_collection.update_one(
              {"key": policy['key'].replace(".", "_"), "type": "field"},
              {"$set": {"data_type": datatype_overwrite,
                        "mode": mode_overwrite,
                        "forced": True}},
              upsert = True)
Example #3
0
class Loader:

  # control params
  infra_type = None
  mongo_uri = None
  db_name = None
  collection_name = None
  collection_sort_by_field = None
  extract_query = None
  tmp_path = None
  schema_db_name = None
  schema_collection_name = None
  use_mr = False

  hiveserveer_host = None
  hiveserver_port = None

  gcloud_project_id = None
  gcloud_storage_bucket_id = None

  write_disposition = None
  process_array = "child_table"
  dw_database_name = None
  dw_table_name = None

  policies = None

  # mongo client and schema collection
  mongo_client = None
  mongo_schema_collection = None

  # runtime variables
  extract_file_names = []
  reject_file_names = []
  sort_by_field_min = None
  sort_by_field_max = None
  dw_table_names = []
  dw = None
  cs = None
  num_records_extracted = 0
  num_records_rejected = 0

  # policy related variables
  required_fields = {}


  def initialize(self):

    # open mongo client
    self.mongo_client = MongoClient(self.mongo_uri)

    # open schema collection
    mongo_schema_db = self.mongo_client[self.schema_db_name]
    self.mongo_schema_collection = mongo_schema_db[self.schema_collection_name]

    # if overwrite, delete schema collection
    if self.write_disposition == 'overwrite':
      self.mongo_schema_collection.remove({})

    # create data warehouse object
    if self.infra_type == 'hadoop':
      self.dw = Hive(self.hiveserveer_host, self.hiveserver_port, ONEFOLD_HIVESERDES_JAR)
      self.cs = HDFSStorage()
    elif self.infra_type == 'gcloud':
      self.dw = GBigQuery(self.gcloud_project_id, self.gcloud_storage_bucket_id)
      self.cs = GCloudStorage(self.gcloud_project_id, self.gcloud_storage_bucket_id)

    # turn policies into better data structure for use later (required_fields)
    if self.policies != None:
      for policy in self.policies:
        if 'key' in policy:
          if 'required' in policy:
            if policy['key'] not in self.required_fields == None:
              self.required_fields[policy['key']] = {}
            self.required_fields[policy['key']] = policy

          if 'data_type' in policy:
            datatype_overwrite = policy['data_type']

            if 'mode' in policy:
              mode_overwrite = policy['mode']
            else:
              mode_overwrite = 'nullable'

            self.mongo_schema_collection.update_one(
              {"key": policy['key'].replace(".", "_"), "type": "field"},
              {"$set": {"data_type": datatype_overwrite,
                        "mode": mode_overwrite,
                        "forced": True}},
              upsert = True)


  def extract_data(self):

    # create tmp_path folder if necessary
    if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'data')):
      os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'data'))

    if not os.path.exists(os.path.join(self.tmp_path, self.collection_name, 'rejected')):
      os.makedirs(os.path.join(self.tmp_path, self.collection_name, 'rejected'))

    # delete old tmp files if exists
    for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'data', '*')):
      print "Deleting old file %s" % (old_file)
      os.remove(old_file)

    for old_file in glob.glob(os.path.join(self.tmp_path, self.collection_name, 'rejected', '*')):
      print "Deleting old file %s" % (old_file)
      os.remove(old_file)

    # some state variables
    part_num = 0
    extract_file = None

    reject_part_num = 0
    reject_file = None

    # start mongo client
    db = self.mongo_client[self.db_name]
    collection = db[self.collection_name]

    # turn query string into json
    if self.extract_query is not None:
      if 'ObjectId' in self.extract_query:
        # kinda hacky.. and dangerous! This is to evaluate an expression
        # like {"_id": {$gt:ObjectId("55401a60151a4b1a4f000001")}}
        from bson.objectid import ObjectId
        extract_query_json = eval(self.extract_query)
      else:
        extract_query_json = json.loads(self.extract_query)
    else:
      extract_query_json = None

    # query collection, sort by collection_sort_by_field
    for data in collection.find(extract_query_json).sort(self.collection_sort_by_field, 1):

      # track min and max id for auditing..
      if self.sort_by_field_min == None:
        self.sort_by_field_min = data[self.collection_sort_by_field]
      self.sort_by_field_max = data[self.collection_sort_by_field]

      # open a new file if necessary
      if self.num_records_extracted % NUM_RECORDS_PER_PART == 0:

        if extract_file != None:
          extract_file.close()

        part_num += 1
        extract_file_name = os.path.join(self.tmp_path, self.collection_name, 'data', str(part_num))
        extract_file = open(extract_file_name, "w")
        extract_file_codec = codecs.getwriter("utf-8")(extract_file)
        self.extract_file_names.append(extract_file_name)
        print "Creating file %s" % extract_file_name

      # validate policies
      rejected = False
      for required_field_name, policy in self.required_fields.iteritems():
        if policy['required'] and jsonpath_get(data, required_field_name) is None:

          # --------------------------------------------------------
          # document found that doesn't contain required fields.
          # --------------------------------------------------------

          # open a new file if necessary
          if self.num_records_rejected % NUM_RECORDS_PER_PART == 0:

            if reject_file != None:
              reject_file.close()

            reject_part_num += 1
            reject_file_name = os.path.join(self.tmp_path, self.collection_name, 'rejected', str(reject_part_num))
            reject_file = open(reject_file_name, "w")
            reject_file_codec = codecs.getwriter("utf-8")(reject_file)
            self.reject_file_names.append(reject_file_name)
            print "Creating reject file %s" % reject_file_name

          self.num_records_rejected += 1
          reject_file_codec.write("Rejected. Missing %s. Data: %s" % (required_field_name, dumps(data)))
          reject_file_codec.write('\n')

          rejected = True
          break

      if not rejected:
        self.num_records_extracted += 1
        extract_file_codec.write(dumps(data))
        extract_file_codec.write('\n')

    if extract_file != None:
      extract_file.close()

    if reject_file != None:
      reject_file.close()

  def simple_schema_gen(self):
    command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \
              % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name)
    execute(command)


  def mr_schema_gen(self):

    hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (CLOUD_STORAGE_PATH, self.collection_name)

    # delete folders
    self.cs.rmdir(hdfs_data_folder)
    self.cs.rmdir(hdfs_mr_output_folder)
    

    # copy extracted files to hdfs data folder
    self.cs.mkdir(hdfs_data_folder)

    for extract_file_name in self.extract_file_names:
      self.cs.copy_from_local(extract_file_name, hdfs_data_folder)

    hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder,
           hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)


  def simple_data_transform(self):

    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)
    transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name)

    command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
              % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                 self.schema_collection_name, transform_data_tmp_path)
    execute(command)

    # delete folders
    self.cs.rmdir (hdfs_mr_output_folder)

    # manually copy files into hdfs
    fragment_values = self.get_fragments()
    for fragment_value in fragment_values:
      self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value))
      self.cs.copy_from_local("%s/%s/part-00000" % (transform_data_tmp_path, fragment_value),
                              "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
      

  def mr_data_transform(self):

    hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)

    # delete folders
    self.cs.rmdir(hdfs_mr_output_folder)

    hadoop_command = """hadoop jar %s \
                              -libjars %s \
                              -D mapred.job.name="onefold-mongo-transform-data" \
                              -D mapred.reduce.tasks=0 \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/transform-data-mapper.py %s/%s/%s' \
                              -file json/transform-data-mapper.py \
                              -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)


  # retrieve schema tree from schema collection
  def retrieve_schema_fields(self):

    # read schema from mongodb schema collection
    schema_fields = []

    mongo_schema_fields = self.mongo_schema_collection.find({"type": "field"})
    for mongo_schema_field in mongo_schema_fields:
      schema_fields.append(mongo_schema_field)

    # add hash code to field
    field = {}
    field['key'] = "hash_code"
    field['mode'] = "nullable"
    field['data_type'] = "string"
    schema_fields.append(field)

    return schema_fields


  def get_fragments(self):
    fragment_record = self.mongo_schema_collection.find_one({"type": "fragments"})
    if fragment_record != None:
      return fragment_record['fragments']
    else:
      return []


  def load_table_hive (self, shard_value = None, table_name = None, different_table_per_shard = False, data_import_id = None):

    # if shard_value is None:
    #   gcs_uri = "%s/data/*" % (self.mr4_output_folder_uri)
    # else:
    #   gcs_uri = "%s/data/%s/*" % (self.mr4_output_folder_uri, shard_value)

    if different_table_per_shard:
      full_table_name = "%s_%s" % (table_name, shard_value)
    else:
      full_table_name = "%s" % (table_name)

    cloud_storage_path = "%s/%s/data_transform/output/%s/" % (CLOUD_STORAGE_PATH, self.collection_name, shard_value)
    self.dw.load_table(self.dw_database_name, full_table_name, cloud_storage_path)

    # extract bq_job_id and save to db
    return "%s/%s" % (data_import_id, shard_value)


  def load_dw (self):

    # retrieve schema fields from mongodb schema collection
    schema_fields = self.retrieve_schema_fields()

    # create tables
    if self.write_disposition == 'overwrite':
      if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
        self.dw.delete_table(self.dw_database_name, self.dw_table_name)
      self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array)
    else:
      # if append, update table.
      if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
        self.dw_table_names = self.dw.update_table(self.dw_database_name, self.dw_table_name, schema_fields)
      else:
        self.dw_table_names = self.dw.create_table(self.dw_database_name, self.dw_table_name, schema_fields, self.process_array)

    # load data
    fragment_values = self.get_fragments()

    if fragment_values == None or len(fragment_values) == 0:
      table_name = self.dw_table_name
      self.load_table_hive(shard_value = None, table_name = table_name, different_table_per_shard=False, data_import_id=None)

    else:
      for fragment_value in fragment_values:
        print "Loading fragment: " + fragment_value
        if fragment_value == 'root':
          table_name = self.dw_table_name
        else:
          table_name = self.dw_table_name + "_" + fragment_value

        self.load_table_hive(shard_value = fragment_value, table_name = table_name, different_table_per_shard=False, data_import_id=None)


  def run(self):
    # init (start mongo client)
    self.initialize()

    # extract data from Mongo
    self.extract_data()

    if self.num_records_extracted > 0:
      # generate schema and transform data
      if self.use_mr:
        self.mr_schema_gen()
        self.mr_data_transform()
      else:
        self.simple_schema_gen()
        self.simple_data_transform()

      # Create data warehouse tables and load data into them
      self.load_dw()

    print '-------------------'
    print '    RUN SUMMARY'
    print '-------------------'
    print 'Num records extracted %s' % self.num_records_extracted
    print 'Num records rejected %s' % self.num_records_rejected
    print 'Extracted data with %s from %s to %s' % (self.collection_sort_by_field, self.sort_by_field_min, self.sort_by_field_max)
    print 'Extracted files are located at: %s' % (' '.join(self.extract_file_names))
    print 'Destination Tables: %s' % (' '.join(self.dw_table_names))
    print 'Schema is stored in Mongo %s.%s' % (self.schema_db_name, self.schema_collection_name)
Example #4
0
class Loader:

    # control params
    infra_type = None
    mongo_uri = None
    db_name = None
    collection_name = None
    collection_sort_by_field = None
    extract_query = None
    tmp_path = None
    schema_db_name = None
    schema_collection_name = None
    use_mr = False

    hiveserveer_host = None
    hiveserver_port = None

    gcloud_project_id = None
    gcloud_storage_bucket_id = None

    write_disposition = None
    process_array = "child_table"
    dw_database_name = None
    dw_table_name = None

    policies = None

    # mongo client and schema collection
    mongo_client = None
    mongo_schema_collection = None

    # runtime variables
    extract_file_names = []
    reject_file_names = []
    sort_by_field_min = None
    sort_by_field_max = None
    dw_table_names = []
    dw = None
    cs = None
    num_records_extracted = 0
    num_records_rejected = 0

    # policy related variables
    required_fields = {}

    def initialize(self):

        # open mongo client
        self.mongo_client = MongoClient(self.mongo_uri)

        # open schema collection
        mongo_schema_db = self.mongo_client[self.schema_db_name]
        self.mongo_schema_collection = mongo_schema_db[
            self.schema_collection_name]

        # if overwrite, delete schema collection
        if self.write_disposition == 'overwrite':
            self.mongo_schema_collection.remove({})

        # create data warehouse object
        if self.infra_type == 'hadoop':
            self.dw = Hive(self.hiveserveer_host, self.hiveserver_port,
                           ONEFOLD_HIVESERDES_JAR)
            self.cs = HDFSStorage()
        elif self.infra_type == 'gcloud':
            self.dw = GBigQuery(self.gcloud_project_id,
                                self.gcloud_storage_bucket_id)
            self.cs = GCloudStorage(self.gcloud_project_id,
                                    self.gcloud_storage_bucket_id)

        # turn policies into better data structure for use later (required_fields)
        if self.policies != None:
            for policy in self.policies:
                if 'key' in policy:
                    if 'required' in policy:
                        if policy['key'] not in self.required_fields == None:
                            self.required_fields[policy['key']] = {}
                        self.required_fields[policy['key']] = policy

                    if 'data_type' in policy:
                        datatype_overwrite = policy['data_type']

                        if 'mode' in policy:
                            mode_overwrite = policy['mode']
                        else:
                            mode_overwrite = 'nullable'

                        self.mongo_schema_collection.update_one(
                            {
                                "key": policy['key'].replace(".", "_"),
                                "type": "field"
                            }, {
                                "$set": {
                                    "data_type": datatype_overwrite,
                                    "mode": mode_overwrite,
                                    "forced": True
                                }
                            },
                            upsert=True)

    def extract_data(self):

        # create tmp_path folder if necessary
        if not os.path.exists(
                os.path.join(self.tmp_path, self.collection_name, 'data')):
            os.makedirs(
                os.path.join(self.tmp_path, self.collection_name, 'data'))

        if not os.path.exists(
                os.path.join(self.tmp_path, self.collection_name, 'rejected')):
            os.makedirs(
                os.path.join(self.tmp_path, self.collection_name, 'rejected'))

        # delete old tmp files if exists
        for old_file in glob.glob(
                os.path.join(self.tmp_path, self.collection_name, 'data',
                             '*')):
            print "Deleting old file %s" % (old_file)
            os.remove(old_file)

        for old_file in glob.glob(
                os.path.join(self.tmp_path, self.collection_name, 'rejected',
                             '*')):
            print "Deleting old file %s" % (old_file)
            os.remove(old_file)

        # some state variables
        part_num = 0
        extract_file = None

        reject_part_num = 0
        reject_file = None

        # start mongo client
        db = self.mongo_client[self.db_name]
        collection = db[self.collection_name]

        # turn query string into json
        if self.extract_query is not None:
            if 'ObjectId' in self.extract_query:
                # kinda hacky.. and dangerous! This is to evaluate an expression
                # like {"_id": {$gt:ObjectId("55401a60151a4b1a4f000001")}}
                from bson.objectid import ObjectId
                extract_query_json = eval(self.extract_query)
            else:
                extract_query_json = json.loads(self.extract_query)
        else:
            extract_query_json = None

        # query collection, sort by collection_sort_by_field
        for data in collection.find(extract_query_json).sort(
                self.collection_sort_by_field, 1):

            # track min and max id for auditing..
            if self.sort_by_field_min == None:
                self.sort_by_field_min = data[self.collection_sort_by_field]
            self.sort_by_field_max = data[self.collection_sort_by_field]

            # open a new file if necessary
            if self.num_records_extracted % NUM_RECORDS_PER_PART == 0:

                if extract_file != None:
                    extract_file.close()

                part_num += 1
                extract_file_name = os.path.join(self.tmp_path,
                                                 self.collection_name, 'data',
                                                 str(part_num))
                extract_file = open(extract_file_name, "w")
                extract_file_codec = codecs.getwriter("utf-8")(extract_file)
                self.extract_file_names.append(extract_file_name)
                print "Creating file %s" % extract_file_name

            # validate policies
            rejected = False
            for required_field_name, policy in self.required_fields.iteritems(
            ):
                if policy['required'] and jsonpath_get(
                        data, required_field_name) is None:

                    # --------------------------------------------------------
                    # document found that doesn't contain required fields.
                    # --------------------------------------------------------

                    # open a new file if necessary
                    if self.num_records_rejected % NUM_RECORDS_PER_PART == 0:

                        if reject_file != None:
                            reject_file.close()

                        reject_part_num += 1
                        reject_file_name = os.path.join(
                            self.tmp_path, self.collection_name, 'rejected',
                            str(reject_part_num))
                        reject_file = open(reject_file_name, "w")
                        reject_file_codec = codecs.getwriter("utf-8")(
                            reject_file)
                        self.reject_file_names.append(reject_file_name)
                        print "Creating reject file %s" % reject_file_name

                    self.num_records_rejected += 1
                    reject_file_codec.write("Rejected. Missing %s. Data: %s" %
                                            (required_field_name, dumps(data)))
                    reject_file_codec.write('\n')

                    rejected = True
                    break

            if not rejected:
                self.num_records_extracted += 1
                extract_file_codec.write(dumps(data))
                extract_file_codec.write('\n')

        if extract_file != None:
            extract_file.close()

        if reject_file != None:
            reject_file.close()

    def simple_schema_gen(self):
        command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \
                  % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name)
        execute(command)

    def mr_schema_gen(self):

        hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH,
                                           self.collection_name)
        hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (
            CLOUD_STORAGE_PATH, self.collection_name)

        # delete folders
        self.cs.rmdir(hdfs_data_folder)
        self.cs.rmdir(hdfs_mr_output_folder)

        # copy extracted files to hdfs data folder
        self.cs.mkdir(hdfs_data_folder)

        for extract_file_name in self.extract_file_names:
            self.cs.copy_from_local(extract_file_name, hdfs_data_folder)

        hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR,
           hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
        execute(hadoop_command)

    def simple_data_transform(self):

        hdfs_mr_output_folder = "%s/%s/data_transform/output" % (
            CLOUD_STORAGE_PATH, self.collection_name)
        transform_data_tmp_path = "%s/%s/data_transform/output" % (
            self.tmp_path, self.collection_name)

        command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
                  % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                     self.schema_collection_name, transform_data_tmp_path)
        execute(command)

        # delete folders
        self.cs.rmdir(hdfs_mr_output_folder)

        # manually copy files into hdfs
        fragment_values = self.get_fragments()
        for fragment_value in fragment_values:
            self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value))
            self.cs.copy_from_local(
                "%s/%s/part-00000" % (transform_data_tmp_path, fragment_value),
                "%s/%s/" % (hdfs_mr_output_folder, fragment_value))

    def mr_data_transform(self):

        hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH,
                                           self.collection_name)
        hdfs_mr_output_folder = "%s/%s/data_transform/output" % (
            CLOUD_STORAGE_PATH, self.collection_name)

        # delete folders
        self.cs.rmdir(hdfs_mr_output_folder)

        hadoop_command = """hadoop jar %s \
                              -libjars %s \
                              -D mapred.job.name="onefold-mongo-transform-data" \
                              -D mapred.reduce.tasks=0 \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/transform-data-mapper.py %s/%s/%s' \
                              -file json/transform-data-mapper.py \
                              -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR,
           MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder,
           self.mongo_uri, self.schema_db_name, self.schema_collection_name)
        execute(hadoop_command)

    # retrieve schema tree from schema collection
    def retrieve_schema_fields(self):

        # read schema from mongodb schema collection
        schema_fields = []

        mongo_schema_fields = self.mongo_schema_collection.find(
            {"type": "field"})
        for mongo_schema_field in mongo_schema_fields:
            schema_fields.append(mongo_schema_field)

        # add hash code to field
        field = {}
        field['key'] = "hash_code"
        field['mode'] = "nullable"
        field['data_type'] = "string"
        schema_fields.append(field)

        return schema_fields

    def get_fragments(self):
        fragment_record = self.mongo_schema_collection.find_one(
            {"type": "fragments"})
        if fragment_record != None:
            return fragment_record['fragments']
        else:
            return []

    def load_table_hive(self,
                        shard_value=None,
                        table_name=None,
                        different_table_per_shard=False,
                        data_import_id=None):

        # if shard_value is None:
        #   gcs_uri = "%s/data/*" % (self.mr4_output_folder_uri)
        # else:
        #   gcs_uri = "%s/data/%s/*" % (self.mr4_output_folder_uri, shard_value)

        if different_table_per_shard:
            full_table_name = "%s_%s" % (table_name, shard_value)
        else:
            full_table_name = "%s" % (table_name)

        cloud_storage_path = "%s/%s/data_transform/output/%s/" % (
            CLOUD_STORAGE_PATH, self.collection_name, shard_value)
        self.dw.load_table(self.dw_database_name, full_table_name,
                           cloud_storage_path)

        # extract bq_job_id and save to db
        return "%s/%s" % (data_import_id, shard_value)

    def load_dw(self):

        # retrieve schema fields from mongodb schema collection
        schema_fields = self.retrieve_schema_fields()

        # create tables
        if self.write_disposition == 'overwrite':
            if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
                self.dw.delete_table(self.dw_database_name, self.dw_table_name)
            self.dw_table_names = self.dw.create_table(self.dw_database_name,
                                                       self.dw_table_name,
                                                       schema_fields,
                                                       self.process_array)
        else:
            # if append, update table.
            if self.dw.table_exists(self.dw_database_name, self.dw_table_name):
                self.dw_table_names = self.dw.update_table(
                    self.dw_database_name, self.dw_table_name, schema_fields)
            else:
                self.dw_table_names = self.dw.create_table(
                    self.dw_database_name, self.dw_table_name, schema_fields,
                    self.process_array)

        # load data
        fragment_values = self.get_fragments()

        if fragment_values == None or len(fragment_values) == 0:
            table_name = self.dw_table_name
            self.load_table_hive(shard_value=None,
                                 table_name=table_name,
                                 different_table_per_shard=False,
                                 data_import_id=None)

        else:
            for fragment_value in fragment_values:
                print "Loading fragment: " + fragment_value
                if fragment_value == 'root':
                    table_name = self.dw_table_name
                else:
                    table_name = self.dw_table_name + "_" + fragment_value

                self.load_table_hive(shard_value=fragment_value,
                                     table_name=table_name,
                                     different_table_per_shard=False,
                                     data_import_id=None)

    def run(self):
        # init (start mongo client)
        self.initialize()

        # extract data from Mongo
        self.extract_data()

        if self.num_records_extracted > 0:
            # generate schema and transform data
            if self.use_mr:
                self.mr_schema_gen()
                self.mr_data_transform()
            else:
                self.simple_schema_gen()
                self.simple_data_transform()

            # Create data warehouse tables and load data into them
            self.load_dw()

        print '-------------------'
        print '    RUN SUMMARY'
        print '-------------------'
        print 'Num records extracted %s' % self.num_records_extracted
        print 'Num records rejected %s' % self.num_records_rejected
        print 'Extracted data with %s from %s to %s' % (
            self.collection_sort_by_field, self.sort_by_field_min,
            self.sort_by_field_max)
        print 'Extracted files are located at: %s' % (' '.join(
            self.extract_file_names))
        print 'Destination Tables: %s' % (' '.join(self.dw_table_names))
        print 'Schema is stored in Mongo %s.%s' % (self.schema_db_name,
                                                   self.schema_collection_name)