def initialize(self): # open mongo client self.mongo_client = MongoClient(self.mongo_uri) # open schema collection mongo_schema_db = self.mongo_client[self.schema_db_name] self.mongo_schema_collection = mongo_schema_db[ self.schema_collection_name] # if overwrite, delete schema collection if self.write_disposition == 'overwrite': self.mongo_schema_collection.remove({}) # create data warehouse object if self.infra_type == 'hadoop': self.dw = Hive(self.hiveserveer_host, self.hiveserver_port, ONEFOLD_HIVESERDES_JAR) self.cs = HDFSStorage() elif self.infra_type == 'gcloud': self.dw = GBigQuery(self.gcloud_project_id, self.gcloud_storage_bucket_id) self.cs = GCloudStorage(self.gcloud_project_id, self.gcloud_storage_bucket_id) # turn policies into better data structure for use later (required_fields) if self.policies != None: for policy in self.policies: if 'key' in policy: if 'required' in policy: if policy['key'] not in self.required_fields == None: self.required_fields[policy['key']] = {} self.required_fields[policy['key']] = policy if 'data_type' in policy: datatype_overwrite = policy['data_type'] if 'mode' in policy: mode_overwrite = policy['mode'] else: mode_overwrite = 'nullable' self.mongo_schema_collection.update_one( { "key": policy['key'].replace(".", "_"), "type": "field" }, { "$set": { "data_type": datatype_overwrite, "mode": mode_overwrite, "forced": True } }, upsert=True)