Ejemplo n.º 1
0
    def initialize(self):

        # open mongo client
        self.mongo_client = MongoClient(self.mongo_uri)

        # open schema collection
        mongo_schema_db = self.mongo_client[self.schema_db_name]
        self.mongo_schema_collection = mongo_schema_db[
            self.schema_collection_name]

        # if overwrite, delete schema collection
        if self.write_disposition == 'overwrite':
            self.mongo_schema_collection.remove({})

        # create data warehouse object
        if self.infra_type == 'hadoop':
            self.dw = Hive(self.hiveserveer_host, self.hiveserver_port,
                           ONEFOLD_HIVESERDES_JAR)
            self.cs = HDFSStorage()
        elif self.infra_type == 'gcloud':
            self.dw = GBigQuery(self.gcloud_project_id,
                                self.gcloud_storage_bucket_id)
            self.cs = GCloudStorage(self.gcloud_project_id,
                                    self.gcloud_storage_bucket_id)

        # turn policies into better data structure for use later (required_fields)
        if self.policies != None:
            for policy in self.policies:
                if 'key' in policy:
                    if 'required' in policy:
                        if policy['key'] not in self.required_fields == None:
                            self.required_fields[policy['key']] = {}
                        self.required_fields[policy['key']] = policy

                    if 'data_type' in policy:
                        datatype_overwrite = policy['data_type']

                        if 'mode' in policy:
                            mode_overwrite = policy['mode']
                        else:
                            mode_overwrite = 'nullable'

                        self.mongo_schema_collection.update_one(
                            {
                                "key": policy['key'].replace(".", "_"),
                                "type": "field"
                            }, {
                                "$set": {
                                    "data_type": datatype_overwrite,
                                    "mode": mode_overwrite,
                                    "forced": True
                                }
                            },
                            upsert=True)