def make_delta(self, df, mapping, send_update=True): timestamp = int(time.time() * 1e3) # very bad. TODO: need to pass than into foreachPartition some other way self.timestamp = timestamp self.mapping = mapping df.rdd.foreachPartition(self.process_partition) hdfs_client = InsecureClient(self.host, user=self.user) hdfs_client.write( '/data/{}/.dmpkit/profiles/{}/cdm/ts={}/_SUCCESS'.format( self.cid, self.source, timestamp), data="") hdfs_client.set_owner('/data/{}/.dmpkit/profiles/{}/cdm/ts={}'.format( self.cid, self.source, timestamp), owner='dmpkit') if send_update: update_id = str(uuid4()) update_path = '/data/{}/.dmpkit/profiles/.updates/{}.json'.format( self.cid, update_id) update_content = { "id": update_id, "owner": self.cid, "dataset": "profiles", "source": self.source, "created": timestamp, "path": '/data/{}/.dmpkit/profiles/{}/cdm/ts={}'.format( self.cid, self.source, timestamp), "mergeStrategy": "merge" } hdfs_client.write(update_path, data=json.dumps(update_content), encoding='utf-8') hdfs_client.set_owner(update_path, owner='dmpkit')