def __init__(self, CC_obj: CerebralCortex, export_dir_path: str, owner_ids: List = None, owner_user_names: List = None, owner_name_regex: str = None, start_time: str = None, end_time: str = None): """ :param CC_obj: :param export_dir_path: :param owner_ids: owner_user_name and owner_name_regex must be None if using owner_id :param owner_user_names: owner_id and owner_name_regex must be None if using owner_user_name :param owner_name_regex: owner_id and owner_user_name must be None if using owner_name_reges :param start_time: :param end_time: """ self.streamData = Data(CC_obj) self.export_dir_path = export_dir_path self.metadata = Metadata(CC_obj) self.owner_ids = owner_ids self.owner_user_names = owner_user_names self.owner_name_regex = str(owner_name_regex) self.start_time = start_time self.end_time = end_time
def get_stream_samples(self, stream_id, day, start_time=None, end_time=None) -> List: """ return sample values of a stream :param stream_id: :return: """ return Data(self).get_stream_samples(stream_id, day, start_time, end_time)
def get_cassandra_raw_data(self, stream_id: uuid, day: str) -> List: """ :param stream_id: :param day: :return: """ return Data(self).load_cassandra_data(stream_id, day)
def get_stream_dataframe(self, stream_identifier: uuid, start_time: datetime = None, end_time: datetime = None, data_type: enumerate = DataSet.COMPLETE) -> dict: """ :param stream_id: :param start_time: :param end_time: :param data_type: this parameter accepts only three types (i.e., all, data, metadata) :return: {"metadata":dict, "data":DataFrame} """ return Data(self).get_stream_dataframe(stream_identifier, start_time, end_time, data_type)
def get_datastream(self, stream_identifier: uuid, day=None, start_time: datetime = None, end_time: datetime = None, data_type: enumerate = DataSet.COMPLETE) -> DataStream: """ Returns a data stream with data and metadata :param stream_identifier: :param start_time: :param end_time: :param data_type: :return: """ return Data(self).get_stream(stream_identifier, day, start_time, end_time, data_type)
def filter_stream(self, data_stream_id: uuid, annotation_stream_name: uuid, annotation: str, start_time: datetime = None, end_time: datetime = None) -> List[DataPoint]: """ This method maps derived annotation stream to a data stream and returns a List of mapped Datapoints :param data_stream_id: :param annotation_stream_name: :param annotation: :param start_time: :param end_time: :return: """ annotation_stream_id = Metadata(self).get_annotation_id(data_stream_id, annotation_stream_name) return Data(self).get_annotation_stream(data_stream_id, annotation_stream_id, annotation, start_time, end_time)
def save_datastream(self, datastream: DataStream): """ Save Datastream to appropriate datastores :param datastream: """ Data(self).store_stream(datastream)
def save_datastream_to_influxdb(self, datastream: DataStream): """ Save Datastream to appropriate datastores :param datastream: """ Data(self).store_data_to_influxdb(datastream)
class DataExporter(): def __init__(self, CC_obj: CerebralCortex, export_dir_path: str, owner_ids: List = None, owner_user_names: List = None, owner_name_regex: str = None, start_time: str = None, end_time: str = None): """ :param CC_obj: :param export_dir_path: :param owner_ids: owner_user_name and owner_name_regex must be None if using owner_id :param owner_user_names: owner_id and owner_name_regex must be None if using owner_user_name :param owner_name_regex: owner_id and owner_user_name must be None if using owner_name_reges :param start_time: :param end_time: """ self.streamData = Data(CC_obj) self.export_dir_path = export_dir_path self.metadata = Metadata(CC_obj) self.owner_ids = owner_ids self.owner_user_names = owner_user_names self.owner_name_regex = str(owner_name_regex) self.start_time = start_time self.end_time = end_time def start(self): if self.owner_ids and self.owner_ids != 'None': for owner_id in self.owner_ids: owner_name = self.metadata.owner_id_to_name(owner_id) self.export_data(owner_id=owner_id, owner_name=owner_name) elif self.owner_user_names and self.owner_user_names != 'None': for owner_user_name in self.owner_user_names: owner_id = self.metadata.owner_name_to_id(owner_user_name) self.export_data(owner_id=owner_id, owner_name=owner_user_name) elif self.owner_name_regex and self.owner_name_regex != 'None': owner_idz = self.metadata.get_owner_ids_by_owner_name_regex( self.owner_name_regex) for owner_id in owner_idz: owner_name = self.metadata.owner_id_to_name( owner_id["identifier"]) self.export_data(owner_id=owner_id["identifier"], owner_name=owner_name) @calculate_time def export_data(self, owner_id=None, owner_name=None): rows = self.metadata.get_stream_metadata_by_owner_id(owner_id) if rows == "NULL": print("No data found for => owner-id: " + owner_id + " - owner-name: " + owner_name) return for row in rows: stream_id = row["identifier"] data_start_time = row["start_time"] data_end_time = row["end_time"] stream_metadata = { "identifier": stream_id, "owner_id": row["owner"], "name": row["name"], "data_available": { "start_time": str(data_start_time), "end_time": str(data_end_time) } } data_descriptor = json.loads(row["data_descriptor"]) execution_context = json.loads(row["execution_context"]) annotations = json.loads(row["annotations"]) stream_metadata.update({"data_descriptor": data_descriptor}) stream_metadata.update({"execution_context": execution_context}) stream_metadata.update({"annotations": annotations}) file_path = self.export_dir_path + owner_name if not os.path.exists(file_path): os.mkdir(file_path) # write metadata to json file self.write_to_file(file_path + "/" + stream_id + ".json", json.dumps(stream_metadata)) # load and write stream raw data to bz2 delta = data_end_time - data_start_time for i in range(delta.days + 1): day = data_start_time + timedelta(days=i) day = datetime.strftime(day, "%Y%m%d") self.writeStreamDataToZipFile(stream_id, day, file_path) def writeStreamDataToZipFile(self, stream_id: uuid, day, file_path: str): """ :param stream_id: :param file_path: """ if stream_id: where_clause = "identifier='" + stream_id + "' and day='" + str( day) + "'" else: raise ValueError("Missing owner ID.") if self.start_time and self.end_time: where_clause += " and start_time>=cast('" + str( self.start_time ) + "' as timestamp) and start_time<=cast('" + str( self.end_time) + "' as timestamp)" elif self.start_time and not self.end_time: where_clause += " and start_time>=cast('" + str( self.start_time) + "' as timestamp)" elif not self.start_time and self.end_time: where_clause += " start_time<=cast('" + str( self.end_time) + "' as timestamp)" df = self.streamData.load_data_from_cassandra( self.streamData.datapointTable, where_clause, 1) df.write \ .format("csv") \ .option("codec", "org.apache.hadoop.io.compress.GzipCodec") \ .save(file_path + "/" + stream_id) os.system("cat " + file_path + "/" + stream_id + "/p* > " + file_path + "/" + stream_id + ".gz") if os.path.exists(file_path + "/" + stream_id + "/"): shutil.rmtree(file_path + "/" + stream_id + "/", ignore_errors=True) def write_to_bz2(self, file_name, data): with open(file_name, 'wb+') as outfile: compressed_data = bz2.compress(data, 9) outfile.write(compressed_data) def write_to_file(self, file_name: str, data: str): """ :param file_name: :param data: """ with open(file_name, 'w+') as outfile: outfile.write(data)
def save_datastream(self, datastream): Data(self.sc, self.sqlContext, self.configuration).store_datastream(datastream)
def get_datastream(self, stream_identifier): return Data(self.sc, self.sqlContext, self.configuration).get_datastream(stream_identifier)