Esempio n. 1
0
def store_streams(data):
    try:
        st = datetime.datetime.now()
        CC.save_datastream_to_influxdb(data)
        CC.save_datastream(data, "json")
        print("Stream Saved: ", data['filename'],
              (datetime.datetime.now() - st))
    except:
        cc_log()
def storeOffsetRanges(rdd):
    offsetRanges = rdd.offsetRanges()
    for offsets in offsetRanges:
        try:
            CC.store_or_update_Kafka_offset(offsets.topic, offsets.partition,
                                            offsets.fromOffset,
                                            offsets.untilOffset)
        except:
            cc_log()
def file_processor(msg: dict, data_path: str) -> DataStream:
    """
    :param msg:
    :param data_path:
    :return:
    """
    print("in file Processor")
    if not isinstance(msg["metadata"], dict):
        metadata_header = json.loads(msg["metadata"])
    else:
        metadata_header = msg["metadata"]

    identifier = metadata_header["identifier"]
    owner = metadata_header["owner"]
    name = metadata_header["name"]
    data_descriptor = metadata_header["data_descriptor"]
    execution_context = metadata_header["execution_context"]
    if "annotations" in metadata_header:
        annotations = metadata_header["annotations"]
    else:
        annotations = {}
    if "stream_type" in metadata_header:
        stream_type = metadata_header["stream_type"]
    else:
        stream_type = "ds"

    try:
        gzip_file_content = get_gzip_file_contents(data_path + msg["filename"])
        datapoints = list(
            map(lambda x: row_to_datapoint(x), gzip_file_content.splitlines()))

        print(datapoints)

        rename_file(data_path + msg["filename"])

        start_time = datapoints[0].start_time
        end_time = datapoints[len(datapoints) - 1].end_time

        return DataStream(identifier, owner, name, data_descriptor,
                          execution_context, annotations, stream_type,
                          start_time, end_time, datapoints)
    except Exception as e:
        error_log = "In Kafka preprocessor - Error in processing file: " + str(
            msg["filename"]
        ) + " Owner-ID: " + owner + "Stream Name: " + name + " - " + str(e)
        cc_log(error_log, "MISSING_DATA")
        datapoints = []
        return None
def store_stream(data: DataStream):
    """
    Store data into Cassandra, MySQL, and influxDB
    :param data:
    """
    if data:
        try:
            c1 = datetime.now()
            CC.save_datastream(data, "datastream")
            e1 = datetime.now()
            CC.save_datastream_to_influxdb(data)
            i1 = datetime.now()
            print("Cassandra Time: ", e1 - c1, " Influx Time: ", i1 - e1,
                  " Batch size: ", len(data.data))
        except:
            cc_log()
def extract_info(msg: dict, data_path: str):
    global cur_time
    global interval

    try:
        metadata_header = msg["metadata"]
        filename = msg["filename"]
        #owner = "fbf8d50c-7f1d-47aa-b958-9caeadc676bd"#metadata_header["owner"]
        #name = metadata_header["name"]
        #data_descriptor = metadata_header["data_descriptor"]
        #execution_context = metadata_header["execution_context"]
        gzip_file_content = get_gzip_file_contents(data_path + msg["filename"])

        datapoints = list(
            map(lambda x: row_to_datapoint_cus(x),
                gzip_file_content.splitlines()))
        #print(datapoints)
        start_time = datapoints[0]["time"]
        end_time = datapoints[len(datapoints) - 1]["time"]

        # in the window, add into queue
        end_time = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
        end_time = datetime.timestamp(end_time)

        if end_time >= cur_time:
            return filename
            # filelist.append(filename)

        return None
        # if len(filelist) != 0:
        #     return [len(filelist)]

        #return [identifier, owner, name, data_descriptor, start_time, end_time, datapoints] #list of dictionary
        #return [0, owner, "name", "data_descriptor", start_time, end_time, datapoints]

        # return valid file name instead
        # return datapoints

    except Exception as e:
        #error_log = "In Kafka preprocessor - Error in processing file: " + str(msg["filename"])+" Owner-ID: "+owner + "Stream Name: "+name + " - " + str(e)
        cc_log(error_log, "MISSING_DATA")
        datapoints = []
        print(e)
        return None
    def store_data_to_influxdb(self, datastream: DataStream):

        """
        :param datastream:
        """
        st = datetime.now()
        client = InfluxDBClient(host=self.influxdbIP, port=self.influxdbPort, username=self.influxdbUser, password=self.influxdbPassword, database=self.influxdbDatabase)
        datapoints = datastream.data
        stream_identifier = datastream.identifier
        stream_owner_id = datastream.owner
        stream_owner_name = Metadata(self.CC_obj).owner_id_to_name(stream_owner_id)
        stream_name = datastream.name

        if datastream.data_descriptor:
            total_dd_columns = len(datastream.data_descriptor)
            data_descriptor = datastream.data_descriptor
        else:
            data_descriptor = []
            total_dd_columns = 0

        influx_data = []
        for datapoint in datapoints:
            object = {}
            object['measurement'] = stream_name
            object['tags'] = {'stream_id':stream_identifier, 'owner_id': stream_owner_id, 'owner_name': stream_owner_name}

            object['time'] = datapoint.start_time
            values = datapoint.sample

            if isinstance(values, tuple):
                values = list(values)
            else:
                try:
                    values = [float(values)]
                except:
                    try:
                        values = list(map(float, values.split(',')))
                    except:
                        values = values


            try:
                object['fields'] = {}
                if isinstance(values, list):
                    for i, sample_val in enumerate(values):
                        if len(values)==total_dd_columns:
                            dd = data_descriptor[i]
                            if "NAME" in dd:
                                object['fields'][dd["NAME"]] = sample_val
                            else:
                                object['fields']['value_'+str(i)] = sample_val
                        else:
                            object['fields']['value_'+str(i)] = sample_val
                else:
                    dd = data_descriptor[0]
                    if not values:
                        values = "NULL"
                    try:
                        values = float(values)
                    except:
                        values = values
                    if "NAME" in dd:
                        object['fields'][dd["NAME"]] = values
                    else:
                        object['fields']['value_0'] = values
            except:
                try:
                    values = json.dumps(values)
                    object['fields']['value_0'] = values
                except:
                    cc_log("Datapoint sample values conversion: "+str(values),"WARNING")
                    object['fields']['value_0'] = str(values)

            influx_data.append(object)
        et = datetime

        #print('InfluxDB - Yielding:', stream_owner_id, len(influx_data), stream_identifier)

        try:
            client.write_points(influx_data)
            et2 = datetime.now()
            #print("Influx Time BreakDown: Processing: ", et-st, " Inserting: ",et2-et, " Size: ",len(influx_data))
        except:
            cc_log()