def is_preprocessed(self):
        """Returns if the raw data is preprocessed.

        Returns:
            A boolean indicating if the raw file is preprocessed.
        """
        metadata = Metadata(self._preprocess_dir,
                            bucket=self._preprocess_bucket)
        return metadata.load()
    def fetch(self, strategy, number_records, timespan_start, timespan_end):
        """Gets the records in given timespan, downsample the fetched data with
            given strategy if needed.

        Read the records and downsample the records to be within number_records.
        First we search the level that has frequency the least higher than the required frequency.
        Then find the first and last slice for the given time span. Since records are sorted, first
        and last slices are found by binary search, then all slices in between are selected and
        downsampled to return.

        Args:
            strategy: A string representing a downsampling strategy.
            number_records: An interger representing number of records to return.
            timespan_start: An integer representing the timestamp in microseconds
                of the start of timespan.
            timespan_end: An integer representing the timestamp in microseconds
                of the end of timespan.

        Returns:
            A list of downsampled data in the given file, and precision for this result.
            Example:
                [
                    {
                        'name':'sys',
                        'data':[
                            [time,power],
                            [time,power]
                        ]},
                    {
                        'name': 'channel2',
                        'data': [
                            [time,power]
                        ]
                    }
                ]
        """
        self._metadata = Metadata(self._preprocess_dir,
                                  bucket=self._preprocess_bucket)
        self._metadata.load()

        if timespan_start is None:
            timespan_start = self._metadata['start']
        if timespan_end is None:
            timespan_end = self._metadata['end']

        if timespan_start > self._metadata[
                'end'] or timespan_end < self._metadata['start']:
            return []

        required_frequency = number_records / (timespan_end - timespan_start)

        # Finds Downsample Level.
        target_level_index = self._binary_search([
            self._metadata['levels'][level_name]['frequency']
            for level_name in self._metadata['levels']['names']
        ], required_frequency, True)

        target_level = self._metadata['levels'][self._metadata['levels']
                                                ['names'][target_level_index]]

        level_metadata = Metadata(self._preprocess_dir,
                                  self._preprocess_bucket, strategy,
                                  utils.get_level_name(target_level_index))
        level_metadata.load()
        first_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_start)
        last_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_end)
        target_slices_names = target_level['names'][first_slice:last_slice + 1]
        target_slice_paths = [
            utils.get_slice_path(self._preprocess_dir,
                                 utils.get_level_name(target_level_index),
                                 single_slice, strategy)
            for single_slice in target_slices_names
        ]

        # Reads records and downsamples.
        target_slices = LevelSlicesReader(target_slice_paths,
                                          self._preprocess_bucket)
        target_slices.read(timespan_start, timespan_end)
        number_target_records = target_slices.get_records_count()

        target_slices.downsample(strategy, max_records=number_records)
        downsampled_data = target_slices.format_response()
        number_result_records = target_slices.get_records_count()

        if number_target_records == 0:
            precision = 0
        else:
            precision = number_result_records / \
                number_target_records * \
                (target_level['number']/self._metadata['raw_number'])
        return downsampled_data, precision
class DataFetcher:
    """Class for fetching data from multiple-level preprocessing."""
    def __init__(self, file_path, root_dir, preprocess_bucket=None):

        self._rawfile = file_path
        self._preprocess_bucket = preprocess_bucket

        original_file_name = utils.get_file_name(file_path)
        self._preprocess_dir = '/'.join([root_dir, original_file_name])

    def is_preprocessed(self):
        """Returns if the raw data is preprocessed.

        Returns:
            A boolean indicating if the raw file is preprocessed.
        """
        metadata = Metadata(self._preprocess_dir,
                            bucket=self._preprocess_bucket)
        return metadata.load()

    def fetch(self, strategy, number_records, timespan_start, timespan_end):
        """Gets the records in given timespan, downsample the fetched data with
            given strategy if needed.

        Read the records and downsample the records to be within number_records.
        First we search the level that has frequency the least higher than the required frequency.
        Then find the first and last slice for the given time span. Since records are sorted, first
        and last slices are found by binary search, then all slices in between are selected and
        downsampled to return.

        Args:
            strategy: A string representing a downsampling strategy.
            number_records: An interger representing number of records to return.
            timespan_start: An integer representing the timestamp in microseconds
                of the start of timespan.
            timespan_end: An integer representing the timestamp in microseconds
                of the end of timespan.

        Returns:
            A list of downsampled data in the given file, and precision for this result.
            Example:
                [
                    {
                        'name':'sys',
                        'data':[
                            [time,power],
                            [time,power]
                        ]},
                    {
                        'name': 'channel2',
                        'data': [
                            [time,power]
                        ]
                    }
                ]
        """
        self._metadata = Metadata(self._preprocess_dir,
                                  bucket=self._preprocess_bucket)
        self._metadata.load()

        if timespan_start is None:
            timespan_start = self._metadata['start']
        if timespan_end is None:
            timespan_end = self._metadata['end']

        if timespan_start > self._metadata[
                'end'] or timespan_end < self._metadata['start']:
            return []

        required_frequency = number_records / (timespan_end - timespan_start)

        # Finds Downsample Level.
        target_level_index = self._binary_search([
            self._metadata['levels'][level_name]['frequency']
            for level_name in self._metadata['levels']['names']
        ], required_frequency, True)

        target_level = self._metadata['levels'][self._metadata['levels']
                                                ['names'][target_level_index]]

        level_metadata = Metadata(self._preprocess_dir,
                                  self._preprocess_bucket, strategy,
                                  utils.get_level_name(target_level_index))
        level_metadata.load()
        first_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_start)
        last_slice = self._binary_search([
            level_metadata[single_slice]
            for single_slice in target_level['names']
        ], timespan_end)
        target_slices_names = target_level['names'][first_slice:last_slice + 1]
        target_slice_paths = [
            utils.get_slice_path(self._preprocess_dir,
                                 utils.get_level_name(target_level_index),
                                 single_slice, strategy)
            for single_slice in target_slices_names
        ]

        # Reads records and downsamples.
        target_slices = LevelSlicesReader(target_slice_paths,
                                          self._preprocess_bucket)
        target_slices.read(timespan_start, timespan_end)
        number_target_records = target_slices.get_records_count()

        target_slices.downsample(strategy, max_records=number_records)
        downsampled_data = target_slices.format_response()
        number_result_records = target_slices.get_records_count()

        if number_target_records == 0:
            precision = 0
        else:
            precision = number_result_records / \
                number_target_records * \
                (target_level['number']/self._metadata['raw_number'])
        return downsampled_data, precision

    def _binary_search(self, data_list, value, reverse=False):
        """Searches the index of the left or right element closest to the given value from the list,
        if reverse is true, the list is decreasing.

        Args:
            data_list: A list of integers.
            value: The value to be inserted.
            reverse: True if data_list is decreasing.

        Returns:
            An int of index for the result.
        """
        if not data_list:
            return -1

        left = 0
        right = len(data_list) - 1
        pivot = (left + right + 1) // 2

        while left < right:
            if reverse:
                if data_list[pivot] >= value:
                    left = pivot
                else:
                    right = pivot - 1
            else:
                if data_list[pivot] < value:
                    left = pivot
                else:
                    right = pivot - 1
            pivot = (left + right + 1) // 2
        return pivot
Example #4
0
class Topics:
    def __init__(self):
        self.metadata = Metadata(True, 'lda')
        self.dataProvider = DataProvider()
        self.topicFile = 'tmp/topics.npy'
        self.kMeansFile = 'tmp/kmeans.pkl.npy'

    def gatherData(self, t):
        self.clips = self.dataProvider.getClips()
        self.model = self.metadata.createModel(self.clips, t)

    def getTopic(self, clip):
        vector = self.metadata.getVectors(clip)
        result = self.kmeans.predict(vector.reshape(1, -1))
        return result
        # return np.argmax(vector)

    def createTopics(self, clips, t, k):
        self.gatherData(t)
        self.createClusters(clips, k)

        topics = []
        for idx, clip in enumerate(clips):
            topics.append(self.getTopic(clip))

        self.topics = np.hstack(topics)

        return self.topics

    def save(self):
        self.metadata.save()
        np.save(self.topicFile, self.topics)
        joblib.dump(self.kmeans, self.kMeansFile)

    def load(self):
        if not self.metadata.load():
            return False

        try:
            self.topics = np.load(self.topicFile)
            self.kmeans = joblib.load(self.kMeansFile)
            return True
        except (IOError):
            return False

    # ------------------------------------------------
    # K Means on topic data...

    def createClusters(self, clips, k):
        vectors = []
        for idx, clip in enumerate(clips):
            vectors.append(self.metadata.getVectors(clip))

        self.vectors = np.vstack(vectors)

        print self.clusterKMeans(self.vectors, k)

    def clusterKMeans(self, data, k):
        self.kmeans = KMeans(n_clusters=k,
                             init='k-means++',
                             n_init=10,
                             max_iter=300,
                             tol=0.0001,
                             precompute_distances='auto',
                             verbose=1,
                             random_state=None,
                             copy_x=True,
                             n_jobs=1)

        self.kmeans.fit(data)

        return self.kmeans.inertia_