Esempio n. 1
0
def get_most_k_sim(query_sequence, query_range, cluster, k, time_series_dict):
    min_rprs = None  # the representative that is closest to the query distance
    min_dist = math.inf
    target_cluster = []
    for cur_rprs in cluster.keys():

        # TODO do we want to get raw data here, or set the raw in timeSeriesObj before calling query (no parsing)
        if (cur_rprs.end_point - cur_rprs.start_point) in range(
                query_range[0], query_range[1] + 1):

            cur_dist = sim_between_seq(
                query_sequence,
                get_data_for_timeSeriesObj(cur_rprs, time_series_dict))

            if cur_dist < min_dist:
                min_rprs = cur_rprs
                min_dist = cur_dist
        else:
            continue

    if min_rprs:
        print('min representative is ' + min_rprs.id)

        print("Querying Cluster of length: " +
              str(len(get_data_for_timeSeriesObj(min_rprs, time_series_dict))))
        target_cluster = cluster[min_rprs]
        print('len of cluster is ' + str(len(target_cluster)))
        print("sorting")

        target_cluster.sort(key=lambda cluster_sequence: sim_between_seq(
            query_sequence,
            get_data_for_timeSeriesObj(cluster_sequence, time_series_dict)))

        return target_cluster[0:k]  # return the k most similar sequences
Esempio n. 2
0
def clusterer_legacy(groups, st):
    """
    construct similarity clusters
    Look at clusters of all length, not using Distributed system

    This is a Legacy function, not used anymore
    TODO give the option of only creating similarity clusters of given length or length range
    :param dict groups: [key = length, value = array of sebsequences of the length]
        For example:
        [[1,4,2],[6,1,4],[1,2,3],[3,2,1]] is a valid 'subsequences'
    :param float st: similarity threshold
    :return: dict clusters: [key = representatives, value = similarity cluster: array of sebsequences]
    """
    clusters = []
    for group_len in groups.keys():

        processing_groups = groups[group_len]
        processing_groups = randomize(
            processing_groups
        )  # randomize the sequence in the group to remove data-related bias

        for sequence in processing_groups:  # the subsequence that is going to form or be put in a similarity clustyer
            if not clusters.keys(
            ):  # if there is no item in the similarity clusters
                clusters[sequence] = [
                    sequence
                ]  # put the first sequence as the representative of the first cluster
            else:
                minSim = math.inf
                minRprst = None  # TODO MinRprst should not be None, catch None exception!

                for rprst in clusters.keys(
                ):  # iterate though all the similarity groups, rprst = representative
                    dist = sim_between_seq(sequence, rprst)
                    if dist < minSim:
                        minSim = dist
                        minRprst = rprst

                if minSim <= math.sqrt(
                        group_len
                ) * st / 2:  # if the calculated min similarity is smaller than the
                    # similarity threshold, put subsequence in the similarity cluster keyed by the min representative
                    clusters[minRprst].append(sequence)
                else:  # if the minSim is greater than the similarity threshold, we create a new similarity group
                    # with this sequence being its representative
                    if sequence in clusters.keys():
                        raise Exception(
                            'cluster_operations: clusterer_legacy: Trying to create new similarity cluster '
                            'due to exceeding similarity threshold, target sequence is already a '
                            'representative(key) in clusters. The sequence isz: '
                            + str(sequence))
                    clusters[sequence] = [sequence]
Esempio n. 3
0
def query(query_sequence,
          query_range,
          cluster,
          k,
          time_series_dict,
          exclude_overlap,
          percentage=1):
    """

    :param query_sequence: list of data: the sequence to be queried
    :param cluster: dict[key = representative, value = list of timeSeriesObj] -> representative is timeSeriesObj
                    the sequences in the cluster are all of the SAME length
    :param k: int
    :return list of time series objects: best k matches. Again note they are all of the SAME length
    """

    # iterate through all the representatives to find which cluster to look at
    min_rprs = None  # the representative that is closest to the query distance
    min_dist = math.inf
    target_cluster = []
    for cur_rprs in cluster.keys():
        # print("actually querying")
        # print('end point is' + str(cur_rprs.end_point))
        # print('start point is' + str(cur_rprs.start_point))
        # TODO do we want to get raw data here, or set the raw in timeSeriesObj before calling query (no parsing)
        if (cur_rprs.end_point - cur_rprs.start_point) in range(
                query_range[0], query_range[1] + 1):
            # print("it's in")
            cur_dist = sim_between_seq(
                query_sequence,
                get_data_for_timeSeriesObj(cur_rprs, time_series_dict))

            if cur_dist < min_dist:
                min_rprs = cur_rprs
                min_dist = cur_dist
        else:
            pass

    if min_rprs:
        print('min representative is ' + min_rprs.id)

        print("Querying Cluster of length: " +
              str(len(get_data_for_timeSeriesObj(min_rprs, time_series_dict))))
        target_cluster = cluster[min_rprs]
        print('len of cluster is ' + str(len(target_cluster)))
        print("sorting")

        # this sorting is taking a long time!
        target_cluster.sort(key=lambda cluster_sequence: sim_between_seq(
            query_sequence,
            get_data_for_timeSeriesObj(cluster_sequence, time_series_dict)))
    #     use a heap?
    #     use quickselect
    #     similar question to k closet point to origin

    # where can we get none?
    if len(target_cluster) != 0:
        # print(target_cluster)
        if exclude_overlap:
            target_cluster = exclude_overlapping(target_cluster, percentage, k)
            print("k is" + str(k))
        return target_cluster[0:k]  # return the k most similar sequences