def _min_subsequence_distance(values): """ Computes the minimum distance for a given subsequence. The values consist of the iteration, batch size, subsequence and query. It is used for both batch processing in single threaded or multi-processing mode. Parameters ---------- values : tuple(iteration, batch_size, subsequence, query) Tuple packed values for parallelization. Returns ------- A tuple of the minimum index and distance for this particular subsequence. """ iteration, batch_size, subsequence, query = values distances = mts.mass2(subsequence, query) # find mininimum index of this batch which will be between 0 and batch_size min_idx = np.argmin(distances) # add this distance to best distances dist = distances[min_idx] # compute the actual index and store it index = min_idx + (batch_size * iteration) return (index, dist)
def test_top_k_discords(): """Sanity check that compares results from UCR use case.""" robot_dog = np.loadtxt( os.path.join(MODULE_PATH, '..', 'tests', 'robot_dog.txt')) carpet_walk = np.loadtxt( os.path.join(MODULE_PATH, '..', 'tests', 'carpet_query.txt')) distances = mts.mass2(robot_dog, carpet_walk) found = mts.top_k_discords(distances, 2, 25) found = np.array(found) expected = np.array([12900, 2]) assert (np.array_equal(found, expected))
return start_point,end_point if __name__ == '__main__': ####### main file & dy main main_file = ut_mdf.getDataFromFile(fileName='light_curve_Gaia-DR2_49407521363733632_date20191129') main_period = 6 start_point,end_point = getSublenght(period=main_period,mdfData=main_file) subInstance = main_file['instances'][start_point:end_point] subTimestamp = main_file["timestamp"][start_point:end_point] sub_len = len(subTimestamp) # distances = mts.mass2(target_file['instances'], subInstance) # print("a") min_idx = np.argmin(distances) min_dis = distances.item(min_idx).real # print(min_idx) print("distance = {}".format(distances.item(min_idx).real)) # plot TS plt.figure(figsize=(25, 5)) plt.plot(target_file['timestamp'], target_file['instances']) plt.plot(target_file['timestamp'][min_idx:min_idx+sub_len], subInstance, c='r') plt.ylabel('Flux') plt.title('TS data : {}'.format(target_file['fileName'])) plt.show() plt.clf()
target_file = ut_mdf.getDataFromFile( fileName='light_curve_Gaia-DR2_49406353132632832_date20191129') main_file = ut_mdf.getDataFromFile( fileName='light_curve_Gaia-DR2_49407521363733632_date20191129') ts = target_file['instances'] query = main_file['instances'][765:2570] # ts = np.loadtxt('ts.txt') # query = np.loadtxt('query.txt') # mass distances = mts.mass(ts, query) # mass2 distances = mts.mass2(ts, query) # mass3 # distances = mts.mass3(ts, query, 256) # mass2_batch # start a multi-threaded batch job with all cpu cores and give me the top 5 matches. # note that batch_size partitions your time series into a subsequence similarity search. # even for large time series in single threaded mode, this is much more memory efficient than # MASS2 on its own. batch_size = 10000 top_matches = 5 n_jobs = -1 indices, distances = mts.mass2_batch(ts, query, batch_size,
if i_ch == 2: axes[i_ax].set_xlabel('Time steps') #%% ============== manual motifs #data0 = data.copy() i_chh = 1 k = 10 exclude_zone = 300 t_s = 57000 t_step = 1000 t = range(t_s, t_s + t_step) quary = data0[i_chh - 1, t] target = data0[i_chh - 1, :] distances = mts.mass2(target, quary) #distances = np.array([abs(i) for i in distances]) #distances.sort() found = mts.top_k_motifs(distances, k, exclude_zone) indices = np.array(found) distances = distances[found] #indices, distances = mts.mass2_batch(target, quary, 1000, top_matches = k) i_sort = np.argsort(distances) distances, indices = [t[i_sort] for t in (distances, indices)] distances = [abs(i) for i in distances] plt.figure() plt.subplot(2, 1, 1)