def find_similarity_of_points_in_radius(closest_vantage_pt, ts1, radius): """ Given a vantage point and a radius, find the points that fall within the circle around the vantage point. Then calculates the distance from all of these points to the timeseries of interest. closest_vantage_pt: number of the vantage point being considered ts1: timeseries of interest radius: radius of circle to consider Returns: list of tuples (distance, timeseries id) in sorted order """ #open database for that vantage point db = BinarySearchDatabase.connect("VantagePointDatabases/" + str(closest_vantage_pt) + ".dbdb") #find all light curves within 2d of the vantage point light_curves_in_radius = db.get_nodes_less_than(radius) light_curves_in_radius.append( str(closest_vantage_pt)) # add in the vantage pt db.close() #find similiarity between these light curves and given light curve distance = [] for l in light_curves_in_radius: with open("GeneratedTimeseries/Timeseries" + str(l), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) distance.append([dist, "Timeseries" + str(l)]) return distance
def test_ccor(self): t0 = ts(times=[0,1,2,3],values=[1,2,3,4]) t0_stand = distances.stand(t0,t0.mean(),t0.std()) t1 = ts(times=[0,1,2,3],values=[-1,2,1,-1]) t1_stand = distances.stand(t1,t1.mean(), t1.std()) d = distances.ccor(t0_stand,t1_stand) assert (str(d) == str(np.array([0.25819889,-0.94672926,-0.0860663,0.77459667])))
def sanity_check(filename, n): """ Function that manually finds the n most similiar timeseries to the given timeseries. Serves as a check of the vantage point method Returns: list of n most similiar filenames """ ans = [] d = [] with open(filename, "rb") as f: ts1 = pickle.load(f) for i in range(1000): with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) d.append([dist, "Timeseries" + str(i)]) d.sort(key=lambda x: x[0]) for i in range(1, n + 1): ans.append(d[i][1]) return ans
def test_kernelcorr(self): """tests that the kernelized cross correlation is 1 when the two timeseries are identical""" t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t0_stand = distances.stand(t0,t0.mean(),t0.std()) t1 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t1_stand = distances.stand(t1,t1.mean(), t1.std()) assert distances.kernel_corr(t1_stand, t0_stand) == 1 t3 = ts(times=[0,1,2,4,5,6],values=[3,7,9,10,16,20]) t3_stand = distances.stand(t3,t3.mean(), t3.std()) assert (distances.kernel_corr(t1_stand, t3_stand) < 1)
def test_standardize(self): t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) assert t0.mean() == 5.5 #check mean assert t0.std() == np.sqrt(17.5/6.0) #check sqrt standardized_values = distances.stand(t0,t0.mean(),t0.std()).values() assert (str(standardized_values) == str(np.array([-1.46385011,-0.87831007,-0.29277002,0.29277002,0.87831007,1.46385011]))) #check that standardized values are correct
def find_most_similiar(filename, n, vantage_pts): """ Finds n most similiar time series to the time series of interest (filename) by using the supplied vantage points filename: timeseries of interest n: number of similiar timeseries to return (n must be between 1 and 20) vantage_pts: a list of the vantage point numbers Returns: list of n most similiar filenames """ file_names = [] #load the given file with open(filename, "rb") as f: ts1 = pickle.load(f) #find the most similiar vantage point = d vantage_pts_dist = [] for i in vantage_pts: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) vantage_pts_dist.append([dist, i]) vantage_pts_dist.sort(key=lambda x: x[0]) all_pts_to_check = [] for i in range(n): closest_vantage_pt = vantage_pts_dist[i][1] radius = 2 * vantage_pts_dist[i][0] pts_in_radius = find_similarity_of_points_in_radius( closest_vantage_pt, ts1, radius) for j in pts_in_radius: if j not in all_pts_to_check: all_pts_to_check.append(j) all_pts_to_check.sort(key=lambda x: x[0]) for i in range(1, n + 1): #ignore given timeseries file_names.append(all_pts_to_check[i][1]) return file_names
def pick_vantage_points(arg): """ Code which picks 20 vantage points and produces a database for each one. The database stores (key,value) pairs where: key = distance from timeseries to vantage point (kernel coefficient) value = id of timeseries (0-999) returns: list of vantage points (integers from 0-999) """ try: parser = argparse.ArgumentParser(description="vantage points") parser.add_argument('--n', help='number of vantage points', type=int, default=20) args = parser.parse_args(arg) num = args.n except: num = arg try: shutil.rmtree('VantagePointDatabases') os.mkdir('VantagePointDatabases') except: os.mkdir('VantagePointDatabases') vantage_pts = random.sample(range(0, 1000), num) for vantage_point in vantage_pts: try: os.remove("VantagePointDatabases/" + str(vantage_point) + ".dbdb") db1 = BinarySearchDatabase.connect("VantagePointDatabases/" + str(vantage_point) + ".dbdb") except: db1 = BinarySearchDatabase.connect("VantagePointDatabases/" + str(vantage_point) + ".dbdb") with open("GeneratedTimeseries/Timeseries" + str(vantage_point), "rb") as f: ts2 = pickle.load(f) for i in range(1000): if i != vantage_point: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts1 = pickle.load(f) dist = distances.distance( distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) db1.set(dist, str(i)) db1.commit() db1.close() f = open('VantagePointDatabases/vp', 'w') for i in vantage_pts: f.write(str(i) + "\n") f.close() return vantage_pts
def test_distance(self): t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t0_stand = distances.stand(t0,t0.mean(),t0.std()) t1 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t1_stand = distances.stand(t1,t1.mean(), t1.std()) assert distances.distance(t0_stand, t1_stand) == 0
def test_maxcorratphase(self): t0 = ts(times=[0,1,2,3],values=[1,2,3,4]) t0_stand = distances.stand(t0,t0.mean(),t0.std()) t1 = ts(times=[0,1,2,3],values=[-1,2,1,-1]) t1_stand = distances.stand(t1,t1.mean(), t1.std()) assert (distances.max_corr_at_phase(t0_stand,t1_stand)) == (3, 0.77459666924148329)
def test_standardizeConstant(self): t0 = ts(times=[0,1,2,4,5,6],values=[3,3, 3, 3, 3, 3]) standardized_values = distances.stand(t0,t0.mean(),t0.std()).values() assert (str(standardized_values) == str(np.array([0.,0.,,0.,0.,0.]))) #check that standardize a series of constant return a series of zeros
def find_most_similiar(filename, n, vantage_pts, isfile=True, dbtype='bstree'): """ Finds n most similiar time series to the time series of interest (filename) by using the supplied vantage points filename: timeseries of interest n: number of similiar timeseries to return (n must be between 1 and 20) vantage_pts: a list of the vantage point numbers Returns: list of n most similiar filenames """ file_names = [] #load the given file if isfile: try: with open(filename, "rb") as f: ts1 = pickle.load(f) except: print( 'Requested %s cannot be found in database, returning ERROR INDEX' % filename) return 'ERROR INDEX' else: ts1 = filename ## check data type if not isinstance(ts1, ts): print( 'Requested %s is not a TimeSeries instance, returning ERROR TYPE' % filename) return 'ERROR TYPE' #find the most similiar vantage point = d vantage_pts_dist = [] for i in vantage_pts: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) ## interpolate the timeseries in the database to have the same times ## as the client input timeseries ts2 = interpolate_to_match_input(ts2, ts1) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) vantage_pts_dist.append([dist, i]) if n > len(vantage_pts_dist) or n < 1: print('More neighbours than vantage requested.') return 'ERROR NUMBER | {}'.format(len(vantage_pts_dist)) vantage_pts_dist.sort(key=lambda x: x[0]) all_pts_to_check = [] for i in range(n): closest_vantage_pt = vantage_pts_dist[i][1] radius = 2 * vantage_pts_dist[i][0] pts_in_radius = find_similarity_of_points_in_radius( closest_vantage_pt, ts1, radius, dbtype) for j in pts_in_radius: if j not in all_pts_to_check: all_pts_to_check.append(j) all_pts_to_check.sort(key=lambda x: x[0]) for i in range(0, n): #ignore given timeseries file_names.append(all_pts_to_check[i]) return file_names