def test_n_closest(): """ Get ts 100; run search by id in on, confirm we get back 3 close ts and that distances in returned dict match actual distances """ # Attempt to get non-existent time series with raises(ValueError): n_closest = simsearch_by_id(500, 3) with raises(ValueError): _ = get_by_id(500) # Get ts 100 ats_100 = get_by_id(100) n_closest = simsearch_by_id(100, 3) assert (len(n_closest) <= 3) # Confirm that distance measures are accurate for dist in n_closest: tsid = n_closest[dist] other_ts = get_by_id(tsid) assert (abs( dist - kernel_dist(standardize(ats_100), standardize(other_ts)) < .0001))
def calc_distances(vp_k, timeseries_dict): """Calculates kernel distance between vantage point and all loaded light curves""" distances = [] vp = standardize(timeseries_dict[vp_k]) for k in timeseries_dict: if k != vp_k: k_dist = kernel_dist(vp, standardize(timeseries_dict[k])) distances.append((k_dist, k)) return distances
def find_closest_vp(vps_dict, ts): """ Calculates distances from time series to all vantage points. Returns tuple with filename of closest vantage point and distance to that vantage point. """ s_ts = standardize(ts) vp_distances = sorted([(kernel_dist(s_ts, standardize(vps_dict[vp])), vp) for vp in vps_dict]) dist_to_vp, vp_fn = vp_distances[0] return (vp_fn, dist_to_vp)
def plot_two_ts(ts1, ts1_name, ts2, ts2_name, stand=True): """Plots two time series with matplotlib""" import matplotlib.pyplot as plt if stand: ts1 = standardize(ts1) ts2 = standardize(ts2) plt.plot(ts1, label=ts1_name) plt.plot(ts2, label=ts2_name) plt.legend() plt.show()
def test_add_ts(): """ Create a ts, add to db, retrieve it, assert that it's the same ts""" new_ts = standardize(tsmaker(0.5, 0.1, random.uniform(0, 10))) new_tsid = add_ts(new_ts) ts_as_saved = get_by_id(new_tsid) assert (kernel_dist(standardize(ts_as_saved), standardize(new_ts)) < .00001) # Confirm that we get the same id back when we attempt to add it a second time assert (add_ts(new_ts) == new_tsid)
def test_save_ts_to_db_two(): new_ts = ArrayTimeSeries(values=[0, 1, 2, 3, 10], times=[0., .2, .3, .5, 1]) #new_ts = ArrayTimeSeries(values=[ 1.90015224,4.11290636,2.45059022,2.45251473,-4.1988066], times=[ 0.,0.2,0.4,0.6,0.8]) #new_ts = (tsmaker(0.5, 0.1, random.uniform(0,10),5)) new_tsid = s_client.save_ts_to_db(new_ts) echo_ts = s_client.get_ts_with_id(new_tsid) interpolated_ats = new_ts.interpolate( np.arange(0.0, 1.0, (1.0 / TS_LENGTH))) assert (kernel_dist(standardize(echo_ts), standardize(interpolated_ats)) < .00001)
def test_crosscorr(): t1 = standardize(tsmaker(0.5, 0.1, random.uniform(0, 10))) # First confirm that the kernel correlation and distance methods # return 1 and 0 when comparing a ts with itself assert (kernel_corr(t1, t1) == 1) assert (kernel_dist(t1, t1) == 0) t2 = standardize(tsmaker(0.5, 0.1, random.uniform(0, 10))) t3 = standardize(random_ts(0.5)) # Now let's do the opposite -- ensure that we see some distance for different curves assert (kernel_dist(t1, t2) > 0) assert (kernel_dist(t1, t3) > 0) assert (kernel_corr(t1, t2) < 1) assert (kernel_corr(t1, t3) < 1)
def test_simsearch_by_ts(): ats_75 = get_by_id(75) n_closest_dict, tsid, is_new = simsearch_by_ts(ats_75, 5) assert (tsid == 75) assert (is_new == False) assert (n_closest_dict == simsearch_by_id(75, 5)) new_ts = standardize(tsmaker(0.5, 0.1, random.uniform(0, 10))) n_closest_dict, tsid, is_new = simsearch_by_ts(new_ts, 5) assert (is_new == True) assert (tsid > 250) assert (len(n_closest_dict) == 5)
def add_ts_to_vpdb(data_tuple): """ Worker function called by add_ts_to_vpdbs above. This process is repeated on each vantage point. """ file, fsm, s_ts, ts_fn, db_dir = data_tuple vp_ts = load_ts(file[:-5], fsm) dist_to_vp = kernel_dist(standardize(vp_ts), s_ts) # print("Adding " + ts_fn + " to " + (db_dir + file)) db = connect(db_dir + file) db.set(dist_to_vp, ts_fn) db.commit() db.close()
def test_crosscorr_errors(): """Test that we have checks for varies error conditions""" t1 = standardize(tsmaker(0.5, 0.1, random.uniform(0, 10))) t4 = standardize(random_ts(0.5, 200)) t5 = tsmaker(0.5, 0.1, random.uniform(0, 10)) #Confirm that we raise value error if we attempt to compare time series # that are not the same length with raises(ValueError): ccor(t1, t4) with raises(ValueError): kernel_dist(t1, t4) with raises(ValueError): kernel_corr(t1, t4) #Confirm that we raise value error if we attempt to compare time series # that have not been standardized first t5 = tsmaker(0.5, 0.1, random.uniform(0, 10)) with raises(ValueError): kernel_dist(t4, t5)
def search_vpdb_for_n(vp_t, ts, db_dir, lc_dir, n): """ Searches for n most similar light curve based on pre-computed distances in vpdb Args: vp_t: tuple containing vantage point filename and distance of time series to vantage point ts: time series to search on. Returns: Dict: A dict of n closet time series ids, with distances as the keys and ts ids as the values Note: Uses processes pool to calculate distances in parallel, and heap queue data to minimize time for sorting final distance list to n smallest distances. """ # 1. Setup data to be processed in parallel vp_fn, dist_to_vp = vp_t lc_candidates, fsm = find_lc_candidates(vp_t, db_dir, lc_dir) lc_candidates.append((dist_to_vp, vp_fn)) existing_ts_id = -1 s_ts = standardize(ts) lc_candidate_data = [(ts_fn, fsm, s_ts) for d_to_vp, ts_fn in lc_candidates] # 2. Calculate distances in parallel with ProcessPoolExecutor() as pool: dist_list = pool.map(calc_distance, lc_candidate_data) # 3. Sort distances for n+1 smallest n_smallest = heapq.nsmallest(n + 1, dist_list) # 4. Look through sublist of closest time series to see if any of have a distance of zero. # If so, mark it as an existing time series. # Otherwise, trim the list by 1. for dist_to_ts, tsid in n_smallest: if dist_to_ts < .00001: existing_ts_id = tsid if (existing_ts_id == -1): n_smallest = n_smallest[:-1] else: n_smallest = [(d, id) for d, id in n_smallest if (id != existing_ts_id)] # 5. Return n_smallest dict, and exiting id (or -1 if not in db) return (dict(n_smallest), existing_ts_id)
def add_ts_to_vpdbs(ts, ts_fn, db_dir, lc_dir): """ Based on names of vantage point db files, adds single new time series to vp indexes (Does not re-pick vantage points) Uses ProcessPoolExecutor to run processes in parallel. """ fsm = FileStorageManager(lc_dir) s_ts = standardize(ts) # Setup data for process poll execution vp_fns = [ file for file in os.listdir(db_dir) if file.startswith("ts_datafile_") and file.endswith(".dbdb") ] vp_tuples = [(vp_fn, fsm, s_ts, ts_fn, db_dir) for vp_fn in vp_fns] # Create processes with ProcessPoolExecutor() as pool: _ = pool.map(add_ts_to_vpdb, vp_tuples)
def test_save_ts_to_db(): # Save a ts, request it by id, compare to original new_ts = (tsmaker(0.5, 0.1, random.uniform(0, 10))) new_tsid = s_client.save_ts_to_db(new_ts) echo_ts = s_client.get_ts_with_id(new_tsid) assert (kernel_dist(standardize(echo_ts), standardize(new_ts)) < .00001)
def calc_distance(lc_candidate_data): """Working function called by search_vpdb_for_n above""" ts_fn, fsm, s_ts = lc_candidate_data candidate_ts = load_ts(ts_fn, fsm) dist_to_ts = kernel_dist(standardize(candidate_ts), s_ts) return (dist_to_ts, tsfn_to_id(ts_fn))