def find_similarity_of_points_in_radius(closest_vantage_pt, ts1, radius):
    """
    Given a vantage point and a radius, find the points that fall within the
    circle around the vantage point. Then calculates the distance from all of these
    points to the timeseries of interest.
    
    closest_vantage_pt: number of the vantage point being considered
    ts1: timeseries of interest
    radius: radius of circle to consider
    
    Returns: list of tuples (distance, timeseries id) in sorted order
    """
    #open database for that vantage point
    db = BinarySearchDatabase.connect("VantagePointDatabases/" +
                                      str(closest_vantage_pt) + ".dbdb")

    #find all light curves within 2d of the vantage point
    light_curves_in_radius = db.get_nodes_less_than(radius)
    light_curves_in_radius.append(
        str(closest_vantage_pt))  # add in the vantage pt
    db.close()

    #find similiarity between these light curves and given light curve
    distance = []
    for l in light_curves_in_radius:
        with open("GeneratedTimeseries/Timeseries" + str(l), "rb") as f:
            ts2 = pickle.load(f)
        dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()),
                                  distances.stand(ts2, ts2.mean(), ts2.std()),
                                  mult=1)
        distance.append([dist, "Timeseries" + str(l)])
    return distance
Exemple #2
0
 def test_ccor(self):
     t0 = ts(times=[0,1,2,3],values=[1,2,3,4])
     t0_stand = distances.stand(t0,t0.mean(),t0.std())
     t1 = ts(times=[0,1,2,3],values=[-1,2,1,-1])
     t1_stand = distances.stand(t1,t1.mean(), t1.std())
     d = distances.ccor(t0_stand,t1_stand)
     assert (str(d) == str(np.array([0.25819889,-0.94672926,-0.0860663,0.77459667])))
def sanity_check(filename, n):
    """
    Function that manually finds the n most similiar timeseries to the given
    timeseries. Serves as a check of the vantage point method
    
    Returns: list of n most similiar filenames 
    """
    ans = []
    d = []
    with open(filename, "rb") as f:
        ts1 = pickle.load(f)

    for i in range(1000):
        with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f:
            ts2 = pickle.load(f)
        dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()),
                                  distances.stand(ts2, ts2.mean(), ts2.std()),
                                  mult=1)
        d.append([dist, "Timeseries" + str(i)])

    d.sort(key=lambda x: x[0])
    for i in range(1, n + 1):
        ans.append(d[i][1])

    return ans
Exemple #4
0
 def test_kernelcorr(self):
     """tests that the kernelized cross correlation is 1 when
     the two timeseries are identical"""
     
     t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8])
     t0_stand = distances.stand(t0,t0.mean(),t0.std())
     t1 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8])
     t1_stand = distances.stand(t1,t1.mean(), t1.std())   
     assert distances.kernel_corr(t1_stand, t0_stand) == 1
     
     t3 = ts(times=[0,1,2,4,5,6],values=[3,7,9,10,16,20])
     t3_stand = distances.stand(t3,t3.mean(), t3.std())          
     assert (distances.kernel_corr(t1_stand, t3_stand) < 1)
Exemple #5
0
 def test_standardize(self):
     t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8])
     assert t0.mean() == 5.5 #check mean
     assert t0.std() == np.sqrt(17.5/6.0) #check sqrt
     
     standardized_values = distances.stand(t0,t0.mean(),t0.std()).values()
     assert (str(standardized_values) == str(np.array([-1.46385011,-0.87831007,-0.29277002,0.29277002,0.87831007,1.46385011]))) #check that standardized values are correct
def find_most_similiar(filename, n, vantage_pts):
    """
    Finds n most similiar time series to the time series of interest (filename)
    by using the supplied vantage points
    
    filename: timeseries of interest
    n: number of similiar timeseries to return (n must be between 1 and 20)
    vantage_pts: a list of the vantage point numbers 
    
    Returns: list of n most similiar filenames
    """

    file_names = []

    #load the given file
    with open(filename, "rb") as f:
        ts1 = pickle.load(f)

    #find the most similiar vantage point = d
    vantage_pts_dist = []
    for i in vantage_pts:
        with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f:
            ts2 = pickle.load(f)
        dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()),
                                  distances.stand(ts2, ts2.mean(), ts2.std()),
                                  mult=1)
        vantage_pts_dist.append([dist, i])

    vantage_pts_dist.sort(key=lambda x: x[0])

    all_pts_to_check = []
    for i in range(n):
        closest_vantage_pt = vantage_pts_dist[i][1]
        radius = 2 * vantage_pts_dist[i][0]
        pts_in_radius = find_similarity_of_points_in_radius(
            closest_vantage_pt, ts1, radius)
        for j in pts_in_radius:
            if j not in all_pts_to_check:
                all_pts_to_check.append(j)

    all_pts_to_check.sort(key=lambda x: x[0])

    for i in range(1, n + 1):  #ignore given timeseries
        file_names.append(all_pts_to_check[i][1])

    return file_names
Exemple #7
0
def pick_vantage_points(arg):
    """
    Code which picks 20 vantage points and produces a database for each one.
    The database stores (key,value) pairs where:
    key = distance from timeseries to vantage point (kernel coefficient)
    value = id of timeseries (0-999)
    
    returns: list of vantage points (integers from 0-999)
    """
    try:
        parser = argparse.ArgumentParser(description="vantage points")
        parser.add_argument('--n',
                            help='number of vantage points',
                            type=int,
                            default=20)

        args = parser.parse_args(arg)
        num = args.n
    except:
        num = arg

    try:
        shutil.rmtree('VantagePointDatabases')
        os.mkdir('VantagePointDatabases')
    except:
        os.mkdir('VantagePointDatabases')

    vantage_pts = random.sample(range(0, 1000), num)

    for vantage_point in vantage_pts:
        try:
            os.remove("VantagePointDatabases/" + str(vantage_point) + ".dbdb")
            db1 = BinarySearchDatabase.connect("VantagePointDatabases/" +
                                               str(vantage_point) + ".dbdb")
        except:
            db1 = BinarySearchDatabase.connect("VantagePointDatabases/" +
                                               str(vantage_point) + ".dbdb")

        with open("GeneratedTimeseries/Timeseries" + str(vantage_point),
                  "rb") as f:
            ts2 = pickle.load(f)
        for i in range(1000):
            if i != vantage_point:
                with open("GeneratedTimeseries/Timeseries" + str(i),
                          "rb") as f:
                    ts1 = pickle.load(f)
                dist = distances.distance(
                    distances.stand(ts1, ts1.mean(), ts1.std()),
                    distances.stand(ts2, ts2.mean(), ts2.std()),
                    mult=1)
                db1.set(dist, str(i))

        db1.commit()
        db1.close()

        f = open('VantagePointDatabases/vp', 'w')
        for i in vantage_pts:
            f.write(str(i) + "\n")
        f.close()

    return vantage_pts
Exemple #8
0
 def test_distance(self):
     t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8])
     t0_stand = distances.stand(t0,t0.mean(),t0.std())
     t1 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8])
     t1_stand = distances.stand(t1,t1.mean(), t1.std())        
     assert distances.distance(t0_stand, t1_stand) == 0
Exemple #9
0
 def test_maxcorratphase(self):
     t0 = ts(times=[0,1,2,3],values=[1,2,3,4])
     t0_stand = distances.stand(t0,t0.mean(),t0.std())
     t1 = ts(times=[0,1,2,3],values=[-1,2,1,-1])
     t1_stand = distances.stand(t1,t1.mean(), t1.std()) 
     assert (distances.max_corr_at_phase(t0_stand,t1_stand)) == (3, 0.77459666924148329)
Exemple #10
0
 def test_standardizeConstant(self):
     t0 = ts(times=[0,1,2,4,5,6],values=[3,3, 3, 3, 3, 3])
     standardized_values = distances.stand(t0,t0.mean(),t0.std()).values()
     assert (str(standardized_values) == str(np.array([0.,0.,,0.,0.,0.]))) #check that standardize a series of constant return a series of zeros
def find_most_similiar(filename, n, vantage_pts, isfile=True, dbtype='bstree'):
    """
    Finds n most similiar time series to the time series of interest (filename)
    by using the supplied vantage points
    
    filename: timeseries of interest
    n: number of similiar timeseries to return (n must be between 1 and 20)
    vantage_pts: a list of the vantage point numbers 
    
    Returns: list of n most similiar filenames
    """

    file_names = []
    #load the given file
    if isfile:
        try:
            with open(filename, "rb") as f:
                ts1 = pickle.load(f)
        except:
            print(
                'Requested %s cannot be found in database, returning ERROR INDEX'
                % filename)
            return 'ERROR INDEX'
    else:
        ts1 = filename

    ## check data type
    if not isinstance(ts1, ts):
        print(
            'Requested %s is not a TimeSeries instance, returning ERROR TYPE' %
            filename)
        return 'ERROR TYPE'

    #find the most similiar vantage point = d
    vantage_pts_dist = []
    for i in vantage_pts:
        with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f:
            ts2 = pickle.load(f)

        ## interpolate the timeseries in the database to have the same times
        ## as the client input timeseries
        ts2 = interpolate_to_match_input(ts2, ts1)

        dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()),
                                  distances.stand(ts2, ts2.mean(), ts2.std()),
                                  mult=1)
        vantage_pts_dist.append([dist, i])
    if n > len(vantage_pts_dist) or n < 1:
        print('More neighbours than vantage requested.')
        return 'ERROR NUMBER | {}'.format(len(vantage_pts_dist))
    vantage_pts_dist.sort(key=lambda x: x[0])

    all_pts_to_check = []
    for i in range(n):
        closest_vantage_pt = vantage_pts_dist[i][1]
        radius = 2 * vantage_pts_dist[i][0]
        pts_in_radius = find_similarity_of_points_in_radius(
            closest_vantage_pt, ts1, radius, dbtype)
        for j in pts_in_radius:
            if j not in all_pts_to_check:
                all_pts_to_check.append(j)

    all_pts_to_check.sort(key=lambda x: x[0])

    for i in range(0, n):  #ignore given timeseries
        file_names.append(all_pts_to_check[i])

    return file_names