def get_trans_matrix_from_scratch(self, lag):
     """
     extract the transition matrix for velocity and angle for the given lag
     :param lag: the lag used to derive the transition matrix
     :return: v_trans_matrix, theta_trans_matrix for the given lag
     """
     # get the size of the transition matrices
     n_v_class, n_theta_class = self.mapping.n_abs_v_classes, self.mapping.n_theta_classes
     # initialize the sparse transition matrices
     i_list_v, j_list_v, val_list_v = [[] for _ in range(3)]
     i_list_theta, j_list_theta, val_list_theta = [[] for _ in range(3)]
     ij_set_v, ij_set_theta = [set([]) for _ in range(2)]
     time_step = self.time_step
     print 'extracting trans matrix for the velocity and angle process...'
     for j in range(self.n_total_realz):
         if not j % 5:
             print 'reading realization number: ', j
         file_name = "real_" + str(j) + ".pkl"
         input_file = os.path.join(self.input_folder, file_name)
         with open(input_file, 'rb') as input:
             dataHolder = pickle.load(input)
         dx = np.diff(dataHolder.x_array)
         dy = np.diff(dataHolder.y_array)
         dt = np.diff(dataHolder.t_array) + 1e-15
         if not (dx.shape[0] and dy.shape[0] and dt.shape[0]):
             print 'some array was empty, skipping this file...'
             continue
         lastIdx = dataHolder.last_idx_array
         vxMatrix = np.divide(dx, dt)
         vyMatrix = np.divide(dy, dt)
         m = dx.shape[0]
         for i in range(m):
             x_start = dataHolder.x_array[i, 0]
             y_start = dataHolder.y_array[i, 0]
             # get the time process for each velocity
             cutOff = lastIdx[i]
             dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(
                 dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                 x_start, y_start, time_step)
             v_temp = np.sqrt(np.power(dxTime, 2) +
                              np.power(dyTime, 2)) / time_step
             theta_temp = np.arctan2(dyTime, dxTime)
             if len(v_temp) > lag:
                 new_v, new_theta, new_f = remove_duplicate_xy(
                     v_temp, theta_temp, freq)
                 v_process_idx = self.mapping.find_1d_class_idx(
                     np.log(new_v), self.mapping.v_log_edges)
                 # fill the transition matrix for this velocity series
                 fill_one_trajectory_sparse_with_freq_cython(
                     lag, v_process_idx, new_f, i_list_v, j_list_v,
                     ij_set_v, val_list_v)
                 # fill the transition matrix for this angle series
                 theta_process_idx = self.mapping.find_1d_class_idx(
                     new_theta, self.mapping.theta_edges)
                 fill_one_trajectory_sparse_with_freq_cython(
                     lag, theta_process_idx, new_f, i_list_theta,
                     j_list_theta, ij_set_theta, val_list_theta)
     print 'done'
     return csc_matrix((val_list_v, (i_list_v, j_list_v)), shape = (n_v_class, n_v_class)), \
            csc_matrix((val_list_theta, (i_list_theta, j_list_theta)), shape = (n_theta_class, n_theta_class))
 def get_trans_matrix(self, lag):
     filter_length = self.filter_length
     if filter_length:
         print 'using only points with x values less than ' + str(
             filter_length)
     filter_time = self.filter_time
     if filter_time:
         print 'using only points with time less than ' + str(filter_time)
     n_2d_class = self.mapping.n_2d_classes
     i_list = []
     j_list = []
     ij_list = set([])
     val_list = []
     time_step = self.time_step
     print 'extracting trans matrix...'
     for j in range(self.n_total_realz):
         print 'realization number: ', j
         file_name = "real_" + str(j) + ".pkl"
         input_file = os.path.join(self.input_folder, file_name)
         with open(input_file, 'rb') as input:
             dataHolder = pickle.load(input)
         dx = np.diff(dataHolder.x_array)
         dy = np.diff(dataHolder.y_array)
         dt = np.diff(dataHolder.t_array) + 1e-15
         if not (dx.shape[0] and dy.shape[0] and dt.shape[0]):
             print 'some array was empty, skipping this file...'
             continue
         lastIdx = dataHolder.last_idx_array
         vxMatrix = np.divide(dx, dt)
         vyMatrix = np.divide(dy, dt)
         m = dx.shape[0]
         for i in range(m):
             x_start = dataHolder.x_array[i, 0]
             y_start = dataHolder.y_array[i, 0]
             # get the time process for each velocity
             cutOff = lastIdx[i]
             if filter_length:
                 cutOff = min(cutOff,
                              np.argmin(dataHolder.x_array < filter_length))
             if filter_time:
                 cutOff = min(cutOff,
                              np.argmin(dataHolder.t_array < filter_time))
             dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(
                 dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                 x_start, y_start, time_step)
             v_temp = np.sqrt(np.power(dxTime, 2) +
                              np.power(dyTime, 2)) / time_step
             theta_temp = np.arctan2(dyTime, dxTime)
             if len(v_temp) > lag:
                 new_v, new_theta, new_f = remove_duplicate_xy(
                     v_temp, theta_temp, freq)
                 class_2d = self.mapping.class_index_2d_vtheta(
                     new_v, new_theta)
                 new_f = np.array(new_f, dtype=np.dtype("i"))
                 fill_one_trajectory_sparse_with_freq_cython(
                     lag, class_2d, new_f, i_list, j_list, ij_list,
                     val_list)
     print 'done'
     return csc_matrix((val_list, (i_list, j_list)),
                       shape=(n_2d_class, n_2d_class))
def get_trans_matrix_single_attrib(lag_array, n_realz, input_folder, mapping, time_step, prefix='real_',
                                   numbered=True, verbose=False):
    if (not numbered) and n_realz>1:
        raise 'Expecting only one file when no numbers are used for the input data'
    v_log_edges = mapping.v_log_edges
    n_v_class = mapping.n_abs_v_classes
    n_theta_class = mapping.n_theta_classes
    theta_edges = mapping.theta_edges
    v_output_list = []
    theta_output_list = []
    for lag in lag_array:
        print " extracting matrices for lag = ", lag
        v_count_matrix = np.zeros((n_v_class, n_v_class))
        t_count_matrix = np.zeros((n_theta_class, n_theta_class))
        for j in range(n_realz):
            if verbose and not j%20:
                print 'realziation ', j
            if numbered:
                file_name = prefix + str(j) + ".pkl"
            else:
                file_name = prefix + ".pkl"
            input_file = os.path.join(input_folder, file_name)
            with open(input_file, 'rb') as input:
                dataHolder = pickle.load(input)
            dx = np.diff(dataHolder.x_array)
            dy = np.diff(dataHolder.y_array)
            dt = np.diff(dataHolder.t_array)
            if not (dx.shape[0] and dy.shape[0] and dt.shape[0]):
                print 'some array was empty, skipping this file...'
                continue
            lastIdx = dataHolder.last_idx_array
            vxMatrix = np.divide(dx, dt)
            vyMatrix = np.divide(dy, dt)
            m = dx.shape[0]
            for i in range(m):
                x_start = dataHolder.x_array[i, 0]
                y_start = dataHolder.y_array[i, 0]
                # get the time process for each velocity
                cutOff = lastIdx[i]
                dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(dt[i, :cutOff], vxMatrix[i, :cutOff],
                                                                      vyMatrix[i, :cutOff], x_start, y_start,
                                                                      time_step)
                v_temp = np.sqrt(np.power(dxTime, 2) + np.power(dyTime, 2)) / time_step
                theta_temp = np.arctan2(dyTime, dxTime)
                if len(v_temp) > lag:
                    new_v, new_theta, new_f = remove_duplicate_xy(v_temp, theta_temp, freq)
                    class_v = np.array(mapping.find_1d_class_idx(np.log(new_v), v_log_edges), dtype=int)
                    class_theta = np.array(mapping.find_1d_class_idx(new_theta, theta_edges), dtype=int)
                    count_matrix_with_freq_one_trajectory(v_count_matrix, lag, class_v, new_f)
                    count_matrix_with_freq_one_trajectory(t_count_matrix, lag, class_theta, new_f)
        v_output_list.append(v_count_matrix)
        theta_output_list.append(t_count_matrix)
    return v_output_list, theta_output_list
 def get_trans_matrix_from_scratch(self, lag, print_every=50, verbose=True):
     n_3d_class = self.mapping.n_3d_classes
     i_list = []
     j_list = []
     ij_list = set([])
     val_list = []
     time_step = self.time_step
     print 'extracting trans matrix...'
     for j in range(self.n_total_realz):
         if verbose and not j % print_every:
             print 'reading realization number: ', j
         file_name = "real_" + str(j) + ".pkl"
         input_file = os.path.join(self.input_folder, file_name)
         with open(input_file, 'rb') as input:
             dataHolder = pickle.load(input)
         dx = np.diff(dataHolder.x_array)
         dy = np.diff(dataHolder.y_array)
         dt = np.diff(dataHolder.t_array) + 1e-15
         if not (dx.shape[0] and dy.shape[0] and dt.shape[0]):
             print 'some array was empty, skipping this file...'
             continue
         lastIdx = dataHolder.last_idx_array
         vxMatrix = np.divide(dx, dt)
         vyMatrix = np.divide(dy, dt)
         m = dx.shape[0]
         for i in range(m):
             x_start = dataHolder.x_array[i, 0]
             y_start = dataHolder.y_array[i, 0]
             # get the time process for each velocity
             cutOff = lastIdx[i]
             dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(
                 dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                 x_start, y_start, time_step)
             v_temp = np.sqrt(np.power(dxTime, 2) +
                              np.power(dyTime, 2)) / time_step
             theta_temp = np.arctan2(dyTime, dxTime)
             if len(v_temp) > lag:
                 new_v, new_theta, new_f = remove_duplicate_xy(
                     v_temp, theta_temp, freq)
                 class_2d = self.mapping.class_index_2d_vtheta(
                     new_v, new_theta)
                 class_3d_array = self.mapping.find_3d_class_number(
                     class_2d, new_f)
                 fill_one_trajectory_sparse_cython(lag, class_3d_array,
                                                   i_list, j_list, ij_list,
                                                   val_list)
     print 'done'
     return csc_matrix((val_list, (i_list, j_list)),
                       shape=(n_3d_class, n_3d_class))
def test_convert_to_time_process_xyf_3():
    x_start = 0.0
    y_start = 0.0
    dt_array = np.array([0.6, 0.6, 4.0], dtype=np.float)
    vx_array = np.array([1.0, 1.0, 1.0], dtype=np.float)
    vy_array = 3 * np.array([1.0, 1.0, 1.0], dtype=np.float)
    deltaT = 0.4
    expected_dx = np.array([0.4, 0.4, 0.4, 0.4, 0.4])
    expected_freq = np.array([1, 1, 1, 9, 1])
    dx_array, dy_array, freq_array = get_time_dx_dy_array_with_freq(
        dt_array, vx_array, vy_array, x_start, y_start, deltaT)
    diff_dx_norm = np.linalg.norm(dx_array - expected_dx)
    diff_dy_norm = np.linalg.norm(dy_array - 3 * expected_dx)
    print "norm(diff_dx): ", diff_dx_norm
    tol = 1e-12
    assert (diff_dx_norm < tol)
    assert (diff_dy_norm < tol)
    assert (np.all(freq_array == expected_freq))
def test_convert_to_time_process_xyf_2():
    x_start = 0.0
    y_start = 0.0
    dt_array = np.array([1.0, 0.1, 2.0, 0.9], dtype=np.float)
    vx_array = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float)
    vy_array = 3 * vx_array
    deltaT = 1.0
    expected_dx = np.array([1.0, 2.9, 3.0, 3.9])
    expected_freq = np.array([1., 1., 1., 1.])
    dx_array, dy_array, freq_array = get_time_dx_dy_array_with_freq(
        dt_array, vx_array, vy_array, x_start, y_start, deltaT)
    diff_dx_norm = np.linalg.norm(dx_array - expected_dx)
    diff_dy_norm = np.linalg.norm(dy_array - 3 * expected_dx)
    print "norm(diff_dx): ", diff_dx_norm
    tol = 1e-12
    assert (diff_dx_norm < tol)
    assert (diff_dy_norm < tol)
    assert (np.all(freq_array == expected_freq))
def test_convert_to_time_process_xyf_1():
    x_start = 0.0
    y_start = 0.0
    dt_array = np.array([10.0, 0.5, 1.6, 0.9], dtype=np.float)
    vx_array = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float)
    vy_array = np.array([4.0, 3.0, 2.0, 1.0], dtype=np.float)
    deltaT = 1.0
    dx_array, dy_array, freq_array = get_time_dx_dy_array_with_freq(
        dt_array, vx_array, vy_array, x_start, y_start, deltaT)
    dx2, dy2, freq2 = remove_duplicate_xy(dx_array, dy_array, freq_array)
    expected_dx = np.array([1.0, 2.5, 3.0, 3.9])
    expected_dy = np.array([4.0, 2.5, 2.0, 1.1])
    expected_freq = np.array([10., 1., 1., 1.])
    diff_dx_norm = np.linalg.norm(dx2 - expected_dx)
    diff_dy_norm = np.linalg.norm(dy2 - expected_dy)
    print "norm(diff_dx): ", diff_dx_norm
    tol = 1e-12
    assert (diff_dx_norm < tol)
    assert (diff_dy_norm < tol)
    assert (np.all(freq2 == expected_freq))
def average_all_realizations(input_folder,
                             n_realizations,
                             time_step,
                             save_folder,
                             n_combine=None,
                             prefix='real',
                             verbose=True,
                             print_every=20):
    """
    save averaged dx, dy, freq for given dt
    save v, theta, freq for given dt
    all the things needed for creating bins
    save big_v, big_theta, big_f, big_y
    save init_v, init_theta, init_f
    :param input_folder: folder containing the input realizations
    :param n_realizations: total number of realizations
    :param time_step: time step size
    :param save_folder: full path to folder to save the averaged realizations
    :param n_combine: number of realizations ro combine in each output file
    :param prefix: prefix for input files
    :param verbose: whether to write output messages or not
    :param print_every: output messages print frequency
    """
    if verbose:
        print "averaging realizations..."
    if not n_combine:
        print "n_combine == None --> saving all trajectories in one file..."
        n_combine = n_realizations
    # make folder for saving averaged realizations
    total_length = 0
    # count number of output files
    counter = 0
    # count realizations per output file
    realz_count = 0
    # each realization has 1000 particles
    pointer_list = []
    initial_v = []
    initial_f = []
    initial_theta = []
    big_dx_list, big_dy_list, big_freq_list = [[] for i in range(3)]
    big_v_list, big_theta_list, big_y_list = [[] for i in range(3)]
    # for each realization
    for j in range(n_realizations):
        if verbose and not j % print_every:
            print "reading realization nr ", j
        case_name = prefix + "_" + str(j) + ".pkl"
        input_file = os.path.join(input_folder, case_name)
        with open(input_file, 'rb') as input:
            dataHolder = pickle.load(input)
        dx = np.diff(dataHolder.x_array)
        dy = np.diff(dataHolder.y_array)
        dt = np.diff(dataHolder.t_array) + 1e-15
        lastIdx = dataHolder.last_idx_array
        vxMatrix = np.divide(dx, dt)
        vyMatrix = np.divide(dy, dt)
        m = dx.shape[0]
        # read all the trajectories in this realization
        for i in range(m):
            x_start = dataHolder.x_array[i, 0]
            y_start = dataHolder.y_array[i, 0]
            # get the time process for each velocity
            cutOff = lastIdx[i]
            dx_time, dy_time, freq = get_time_dx_dy_array_with_freq(
                dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                x_start, y_start, time_step)
            if len(dx_time) < 1:
                continue
            dx_time, dy_time, freq = remove_duplicate_xy(
                dx_time, dy_time, freq)
            current_v = np.sqrt(np.power(dx_time, 2) + np.power(dy_time, 2))
            current_theta = np.arctan2(dy_time, dx_time)
            current_y = np.cumsum(dy_time)
            current_length = len(dx_time)
            if current_length > 1:
                total_length += current_length
                big_dx_list.append(dx_time)
                big_dy_list.append(dy_time)
                big_v_list.append(current_v)
                big_theta_list.append(current_theta)
                big_y_list.append(current_y)
                big_freq_list.append(freq)
                pointer_list.append(total_length)
                # save the first velocity for initialization
                initial_v.append(current_v[0] / time_step)
                initial_theta.append(current_theta[0])
                initial_f.append(freq[0])
        realz_count += 1
        if n_combine == 1 or (j > 0 and (j + 1) % n_combine
                              == 0) or j + 1 == n_realizations:
            if verbose:
                print '     -saving combined realizations'
            # save this batch and initialize the arrays for next batch
            # flatten the big lists
            chain = itertools.chain(*big_dx_list)
            big_dx_array = np.array(list(chain), dtype=np.float)
            chain = itertools.chain(*big_dy_list)
            big_dy_array = np.array(list(chain), dtype=np.float)
            chain = itertools.chain(*big_freq_list)
            big_freq_array = np.array(list(chain), dtype=np.int)
            chain = itertools.chain(*big_y_list)
            big_y_array = np.array(list(chain), dtype=np.float)
            # save these n_combine averaged realizations in cartesian frame
            save_path = os.path.join(save_folder,
                                     'avg_cartesian_' + str(counter) + '.npz')
            np.savez(save_path,
                     DX=big_dx_array,
                     DY=big_dy_array,
                     F=big_freq_array,
                     Y=big_y_array,
                     ptr=pointer_list,
                     dt=time_step,
                     n_realz=realz_count)
            big_dx_list, big_dy_list = [[] for i in range(2)]
            del big_dx_array, big_dy_array
            # save these n_combine averaged realizations in polar coordinates
            chain = itertools.chain(*big_v_list)
            big_v_array = np.array(list(chain), dtype=np.float) / time_step
            chain = itertools.chain(*big_theta_list)
            big_theta_array = np.array(list(chain), dtype=np.float)
            save_path = os.path.join(save_folder,
                                     'avg_polar_' + str(counter) + '.npz')
            np.savez(save_path,
                     V=big_v_array,
                     Theta=big_theta_array,
                     F=big_freq_array,
                     ptr=pointer_list,
                     dt=time_step,
                     n_realz=realz_count)
            big_v_list, big_theta_list, big_freq_list, big_y_list = [
                [] for i in range(4)
            ]
            # reset pointer array
            pointer_list = []
            total_length = 0
            del big_v_array, big_theta_array, big_freq_array, big_y_array
            counter += 1
            realz_count = 0
    initial_v = np.array(initial_v)
    initial_f = np.array(initial_f, dtype=np.int)
    initial_theta = np.array(initial_theta)
    # save the initial values for v, theta, f
    save_path = os.path.join(save_folder, 'initial_arrays.npz')
    np.savez(save_path,
             v=initial_v,
             theta=initial_theta,
             f=initial_f,
             dt=time_step)
    # save number of averaged realization files
    save_path = os.path.join(save_folder, 'case_info.npz')
    np.savez(save_path, n_out=counter, n_input=n_realizations, dt=time_step)
def get_trans_matrix_single_attrib_both_methods_from_scratch(lag_array, n_realz, input_folder, mapping, time_step,
                                                             prefix='real_', numbered=True, verbose=False):
    """
    Get the aggregate transition matrix both considering the frequency and not considering the frequency
    corresponding to the stencil method and the extended stencil method
    :param lag_array:
    :param n_realz:
    :param input_folder:
    :param mapping:
    :param time_step:
    :param prefix:
    :param numbered:
    :param verbose:
    :return:
    """
    if (not numbered) and n_realz>1:
        raise 'Expecting only one file when no numbers are used for the input data'
    v_log_edges = mapping.v_log_edges
    n_v_class = mapping.n_abs_v_classes
    n_theta_class = mapping.n_theta_classes
    theta_edges = mapping.theta_edges
    v_output_list = [np.zeros((n_v_class, n_v_class)) for i in range(2)]
    theta_output_list = [np.zeros((n_theta_class, n_theta_class)) for i in range(2)]
    v_output_list_nofreq = [np.zeros((n_v_class, n_v_class)) for i in range(2)]
    theta_output_list_nofreq = [np.zeros((n_theta_class, n_theta_class)) for i in range(2)]
    for j in range(n_realz):
        if verbose and not j%20:
            print 'realziation ', j
        if numbered:
            file_name = prefix + str(j) + ".pkl"
        else:
            file_name = prefix + ".pkl"
        input_file = os.path.join(input_folder, file_name)
        with open(input_file, 'rb') as input:
            dataHolder = pickle.load(input)
        dx = np.diff(dataHolder.x_array)
        dy = np.diff(dataHolder.y_array)
        dt = np.diff(dataHolder.t_array) + 1e-12
        if not (dx.shape[0] and dy.shape[0] and dt.shape[0]):
            print 'some array was empty, skipping this file...'
            continue
        lastIdx = dataHolder.last_idx_array
        vxMatrix = np.divide(dx, dt)
        vyMatrix = np.divide(dy, dt)
        m = dx.shape[0]
        for i in range(m):
            x_start = dataHolder.x_array[i, 0]
            y_start = dataHolder.y_array[i, 0]
            # get the time process for each velocity
            cutOff = lastIdx[i]
            dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(dt[i, :cutOff], vxMatrix[i, :cutOff],
                                                                  vyMatrix[i, :cutOff], x_start, y_start,
                                                                  time_step)
            v_temp = np.sqrt(np.power(dxTime, 2) + np.power(dyTime, 2)) / time_step
            theta_temp = np.arctan2(dyTime, dxTime)
            new_v, new_theta, new_f = remove_duplicate_xy(v_temp, theta_temp, freq)
            for idx_lag, lag in enumerate(lag_array):
                if len(new_v) > lag:
                    class_v = np.array(mapping.find_1d_class_idx(np.log(new_v), v_log_edges), dtype=int)
                    class_theta = np.array(mapping.find_1d_class_idx(new_theta, theta_edges), dtype=int)
                    count_matrix_with_freq_one_trajectory(v_output_list[idx_lag], lag, class_v, new_f)
                    count_matrix_with_freq_one_trajectory(theta_output_list[idx_lag], lag, class_theta, new_f)
                    # get the transition matrices for the extended method (v, theta, f) ->
                    # input (v,theta)
                    count_matrix_one_trajectory(v_output_list_nofreq[idx_lag], lag, class_v)
                    count_matrix_one_trajectory(theta_output_list_nofreq[idx_lag], lag, class_theta)
    return v_output_list, theta_output_list, v_output_list_nofreq, theta_output_list_nofreq
def binning_input_v_theta_freq_y(input_folder,
                                 n_realizations,
                                 time_step,
                                 prefix='real',
                                 verbose=True):
    """
    generate sample processes for v, theta, freq, y to be used for creating classes
    :param input_folder: folder containing the input realizations
    :param n_realizations: number of realizations to consider
    :param time_step: time step size
    :param prefix: prefix for input files
    :param verbose: whether to write output messages or not
    :return big_v_array:
    :return big_freq_array:
    :return big_theta_array:
    :return pointer_list:
    :return initial_v0:
    :return initial f_0:
    :return initial_theta0:
    """
    if verbose:
        print "making long array for generating v, theta, frequency bins..."
    total_length = 0
    #
    pointer_list = []
    initial_v = []
    initial_f = []
    initial_theta = []
    big_v_array = np.array([], dtype=np.float)
    big_theta_array = np.array([], dtype=np.float)
    big_freq_array = np.array([], dtype=np.float)
    big_y_array = np.array([], dtype=np.float)
    for j in range(n_realizations):
        if verbose:
            print "reading realization nr ", j
        case_name = prefix + "_" + str(j) + ".pkl"
        input_file = os.path.join(input_folder, case_name)
        with open(input_file, 'rb') as input:
            dataHolder = pickle.load(input)
        dx = np.diff(dataHolder.x_array)
        dy = np.diff(dataHolder.y_array)
        dt = np.diff(dataHolder.t_array) + 1e-15
        lastIdx = dataHolder.last_idx_array
        vxMatrix = np.divide(dx, dt)
        vyMatrix = np.divide(dy, dt)
        m = dx.shape[0]
        for i in range(m):
            x_start = dataHolder.x_array[i, 0]
            y_start = dataHolder.y_array[i, 0]
            # get the time process for each velocity (averaging/integrating arrays in time)
            cutOff = lastIdx[i]
            dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(
                dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                x_start, y_start, time_step)
            if len(dxTime) < 1:
                continue
            dxTime, dyTime, freq = remove_duplicate_xy(dxTime, dyTime, freq)
            # find y
            y_time = np.hstack((0.0, np.cumsum(dyTime)))
            current_length = len(dxTime)
            if current_length > 1:
                total_length += current_length
                current_v = np.sqrt(np.power(dxTime, 2) + np.power(dyTime, 2))
                current_theta = np.arctan2(dyTime, dxTime)
                big_v_array = np.hstack((big_v_array, current_v))
                big_theta_array = np.hstack((big_theta_array, current_theta))
                big_freq_array = np.hstack((big_freq_array, freq))
                big_y_array = np.hstack((big_y_array, y_time))
                pointer_list.append(total_length)
                # save the first velocity for initialization
                initial_v.append(current_v[0] / time_step)
                initial_theta.append(current_theta[0])
                initial_f.append(freq[0])
    assert (len(big_v_array) == len(big_freq_array))
    initial_v = np.array(initial_v)
    initial_f = np.array(initial_f)
    initial_theta = np.array(initial_theta)
    # divide by time to get velocit
    big_v_array /= time_step
    return big_v_array, big_theta_array, big_y_array, big_freq_array, \
           pointer_list, initial_v, initial_f, initial_theta
def make_input_for_binning_v_theta_freq(input_folder,
                                        n_realizations,
                                        time_step,
                                        prefix='real',
                                        verbose=True,
                                        print_every=20):
    """
    :param input_folder: folder containing the input realizations
    :param n_realizations: number of realizations to consider
    :param time_step: time step size
    :param prefix: prefix for input files
    :param verbose: whether to write output messages or not
    :return big_v_array:
    :return big_freq_array:
    :return big_theta_array:
    :return pointer_list:
    :return initial_v0:
    :return initial f_0:
    :return initial_theta0:
    """
    if verbose:
        print "making long array for generating v, theta, frequency bins..."
    total_length = 0
    # each realization has 1000 particles
    pointer_list = []
    initial_v = []
    initial_f = []
    initial_theta = []
    big_v_list, big_theta_list, big_freq_list = [[] for i in range(3)]
    for j in range(n_realizations):
        if verbose and not j % print_every:
            print "reading realization nr ", j
        case_name = prefix + "_" + str(j) + ".pkl"
        input_file = os.path.join(input_folder, case_name)
        with open(input_file, 'rb') as input:
            dataHolder = pickle.load(input)
        dx = np.diff(dataHolder.x_array)
        dy = np.diff(dataHolder.y_array)
        dt = np.diff(dataHolder.t_array) + 1e-15
        lastIdx = dataHolder.last_idx_array
        vxMatrix = np.divide(dx, dt)
        vyMatrix = np.divide(dy, dt)
        m = dx.shape[0]
        for i in range(m):
            x_start = dataHolder.x_array[i, 0]
            y_start = dataHolder.y_array[i, 0]
            # get the time process for each velocity
            cutOff = lastIdx[i]
            dxTime, dyTime, freq = get_time_dx_dy_array_with_freq(
                dt[i, :cutOff], vxMatrix[i, :cutOff], vyMatrix[i, :cutOff],
                x_start, y_start, time_step)
            if len(dxTime) < 1:
                continue
            dxTime, dyTime, freq = remove_duplicate_xy(dxTime, dyTime, freq)
            current_length = len(dxTime)
            if current_length > 1:
                total_length += current_length
                current_v = np.sqrt(np.power(dxTime, 2) + np.power(dyTime, 2))
                current_theta = np.arctan2(dyTime, dxTime)
                big_v_list.append(current_v)
                big_theta_list.append(current_theta)
                big_freq_list.append(freq)
                pointer_list.append(total_length)
                # save the first velocity for initialization
                initial_v.append(current_v[0] / time_step)
                initial_theta.append(current_theta[0])
                initial_f.append(freq[0])
    # flatten the big lists
    chain = itertools.chain(*big_v_list)
    big_v_array = np.array(list(chain), dtype=np.float)
    chain = itertools.chain(*big_theta_list)
    big_theta_array = np.array(list(chain), dtype=np.float)
    chain = itertools.chain(*big_freq_list)
    big_freq_array = np.array(list(chain), dtype=np.float)
    assert (len(big_v_array) == len(big_freq_array))
    initial_v = np.array(initial_v)
    initial_f = np.array(initial_f)
    initial_theta = np.array(initial_theta)
    # divide by time to get velocit
    big_v_array /= time_step
    return big_v_array, big_theta_array, big_freq_array, pointer_list, initial_v, initial_f, initial_theta