def index_data():
    """Simple example with equity index return data

    Segment the daily rates of return of a pair of equity indices between April
    23rd, 1993 and July 14th, 2003. The indices are the Cotation Assistee en
    Continu (CAC) and the Deutscher Aktienindex (DAX). The rates of return are
    computed based on the daily closing price of each index.
    """

    # Store the absolute path to the file containing the data.
    abspath = path.realpath(path.join(os.getcwd(), 'data'))
    abspath = path.join(abspath, 'equity-index-data.csv')

    time = []
    val = []

    # Read the data.
    with open(abspath, 'r') as fileobj:
        reader = csv.reader(fileobj, delimiter=',')
        row = reader.next()
        name = []
        for field in row:
            if field != 'date':
                name.append(field.upper())
        for row in reader:
            rec = []
            for field in row:
                try:
                    rec.append(float(field))
                except:
                    time.append(dp.parse(field))
            val.append(rec)

    # Format the data.
    X = np.ones([len(val), 1])
    Y = np.array(val).reshape([len(val), len(name)])

    # Select daily returns from CAC and DAX.
    ind = ['CAC', 'DAX']
    ind = [name.index(i) for i in ind]
    name = [name[i] for i in ind]
    Y = Y[:, ind]

    kwargs = {'ratefun': 1.0e-2,                   # 1% expected hazard rate
              'mu': np.zeros([1, len(name)]),      # 0% expected rate of return
              'sigma': 1.0e-4 * np.eye(len(name)), # 1% expected volatility
              'maxhypot': 50,
              'minprob': 1.0e-16}

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely sequence segmentation.
    bcdm_probabilities = Bcdm(alg='sumprod', **kwargs)
    bcdm_segments = Bcdm(alg='maxprod', **kwargs)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, len(val) + 1)
    upperaxes.plot(t, Y[:])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t, segments, facecolor='y', alpha=0.2, edgecolor='none')
        ax.set_xlim([0, len(val)])

    fig.canvas.set_window_title('Equity index data')
    upperaxes.set_title('Equity index data')
    upperaxes.set_ylabel('Rate of return')
    loweraxes.set_xlabel('Trading day')
    loweraxes.set_ylabel('Hypothesis probability')

    upperaxes.legend(['CAC', 'DAX'], loc='upper left')
def non_sinusoidal():
    """Simple example with triangular wave data."""

    rate = 0.001
    omega = 1.0e-3 * np.eye(2)
    sigma = 1.0e-6 * np.eye(3)
    samples = 1000
    basis = lambda x: np.array([[1.0, x]])

    # Create triangulare wave functions.
    square_wave = lambda x: np.sign(np.sin(x))
    sawtooth_wave = lambda a, x: 2 * ((x/a) - np.floor(0.5 + (x/a)))
    triangle_wave = lambda a, x: 2 * np.abs(sawtooth_wave(a, x)) - 1

    # Create input and outputs.
    X = np.linspace(0, 3*2*np.pi, samples).reshape(samples, 1)
    Y = np.hstack([square_wave(X),
                   triangle_wave(2*np.pi, X - np.pi/2),
                   sawtooth_wave(2*np.pi, X + np.pi/3)])

    # Create Gaussian noise.
    Y += np.vstack([0.025 * np.random.randn(samples),
                    0.1 * np.random.randn(samples),
                    0.05 * np.random.randn(samples)]).T

    # Determine location of true boundaries.
    true_boundaries = np.hstack((np.pi * np.arange(0, 7),
                                 np.pi * np.arange(0, 6) + np.pi/2,
                                 2*np.pi * np.arange(0, 4) + np.pi - np.pi/3))

    true_boundaries = np.sort(true_boundaries[true_boundaries <= max(X)])

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely segmentation of the sequence.
    bcdm_probabilities = Bcdm(alg='sumprod',
                              ratefun=rate,
                              basisfunc=basis,
                              omega=omega,
                              sigma=sigma)

    bcdm_segments = Bcdm(alg='maxprod',
                         ratefun=rate,
                         basisfunc=basis,
                         omega=omega,
                         sigma=sigma)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        y = np.array([y])
        basis_t = lambda xt: basis(xt - x)
        bcdm_probabilities.update(x, y, basisfunc=basis_t)
        bcdm_segments.update(x, y, basisfunc=basis_t)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=False)

    # Plot the response data.
    for i in range(Y.shape[1]):
        upperaxes.plot(X, Y[:, i])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    plt.sca(upperaxes)
    plot_segment_span(X, segments, facecolor='y', alpha=0.2, edgecolor='none')
    plot_segment_boundaries(true_boundaries, color='k', linestyle=':')

    plt.sca(loweraxes)
    plot_segment_span(segments, facecolor='y', alpha=0.2, edgecolor='none')
    plot_segment_boundaries(samples * true_boundaries / max(X),
                            color='k', linestyle=':')

    upperaxes.set_xlim([0, max(X)])
    loweraxes.set_xlim([0, len(X)])

    fig.canvas.set_window_title('Triangular wave data')
    upperaxes.set_title('Triangular wave data')
    upperaxes.set_ylabel('Signal values')
    loweraxes.set_xlabel('Observation')
    loweraxes.set_ylabel('Hypothesis probability')
def well_data():
    """Simple example with nuclear response data collected a well drilling

    Segment the well log data used in Fearnhead and Clifford (1996). This data
    consist of measurements of the nuclear magnetic response of underground
    rocks, collected during the drilling of a well bore. The data are composed
    of piecewise constant segments, each segment relating to a stratum with a
    single type of rock. The jump discontinuities between segments occur at the
    boundaries between rock strata.

    P. Fearnhead and P. Clifford, "Online Inference for Hidden Markov Models
    via Particle Filters," Journal of the Royal Statistical Society: Series B
    (Statistical Methodology), Vol. 65, Issue 4, pp. 887-889, November 2003.
    """

    loc = 1.0e5
    scale = 1.0e4
    rate = 1.0e-2

    val = []

    # Store the absolute path to the file containing the data.
    abspath = path.realpath(path.join(os.getcwd(), 'data'))
    abspath = path.join(abspath, 'well-data.txt')

    # Read the data.
    with open(abspath, 'r') as file:
        for line in file:
            try:
                val.append(float(line))
            except:
                pass

    # Format the data.
    X = np.ones([len(val), 1])
    Y = np.array(val).reshape([len(val), 1])

    loc = np.array([(loc, )])
    scale = np.array([(scale, )])

    kwargs = {'ratefun': rate,
              'mu': loc,
              'sigma': scale}

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely sequence segmentation.
    bcdm_probabilities = Bcdm(alg='sumprod', **kwargs)
    bcdm_segments = Bcdm(alg='maxprod', **kwargs)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, len(val) + 1)
    upperaxes.plot(t, Y[:])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t, segments, facecolor='y', alpha=0.2, edgecolor='none')
        ax.set_xlim([0, len(val)])

    fig.canvas.set_window_title('Well log data')
    upperaxes.set_title('Well log data')
    upperaxes.set_ylabel('Nuclear magnetic response')
    loweraxes.set_xlabel('Measurement number')
    loweraxes.set_ylabel('Hypothesis probability')
def random_data():
    """Simple example with synthetic data."""

    # Set the size of the problem.
    numpred = 2
    numresp = 2
    numpoint = 200
    numseg = 5

    # Set parameters for generating the data.
    coeffparam = 0.5
    noiseparam = 5.0

    # Generate a sequence of segments and, for each segment, generate a set of
    # predictor-response data.
    segbound, X, Y = gen_random_data(numpred, numresp, numpoint, numseg,
                                     omega=coeffparam*np.eye(numpred),
                                     eta=noiseparam)

    rate = float(numseg) / float(numpoint - numseg)

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely segmentation of the sequence.
    bcdm_probabilities = Bcdm(alg='sumprod', ratefun=rate)
    bcdm_segments = Bcdm(alg='maxprod', ratefun=rate)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, numpoint + 1)
    for i in range(numresp):
        upperaxes.plot(t, Y[:, i])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t, segments, facecolor='y', alpha=0.2, edgecolor='none')
        plot_segment_boundaries(t, segbound, color='k', linestyle=':')
        ax.set_xlim([0, numpoint])

    fig.canvas.set_window_title('Randomly generated data')
    upperaxes.set_title('Randomly generated data')
    upperaxes.set_ylabel('Output values')
    loweraxes.set_xlabel('Observation')
    loweraxes.set_ylabel('Hypothesis probability')
Example #5
0
def index_data():
    """Simple example with equity index return data

    Segment the daily rates of return of a pair of equity indices between April
    23rd, 1993 and July 14th, 2003. The indices are the Cotation Assistee en
    Continu (CAC) and the Deutscher Aktienindex (DAX). The rates of return are
    computed based on the daily closing price of each index.
    """

    # Store the absolute path to the file containing the data.
    abspath = path.realpath(path.join(os.getcwd(), 'data'))
    abspath = path.join(abspath, 'equity-index-data.csv')

    time = []
    val = []

    # Read the data.
    with open(abspath, 'r') as fileobj:
        reader = csv.reader(fileobj, delimiter=',')
        row = reader.next()
        name = []
        for field in row:
            if field != 'date':
                name.append(field.upper())
        for row in reader:
            rec = []
            for field in row:
                try:
                    rec.append(float(field))
                except:
                    time.append(dp.parse(field))
            val.append(rec)

    # Format the data.
    X = np.ones([len(val), 1])
    Y = np.array(val).reshape([len(val), len(name)])

    # Select daily returns from CAC and DAX.
    ind = ['CAC', 'DAX']
    if len(ind) > 0:
        ind = [name.index(i) for i in ind]
        name = [name[i] for i in ind]
        Y = Y[:, ind]

    kwargs = {
        'ratefun': 1.0e-2,  # 1% expected hazard rate
        'mu': np.zeros([1, len(name)]),  # 0% expected rate of return
        'sigma': 1.0e-4 * np.eye(len(name)),  # 1% expected volatility
        'maxhypot': 50,
        'minprob': 1.0e-16
    }

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely sequence segmentation.
    bcdm_probabilities = Bcdm(alg='sumprod', **kwargs)
    bcdm_segments = Bcdm(alg='maxprod', **kwargs)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, len(val) + 1)
    upperaxes.plot(t, Y[:])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t,
                          segments,
                          facecolor='y',
                          alpha=0.2,
                          edgecolor='none')
        ax.set_xlim([0, len(val)])

    fig.canvas.set_window_title('Equity index data')
    upperaxes.set_title('Equity index data')
    upperaxes.set_ylabel('Rate of return')
    loweraxes.set_xlabel('Trading day')
    loweraxes.set_ylabel('Hypothesis probability')

    upperaxes.legend(name, loc='upper left')
Example #6
0
def well_data():
    """Simple example with nuclear response data collected a well drilling

    Segment the well log data used in Fearnhead and Clifford (1996). This data
    consist of measurements of the nuclear magnetic response of underground
    rocks, collected during the drilling of a well bore. The data are composed
    of piecewise constant segments, each segment relating to a stratum with a
    single type of rock. The jump discontinuities between segments occur at the
    boundaries between rock strata.

    P. Fearnhead and P. Clifford, "Online Inference for Hidden Markov Models
    via Particle Filters," Journal of the Royal Statistical Society: Series B
    (Statistical Methodology), Vol. 65, Issue 4, pp. 887-889, November 2003.
    """

    loc = 1.0e5
    scale = 1.0e4
    rate = 1.0e-2

    val = []

    # Store the absolute path to the file containing the data.
    abspath = path.realpath(path.join(os.getcwd(), 'data'))
    abspath = path.join(abspath, 'well-data.txt')

    # Read the data.
    with open(abspath, 'r') as file:
        for line in file:
            try:
                val.append(float(line))
            except:
                pass

    # Format the data.
    X = np.ones([len(val), 1])
    Y = np.array(val).reshape([len(val), 1])

    loc = np.array([(loc, )])
    scale = np.array([(scale, )])

    kwargs = {'ratefun': rate, 'mu': loc, 'sigma': scale}

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely sequence segmentation.
    bcdm_probabilities = Bcdm(alg='sumprod', **kwargs)
    bcdm_segments = Bcdm(alg='maxprod', **kwargs)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, len(val) + 1)
    upperaxes.plot(t, Y[:])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t,
                          segments,
                          facecolor='y',
                          alpha=0.2,
                          edgecolor='none')
        ax.set_xlim([0, len(val)])

    fig.canvas.set_window_title('Well log data')
    upperaxes.set_title('Well log data')
    upperaxes.set_ylabel('Nuclear magnetic response')
    loweraxes.set_xlabel('Measurement number')
    loweraxes.set_ylabel('Hypothesis probability')
Example #7
0
def non_sinusoidal():
    """Simple example with triangular wave data."""

    rate = 0.001
    omega = 1.0e-3 * np.eye(2)
    sigma = 1.0e-6 * np.eye(3)
    samples = 1000
    basis = lambda x: np.array([[1.0, x]])

    # Create triangulare wave functions.
    square_wave = lambda x: np.sign(np.sin(x))
    sawtooth_wave = lambda a, x: 2 * ((x / a) - np.floor(0.5 + (x / a)))
    triangle_wave = lambda a, x: 2 * np.abs(sawtooth_wave(a, x)) - 1

    # Create input and outputs.
    X = np.linspace(0, 3 * 2 * np.pi, samples).reshape(samples, 1)
    Y = np.hstack([
        square_wave(X),
        triangle_wave(2 * np.pi, X - np.pi / 2),
        sawtooth_wave(2 * np.pi, X + np.pi / 3)
    ])

    # Create Gaussian noise.
    Y += np.vstack([
        0.025 * np.random.randn(samples), 0.1 * np.random.randn(samples),
        0.05 * np.random.randn(samples)
    ]).T

    # Determine location of true boundaries.
    true_boundaries = np.hstack(
        (np.pi * np.arange(0, 7), np.pi * np.arange(0, 6) + np.pi / 2,
         2 * np.pi * np.arange(0, 4) + np.pi - np.pi / 3))

    true_boundaries = np.sort(true_boundaries[true_boundaries <= max(X)])

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely segmentation of the sequence.
    bcdm_probabilities = Bcdm(alg='sumprod',
                              ratefun=rate,
                              basisfunc=basis,
                              omega=omega,
                              sigma=sigma)

    bcdm_segments = Bcdm(alg='maxprod',
                         ratefun=rate,
                         basisfunc=basis,
                         omega=omega,
                         sigma=sigma)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        y = np.array([y])
        basis_t = lambda xt: basis(xt - x)
        bcdm_probabilities.update(x, y, basisfunc=basis_t)
        bcdm_segments.update(x, y, basisfunc=basis_t)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=False)

    # Plot the response data.
    for i in range(Y.shape[1]):
        upperaxes.plot(X, Y[:, i])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    plt.sca(upperaxes)
    plot_segment_span(X, segments, facecolor='y', alpha=0.2, edgecolor='none')
    plot_segment_boundaries(true_boundaries, color='k', linestyle=':')

    plt.sca(loweraxes)
    plot_segment_span(segments, facecolor='y', alpha=0.2, edgecolor='none')
    plot_segment_boundaries(samples * true_boundaries / max(X),
                            color='k',
                            linestyle=':')

    upperaxes.set_xlim([0, max(X)])
    loweraxes.set_xlim([0, len(X)])

    fig.canvas.set_window_title('Triangular wave data')
    upperaxes.set_title('Triangular wave data')
    upperaxes.set_ylabel('Signal values')
    loweraxes.set_xlabel('Observation')
    loweraxes.set_ylabel('Hypothesis probability')
Example #8
0
def random_data():
    """Simple example with synthetic data."""

    # Set the size of the problem.
    numpred = 2
    numresp = 2
    numpoint = 200
    numseg = 5

    # Set parameters for generating the data.
    coeffparam = 0.5
    noiseparam = 5.0

    # Generate a sequence of segments and, for each segment, generate a set of
    # predictor-response data.
    segbound, X, Y = gen_random_data(numpred,
                                     numresp,
                                     numpoint,
                                     numseg,
                                     omega=coeffparam * np.eye(numpred),
                                     eta=noiseparam)

    rate = float(numseg) / float(numpoint - numseg)

    # Compute the posterior probabilities over segment length hypotheses. Then,
    # find the most likely segmentation of the sequence.
    bcdm_probabilities = Bcdm(alg='sumprod', ratefun=rate)
    bcdm_segments = Bcdm(alg='maxprod', ratefun=rate)

    # Update the segment length hypotheses given the data.
    for x, y in zip(X, Y):
        bcdm_probabilities.update(x, y)
        bcdm_segments.update(x, y)

    # Recover the hypothesis probabilities and back-trace to find the most
    # likely segmentation of the sequence.
    hypotheses_probability = bcdm_probabilities.infer()
    segments = bcdm_segments.infer()

    # Create subplots with shared X-axis.
    fig, (upperaxes, loweraxes) = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    # Plot the response data.
    t = np.arange(1, numpoint + 1)
    for i in range(numresp):
        upperaxes.plot(t, Y[:, i])

    # Plot the posterior probabilities over segment length hypotheses.
    plot_probability(loweraxes, hypotheses_probability, cmap=plt.cm.gray)

    # Plot the changes detected by the segmentation algorithm as alternating
    # coloured spans. Plot the true segment boundaries as vertical lines.
    for ax in (upperaxes, loweraxes):
        plt.sca(ax)
        plot_segment_span(t,
                          segments,
                          facecolor='y',
                          alpha=0.2,
                          edgecolor='none')
        plot_segment_boundaries(t, segbound, color='k', linestyle=':')
        ax.set_xlim([0, numpoint])

    fig.canvas.set_window_title('Randomly generated data')
    upperaxes.set_title('Randomly generated data')
    upperaxes.set_ylabel('Output values')
    loweraxes.set_xlabel('Observation')
    loweraxes.set_ylabel('Hypothesis probability')