Beispiel #1
0
    def predict_with_edit_dist(self,
                               test_embeddings,
                               test_descriptions,
                               official_descriptions,
                               pbar=False):
        chunk_size = 2000
        ntest_docs = test_embeddings.shape[0]
        best_neighbors = np.empty(ntest_docs, dtype=int)
        for start_index in xrange(0, ntest_docs, chunk_size):
            if pbar: progress_bar(start_index, ntest_docs)
            stop_index = start_index + chunk_size
            if stop_index > ntest_docs:
                stop_index = ntest_docs

            NN_dists, NN_indices = self._classifier.kneighbors(
                test_embeddings[start_index:stop_index], return_distance=True)
            nNN = NN_indices.shape[1]
            for i1, i2 in enumerate(xrange(start_index, stop_index)):
                test_string = test_descriptions[i2]
                edit_dists = [
                    self.edit_distance(
                        test_string, official_descriptions[NN_indices[i1, j]])
                    for j in xrange(nNN)
                ]
                best_neighbors[i2] = NN_indices[i1, np.argsort(edit_dists)[0]]

        predicted_codes = np.array([
            self._official_labels[best_neighbors[i]]
            for i in xrange(ntest_docs)
        ])
        return predicted_codes
Beispiel #2
0
def find_min():
    x0 = np.array((1, 4))
    sigma = 100
    npts = 50
    times = np.linspace(0, 1, npts)
    data = integrate(f_test, x0, sigma, times)

    # generate initial points uniformly distributed +/- "max_offset" around true value
    ntrials = 2
    max_offset = 0.5
    init_offsets = max_offset * np.random.uniform(
        low=-1, high=1, size=(ntrials, 3))
    true_val = np.hstack((x0, sigma))
    opt_vals = np.empty((ntrials, 3))
    errs = np.empty(ntrials)
    for i in range(ntrials):
        init_guess = true_val + init_offsets[i]
        out = opt.leastsq(lsq_error,
                          init_guess,
                          args=(times, data),
                          full_output=True)
        opt_vals[i] = out[0]
        errs[i] = np.sum(out[2]['fvec'])
        uf.progress_bar(i + 1, ntrials)
    np.savetxt('./data/optvals.csv', opt_vals, delimiter=',')
    np.savetxt('./data/errs.csv', errs, delimiter=',')
Beispiel #3
0
    def predict_combined(self,
                         test_embeddings1,
                         test_embeddings2,
                         alpha=0.2,
                         pbar=False):
        chunk_size = 2000
        ntest_docs = test_embeddings1.shape[0]

        predicted_codes = np.empty((ntest_docs, self._nNN), dtype=int)
        prediction_weights = np.empty((ntest_docs, self._nNN))
        potential_full_codes = []
        for start_index in xrange(0, ntest_docs, chunk_size):
            if pbar: progress_bar(start_index, ntest_docs)
            stop_index = start_index + chunk_size
            if stop_index > ntest_docs:
                stop_index = ntest_docs

            NN_dists1, NN_indices1 = self._classifier1.kneighbors(
                test_embeddings1[start_index:stop_index], return_distance=True)
            NN_dists2, NN_indices2 = self._classifier2.kneighbors(
                test_embeddings2[start_index:stop_index], return_distance=True)
            probs1, class1 = self.get_assignment_probs(NN_dists1, NN_indices1)
            probs2, class2 = self.get_assignment_probs(NN_dists2, NN_indices2)
            predicted_codes[start_index:stop_index], prediction_weights[
                start_index:stop_index] = self.predict_from_weights(
                    probs1, probs2, class1, class2, alpha)

            potential_full_codes += self.get_recurring_indices(
                NN_indices1, NN_indices2)

        return [predicted_codes, prediction_weights, potential_full_codes]
Beispiel #4
0
def parse_csv():
    """Parses huge csv of exported Enigma dataset"""
    data_filebase = './data/ams-summary-2016'

    word_description_regex = re.compile('[a-zA-Z]+')

    description_index = 26  # index of column which contains description text of lading
    code_index = 27  # index of column which contains tariff code (probably missing)

    nrows_per_page = np.power(10, 6)
    lading_tariff_codes = []
    lading_descriptions = []

    powers_of_ten = np.power(10, range(10))

    with open(data_filebase + '.csv') as file:
        print file.readline()  # discard header
        # for i in range((current_page-1)*nrows_per_page):
        #     file.readline()
        csv_reader = csv.reader(file)
        for current_page in range(1, 20):
            print current_page
            for i in xrange(nrows_per_page):
                progress_bar(i, nrows_per_page)
                line = csv_reader.next()

                # find code, ensure it has ten digits, then add it to list as int (may remove leading zeros)
                # code = line[code_index]
                # if len(code) > 0:
                #     # throw out those few codes that have
                #     try:
                #         nmissing_digits = 10 - len(code)
                #         code = int(code)*powers_of_ten[nmissing_digits]
                #     except ValueError:
                #         code = 0
                #         continue
                # else:
                #     code = np.nan
                # lading_tariff_codes.append(code)

                # # find all words (defined as consecutive letters in any case) and append to list
                # description = line[description_index]
                # word_descriptions =  ' '.join(re.findall(word_description_regex, description))
                # lading_descriptions.append(word_descriptions)

                lading_tariff_codes.append(line[code_index])
                lading_descriptions.append(line[description_index])

            pickle.dump(
                lading_tariff_codes,
                open('./data/data-codes-2016-' + str(current_page) + '.pkl',
                     'w'))
            pickle.dump(
                lading_descriptions,
                open(
                    './data/data-descriptions-2016-' + str(current_page) +
                    '.pkl', 'w'))
Beispiel #5
0
 def predict(self, test_embeddings, pbar=False):
     chunk_size = 2000
     ntest_docs = test_embeddings.shape[0]
     predicted_codes = np.empty(ntest_docs, dtype=int)
     for start_index in xrange(0, ntest_docs, chunk_size):
         if pbar: progress_bar(start_index, ntest_docs)
         stop_index = start_index + chunk_size
         if stop_index > ntest_docs:
             stop_index = ntest_docs
         predicted_codes[start_index:stop_index] = self._classifier.predict(
             test_embeddings[start_index:stop_index])
     return predicted_codes
Beispiel #6
0
def separate_data():
    base_code_name = './data/data-codes-2016-'
    base_desc_name = './data/data-descriptions-2016-'
    for i in range(5, 10):
        progress_bar(i, 10)
        code_filename = base_code_name + str(i) + '.pkl'
        desc_filename = base_desc_name + str(i) + '.pkl'
        data_codes = np.array(pickle.load(open(code_filename, 'r')))
        ncodes = data_codes.shape[0]
        data_codes_array = np.empty(ncodes, dtype=int)
        powers_of_ten = np.power(10, range(10))
        for j in xrange(ncodes):

            code = data_codes[j]
            if len(code) > 0:
                # throw out those few codes that have
                try:
                    nmissing_digits = 10 - len(code)
                    code = int(code) * powers_of_ten[nmissing_digits]
                except ValueError:
                    code = 0
                    continue
            else:
                code = -1
            data_codes_array[j] = code

        data_codes = data_codes_array
        print data_codes[:5]

        data_descriptions = [
            ' '.join(words) for words in pickle.load(open(desc_filename, 'r'))
        ]

        kept_indices = data_codes != -1
        data_descriptions = list(
            compress(data_descriptions, ~np.isnan(data_codes)))
        data_codes = data_codes[kept_indices]

        pickle.dump(data_codes,
                    open('./data/train/codes' + str(i) + '.pkl', 'w'))
        pickle.dump(data_descriptions,
                    open('./data/train/descriptions' + str(i) + '.pkl', 'w'))
Beispiel #7
0
def mm_contour_grid():
    nks = 1000
    nvs = 1000
    Ks = 2*np.logspace(-1, 3, nks)
    Vs = np.logspace(-1, 3, nvs)
    if os.path.isfile('./of_evals.csv'):
        kept_pts = np.genfromtxt('./of_evals.csv', delimiter=',')
        count = kept_pts.shape[0]
    else:
        # set up base system
        params = OrderedDict((('K',2.0), ('V',1.0), ('St',2.0), ('epsilon',1e-3), ('kappa',10.0))) # from Antonios' writeup
        true_params = np.array(params.values())
        nparams = true_params.shape[0]
        transform_id = 't2'
        state_params = ['K']
        continuation_param = 'V'
        # set init concentrations
        S0 = params['St']; C0 = 0.0; P0 = 0.0 # init concentrations
        Cs0 = np.array((S0, C0, P0))
        # set times at which to collect data
        tscale = (params['St'] + params['K'])/params['V'] # timescale of slow evolution
        npts = 20
        times = tscale*np.linspace(1,npts,npts)/5.0
        # use these params, concentrations and times to define the MM system
        MM_system = MM.MM_System(Cs0, times, true_params, transform_id)
        print 'ofeval', MM_system.of(params.values())
        of_evals = np.empty((nks, nvs))
        test_params = true_params
        ndiscarded = 0
        kept_pts = np.empty((nks*nvs,3))
        tol = 0.1
        count = 0
        for i, K in enumerate(Ks):
            uf.progress_bar(i+1, nks)
            for j, V in enumerate(Vs):
                test_params[0] = K
                test_params[1] = V
                try:
                    # of_evals[i,j] = MM_system.of(test_params)
                    of_eval = MM_system.of(test_params)
                    if of_eval < tol:
                        kept_pts[count,0] = K
                        kept_pts[count,1] = V
                        kept_pts[count,2] = of_eval
                        count = count + 1
                except CustomErrors.EvalError:
                    ndiscarded = ndiscarded + 1
                    continue
        np.savetxt('./of_evals.csv', kept_pts, delimiter=',')
        print 'threw away', ndiscarded, 'pts'

    vgrid, kgrid = np.meshgrid(Vs, Ks)
    solarize('light')
    # plt.imshow(of_evals)
    # plt.show()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_xscale('log')
    ax.set_yscale('log')
    plot = ax.scatter(kept_pts[:count,0], kept_pts[:count,1], c=kept_pts[:count,2], s=5, lw=0)
    ax.set_xlabel('K')
    ax.set_ylabel('V')
    plt.colorbar(plot)
    plt.show(fig)
Beispiel #8
0
def sing_pert_fig():
    """Generates plots showing how both eps and init conds can be sloppy in sing. pert. system (aiche 2015)"""
    params = np.array((2.0, 4.0, 1, 0.01)) # (a, b, lambda, epsilon)
    (a, b, lam, eps) = params
    # create system with given params
    z_system = ZM.Z_Model(params)

    # set up integration times
    t0 = 0.0
    tfinal = 1
    dt = 0.1
    times = np.arange(t0, tfinal, dt)
    ntimes = times.shape[0]

    # get true trajectory based on true initial conditions
    x0_true = np.array((1., 4.))
    x_true_traj = z_system.get_trajectory(x0_true, times)
    # plt.plot(times, )
    plt.show()

    # plot showing increasing sloppiness in epsilon, singular pert param
    epsilons = np.logspace(-1, -5, 5)
    colors = ['k', 'g', 'r', 'b', 'c']
    for i, e in enumerate(epsilons):
        z_system.change_parameters(np.array((a,b,lam,e)))
        traj = z_system.get_trajectory(x0_true, times)
        print np.any(np.isnan(traj))
        plt.scatter(traj[:,0], traj[:,1], lw=0, label=r'$\epsilon$=%1.0e' % e, color=colors[i], s=60)
        # plt.plot(traj[:,0], traj[:,1], label=r'$\epsilon$=' + str(e), c=colors[i])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend(loc=4)
    plt.show()

    # plot showing sloppiness in initial condition, so long as they lie on the same fast manifold
    # ensure we're in a singularly perturbed regime
    eps = 1e-3
    z_system.change_parameters(np.array((a,b,lam,eps)))
    # find multiple points on a single fast manifold
    x0 = np.array((1., 2.))
    traj = z_system.get_trajectory(x0, times)
    xts = traj[:4]
    # find corresponding x0, y0 that will be mapped to initial points on this fast manifold
    x0s = xts[:,0] - b*xts[:,1]*xts[:,1]
    y0s = -a*b*((np.power(2*xts[:,1]-1/(a*b), 2) - 1/np.power(a*b, 2))/4 + x0s/b)
    x0s = np.array((x0s, y0s)).T
    # increase epsilon to ensure initial points are on slow manifold by the time the second point is recorded
    eps = 1e-4
    z_system.change_parameters(np.array((a,b,lam,eps)))
    # decrease total time
    t0 = 0.0
    tfinal = 0.2
    dt = 0.001
    times = np.arange(t0, tfinal, dt)
    ntimes = times.shape[0]
    # x0s = np.array((1 + 1.0*np.linspace(1,3,5), np.linspace(1,3,5))).T # initial conditions to loop over
    for i, x0 in enumerate(x0s):
        traj = z_system.get_trajectory(x0, times)
        print np.any(np.isnan(traj))
        plt.scatter(traj[:,0], traj[:,1], label=r'$x_0$=%1.1f' % x0[1], color=colors[i], s=100)
        # plt.plot(traj[:,0], traj[:,1], label=r'$\epsilon$=' + str(e), c=colors[i])
    plt.xlabel('x')
    plt.ylabel('y')
    # plt.legend(loc=4)
    plt.show()
        
    
    
    

    # set up sampling grid and storage space for obj. fn. evals
    nsamples_per_axis = 100
    nsamples = nsamples_per_axis**2
    x10s, x20s = np.meshgrid(np.linspace(1, 2, nsamples_per_axis), np.linspace(1, 2, nsamples_per_axis))
    x10s.shape = (nsamples,)
    x20s.shape = (nsamples,)
    x0_samples = np.array((x10s, x20s)).T # all samples of initial conditions in two columns
    of_evals = np.empty(nsamples) # space for obj. fn. evals
    x0s = np.empty((nsamples, 2))

    # loop through different initial conditions and record obj. fn. value
    count = 0
    for i, x0 in enumerate(x0_samples):
        uf.progress_bar(i, nsamples) # optional progress bar
        x_sample_traj = z_system.get_trajectory(x0, times)
        temp_eval = get_of(x_sample_traj, x_true_traj)
        if not np.isnan(temp_eval):
            of_evals[count] = temp_eval
            x0s[count] = x0
            count = count + 1
        

    print count
    x0s = x0s[:count]
    of_evals = of_evals[:count]
    # plot grid of sampled points colored by obj. fn. value
    print np.any(np.isnan(of_evals)), np.any(np.isinf(of_evals))
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x0s[:,0], x0s[:,1])
    plt.show()
Beispiel #9
0
    def find_branch(self, x0, k0, ds, nsteps, progress_bar=False, **kwargs):
        """Continues along the branch of values for which :math:`f(x,k)=0` via pseudo-arclength continuation

        Args:
            x0 (array): the initial x vector
            k0 (float): the initial parameter value
            ds (float): the arclength step size
            nsteps (int): the total number of arclength steps to take
            progress_bar (bool): whether or not to show a progress bar as iterations proceed

        Returns:
            Numpy array of dimension (nsteps, n+1), each row of which contains first the length-n value of x and then the scalar parameter value at which a point on the branch was found

        .. note::
            If :math:`f(x_0,k_0) \\neq 0`, this method automatically searches for an appropriate starting point via a Newton iteration at :math:`k=k_0`
        """

        # fig = plt.figure()
        # ax = fig.add_subplot(111)
        # ax.hold(True)

        # TODO: faster method than defining lambda fn?
        n = x0.shape[0]
        f_init = lambda x: self._f(x, k0)[:n]
        Df_init = lambda x: self._Df(x, k0)[:n, :n]
        # find initial point on branch
        newton_solver = Newton.Newton(f_init, Df_init)
        try:
            xstart = newton_solver.find_zero(x0, **kwargs)
        except CustomErrors.EvalError as e:
            print e.msg
            raise CustomErrors.PSAError(
                'Initial newton encountered an EvalError')
        except CustomErrors.ConvergenceError as e:
            raise CustomErrors.PSAError('Initial newton failed to converge')
        # append parameter value
        xstart = np.hstack((xstart, k0))
        # find initial slopes
        # pretend initial slopes are 0 in x, 1 in k
        tempslopes = np.zeros(n + 1)
        tempslopes[n] = 1
        # note that tempslopes is also rhs of initial slope calc (see Auto notes)
        try:
            xprime = spla.gmres(self._Df_arclength(xstart, tempslopes),
                                tempslopes)[0]
        except (CustomErrors.EvalError, CustomErrors.ConvergenceError):
            raise CustomErrors.PSAError('Initial slope not found')
        # normalize
        xprime_start = xprime / np.linalg.norm(xprime)
        # update newton to new functions
        newton_solver = Newton.Newton(self._f_arclength, self._Df_arclength)
        halfnsteps = nsteps / 2
        branch_pts = np.empty((2 * halfnsteps + 1, n + 1))
        branch_pts[-1] = np.copy(xstart)
        # take nsteps/2 forward and backward from the initial point
        # in case the inner loop over 'i' exits prematurely, keep track of how many were successfully obtained
        ncompleted_pts = 0

        # TESTING
        total_pts = 0
        # TESTING

        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # The error handling in this double loop is as follows:
        # The inner loop might raise an EvalError from the 'find_zero' function
        # which will be caught in the outer loop. If this happens, we must adjust
        # 'ncompleted_pts' accordingly, to reflect how many iterations were successful
        # before the error. Then, we continue in the opposite direction (k==1). Again,
        # an error could be raised, in which case we return whatever was
        # found to that point. Otherwise, return the partial branch from (k==0)
        # and the full branch from (k==1).
        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        ds0 = ds
        max_ds_divisions = 4
        count = 0
        for k in range(2):
            # flip ds to continue in both directions
            ds0 = -ds0
            ds = ds0
            # move x back to "center" of branch
            x = np.copy(xstart)
            xprime = np.copy(xprime_start)
            count = 0
            ds_divisions = 0
            try:
                for i in range(halfnsteps):
                    if progress_bar:
                        uf.progress_bar(k * halfnsteps + i + 1, nsteps)
                    # initial guess for next point on branch
                    x0 = x + xprime * ds
                    # save previous value for arclength eqn
                    xprev = np.copy(x)
                    # update parameter values for f and Df in newton solver
                    newton_solver.change_parameters([xprev, xprime, ds],
                                                    [xprime])
                    try:
                        x = newton_solver.find_zero(x0, **kwargs)
                    except CustomErrors.EvalError as e:
                        print e.msg
                        raise
                    except CustomErrors.ConvergenceError:
                        # raise
                        # do some ad-hoc stepsize reduction
                        if ds_divisions > max_ds_divisions:
                            raise
                        else:
                            x = xprev
                            ds = ds / 10.0
                            ds_divisions = ds_divisions + 1
                            continue
                    else:

                        # ax.plot([branch_pts[k*ncompleted_pts + count - 1][0], x0[0]], [branch_pts[k*ncompleted_pts + count - 1][1], x0[1]], color='r')

                        # use finite diff approx for xprime
                        xprime = (x - xprev) / ds
                        # normalize
                        xprime = xprime / np.linalg.norm(xprime)
                        branch_pts[total_pts] = np.copy(x)
                        # branch_pts[total_pts] = np.copy(x)
                        count = count + 1
                        total_pts = total_pts + 1
            except (CustomErrors.EvalError, CustomErrors.ConvergenceError):
                # continue from ncompleted_pts
                if k == 0:
                    ncompleted_pts = count
                    continue
                # k == 1, copy initial point from end of 'branch' and return whatever was successfully found
                else:
                    branch_pts[:ncompleted_pts] = branch_pts[ncompleted_pts -
                                                             1::-1]
                    branch_pts[ncompleted_pts + 1:ncompleted_pts + count +
                               1] = branch_pts[ncompleted_pts:ncompleted_pts +
                                               count]
                    branch_pts[ncompleted_pts] = np.copy(xstart)
                    return branch_pts[:ncompleted_pts + count + 1]
            else:
                if k == 0:
                    ncompleted_pts = count

        # could have encountered error when k==0, no error when k==1: adjust accordingly
        branch_pts[:ncompleted_pts] = branch_pts[ncompleted_pts - 1::-1]
        branch_pts[ncompleted_pts + 1:ncompleted_pts + count +
                   1] = branch_pts[ncompleted_pts:ncompleted_pts + count]
        branch_pts[ncompleted_pts] = np.copy(xstart)
        return branch_pts[:ncompleted_pts + count + 1]  #[:total_pts + 1]