def inference(params):
    # Read data from file
    input_data = loadmat(params['input_file'])
    x_sparse = np.asarray(input_data[params['data_field']], dtype=np.uint32)
    if 'theta' in input_data:
        theta_true = input_data['theta']
    else:
        theta_true = None
    params['N'], params['T'] = np.max(x_sparse[:,1]), np.max(x_sparse[:,0])
    params['T'] = min(params['T'], params['max_T'])
    params['N'] = min(params['N'], params['max_N'])
    x_sparse -= 1
    theta_dim = (params['N'],params['N'],params['L'])

    # Push sparse data into a dictionary
    print 'Preprocessing sparse data'
    x_dict = {}
    for t in range(params['T']):
        x_dict[t] = []
    for t, i in x_sparse:
        if not (t < params['T'] and i < params['N']): continue
        x_dict[t].append(i)
    for t in x_dict:
        x_dict[t] = tuple(sorted(x_dict[t]))

    # Define function for building window as needed
    def make_window(cols):
        w = np.zeros((params['N'], len(cols)), dtype=np.bool)
        for o, col in enumerate(cols):
            for i in col:
                w[i, o] = 1
        return w
   
    # Generate S (calling it "windows" in code)
    print 'Counting window permutations'
    def n_perms(cols):
        n = 0
        denom = 1
        for col in cols:
            n += cols[col]
            denom *= factorial(cols[col])
        return factorial(n) / denom
    windows, n_w, l_w = [], [], []
    t_start = 0
    while t_start < params['T']:
        cols_seen = {}
        t_end = t_start
        while t_end < params['T']:
            new_col = x_dict[t_end]
            t_end += 1
            if not new_col in cols_seen:
                cols_seen[new_col] = 0
            cols_seen[new_col] += 1
            if t_end - t_start <= params['L']:
                n_perm = n_perms(cols_seen)
                continue
            n_perm_new = n_perms(cols_seen)
            if n_perm_new > params['perm_max']:
                t_end -= 1
                break
            n_perm = n_perm_new
        windows.append((t_start, t_end))
        n_w.append(n_perm)
        l_w.append(t_end - t_start)
        t_start = t_end
    params['M'] = len(n_w)

    # Initialize theta
    theta_init = np.zeros(theta_dim)

    # Precompute statistics
    print 'Precomputing statistics'
    hits = [np.empty((n_w[0],)+theta_dim)]
    hits_observed = np.zeros(theta_dim)
    s_padded = np.zeros((params['N'],params['L']+l_w[0]), dtype=np.bool)
    w_start, w_end = windows[0]
    window = [x_dict[t] for t in range(w_start, w_end)]
    for w, z in enumerate(permute(window)):
        s = make_window(z)
        s_padded[:,params['L']:(params['L']+l_w[0])] = s
        for l in range(params['L']):
            tmin, tmax = params['L']-(l+1), (params['L']+l_w[0])-(l+1)
            s_lagged = s_padded[:,tmin:tmax]
            hit = np.tensordot(s_lagged, s, axes = (1,1))
            hits[0][w,:,:,l] = hit
        if z == window:
            hits_observed += hits[0][w]
    for k in range(1, params['M']):
        hits.append(np.empty((n_w[k-1],n_w[k])+theta_dim))
        s_padded = np.empty((params['N'],l_w[k-1]+l_w[k]), dtype=np.bool)
        w_prev_start, w_prev_end = windows[k-1]
        w_start, w_end = windows[k]
        window_prev = [x_dict[t] for t in range(w_prev_start, w_prev_end)]
        window = [x_dict[t] for t in range(w_start, w_end)]
        for w_prev, z_prev in enumerate(permute(window_prev)):
            s_prev = make_window(z_prev)
            s_padded[:,0:l_w[k-1]] = s_prev
            for w, z in enumerate(permute(window)):
                s = make_window(z)
                s_padded[:,l_w[k-1]:(l_w[k-1]+l_w[k])] = s
                for l in range(params['L']):
                    tmin, tmax = l_w[k-1]-(l+1), (l_w[k-1]+l_w[k])-(l+1)
                    s_lagged = s_padded[:,tmin:tmax]
                    hit = np.tensordot(s_lagged, s, axes = (1,1))
                    hits[k][w_prev,w,:,:,l] = hit
                if z_prev == window_prev and z == window:
                    hits_observed += hits[k][w_prev,w]
    del x_dict

    # Common DP code used for likelihood and gradient calculations
    def dp(theta):
        h = [None] * params['M']
        h[0] = np.empty(n_w[0])
        for w in range(n_w[0]):
            h[0][w] = np.sum(theta * hits[0][w])
        for k in range(1, params['M']):
            h[k] = np.empty((n_w[k-1], n_w[k]))
            for w_prev in range(n_w[k-1]):
                for w in range(n_w[k]):
                    h[k][w_prev,w] = np.sum(theta * hits[k][w_prev,w])

        b = [None] * (params['M']+1)
        b[params['M']] = np.zeros(n_w[params['M']-1])
        for k in range(params['M']-1, 0, -1):
            b[k] = np.empty(n_w[k-1])
            for w_prev in range(n_w[k-1]):
                b[k][w_prev] = logaddexp(h[k][w_prev] + b[k+1])

        return h, b

    # Define objective function, in this case, the negative log-likelihood
    def neg_log_likelihood(theta_vec):
        theta = np.reshape(theta_vec, theta_dim)

        h, b = dp(theta)
        
        log_kappa = logaddexp(h[0] + b[1])

        nll = log_kappa
        nll -= h[0][0]
        for k in range(1, params['M']):
            nll -= h[k][0,0]
        nll += params['lambda'] * np.sum(np.abs(theta))
        return nll
    
    # Define gradient of the objective function
    def grad_neg_log_likelihood(theta_vec):
        theta = np.reshape(theta_vec, theta_dim)

        h, b = dp(theta)

        # Compute expected statistics
        w_prob = unlog(h[0] + b[1])
        hits_expected = fast_average(hits[0], w_prob)
        for k in range(1, params['M']):
            w_prob_new = np.zeros(n_w[k])
            for w_prev in range(n_w[k-1]):
                w_weight = unlog(h[k][w_prev,:] + b[k+1])
                w_prob_new += w_weight * w_prob[w_prev]
                hits_expected += (w_prob[w_prev] *
                                  fast_average(hits[k][w_prev], w_weight))
            w_prob = w_prob_new

        # Adjust gradient for L1 regularization
        reg = params['lambda'] * np.sign(theta)
        
        return np.reshape(hits_expected - hits_observed + reg, theta_vec.shape)

    # Callback for displaying state during optimization
    def show_theta(theta_vec):
        theta = np.reshape(theta_vec, (params['N'], params['N'], params['L']))
        if theta_true is None:
            print np.round(theta, decimals = 2)
        else:
            diff = np.reshape(theta - theta_true, theta_vec.shape)
            print np.sqrt(np.dot(diff, diff))
        if params['intermediate_viz']:
            theta_viz(theta)

    # Do optimization
    print 'Starting optimization'
    theta_opt = opt.fmin_bfgs(f = neg_log_likelihood,
                              fprime = grad_neg_log_likelihood,
                              x0 = theta_init,
                              callback = show_theta,
                              **(params['opt_params']))

    # Output
    print 'x'
    print x_sparse
    print

    print 'Parameters'
    for param in params:
        print '%s: %s' % (param, str(params[param]))
    print

    print 'Inferred theta'
    print np.reshape(theta_opt, (params['N'], params['N'], params['L']))
def inference(params):
    # Read data from file
    input_data = loadmat(params['input_file'])
    x_sparse = np.asarray(input_data[params['data_field']], dtype=np.uint32)
    labels, theta_true = None, None
    if params['label_field'] in input_data:
        labels = input_data[params['label_field']][:,0]
    if params['theta_field'] in input_data:
        theta_true = input_data[params['theta_field']]
    params['N'], params['T'] = np.max(x_sparse[:,1]), np.max(x_sparse[:,0])
    params['T'] = min(params['T'], params['max_T'])
    params['N'] = min(params['N'], params['max_N'])
    params['M'] = int(np.ceil(1.0 * params['T'] / params['Delta']))
    params['T'] = params['Delta'] * params['M']
    x_sparse -= 1
    theta_dim = (params['N'],params['N'],params['L'])

    # Push sparse data into a dictionary
    print 'Preprocessing sparse data'
    x_dict = {}
    for t in range(params['T']):
        x_dict[t] = []
    for t, i in x_sparse:
        if not (t < params['T'] and i < params['N']): continue
        x_dict[t].append(i)
    for t in x_dict:
        x_dict[t] = tuple(sorted(x_dict[t]))

    # Define function for building window as needed
    def make_window(cols):
        w = np.zeros((params['N'], len(cols)), dtype=np.bool)
        for o, col in enumerate(cols):
            for i in col:
                w[i, o] = 1
        return w
   
    # Generate S (calling it "windows" in code)
    print 'Counting window permutations'
    n_perms_memo = {}
    fact_memo = {}
    def fact(n):
        if n in fact_memo:
            return fact_memo[n]
        else:
            val = factorial(n)
            fact_memo[n] = val
            return val
    def n_perms(cols):
        n = 0
        denoms = []
        for col in cols:
            n += cols[col]
            denoms.append(cols[col])
        key = (n, tuple(sorted(denoms)))
        if key in n_perms_memo:
            return n_perms_memo[key]
        else:
            val = fact(n)
            for denom in denoms:
                val /= fact(denom)
            n_perms_memo[key] = val
            return val
    windows, n_w, l_w = [], [], []
    for k in range(params['M']):
        t_start = k * params['Delta']
        t_end = min(params['T'], (k+1) * params['Delta'])
        cols_seen = {}
        for t in range(t_start, t_end):
            new_col = x_dict[t]
            if not new_col in cols_seen:
                cols_seen[new_col] = 0
            cols_seen[new_col] += 1
            n_perm = n_perms(cols_seen)
        windows.append((t_start, t_end))
        n_w.append(n_perm)
        l_w.append(t_end - t_start)

    # Initialize theta (using sparse representation)
    theta = {}
    def theta_dense(theta_sparse):
        theta = np.zeros(theta_dim)
        for ind in theta_sparse:
            theta[ind] = theta_sparse[ind]
        return theta
    def arrays_from_theta(theta_sparse):
        inds = []
        theta = []
        for ind in theta_sparse:
            inds.append(ind)
            theta.append(theta_sparse[ind])
        return inds, np.array(theta)
    def theta_from_arrays(inds, vec):
        theta = {}
        for ind, v in zip(inds, vec):
            theta[ind] = v
        return theta

    # Precompute statistics
    print 'Precomputing statistics'
    hits_pre = [np.empty((n_w[0],)+theta_dim)]
    hits_observed = np.zeros(theta_dim)
    s_padded = np.zeros((params['N'],params['L']+l_w[0]), dtype=np.uint32)
    w_start, w_end = windows[0]
    window = [x_dict[t] for t in range(w_start, w_end)]
    for w, z in enumerate(permute(window)):
        s = make_window(z)
        s_padded[:,params['L']:(params['L']+l_w[0])] = s
        for l in range(params['L']):
            tmin, tmax = params['L']-(l+1), (params['L']+l_w[0])-(l+1)
            s_lagged = s_padded[:,tmin:tmax]
            hit = np.tensordot(s_lagged, s, axes = (1,1))
            hits_pre[0][w,:,:,l] = hit
        if z == window:
            hits_observed += hits_pre[0][w]
    for k in range(1, params['M']):
        hits_pre.append(np.empty((n_w[k-1],n_w[k])+theta_dim))
        s_padded = np.empty((params['N'],l_w[k-1]+l_w[k]), dtype=np.uint32)
        w_prev_start, w_prev_end = windows[k-1]
        w_start, w_end = windows[k]
        window_prev = [x_dict[t] for t in range(w_prev_start, w_prev_end)]
        window = [x_dict[t] for t in range(w_start, w_end)]
        for w_prev, z_prev in enumerate(permute(window_prev)):
            s_prev = make_window(z_prev)
            s_padded[:,0:l_w[k-1]] = s_prev
            for w, z in enumerate(permute(window)):
                s = make_window(z)
                s_padded[:,l_w[k-1]:(l_w[k-1]+l_w[k])] = s
                for l in range(params['L']):
                    tmin, tmax = l_w[k-1]-(l+1), (l_w[k-1]+l_w[k])-(l+1)
                    s_lagged = s_padded[:,tmin:tmax]
                    hit = np.tensordot(s_lagged, s, axes = (1,1))
                    hits_pre[k][w_prev,w,:,:,l] = hit
                if z_prev == window_prev and z == window:
                    hits_observed += hits_pre[k][w_prev,w]
    del x_dict

    # Common DP code used for likelihood and gradient calculations
    def dp(theta_sparse):
        theta = theta_dense(theta_sparse)

        h = [None] * params['M']
        h[0] = np.empty(n_w[0])
        for w in range(n_w[0]):
            h[0][w] = np.sum(theta * hits_pre[0][w])
        for k in range(1, params['M']):
            h[k] = np.empty((n_w[k-1], n_w[k]))
            for w_prev in range(n_w[k-1]):
                for w in range(n_w[k]):
                    h[k][w_prev,w] = np.sum(theta * hits_pre[k][w_prev,w])

        b = [None] * (params['M']+1)
        b[params['M']] = np.zeros(n_w[params['M']-1])
        for k in range(params['M']-1, 0, -1):
            b[k] = np.empty(n_w[k-1])
            for w_prev in range(n_w[k-1]):
                b[k][w_prev] = logsumexp(h[k][w_prev] + b[k+1])

        return h, b

    # Define objective function, in this case, the negative log-likelihood
    def neg_log_likelihood(theta_sparse, hb = None):
        if not hb is None:
            h, b = hb
        else:
            h, b = dp(theta_sparse)
        
        log_kappa = logsumexp(h[0] + b[1])

        nll = log_kappa
        nll -= h[0][0]
        for k in range(1, params['M']):
            nll -= h[k][0,0]
        for ind in theta_sparse:
            nll += params['lambda'] * np.abs(theta_sparse[ind])
        return nll

    # Compute expected statistics
    def expected_statistics(h, b):
        w_prob = unlog(h[0] + b[1])
        hits_expected = fast_average(hits_pre[0], w_prob)
        for k in range(1, params['M']):
            w_prob_new = np.zeros(n_w[k])
            for w_prev in range(n_w[k-1]):
                w_weight = unlog(h[k][w_prev,:] + b[k+1])
                w_prob_new += w_weight * w_prob[w_prev]
                hits_expected += (w_prob[w_prev] *
                                  fast_average(hits_pre[k][w_prev], w_weight))
            w_prob = w_prob_new
        return hits_expected
            
    # Define gradient of the objective function
    def grad_neg_log_likelihood(theta_sparse, hb = None):
        if not hb is None:
            h, b = hb
        else:
            h, b = dp(theta_sparse)
        
        hits_expected = expected_statistics(h, b)
        grad_full = hits_expected - hits_observed
        
        grad_sparse = {}
        for ind in theta_sparse:
            grad_sparse[ind] = grad_full[ind]

        # Adjust gradient for L1 regularization
        for ind in theta_sparse:
            grad_sparse[ind] += params['lambda'] * np.sign(theta_sparse[ind])
        
        return grad_sparse

    # Do optimization
    print 'Starting stepwise optimization'
    h, b = dp(theta)
    nll = neg_log_likelihood(theta, (h, b))
    print 'Initial negative log-likelihood: %.2f' % nll
    while True:
        # Assess model at current theta
        if params['intermediate_viz']:
            theta_viz(theta_dense(theta), labels = labels)

        # Sample at current theta
        hits_sample = np.zeros((params['num_samples'],)+theta_dim)
        w_samps = log_weighted_sample(h[0] + b[1], params['num_samples'])
        for rep in range(params['num_samples']):
            hits_sample[rep] += hits_pre[0][w_samps[rep]]
        for k in range(1, params['M']):
            w_samps_next = []
            w_samps_next_uniques = []
            w_samps_uniques, inds = np.unique(w_samps, return_inverse=True)
            for i, w in enumerate(w_samps_uniques):
                n = len(inds[inds == i])
                w_samps_next_uniques.append(log_weighted_sample(h[k][w]+b[k+1],n))
            for rep in range(params['num_samples']):
                w_samps_next.append(w_samps_next_uniques[inds[rep]].pop())
                hits_sample[rep] += hits_pre[k][w_samps[rep]][w_samps_next[rep]]
            w_samps = w_samps_next

        # Check global goodness-of-fit
        hits_expected = expected_statistics(h, b)
        hits_norms = np.array([la.norm(hits - hits_expected)
                                for hits in hits_sample])
        ext = np.where(la.norm(hits_observed - hits_expected) > hits_norms)[0]
        score = 1.0 * len(ext) / params['num_samples']
        print 'Global score: %.2f' % score
        if score < params['stopping_global']:
            print 'Global goodness-of-fit criterion achieved'
            break

        # Find component with largest z-score
        hits_sd = np.sqrt(np.mean((hits_sample-hits_expected)**2, axis=0))
        z_scores = (hits_observed - hits_expected) / hits_sd
        z_scores[hits_sd == 0] = 0
        for ind in theta:
            z_scores[ind] = 0
        argmax_z = np.unravel_index(np.argmax(np.abs(z_scores)), theta_dim)
        if abs(z_scores[argmax_z]) < params['stopping_z']:
            print 'Largest z-score below stopping threshold'
            break
        print 'New component: %s (z = %.2f)' % (str(argmax_z), z_scores[argmax_z])
        theta[argmax_z] = 0.0

        # Make big steps in direction of new theta component
        grad = grad_neg_log_likelihood(theta, (h, b))
        dir_new = -np.sign(grad[argmax_z])
        while True:
            print 'Making big step'
            old_nll = nll
            theta[argmax_z] += dir_new * params['step_size']
            h, b = dp(theta)
            nll = neg_log_likelihood(theta, (h, b))
            print 'Negative log-likelihood: %.2f' % nll
            if nll > old_nll or abs(nll - old_nll) < params['opt_tol']: break

        # Refit theta with new non-zero component
        print 'Optimization by gradient descent'

        while True:
            old_nll = nll
            grad = grad_neg_log_likelihood(theta, (h, b))
            grad_norm = max(la.norm(np.array(grad.values())), 1.0)
            for ind in grad:
                theta[ind] -= (params['step_size'] / grad_norm) * grad[ind]
            h, b = dp(theta)
            nll = neg_log_likelihood(theta, (h, b))
            print 'Negative log-likelihood: %.2f' % nll
            if nll > old_nll: break

    # Output
    print 'x'
    print x_sparse
    print

    print 'Parameters'
    for param in params:
        print '%s: %s' % (param, str(params[param]))
    print

    print 'Inferred theta'
    print theta_dense(theta)