Beispiel #1
0
def test_multiple_measurements(par):
    """Multiple measurement vector (MMV) problem:

    minimize ||Y||_1,2 subject to AW^{-1}Y = B

    """
    # Create random m-by-n encoding matrix
    m = par['m']
    n = par['n']
    k = par['k']
    l = 6

    A = np.random.randn(m, n)
    p = np.random.permutation(n)[:k]
    X = np.zeros((n, l))
    X[p, :] = np.random.randn(k, l)

    weights = 0.1 * np.random.rand(n) + 0.1
    W = 1 / weights * np.eye(n)

    B = A.dot(W).dot(X)

    # Solve unweighted version
    X_uw, _, _, _ = spg_mmv(A.dot(W), B, 0, verbosity=0)

    # Solve weighted version
    X_w, _, _, _ = spg_mmv(A, B, 0, weights=weights, verbosity=0)
    X_w = spdiags(weights, 0, n, n).dot(X_w)

    assert_array_almost_equal(X, X_uw, decimal=2)
    assert_array_almost_equal(X, X_w, decimal=2)
Beispiel #2
0
def test_multiple_measurements_nonnegative(par):
    """Multiple measurement vector (MMV) problem with non-negative norm:
    """
    np.random.seed(0)

    # Create random m-by-n encoding matrix
    m = par['m']
    n = par['n']
    k = par['k']
    l = 6

    A = np.random.randn(m, n)
    A = A.astype(par['dtype'])

    p = np.random.permutation(n)[:k]
    X = np.zeros((n, l), dtype=par['dtype'])
    X[p, :] = np.abs(np.random.randn(k, l))

    B = A.dot(X)

    Xnn, _, _, _ = spg_mmv(A,
                           B,
                           0,
                           project=norm_l12nn_project,
                           primal_norm=norm_l12nn_primal,
                           dual_norm=norm_l12nn_dual,
                           iter_lim=20,
                           verbosity=0)
    assert Xnn.dtype == par['dtype']
    assert np.any(Xnn < 0) == False
    n = 150
    k = 12
    l = 6;
    A = np.random.randn(m, n)
    p = np.random.permutation(n)[:k]
    X0 = np.zeros((n, l))
    X0[p, :] = np.random.randn(k, l)

    weights = 3 * np.random.rand(n) + 0.1
    W = 1/weights * np.eye(n)

    B = A.dot(W).dot(X0)

    # Solve unweighted version
    opts = spgSetParms({'verbosity': 1})
    x_uw, _, _, _ = spg_mmv(A.dot(W), B, 0, opts)

    # Solve weighted version
    opts = spgSetParms({'verbosity': 1,
                        'weights': weights})
    x_w, _, _, _ = spg_mmv(A, B, 0, opts)
    x_w = spdiags(weights, 0, n, n).dot(x_w)

    # Plot results
    figure()
    plot(x_uw[:, 0], 'b-')
    plot(x_w[:, 0], 'b.')
    plot(X0, 'ro');
    plot(x_uw[:, 1:], '-')
    plot(x_w[:, 1:], 'b.')
    #legend('Coefficients (1)','Coefficients (2)','Original coefficients');
Beispiel #4
0
def strainSolver(
        dataPath,
        refStrains,
        outputPath,
        objectiveOption,
        globalILP_option='all',
        timelimit=600,
        gap=8,
        loci=["clpA", "clpX", "nifS", "pepX", "pyrG", "recG", "rplB", "uvrA"],
        pathToDistMat=None,
        eps=0.05):
    # ------------------------- Data handling ----------------------------
    # Parameters
    propFormat = 1  # proportion in percentage or fraction
    numLoci = len(loci)

    # read data for samples and reference
    data, numSamples = readData(dataPath, loci, globalILP_option)
    newNameToOriName = dict()
    namingIndex = 1
    for i in sorted(data.keys()):
        newNameToOriName["s{}".format(namingIndex)] = i
        data["s{}".format(namingIndex)] = data.pop(i)
        namingIndex += 1
    reference = pd.read_csv(refStrains,
                            sep="\t",
                            usecols=range(1, numLoci + 1))
    lociNames = list(reference.columns.values)
    numReference = reference.shape[0]
    allSamples = data.keys()

    # check proportions sum to 100
    checkProp(data, propFormat)

    # round the proportions to 3 decimal places
    data = roundProp(data)

    # As reference only contains numbers as entries, add gene name to the variants for better identification
    for name in lociNames:
        reference["%s" %
                  name] = name + "_" + reference["%s" % name].astype(str)

    # Get proportions of variants at different locus for each sample
    varAndProp = returnVarAndProportions(data)

    # Get the combinations at all loci across all samples
    strains, numOfComb = returnCombinationsAndNumComb(data, numLoci, loci)
    uniqueStrains = strains.drop_duplicates(loci)
    uniqueStrains = (uniqueStrains[loci]).reset_index(drop=True)
    uniqueStrains[
        "ST"] = uniqueStrains.index.values + 1  # assign indices for strains or each unique combinations
    strains = strains.merge(
        uniqueStrains, indicator=True, how="left"
    )  # assign the index to the combinations(as strain data frame contains duplicated rows)
    strains = strains.drop("_merge", 1)

    # For each variants, get a mapping of which strains it maps to
    varSampToST = mapVarAndSampleToStrain(strains, loci, allSamples)

    print "\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n"

    # some data processing
    vpdf = varAndProp.drop_duplicates(['Variant'])
    loci = vpdf['Locus'].tolist()
    from collections import OrderedDict
    loci = list(OrderedDict.fromkeys(loci))
    sampAlleles = []  # List for having each samples data
    nsamp = 0  # Number of columns of X and columns of B
    sampNames = []
    for samp, numOfC in numOfComb.iteritems():
        sampNames.append(int(samp[1:]))
        nsamp += 1
    sampNames = sorted(sampNames)
    for samp in sampNames:
        sampAlleles.append(varAndProp.loc[varAndProp['Sample'] == 's' +
                                          str(samp)])

    # for sa in sampAlleles:
    #     print sa
    # print '\n'

    # ~~~~~~~~~~~~  Solving for strains present in several samples  ~~~~~~~~~~~~~~

    # Below we detect the shared strains between all samples
    strSet = set(strains.drop_duplicates(loci)['ST'].tolist())
    chosenStrs = list()
    sampsets = list()
    for s in strSet:
        # those strains which are present in more than two samples are detected by
        # counting the number of samples they appear in
        sampSet = set(strains.loc[strains['ST'] == s].drop_duplicates('Sample')
                      ['Sample'].tolist())
        if len(sampSet) > 1:
            chosenStrs.append(s - 1)
            sampsets.append(sampSet)
    sharedStrs = strains.iloc[chosenStrs].reset_index(drop=True)
    sharedStrs['Sample'] = sampsets
    sharedStrs = sharedStrs.drop_duplicates(loci)
    print '**SHARED STRAINS'
    print sharedStrs, '\n'

    # detecting the samples which have a shared strain
    samplesSharing = []
    for ss in sampsets:
        for s in ss:
            samplesSharing.append(int(s[1:]))
    samplesSharing = sorted(list(set(samplesSharing)))
    nsampS = len(samplesSharing)

    # dictionary to hold output dataframes for each sample
    outputdict = dict()
    # matrix of residues of shared calculation allele proportions
    BR = np.array([[0]])
    variantsMixed = []

    # In some iterations it's possible not to have any shared strains
    if len(chosenStrs) > 0:
        chosenVars = list()
        dims = []
        variants = [[], []]
        for l in loci:
            dims.append(len(set(sharedStrs[l].tolist())))
            locusvars = list(set(sharedStrs[l].tolist()))
            variants[0].append(locusvars)
            variants[1].append((l, locusvars))
            variantsMixed.extend(locusvars)

        # A: Measurement matrix
        A = createMeasureMatrix(sharedStrs, loci, variants[0], dims)

        # B: Observation Matrix
        nrows = len(variantsMixed)  # Number of rows of A and rows of B
        B = [[0 for x in range(nsampS)] for x in range(nrows)]
        for ind in range(nsampS):
            for indV in range(nrows):
                if variantsMixed[indV] in sampAlleles[samplesSharing[ind] -
                                                      1]['Variant'].tolist():
                    B[indV][ind] = sampAlleles[samplesSharing[ind] - 1].loc[
                        sampAlleles[samplesSharing[ind] - 1]['Variant'] ==
                        variantsMixed[indV]]['Proportion'].values.tolist()[0]

        # Now we solve the shared strains with mmv, then the remaining strains for each sample with single sample solution
        # For solving the shared strains problem,  we choose a range  of sigmas, and use each of them as  input to get the
        # best result.

        # calculating weights for shared strains
        weights = []
        for st in sharedStrs[loci].values.tolist():
            weights.append(
                min([
                    hamming(st, rst)
                    for rst in reference[loci].values.tolist()
                ]) * 8 + 1)

        print "----------------------------------------------------------------------\n           Solving the Joint Sparsity " \
        "using SPGL1\n---------------------------------------------------------------------- "
        print

        sigmas = []
        errs = []
        min_err = 100.0
        best_sig = 0
        X1 = np.array([[0]])
        # in order to find the best solution, we solve the join sparsity problem with different values of sigma
        # to find the result with least error and then report that result.
        for i in range(11):
            sigmas.append(i * 0.05)
        for sig in sigmas:
            X, _, _, _ = spg_mmv(A,
                                 np.array(B),
                                 sig,
                                 weights=np.array(weights))
            errmat = np.dot(A, X) - np.array(B)
            err = np.sqrt(sum([np.linalg.norm(row) for row in errmat]))
            errs.append(err)
        if min_err > err:
            min_err = err
            best_sig = sig
            X1 = X
        print '**BEST SIG AND ITS ERR ', best_sig, min_err
        print '**RESULT column=samp row=strain:'
        print pd.DataFrame(X1)

        # setting the residue matrix for samples that were included in this part
        BR = np.array(B) - np.dot(A, X1)

        # creating the output of shared strains. Latter output for individual samples will be appended to these.
        output = sharedStrs.merge(reference, indicator=True, how="left")
        output["_merge"].replace(to_replace="both",
                                 value="Existing",
                                 inplace=True)
        output["_merge"].replace(to_replace="left_only",
                                 value="New",
                                 inplace=True)
        output = output.rename(columns={"_merge": "New/Existing"})
        out = output.drop_duplicates(loci)

        # more output formatting and adding the results to proper dataframes
        retainCol = ["ST", "New/Existing"]
        out = out[retainCol].reset_index(drop=True)
        samplecnt = 0
        for sampStrains in X1.transpose():
            persampout = out.copy(deep=True)
            strainNums = np.nonzero(sampStrains)[0]
            persampout = pd.merge(persampout,
                                  sharedStrs.iloc[strainNums],
                                  on='ST')
            persampout['Proportions'] = [sampStrains[i] for i in strainNums]
            persampout['Sample'] = 's' + str(samplesSharing[samplecnt])
            columns = persampout.columns.tolist()
            columns = columns[:-2] + columns[-1:] + columns[-2:-1]
            persampout = persampout[columns]
            #print persampout
            outputdict['s' + str(samplesSharing[samplecnt])] = persampout
            samplecnt += 1

    # ------------------ Per-sample solution using single sample solver -------------------
    # after subtraction of the results of shared strains, the remaining proportions are pas
    # -sed to the function which solves the single sample problem.
    for i in range(nsamp):
        sampName = 's' + str(i + 1)
        if i + 1 in samplesSharing:
            persampout = single_sample_solver(
                loci, sampAlleles[i], reference,
                strains.loc[strains['Sample'] == sampName], eps, True,
                BR.transpose().tolist()[samplesSharing.index(i + 1)],
                variantsMixed)
            outputdict[sampName] = outputdict[sampName].append(persampout)
        else:
            persampout = single_sample_solver(
                loci, sampAlleles[i], reference,
                strains.loc[strains['Sample'] == sampName], eps)
            outputdict[sampName] = persampout
        print outputdict[sampName]
        outputdict[sampName].to_csv("{0}/{1}_strainsAndProportions.csv".format(
            outputPath, newNameToOriName[sampName]))
    return outputdict
Beispiel #5
0
from spgl1 import spg_mmv
from spgl1 import norm_l12nn_primal, norm_l12nn_dual, norm_l12nn_project

###############################################################################
# Let's first import our data, matrix operator and weights
data = np.load('../testdata/mmvnn.npz')

A = data['A']
B = data['B']
weights = data['weights']

###############################################################################
# We apply unweighted inversions with and without non-negative
# norms.
X, _, _, info = spg_mmv(A, B, 0.5, iter_lim=100, verbosity=0)

XNN, _, _, infoNN = spg_mmv(A,
                            B,
                            0.5,
                            iter_lim=100,
                            verbosity=0,
                            project=norm_l12nn_project,
                            primal_norm=norm_l12nn_primal,
                            dual_norm=norm_l12nn_dual)
print('Negative X - MMV:', np.any(X < 0))
print('Negative X - MMVNN:', np.any(XNN < 0))
print('Residual norm - MMV:', info['rnorm'])
print('Residual norm - MMVNN:', infoNN['rnorm'])

plt.figure()
Beispiel #6
0
# Create problem
m = 100
n = 150
k = 12
l = 6
A = np.random.randn(m, n)
p = np.random.permutation(n)[:k]
X0 = np.zeros((n, l))
X0[p, :] = np.random.randn(k, l)

weights = 3 * np.random.rand(n) + 0.1
W = 1 / weights * np.eye(n)
B = A.dot(W).dot(X0)

# Solve unweighted version
x_uw, _, _, _ = spgl1.spg_mmv(A.dot(W), B, 0, verbosity=1)

# Solve weighted version
x_w, _, _, _ = spgl1.spg_mmv(A, B, 0, weights=weights, verbosity=2)
x_w = spdiags(weights, 0, n, n).dot(x_w)

# Plot results
plt.figure()
plt.plot(x_uw[:, 0], 'b-', label='Coefficients (1)')
plt.plot(x_w[:, 0], 'b.', label='Coefficients (2)')
plt.plot(X0[:, 0], 'ro', label='Original coefficients')
plt.legend()
plt.title('Weighted Basis Pursuit with Multiple Measurement Vectors')

plt.figure()
plt.plot(x_uw[:, 1], 'b-', label='Coefficients (1)')