def test_multiple_measurements(par): """Multiple measurement vector (MMV) problem: minimize ||Y||_1,2 subject to AW^{-1}Y = B """ # Create random m-by-n encoding matrix m = par['m'] n = par['n'] k = par['k'] l = 6 A = np.random.randn(m, n) p = np.random.permutation(n)[:k] X = np.zeros((n, l)) X[p, :] = np.random.randn(k, l) weights = 0.1 * np.random.rand(n) + 0.1 W = 1 / weights * np.eye(n) B = A.dot(W).dot(X) # Solve unweighted version X_uw, _, _, _ = spg_mmv(A.dot(W), B, 0, verbosity=0) # Solve weighted version X_w, _, _, _ = spg_mmv(A, B, 0, weights=weights, verbosity=0) X_w = spdiags(weights, 0, n, n).dot(X_w) assert_array_almost_equal(X, X_uw, decimal=2) assert_array_almost_equal(X, X_w, decimal=2)
def test_multiple_measurements_nonnegative(par): """Multiple measurement vector (MMV) problem with non-negative norm: """ np.random.seed(0) # Create random m-by-n encoding matrix m = par['m'] n = par['n'] k = par['k'] l = 6 A = np.random.randn(m, n) A = A.astype(par['dtype']) p = np.random.permutation(n)[:k] X = np.zeros((n, l), dtype=par['dtype']) X[p, :] = np.abs(np.random.randn(k, l)) B = A.dot(X) Xnn, _, _, _ = spg_mmv(A, B, 0, project=norm_l12nn_project, primal_norm=norm_l12nn_primal, dual_norm=norm_l12nn_dual, iter_lim=20, verbosity=0) assert Xnn.dtype == par['dtype'] assert np.any(Xnn < 0) == False
n = 150 k = 12 l = 6; A = np.random.randn(m, n) p = np.random.permutation(n)[:k] X0 = np.zeros((n, l)) X0[p, :] = np.random.randn(k, l) weights = 3 * np.random.rand(n) + 0.1 W = 1/weights * np.eye(n) B = A.dot(W).dot(X0) # Solve unweighted version opts = spgSetParms({'verbosity': 1}) x_uw, _, _, _ = spg_mmv(A.dot(W), B, 0, opts) # Solve weighted version opts = spgSetParms({'verbosity': 1, 'weights': weights}) x_w, _, _, _ = spg_mmv(A, B, 0, opts) x_w = spdiags(weights, 0, n, n).dot(x_w) # Plot results figure() plot(x_uw[:, 0], 'b-') plot(x_w[:, 0], 'b.') plot(X0, 'ro'); plot(x_uw[:, 1:], '-') plot(x_w[:, 1:], 'b.') #legend('Coefficients (1)','Coefficients (2)','Original coefficients');
def strainSolver( dataPath, refStrains, outputPath, objectiveOption, globalILP_option='all', timelimit=600, gap=8, loci=["clpA", "clpX", "nifS", "pepX", "pyrG", "recG", "rplB", "uvrA"], pathToDistMat=None, eps=0.05): # ------------------------- Data handling ---------------------------- # Parameters propFormat = 1 # proportion in percentage or fraction numLoci = len(loci) # read data for samples and reference data, numSamples = readData(dataPath, loci, globalILP_option) newNameToOriName = dict() namingIndex = 1 for i in sorted(data.keys()): newNameToOriName["s{}".format(namingIndex)] = i data["s{}".format(namingIndex)] = data.pop(i) namingIndex += 1 reference = pd.read_csv(refStrains, sep="\t", usecols=range(1, numLoci + 1)) lociNames = list(reference.columns.values) numReference = reference.shape[0] allSamples = data.keys() # check proportions sum to 100 checkProp(data, propFormat) # round the proportions to 3 decimal places data = roundProp(data) # As reference only contains numbers as entries, add gene name to the variants for better identification for name in lociNames: reference["%s" % name] = name + "_" + reference["%s" % name].astype(str) # Get proportions of variants at different locus for each sample varAndProp = returnVarAndProportions(data) # Get the combinations at all loci across all samples strains, numOfComb = returnCombinationsAndNumComb(data, numLoci, loci) uniqueStrains = strains.drop_duplicates(loci) uniqueStrains = (uniqueStrains[loci]).reset_index(drop=True) uniqueStrains[ "ST"] = uniqueStrains.index.values + 1 # assign indices for strains or each unique combinations strains = strains.merge( uniqueStrains, indicator=True, how="left" ) # assign the index to the combinations(as strain data frame contains duplicated rows) strains = strains.drop("_merge", 1) # For each variants, get a mapping of which strains it maps to varSampToST = mapVarAndSampleToStrain(strains, loci, allSamples) print "\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n" # some data processing vpdf = varAndProp.drop_duplicates(['Variant']) loci = vpdf['Locus'].tolist() from collections import OrderedDict loci = list(OrderedDict.fromkeys(loci)) sampAlleles = [] # List for having each samples data nsamp = 0 # Number of columns of X and columns of B sampNames = [] for samp, numOfC in numOfComb.iteritems(): sampNames.append(int(samp[1:])) nsamp += 1 sampNames = sorted(sampNames) for samp in sampNames: sampAlleles.append(varAndProp.loc[varAndProp['Sample'] == 's' + str(samp)]) # for sa in sampAlleles: # print sa # print '\n' # ~~~~~~~~~~~~ Solving for strains present in several samples ~~~~~~~~~~~~~~ # Below we detect the shared strains between all samples strSet = set(strains.drop_duplicates(loci)['ST'].tolist()) chosenStrs = list() sampsets = list() for s in strSet: # those strains which are present in more than two samples are detected by # counting the number of samples they appear in sampSet = set(strains.loc[strains['ST'] == s].drop_duplicates('Sample') ['Sample'].tolist()) if len(sampSet) > 1: chosenStrs.append(s - 1) sampsets.append(sampSet) sharedStrs = strains.iloc[chosenStrs].reset_index(drop=True) sharedStrs['Sample'] = sampsets sharedStrs = sharedStrs.drop_duplicates(loci) print '**SHARED STRAINS' print sharedStrs, '\n' # detecting the samples which have a shared strain samplesSharing = [] for ss in sampsets: for s in ss: samplesSharing.append(int(s[1:])) samplesSharing = sorted(list(set(samplesSharing))) nsampS = len(samplesSharing) # dictionary to hold output dataframes for each sample outputdict = dict() # matrix of residues of shared calculation allele proportions BR = np.array([[0]]) variantsMixed = [] # In some iterations it's possible not to have any shared strains if len(chosenStrs) > 0: chosenVars = list() dims = [] variants = [[], []] for l in loci: dims.append(len(set(sharedStrs[l].tolist()))) locusvars = list(set(sharedStrs[l].tolist())) variants[0].append(locusvars) variants[1].append((l, locusvars)) variantsMixed.extend(locusvars) # A: Measurement matrix A = createMeasureMatrix(sharedStrs, loci, variants[0], dims) # B: Observation Matrix nrows = len(variantsMixed) # Number of rows of A and rows of B B = [[0 for x in range(nsampS)] for x in range(nrows)] for ind in range(nsampS): for indV in range(nrows): if variantsMixed[indV] in sampAlleles[samplesSharing[ind] - 1]['Variant'].tolist(): B[indV][ind] = sampAlleles[samplesSharing[ind] - 1].loc[ sampAlleles[samplesSharing[ind] - 1]['Variant'] == variantsMixed[indV]]['Proportion'].values.tolist()[0] # Now we solve the shared strains with mmv, then the remaining strains for each sample with single sample solution # For solving the shared strains problem, we choose a range of sigmas, and use each of them as input to get the # best result. # calculating weights for shared strains weights = [] for st in sharedStrs[loci].values.tolist(): weights.append( min([ hamming(st, rst) for rst in reference[loci].values.tolist() ]) * 8 + 1) print "----------------------------------------------------------------------\n Solving the Joint Sparsity " \ "using SPGL1\n---------------------------------------------------------------------- " print sigmas = [] errs = [] min_err = 100.0 best_sig = 0 X1 = np.array([[0]]) # in order to find the best solution, we solve the join sparsity problem with different values of sigma # to find the result with least error and then report that result. for i in range(11): sigmas.append(i * 0.05) for sig in sigmas: X, _, _, _ = spg_mmv(A, np.array(B), sig, weights=np.array(weights)) errmat = np.dot(A, X) - np.array(B) err = np.sqrt(sum([np.linalg.norm(row) for row in errmat])) errs.append(err) if min_err > err: min_err = err best_sig = sig X1 = X print '**BEST SIG AND ITS ERR ', best_sig, min_err print '**RESULT column=samp row=strain:' print pd.DataFrame(X1) # setting the residue matrix for samples that were included in this part BR = np.array(B) - np.dot(A, X1) # creating the output of shared strains. Latter output for individual samples will be appended to these. output = sharedStrs.merge(reference, indicator=True, how="left") output["_merge"].replace(to_replace="both", value="Existing", inplace=True) output["_merge"].replace(to_replace="left_only", value="New", inplace=True) output = output.rename(columns={"_merge": "New/Existing"}) out = output.drop_duplicates(loci) # more output formatting and adding the results to proper dataframes retainCol = ["ST", "New/Existing"] out = out[retainCol].reset_index(drop=True) samplecnt = 0 for sampStrains in X1.transpose(): persampout = out.copy(deep=True) strainNums = np.nonzero(sampStrains)[0] persampout = pd.merge(persampout, sharedStrs.iloc[strainNums], on='ST') persampout['Proportions'] = [sampStrains[i] for i in strainNums] persampout['Sample'] = 's' + str(samplesSharing[samplecnt]) columns = persampout.columns.tolist() columns = columns[:-2] + columns[-1:] + columns[-2:-1] persampout = persampout[columns] #print persampout outputdict['s' + str(samplesSharing[samplecnt])] = persampout samplecnt += 1 # ------------------ Per-sample solution using single sample solver ------------------- # after subtraction of the results of shared strains, the remaining proportions are pas # -sed to the function which solves the single sample problem. for i in range(nsamp): sampName = 's' + str(i + 1) if i + 1 in samplesSharing: persampout = single_sample_solver( loci, sampAlleles[i], reference, strains.loc[strains['Sample'] == sampName], eps, True, BR.transpose().tolist()[samplesSharing.index(i + 1)], variantsMixed) outputdict[sampName] = outputdict[sampName].append(persampout) else: persampout = single_sample_solver( loci, sampAlleles[i], reference, strains.loc[strains['Sample'] == sampName], eps) outputdict[sampName] = persampout print outputdict[sampName] outputdict[sampName].to_csv("{0}/{1}_strainsAndProportions.csv".format( outputPath, newNameToOriName[sampName])) return outputdict
from spgl1 import spg_mmv from spgl1 import norm_l12nn_primal, norm_l12nn_dual, norm_l12nn_project ############################################################################### # Let's first import our data, matrix operator and weights data = np.load('../testdata/mmvnn.npz') A = data['A'] B = data['B'] weights = data['weights'] ############################################################################### # We apply unweighted inversions with and without non-negative # norms. X, _, _, info = spg_mmv(A, B, 0.5, iter_lim=100, verbosity=0) XNN, _, _, infoNN = spg_mmv(A, B, 0.5, iter_lim=100, verbosity=0, project=norm_l12nn_project, primal_norm=norm_l12nn_primal, dual_norm=norm_l12nn_dual) print('Negative X - MMV:', np.any(X < 0)) print('Negative X - MMVNN:', np.any(XNN < 0)) print('Residual norm - MMV:', info['rnorm']) print('Residual norm - MMVNN:', infoNN['rnorm']) plt.figure()
# Create problem m = 100 n = 150 k = 12 l = 6 A = np.random.randn(m, n) p = np.random.permutation(n)[:k] X0 = np.zeros((n, l)) X0[p, :] = np.random.randn(k, l) weights = 3 * np.random.rand(n) + 0.1 W = 1 / weights * np.eye(n) B = A.dot(W).dot(X0) # Solve unweighted version x_uw, _, _, _ = spgl1.spg_mmv(A.dot(W), B, 0, verbosity=1) # Solve weighted version x_w, _, _, _ = spgl1.spg_mmv(A, B, 0, weights=weights, verbosity=2) x_w = spdiags(weights, 0, n, n).dot(x_w) # Plot results plt.figure() plt.plot(x_uw[:, 0], 'b-', label='Coefficients (1)') plt.plot(x_w[:, 0], 'b.', label='Coefficients (2)') plt.plot(X0[:, 0], 'ro', label='Original coefficients') plt.legend() plt.title('Weighted Basis Pursuit with Multiple Measurement Vectors') plt.figure() plt.plot(x_uw[:, 1], 'b-', label='Coefficients (1)')