def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None, debug=False): # NOTE set <n_components> to None to run plain kmeans, without mds TODO clean this up start = time.time() assert n_clusters is not None if 'sklearn' not in sys.modules: from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> if debug: print 'align' seqfos = utils.align_many_seqs(seqfos) if debug: print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) random_state = numpy.random.RandomState(seed=seed) pos = None if n_components is not None: if debug: print ' mds' mds = sys.modules['sklearn'].manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ if debug: print ' kmeans clustering with %d clusters' % n_clusters kmeans = sys.modules['sklearn'].cluster.KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos if pos is not None else similarities) pcvals = {seqfos[iseq]['name'] : pos[iseq] if pos is not None else None for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} partition = utils.group_seqs_by_value(pcvals.keys(), lambda q: labels[q]) if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) if debug: print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) if debug: print ' kmeans time %.1f' % (time.time() - start) return partition
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None): print '%s not testing this after moving these imports down here' % utils.color('red', 'hey') from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') print 'align' if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> seqfos = utils.align_many_seqs(seqfos) print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) print ' mds' random_state = numpy.random.RandomState(seed=seed) mds = manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ print ' kmeans' kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos) pcvals = {seqfos[iseq]['name'] : pos[iseq] for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} def keyfunc(q): # should really integrate this with utils.collapse_naive_seqs()/utils.split_partition_with_criterion() return labels[q] partition = [list(group) for _, group in itertools.groupby(sorted(pcvals, key=keyfunc), key=keyfunc)] if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) return partition
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None, max_runs=100, max_iterations=1000, method='euclidean', plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, debug=False): workdir = base_workdir + '/mds' msafname = workdir + '/msa.fa' mdsfname = workdir + '/components.txt' clusterfname = workdir + '/clusters.txt' if not os.path.exists(workdir): os.makedirs(workdir) if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos): # it'll just crash when it's running mds later, but this is faster raise Exception('duplicate sequences in seqfos') if aligned: # NOTE unlike the sklearn version below, this doesn't modify <seqfos> with open(msafname, 'w') as fastafile: for sfo in seqfos: fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq'])) else: utils.align_many_seqs(seqfos, outfname=msafname) # build the R cmd file cmdlines = [ 'options(rgl.useNULL=TRUE)', 'require(bios2mds, quietly=TRUE)', 'set.seed(%d)' % seed, 'human <- import.fasta("%s")' % msafname, 'active <- mat.dif(human, human)', # mat.dif or mat.dis? ] if n_components is not None: cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components] cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname] else: raise Exception('need to implement') if n_clusters is not None: cmdlines += [ 'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method), # 'kmeans.run1$clusters', # 'kmeans.run1$elements', 'options(width=10000)', 'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname, # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000, # run for every possible number of clusters (?) # method = "euclidean") # random.msa # builds a random [...] ] rstart = time.time() try: utils.run_r(cmdlines, workdir) #, print_time='kmeans') except subprocess.CalledProcessError as e: # complex eigenvalues print e print ' mds failed on cluster' # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m. title = (title if title is not None else '') + ' mds failed' pcvals = read_component_file(mdsfname, n_components, seqfos) partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None rstop = time.time() os.remove(msafname) os.rmdir(workdir) plotstart = time.time() if plotdir is not None: # utils.prep_dir(plotdir, wildlings=['*.svg']) plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title) print ' %5.1f %5.1f' % (rstop - rstart, time.time() - plotstart), return partition