def plot_profiles(prots, eluts, sp='Hs', plot_sums=True, shape=None, min_count=1): """ shape: (m,n) = m rows, n columns eluts: [el.NormElut(f, sp, norm_rows=False, norm_cols=False) for f in fs] """ import plotting as pl gt = seqs.GTrans() use_eluts = elutions_containing_prots(eluts, sp, seqs.names2ids(prots), min_count) shape = shape if shape else ut.sqrt_shape(len(use_eluts)+1) fig = pl.figure() for i,e in enumerate(use_eluts): sp_target = ut.shortname(e.filename)[:2] pl.subplot(shape[0],shape[1],i+1) pl.title(ut.shortname(e.filename)) pids = [gt.name2id[p] for p in prots] protsmax = max([np.max(e.normarr[r]) for p in pids if p in e.baseid2inds for r in e.baseid2inds[p]]) plot_prots(e, pids, e.baseid2inds, protsmax) if plot_sums: # plot total spectral counts normalized to match biggest peak sums = np.sum(e.normarr,axis=0) fmax = np.max(sums) pl.plot(range(sums.shape[1]), np.log2(sums[0,:]).T*np.log2(protsmax)*len(pids)/np.log2(fmax), color='k', linestyle='-', linewidth=.5) # make legend with all prots pl.subplot(shape[0],shape[1],0) for p in prots: pl.plot(0,label=p) pl.legend()
def plot_sums(fs, shape=None): import plotting as pl shape = shape if shape else ut.sqrt_shape(len(fs)) for i,f in enumerate(fs): e = el.load_elution(f) pl.subplot(shape[0],shape[1],i+1) pl.title(ut.shortname(f)) sums = np.sum(e.mat,axis=0) pl.plot(range(sums.shape[1]), sums[0,:].T)
def plot_bigprofiles(prots, pids, unnorm_eluts, sp='Hs', min_count=1, remove_multi_base=False, gt=None, eluts_per_plot=10, do_cluster=True, label_trans=None, do_plot_tree=False, rename_fracs=None, colors=None, **kwargs): """ supply EITHER prots OR protids, set other to None unnorm_eluts: [el.NormElut(f, sp=sp, norm_cols=False, norm_rows=False) for f in fs] """ import plotting as pl if prots is not None: pids = [gt.name2id[p] for p in prots] if do_cluster: print "clustering" pids = cluster_ids(pids, unnorm_eluts, sp, gt=gt, do_plot=do_plot_tree, **kwargs) if gt is not None: prots = [gt.id2name[pid] for pid in pids if pid in gt.id2name] #re-order to match else: prots = pids print "No gene names provided--labeling with ids." if label_trans: print "Translating names for display." # Translate displayed names from base ids according to provided dict #prots = [gt.id2name[pid] for pid in pids] prots = [label_trans.get(p,p) for p in prots] prots.reverse(); pids.reverse(); # put them top to bottom if colors is not None: colors.reverse() print "%s proteins" % len(pids) use_eluts = elutions_containing_prots(unnorm_eluts, sp, pids, min_count) nplots = int(np.ceil(len(use_eluts) / eluts_per_plot)) maxfracs = 0 for iplot in range(nplots): pl.subplot(nplots, 1, iplot+1) plot_eluts = use_eluts[iplot*eluts_per_plot: (iplot+1)*eluts_per_plot] frac_names = [ut.shortname(e.filename) for e in plot_eluts] if rename_fracs: frac_names = [rename_fracs.get(n,n) for n in frac_names] startcols = [0] for i,e in enumerate(plot_eluts): freqarr = ut.normalize_fracs(e.normarr, norm_rows=False) sp_target = ut.shortname(e.filename)[:2] protsmax = max([np.max(freqarr[r]) for p in pids if p in e.baseid2inds for r in e.baseid2inds[p]]) plot_big_single(freqarr, pids, e.baseid2inds, protsmax, startcols[-1], colors=colors) startcols.append(startcols[-1]+freqarr.shape[1]) label_ys(prots) label_xs(startcols, frac_names) pl.grid(False) maxfracs = maxfracs if maxfracs > startcols[-1] else startcols[-1] for iplot in range(nplots): pl.subplot(nplots, 1, iplot+1) pl.xlim(0,maxfracs) pl.subplots_adjust(hspace=5/len(prots)) return nplots
def main(argv): # defaults window_length = 50 overlap = window_length / 2 featdim = 10 #data_115818,sgmdata_115818 = load_dataset(window_length,overlap) training_data, training_sgmdata = load_dataset(window_length, overlap) training_featdata, header = build_dataset_features(training_sgmdata) cl.rnn_test(training_featdata) return data_120250, sgmdata_120250 = load_dataset( window_length, overlap, median_filter=True, alldatafile= '../../acquisizione20062014/acquisizione_20062014/Data_120250.txt') # questi dati son completamente diversi dagli altri tre # data_120611,sgmdata_120611 = load_dataset(window_length,overlap,median_filter=True,alldatafile='../../acquisizione20062014/acquisizione_20062014/Data_120611.txt') """ data_120922,sgmdata_120922 = load_dataset(window_length,overlap,median_filter=True,alldatafile='../../acquisizione20062014/acquisizione_20062014/Data_120922.txt') all_data = [(data_115818,"115818"),(data_120250,"120250"),(data_120611,"120611"),(data_120922,"120922")] sgm_data = [sgmdata_115818,sgmdata_120250,sgmdata_120611,sgmdata_120922] cols = ['b','r','g','m'] for (data,title),c in zip(all_data,cols): print "Acquisizione", title plt.plot_in_subplots(data,0,1,c) return """ return training_data, training_sgmdata = load_dataset(window_length, overlap) training_featdata, header = build_dataset_features(training_sgmdata) training_targets = fm.assign_target(training_featdata) """ data1,sgmdata1 = load_dataset(window_length,overlap,alldatafile='/home/ilaria/Scrivania/marsupio/acquisizione20062014/acquisizione_20062014/Data_120250.txt') featdata1,_ = build_dataset_features(sgmdata1) targets1 = fm.assign_target(featdata1) """ #write_feature_data_to_file(featdata,header) #print featdata[0,idxs] #plt.plot_in_subplots(featdata,idxs) #plt.plot_all(featdata1[:,idxs]) #X_r=preprocessing.scale(featdata) #pca = PCA(n_components=featdim) #kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.1) #X_r = kpca.fit_transform(X_r) #X_r = pca.fit(X_r).transform(X_r) X_r = training_featdata targets = training_targets pca = PCA(n_components=2) X_r = preprocessing.scale(X_r) X_r = pca.fit(X_r).transform(X_r) kmeans = KMeans(n_clusters=10) kmeans.fit(X_r) plt.plot_clustering_and_targets(X_r, kmeans, 0, 1, targets) return pars = [{ 'clf__kernel': ['rbf'], 'clf__gamma': [1e-3, 1e-5, 1e-2, 1e-1, 1e-4], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100], 'pca__n_components': [5, 10, 20, 50, 80] }, { 'clf__kernel': ['linear'], 'clf__C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100], 'pca__n_components': [5, 10, 20, 50, 80] }] #evaluation set cl.cross_model_selection(X_r, targets, pars, save=True) c = cl.load_model('model.pkl') print c return #print X_train.shape, X_test.shape clf = svm.SVC(kernel='rbf', gamma=0.7, C=0.8) pca = PCA(n_components=featdim) pca_svm = Pipeline([ ('pca', pca), ('svm', clf), ]) scores = cross_validation.cross_val_score(clf, X_r, targets, cv=5, scoring='acc') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #pca_svm.fit(X_train, y_train) #print pca_svm.score(X_test,y_test) return #X_r = pca.fit(sint).transform(sint) #X_r = preprocessing pca = PCA(n_components=featdim) #kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.1) #X_r = kpca.fit_transform(X_r) X_r = pca.fit(X_r).transform(X_r) ncluster = 10 """ from sklearn.cluster import DBSCAN dbscan = DBSCAN() plt.plot_DBSCAN_clustering_result(X_r,dbscan,0,1) return """ #X_r = preprocessing.scale(X_r) kmeans = KMeans(n_clusters=ncluster) #print X_r kmeans.fit(X_r) plt.plot_clustering_and_targets(X_r, kmeans, 0, 1, target) return """ test = open('./test.csv','w') for dt in sint: for ft in dt: test.write(str(ft)+',') test.write('\n') """ #colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) #colors = np.hstack([colors] * 20) featdim = 10 Y = randomtargets(sint) clf = svm.SVC(kernel='rbf', gamma=0.7) pca = PCA(n_components=featdim) pca_svm = Pipeline([ ('pca', pca), ('svm', clf), ]) pca_svm.fit(sint, Y) X_r = pca.fit(sint).transform(sint) cX_r = pca.fit(sint).transform(cint) #th1 = [l[1] for l in sint] #accx1 = [l[2] for l in sint] #print(th1) #plt.scatter(th1, accx1, 50,c=Y) #plt.show() features = [] for i in range(0, featdim): features.append([l[i] for l in cX_r]) Yp = [int(i) for i in pca_svm.predict(cint)] print Yp s = 411 for f in features[1:5]: # plt.subplot(s) # plt.scatter(features[0], f, 50,c=Yp) i += 1 s += 1 #plt.show() s = 511 for f in features[5:10]: # plt.subplot(s) # plt.scatter(features[0], f, color=colors[Yp].tolist()) i += 1 s += 1 #plt.show() print clf.support_vectors_ # plt.scatter(clf.support_vectors_,range(0,3), color=colors[range(0,3)].tolist()) # create a mesh to plot in sint = np.array(sint) Y = (np.array(Y)) x_min, x_max = sint[:, 2].min() - 1, sint[:, 2].max() + 1 y_min, y_max = Y.min() - 1, Y.max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02)) #print len(Y), yy.shape #Z = Y.reshape(yy.shape) pl.contourf(xx, yy, Y, cmap=pl.cm.Paired) pl.axis('off') # Plot also the training points pl.scatter(X[:, 1], X[:, 2], c=Y, cmap=pl.cm.Paired) pl.show() return #intervalslist=scale(intervalslist) #print intervalslist featdim = 5 ncluster = 8 clusters = range(1, ncluster + 1) pca = PCA(n_components=featdim) X_r = pca.fit(intervalslist).transform(intervalslist) features = [] for i in range(0, featdim): features.append([l[i] for l in X_r]) #return kmeans = KMeans() #print X_r pca_clustering = Pipeline([('pca', pca), ('minmaxnorm', preprocessing.Normalizer()), ('kmeans', kmeans)]) clustering = Pipeline([('kmeans', kmeans)]) print pca_clustering.fit(intervalslist) #return pca_clusters = pca_clustering.predict(intervalslist) clustering.fit(intervalslist) nopca_clusters = clustering.predict(intervalslist) clustered = [] i = 0 s = 411 for f in features[1:]: plt.subplot(s) plt.scatter(features[0], f, color=colors[pca_clusters].tolist()) i += 1 s += 1 plt.show() """