def fullAnalysis(location,file,mt): data=pd.read_csv(location+'/'+file) data=f.getDummyVarPositionForK(data,1) data=f.cleanDataMatrix(data,withNull=True) X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,mt) features=X.columns[1:] sigList=featureSelectionPairs(X, y_, initNum, features) analyze(X, y_, initNum, stepNum=400000,plot=False,v=0.1)
def fullAnalysis(location,file,mt): """ size=256 initNum=np.random.poisson(lam=1000000,size=size) data=pd.read_csv(r'C:\Users\Guyling1\ContextProject\SABIN\regressionCSVOneFileCountsK5P7.csv') data=f.getDummyVarPositionForK(data,1) data=f.cleanDataMatrix(data,withNull=True) covariates=f.createCovarsAndProbVectorForMutationType(data,'CT')[0] covariates=np.matrix(covariates)[:,1:] pinit=np.ones(size)*-11 X=covariates #print covarMatrix.shape realBeta=np.random.multivariate_normal(np.zeros(covars),np.identity(covars))# realBeta[0]=3 realBeta[1]=-3 #realBeta[-1]=-5 realGamma=np.zeros(covars) realGamma[0]=1 realGamma[1]=1 #realGamma[-1]=1 betaGamma=realBeta*realGamma print X means=pinit+np.dot(X,betaGamma) means=sc.special.expit(means) #print means[:100] y_=np.random.binomial(initNum,means) y_=y_/initNum #print y_ #print y_[0:100] #Split Data #X_tr, X_te, y_tr, y_te,initNum_tr,initNum_te = train_test_split(X,y_,initNum,test_size=0.25, random_state=0) """ #location=r'C:\Users\Guyling1\Documents\guyReserch\\' #'/sternadi/home/volume1/guyling/MCMC/' data=pd.read_csv(location+'/'+file) data=f.getDummyVarPositionForK(data,1) data=f.cleanDataMatrix(data,withNull=True) X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,mt) #D.to_csv(r'C:\Users\Guyling1\Documents\guyReserch\testFeatures.csv') #outputFile='C:\Users\Guyling1\ContextProject\MCMC\P7K5_{}.csv'.format(mt) features=X.columns[1:] print len(features) #print features sigList=featureSelectionPairs(X, y_, initNum, features) #print sigList #featureListToGraphMatrix(sigList, features) #featureSelection(X, y_, initNum, features, sigList ) #analyzeSigFeatuesByCorrelation(X, y_, initNum, sigList, features) analyze(X, y_, initNum, stepNum=400000,plot=False,v=0.1) #fullAnalysis()
def loadData(dataPath, mutType): data = pd.read_csv(dataPath) name = ".".join(dataPath.split("/")[-1].split(".")[:-1]) + "_" + mutType folder = "/".join(dataPath.split("/")[:-1]) data = f.getDummyVarPositionForK(data, 2) data = f.cleanDataMatrix(data, withNull=True) X, y, initNum, D = f.createCovarsAndProbVectorForMutationType( data, mutType) features = X.columns[1:] X = np.matrix(X)[:, 1:] n, m = X.shape y = np.array((y * initNum), dtype=int).reshape(n, 1) initNum = np.array(initNum).reshape(n, 1) return X, y, initNum, features, name, folder
def analyseSample(location, file,oldNuc=1,newNuc=3): data=pd.read_csv(location+file) data=f.getDummyVarPositionForK(data,2) data=f.cleanDataMatrix(data,withNull=True) X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,NUCS[oldNuc]+NUCS[newNuc]) features=X.columns[1:] sigList=mcmc.featureSelectionPairs(X, y_, initNum, features) if len(sigList)>0: sigFeatures,resultSummary=mcmc.featureSelection(X, y_, initNum, features, sigList,name=file+"_old:{}_new:{}_".format(NUCS[oldNuc],NUCS[newNuc])) sigFeatures=features[[s[0] for s in sigFeatures]] out=open(location+file+"_old:{}_new:{}_sigFeatures.out".format(NUCS[oldNuc],NUCS[newNuc]),'w') for s in sigFeatures: out.write(str(s)+"\n") out.close() resultSummary.to_csv(location+file+"_old:{}_new:{}_resultSummary.csv".format(NUCS[oldNuc],NUCS[newNuc])) else: out=open(location+file+"_old:{}_new:{}_sigFeatures.out".format(NUCS[oldNuc],NUCS[newNuc]),'w') out.write("#no sig features in the initial stage") pass
def analyseSample(file, oldNuc=1, newNuc=3): location = '/sternadi/home/volume1/guyling/MCMC/dataSimulations/' data = pd.read_csv(location + file) data = f.getDummyVarPositionForK(data, 2) data = f.cleanDataMatrix(data, withNull=True) X, y_, initNum, D = f.createCovarsAndProbVectorForMutationType( data, NUCS[oldNuc] + NUCS[newNuc]) features = X.columns[1:] #sigList=[3, 8, 9, 10, 28, 32, 33, 34, 36, 37, 38, 40, 41, 42, 46, 62, 63, 65, 66, 69, 73, 74, 78, 99, 100, 101, 103, 104, 105, 106, 107] sigList = mcmc.featureSelectionPairs(X, y_, initNum, features) print sigList if len(sigList) > 0: sigFeatures, resultSummary = mcmc.featureSelection( X, y_, initNum, features, sigList, name=file + "_old:{}_new:{}_".format(NUCS[oldNuc], NUCS[newNuc])) print sigFeatures sigFeatures = features[[s[0] for s in sigFeatures]] out = open( location + file + "_old:{}_new:{}_sigFeatures.out".format( NUCS[oldNuc], NUCS[newNuc]), 'w') for s in sigFeatures: out.write(str(s) + "\n") out.close() resultSummary.to_csv(location + file + "_old:{}_new:{}_resultSummary.csv".format( NUCS[oldNuc], NUCS[newNuc])) else: out = open( location + file + "_old:{}_new:{}_sigFeatures.out".format( NUCS[oldNuc], NUCS[newNuc]), 'w') out.write("#no sig features in the initial stage") pass