Beispiel #1
0
def fullAnalysis(location,file,mt):


    data=pd.read_csv(location+'/'+file)
    data=f.getDummyVarPositionForK(data,1)
    data=f.cleanDataMatrix(data,withNull=True)
    X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,mt)

    features=X.columns[1:]
    sigList=featureSelectionPairs(X, y_, initNum, features) 

    analyze(X, y_, initNum, stepNum=400000,plot=False,v=0.1)
Beispiel #2
0
def fullAnalysis(location,file,mt):

    """
    size=256
    initNum=np.random.poisson(lam=1000000,size=size)
    data=pd.read_csv(r'C:\Users\Guyling1\ContextProject\SABIN\regressionCSVOneFileCountsK5P7.csv')
    data=f.getDummyVarPositionForK(data,1)
    data=f.cleanDataMatrix(data,withNull=True)
    covariates=f.createCovarsAndProbVectorForMutationType(data,'CT')[0]
    covariates=np.matrix(covariates)[:,1:]
    pinit=np.ones(size)*-11
    X=covariates
    #print covarMatrix.shape
    realBeta=np.random.multivariate_normal(np.zeros(covars),np.identity(covars))#
    realBeta[0]=3
    realBeta[1]=-3
    #realBeta[-1]=-5
    realGamma=np.zeros(covars)
    realGamma[0]=1
    realGamma[1]=1
    #realGamma[-1]=1
    betaGamma=realBeta*realGamma
    print X
    means=pinit+np.dot(X,betaGamma)
    means=sc.special.expit(means)
    #print means[:100]
    y_=np.random.binomial(initNum,means)
    y_=y_/initNum
    #print y_
    #print y_[0:100]
    #Split Data
    #X_tr, X_te, y_tr, y_te,initNum_tr,initNum_te = train_test_split(X,y_,initNum,test_size=0.25, random_state=0)
    """

    #location=r'C:\Users\Guyling1\Documents\guyReserch\\'
    #'/sternadi/home/volume1/guyling/MCMC/'
    data=pd.read_csv(location+'/'+file)
    data=f.getDummyVarPositionForK(data,1)
    data=f.cleanDataMatrix(data,withNull=True)
    X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,mt)
    #D.to_csv(r'C:\Users\Guyling1\Documents\guyReserch\testFeatures.csv')
    #outputFile='C:\Users\Guyling1\ContextProject\MCMC\P7K5_{}.csv'.format(mt)
    features=X.columns[1:]
    print len(features)
    #print features
    sigList=featureSelectionPairs(X, y_, initNum, features) 
    #print sigList
    #featureListToGraphMatrix(sigList, features)
    #featureSelection(X, y_, initNum, features, sigList )
    #analyzeSigFeatuesByCorrelation(X, y_, initNum, sigList, features)
    analyze(X, y_, initNum, stepNum=400000,plot=False,v=0.1)
#fullAnalysis()
Beispiel #3
0
def loadData(dataPath, mutType):
    data = pd.read_csv(dataPath)
    name = ".".join(dataPath.split("/")[-1].split(".")[:-1]) + "_" + mutType
    folder = "/".join(dataPath.split("/")[:-1])
    data = f.getDummyVarPositionForK(data, 2)
    data = f.cleanDataMatrix(data, withNull=True)
    X, y, initNum, D = f.createCovarsAndProbVectorForMutationType(
        data, mutType)
    features = X.columns[1:]
    X = np.matrix(X)[:, 1:]
    n, m = X.shape
    y = np.array((y * initNum), dtype=int).reshape(n, 1)
    initNum = np.array(initNum).reshape(n, 1)
    return X, y, initNum, features, name, folder
Beispiel #4
0
def analyseSample(location, file,oldNuc=1,newNuc=3):
    data=pd.read_csv(location+file)
    data=f.getDummyVarPositionForK(data,2)
    data=f.cleanDataMatrix(data,withNull=True)
    X,y_,initNum,D=f.createCovarsAndProbVectorForMutationType(data,NUCS[oldNuc]+NUCS[newNuc])
    features=X.columns[1:]

    sigList=mcmc.featureSelectionPairs(X, y_, initNum, features) 
    
    if len(sigList)>0:
        sigFeatures,resultSummary=mcmc.featureSelection(X, y_, initNum, features, sigList,name=file+"_old:{}_new:{}_".format(NUCS[oldNuc],NUCS[newNuc]))
        
        sigFeatures=features[[s[0] for s in sigFeatures]]
        out=open(location+file+"_old:{}_new:{}_sigFeatures.out".format(NUCS[oldNuc],NUCS[newNuc]),'w')
        for s in sigFeatures:
            out.write(str(s)+"\n")
        out.close()
        resultSummary.to_csv(location+file+"_old:{}_new:{}_resultSummary.csv".format(NUCS[oldNuc],NUCS[newNuc]))
    else:
        out=open(location+file+"_old:{}_new:{}_sigFeatures.out".format(NUCS[oldNuc],NUCS[newNuc]),'w')
        out.write("#no sig features in the initial stage")
    pass
def analyseSample(file, oldNuc=1, newNuc=3):
    location = '/sternadi/home/volume1/guyling/MCMC/dataSimulations/'
    data = pd.read_csv(location + file)
    data = f.getDummyVarPositionForK(data, 2)
    data = f.cleanDataMatrix(data, withNull=True)
    X, y_, initNum, D = f.createCovarsAndProbVectorForMutationType(
        data, NUCS[oldNuc] + NUCS[newNuc])
    features = X.columns[1:]
    #sigList=[3, 8, 9, 10, 28, 32, 33, 34, 36, 37, 38, 40, 41, 42, 46, 62, 63, 65, 66, 69, 73, 74, 78, 99, 100, 101, 103, 104, 105, 106, 107]
    sigList = mcmc.featureSelectionPairs(X, y_, initNum, features)
    print sigList
    if len(sigList) > 0:
        sigFeatures, resultSummary = mcmc.featureSelection(
            X,
            y_,
            initNum,
            features,
            sigList,
            name=file + "_old:{}_new:{}_".format(NUCS[oldNuc], NUCS[newNuc]))
        print sigFeatures
        sigFeatures = features[[s[0] for s in sigFeatures]]
        out = open(
            location + file + "_old:{}_new:{}_sigFeatures.out".format(
                NUCS[oldNuc], NUCS[newNuc]), 'w')
        for s in sigFeatures:
            out.write(str(s) + "\n")
        out.close()
        resultSummary.to_csv(location + file +
                             "_old:{}_new:{}_resultSummary.csv".format(
                                 NUCS[oldNuc], NUCS[newNuc]))
    else:
        out = open(
            location + file + "_old:{}_new:{}_sigFeatures.out".format(
                NUCS[oldNuc], NUCS[newNuc]), 'w')
        out.write("#no sig features in the initial stage")
    pass