Python getTrainTestSplit Beispiele, helperFuncs.getTrainTestSplit Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: trainECFPdeep.py Projekt: jamesmf/molecularFormula

#if (not UPDATE) and (isdir(folder)):
#    i=1
#    oldfolder = folder
#    while isdir(folder):
#        i+=1
#        folder  = oldfolder[:-1]+"_"+str(i)+'/'
#        print folder
#    mkdir(folder)

if update:
    stop = raw_input("Loading from folder " + folder +
                     " : Hit enter to proceed or ctrl+C to cancel")
else:
    print "Initializing in folder " + folder
"""Load the train/test split information if update, else split and write out which images are in which dataset"""
trainFs, testFs = getTrainTestSplit(update, folder)
trainL = len(trainFs)
testL = len(testFs)
#if not UPDATE:
#    trainFs = ld[:int(numEx*trainTestSplit)]
#    testFs  = ld[int(numEx*trainTestSplit):]
#    with open(folder+"traindata.csv",'wb') as f:
#        f.write('\n'.join(trainFs))
#    with open(folder+"testdata.csv",'wb') as f:
#        f.write('\n'.join(testFs))
#else:
#    with open(folder+"traindata.csv",'rb') as f:
#        trainFs = f.read().split("\n")
#    with open(folder+"testdata.csv",'rb') as f:
#        testFs  = f.read().split("\n")

Beispiel #2

0

Datei anzeigen

batch_size = 32  #how many training examples per batch
chunkSize = 50000  #how much data to ever load at once
testChunkSize = 6000  #how many examples to evaluate per iteration
run = "1"
"""Define the folder where the model will be stored based on the input arguments"""
folder = helperFuncs.defineFolder(False, outType, size, run)
print folder
trainDirect = folder + "tempTrain/"
testDirect = folder + "tempTest/"

#if update:
#    stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel")
#else:
#    print "Initializing in folder "+folder
"""Load the train/test split information if update, else split and write out which images are in which dataset"""
trainFs, testFs = helperFuncs.getTrainTestSplit(False, folder, numEx,
                                                trainTestSplit, ld)
trainL = len(trainFs)
testL = len(testFs)

print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL

features, labels = helperFuncs.getTargets(
    "justbonds")  #get the target vector for each CID
outsize = helperFuncs.getOutSize(features)
"""DEFINE THE MODEL HERE"""

model = Sequential()

model.add(Convolution2D(8, 8, 8, input_shape=(1, size, size)))

Beispiel #3

0

Datei anzeigen

Datei: trainOCRMin.py Projekt: jamesmf/molecularFormula

print folder
trainDirect     = folder+"tempTrain/"
testDirect      = folder+"tempTest/"

#if update:     
#    stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel")
#else:
#    print "Initializing in folder "+folder





"""Load the train/test split information if update, else split and write out which images are in which dataset"""
if update:
    trainFs, testFs     = getTrainTestSplit(update,folder)
else:
    trainFs, testFs     = getTrainTestSplit(update,folder,numEx,trainTestSplit,ld)
trainL  = len(trainFs)
testL   = len(testFs)


print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL

OCRfeatures,labels  = getOCRTargets() #get the ECFP vector for each CID
#testAverages(direct,OCRfeatures)   # determind the RMSE of guessing the mean
outsize             = len(OCRfeatures[OCRfeatures.keys()[0]]) #this it the size of the target (# of OCRfeatures)

"""If we are training a new model, define it"""

Beispiel #4

0

Datei anzeigen

Datei: evaluate.py Projekt: ncats/molecularocrcrnn


"""Define parameters of the run"""
batch_size      = 32                        #how many training examples per batch


"""Define the folder where the model will be stored based on the input arguments"""
folder          = helperFuncs.defineFolder(True,outType,size,run)
print folder
trainDirect     = folder+"tempTrain/"
trainNP         = folder+"tempTrainNP/"
testDirect      = folder+"tempTest/"
testNP          = folder+"tempTestNP/"

"""Load the train/test split information"""
trainFs, testFs     = helperFuncs.getTrainTestSplit(True,folder)

trainL  = len(trainFs)
testL   = len(testFs)


features,labels     = helperFuncs.getTargets(outType) #get the OCR vector for each CID
outsize             = len(features[features.keys()[0]]) #this it the size of the target (# of OCRfeatures)
means,stds          = helperFuncs.getMeansStds(features)


"""load model"""
model   = helperFuncs.loadModel(folder+"wholeModel")


while not isfile(testNP+"Xtest.h5"):

Beispiel #5

0

Datei anzeigen

Datei: trainOCR300.py Projekt: jamesmf/molecularFormula

batch_size = 32  #how many training examples per batch
chunkSize = 50000  #how much data to ever load at once
testChunkSize = 6000  #how many examples to evaluate per iteration
"""Define the folder where the model will be stored based on the input arguments"""
folder = defineFolder(outType, size, lay1size, run, update)
print folder
trainDirect = folder + "tempTrain/"
testDirect = folder + "tempTest/"

#if update:
#    stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel")
#else:
#    print "Initializing in folder "+folder
"""Load the train/test split information if update, else split and write out which images are in which dataset"""
if update:
    trainFs, testFs = getTrainTestSplit(update, folder)
else:
    trainFs, testFs = getTrainTestSplit(update, folder, numEx, trainTestSplit,
                                        ld)
trainL = len(trainFs)
testL = len(testFs)

print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL

OCRfeatures, labels = getOCRTargets()  #get the ECFP vector for each CID
#testAverages(direct,OCRfeatures)   # determind the RMSE of guessing the mean
outsize = len(OCRfeatures[
    OCRfeatures.keys()[0]])  #this it the size of the target (# of OCRfeatures)
"""If we are training a new model, define it"""

Beispiel #6

0

Datei anzeigen

Datei: OCR300_5BatchNormConv.py Projekt: jamesmf/molecularFormula

folder          = helperFuncs.defineFolder(False,outType,size,run)
print folder
trainDirect     = folder+"tempTrain/"
testDirect      = folder+"tempTest/"

#if update:     
#    stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel")
#else:
#    print "Initializing in folder "+folder





"""Load the train/test split information if update, else split and write out which images are in which dataset"""
trainFs, testFs     = helperFuncs.getTrainTestSplit(False,folder,numEx,trainTestSplit,ld)
trainL  = len(trainFs)
testL   = len(testFs)


print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL

features,labels  = helperFuncs.getTargets("ocr")            #get the target vector for each CID
outsize             = len(features[features.keys()[0]])  #this it the size of the target (# of OCRfeatures)

"""DEFINE THE MODEL HERE"""  

model = Sequential()

Beispiel #7

0

Datei anzeigen

"""Define parameters of the run"""
imdim = size - 20  #strip 10 pixels buffer from each size
direct = "../data/images" + str(size) + "/"  #directory containing the images
ld = listdir(direct)  #contents of that directory
numEx = len(ld)  #number of images in the directory
shuffle(ld)  #shuffle the image list for randomness
outType = "solubility"  #what the CNN is predicting
DUMP_WEIGHTS = True  #will we dump the weights of conv layers for visualization
trainTestSplit = 0.90  #percentage of data to use as training data
batch_size = 32  #how many training examples per batch
chunkSize = 50000  #how much data to ever load at once
testChunkSize = 5000  #how many examples to evaluate per iteration
"""Define the folder where the model will be stored based on the input arguments"""
folder = helperFuncs.defineFolder(outType, size, lay1size, run)
"""Load the train/test split information if update, else split and write out which images are in which dataset"""
trainFs, testFs = helperFuncs.getTrainTestSplit(update, folder, numEx,
                                                trainTestSplit)
trainL = len(trainFs)
testL = len(testFs)

print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL

#batch_size      = 32            #how many training examples per batch
#chunkSize       = 5000          #how much data to ever load at once
#testChunkSize   = 600           #how many examples to evaluate per iteration
numTrainEx = min(trainL, chunkSize)

targets = helperFuncs.getSolubilityTargets(
)  #get the solubility value for each CID
outsize = len(

Beispiel #8

0

Datei anzeigen

Datei: evaluate.py Projekt: jamesmf/molecularFormula


"""Define parameters of the run"""
batch_size      = 32                        #how many training examples per batch


"""Define the folder where the model will be stored based on the input arguments"""
folder          = helperFuncs.defineFolder(True,outType,size,run)
print folder
trainDirect     = folder+"tempTrain/"
trainNP         = folder+"tempTrainNP/"
testDirect      = folder+"tempTest/"
testNP          = folder+"tempTestNP/"

"""Load the train/test split information"""
trainFs, testFs     = helperFuncs.getTrainTestSplit(True,folder)

trainL  = len(trainFs)
testL   = len(testFs)


features,labels     = helperFuncs.getTargets(outType) #get the OCR vector for each CID
outsize             = len(features[features.keys()[0]]) #this it the size of the target (# of OCRfeatures)
means,stds          = helperFuncs.getMeansStds()


"""load model"""
#with open(folder+"bestModel.pickle",'rb') as f:
#    model     = cPickle.load(f)

model   = helperFuncs.loadModel(folder+"wholeModel")

Beispiel #9

0

Datei anzeigen

Datei: trainSolubility.py Projekt: jamesmf/molecularFormula

ld      = listdir(direct)                   #contents of that directory
numEx   = len(ld)                           #number of images in the directory
shuffle(ld)                                 #shuffle the image list for randomness
outType = "solubility"                      #what the CNN is predicting
DUMP_WEIGHTS = True                         #will we dump the weights of conv layers for visualization
trainTestSplit   = 0.90                     #percentage of data to use as training data
batch_size      = 32                        #how many training examples per batch
chunkSize       = 50000                     #how much data to ever load at once      
testChunkSize   = 5000                      #how many examples to evaluate per iteration

"""Define the folder where the model will be stored based on the input arguments"""
folder     = helperFuncs.defineFolder(outType,size,lay1size,run)


"""Load the train/test split information if update, else split and write out which images are in which dataset"""
trainFs, testFs     = helperFuncs.getTrainTestSplit(update,folder,numEx,trainTestSplit)
trainL  = len(trainFs)
testL   = len(testFs)
   

print "number of examples: ", numEx
print "training examples : ", trainL
print "test examples : ", testL


#batch_size      = 32            #how many training examples per batch
#chunkSize       = 5000          #how much data to ever load at once      
#testChunkSize   = 600           #how many examples to evaluate per iteration
numTrainEx      = min(trainL,chunkSize)

targets           = helperFuncs.getSolubilityTargets()     #get the solubility value for each CID