#if (not UPDATE) and (isdir(folder)): # i=1 # oldfolder = folder # while isdir(folder): # i+=1 # folder = oldfolder[:-1]+"_"+str(i)+'/' # print folder # mkdir(folder) if update: stop = raw_input("Loading from folder " + folder + " : Hit enter to proceed or ctrl+C to cancel") else: print "Initializing in folder " + folder """Load the train/test split information if update, else split and write out which images are in which dataset""" trainFs, testFs = getTrainTestSplit(update, folder) trainL = len(trainFs) testL = len(testFs) #if not UPDATE: # trainFs = ld[:int(numEx*trainTestSplit)] # testFs = ld[int(numEx*trainTestSplit):] # with open(folder+"traindata.csv",'wb') as f: # f.write('\n'.join(trainFs)) # with open(folder+"testdata.csv",'wb') as f: # f.write('\n'.join(testFs)) #else: # with open(folder+"traindata.csv",'rb') as f: # trainFs = f.read().split("\n") # with open(folder+"testdata.csv",'rb') as f: # testFs = f.read().split("\n")
batch_size = 32 #how many training examples per batch chunkSize = 50000 #how much data to ever load at once testChunkSize = 6000 #how many examples to evaluate per iteration run = "1" """Define the folder where the model will be stored based on the input arguments""" folder = helperFuncs.defineFolder(False, outType, size, run) print folder trainDirect = folder + "tempTrain/" testDirect = folder + "tempTest/" #if update: # stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel") #else: # print "Initializing in folder "+folder """Load the train/test split information if update, else split and write out which images are in which dataset""" trainFs, testFs = helperFuncs.getTrainTestSplit(False, folder, numEx, trainTestSplit, ld) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL features, labels = helperFuncs.getTargets( "justbonds") #get the target vector for each CID outsize = helperFuncs.getOutSize(features) """DEFINE THE MODEL HERE""" model = Sequential() model.add(Convolution2D(8, 8, 8, input_shape=(1, size, size)))
print folder trainDirect = folder+"tempTrain/" testDirect = folder+"tempTest/" #if update: # stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel") #else: # print "Initializing in folder "+folder """Load the train/test split information if update, else split and write out which images are in which dataset""" if update: trainFs, testFs = getTrainTestSplit(update,folder) else: trainFs, testFs = getTrainTestSplit(update,folder,numEx,trainTestSplit,ld) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL OCRfeatures,labels = getOCRTargets() #get the ECFP vector for each CID #testAverages(direct,OCRfeatures) # determind the RMSE of guessing the mean outsize = len(OCRfeatures[OCRfeatures.keys()[0]]) #this it the size of the target (# of OCRfeatures) """If we are training a new model, define it"""
"""Define parameters of the run""" batch_size = 32 #how many training examples per batch """Define the folder where the model will be stored based on the input arguments""" folder = helperFuncs.defineFolder(True,outType,size,run) print folder trainDirect = folder+"tempTrain/" trainNP = folder+"tempTrainNP/" testDirect = folder+"tempTest/" testNP = folder+"tempTestNP/" """Load the train/test split information""" trainFs, testFs = helperFuncs.getTrainTestSplit(True,folder) trainL = len(trainFs) testL = len(testFs) features,labels = helperFuncs.getTargets(outType) #get the OCR vector for each CID outsize = len(features[features.keys()[0]]) #this it the size of the target (# of OCRfeatures) means,stds = helperFuncs.getMeansStds(features) """load model""" model = helperFuncs.loadModel(folder+"wholeModel") while not isfile(testNP+"Xtest.h5"):
batch_size = 32 #how many training examples per batch chunkSize = 50000 #how much data to ever load at once testChunkSize = 6000 #how many examples to evaluate per iteration """Define the folder where the model will be stored based on the input arguments""" folder = defineFolder(outType, size, lay1size, run, update) print folder trainDirect = folder + "tempTrain/" testDirect = folder + "tempTest/" #if update: # stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel") #else: # print "Initializing in folder "+folder """Load the train/test split information if update, else split and write out which images are in which dataset""" if update: trainFs, testFs = getTrainTestSplit(update, folder) else: trainFs, testFs = getTrainTestSplit(update, folder, numEx, trainTestSplit, ld) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL OCRfeatures, labels = getOCRTargets() #get the ECFP vector for each CID #testAverages(direct,OCRfeatures) # determind the RMSE of guessing the mean outsize = len(OCRfeatures[ OCRfeatures.keys()[0]]) #this it the size of the target (# of OCRfeatures) """If we are training a new model, define it"""
folder = helperFuncs.defineFolder(False,outType,size,run) print folder trainDirect = folder+"tempTrain/" testDirect = folder+"tempTest/" #if update: # stop = raw_input("Loading from folder "+folder+" : Hit enter to proceed or ctrl+C to cancel") #else: # print "Initializing in folder "+folder """Load the train/test split information if update, else split and write out which images are in which dataset""" trainFs, testFs = helperFuncs.getTrainTestSplit(False,folder,numEx,trainTestSplit,ld) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL features,labels = helperFuncs.getTargets("ocr") #get the target vector for each CID outsize = len(features[features.keys()[0]]) #this it the size of the target (# of OCRfeatures) """DEFINE THE MODEL HERE""" model = Sequential()
"""Define parameters of the run""" imdim = size - 20 #strip 10 pixels buffer from each size direct = "../data/images" + str(size) + "/" #directory containing the images ld = listdir(direct) #contents of that directory numEx = len(ld) #number of images in the directory shuffle(ld) #shuffle the image list for randomness outType = "solubility" #what the CNN is predicting DUMP_WEIGHTS = True #will we dump the weights of conv layers for visualization trainTestSplit = 0.90 #percentage of data to use as training data batch_size = 32 #how many training examples per batch chunkSize = 50000 #how much data to ever load at once testChunkSize = 5000 #how many examples to evaluate per iteration """Define the folder where the model will be stored based on the input arguments""" folder = helperFuncs.defineFolder(outType, size, lay1size, run) """Load the train/test split information if update, else split and write out which images are in which dataset""" trainFs, testFs = helperFuncs.getTrainTestSplit(update, folder, numEx, trainTestSplit) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL #batch_size = 32 #how many training examples per batch #chunkSize = 5000 #how much data to ever load at once #testChunkSize = 600 #how many examples to evaluate per iteration numTrainEx = min(trainL, chunkSize) targets = helperFuncs.getSolubilityTargets( ) #get the solubility value for each CID outsize = len(
"""Define parameters of the run""" batch_size = 32 #how many training examples per batch """Define the folder where the model will be stored based on the input arguments""" folder = helperFuncs.defineFolder(True,outType,size,run) print folder trainDirect = folder+"tempTrain/" trainNP = folder+"tempTrainNP/" testDirect = folder+"tempTest/" testNP = folder+"tempTestNP/" """Load the train/test split information""" trainFs, testFs = helperFuncs.getTrainTestSplit(True,folder) trainL = len(trainFs) testL = len(testFs) features,labels = helperFuncs.getTargets(outType) #get the OCR vector for each CID outsize = len(features[features.keys()[0]]) #this it the size of the target (# of OCRfeatures) means,stds = helperFuncs.getMeansStds() """load model""" #with open(folder+"bestModel.pickle",'rb') as f: # model = cPickle.load(f) model = helperFuncs.loadModel(folder+"wholeModel")
ld = listdir(direct) #contents of that directory numEx = len(ld) #number of images in the directory shuffle(ld) #shuffle the image list for randomness outType = "solubility" #what the CNN is predicting DUMP_WEIGHTS = True #will we dump the weights of conv layers for visualization trainTestSplit = 0.90 #percentage of data to use as training data batch_size = 32 #how many training examples per batch chunkSize = 50000 #how much data to ever load at once testChunkSize = 5000 #how many examples to evaluate per iteration """Define the folder where the model will be stored based on the input arguments""" folder = helperFuncs.defineFolder(outType,size,lay1size,run) """Load the train/test split information if update, else split and write out which images are in which dataset""" trainFs, testFs = helperFuncs.getTrainTestSplit(update,folder,numEx,trainTestSplit) trainL = len(trainFs) testL = len(testFs) print "number of examples: ", numEx print "training examples : ", trainL print "test examples : ", testL #batch_size = 32 #how many training examples per batch #chunkSize = 5000 #how much data to ever load at once #testChunkSize = 600 #how many examples to evaluate per iteration numTrainEx = min(trainL,chunkSize) targets = helperFuncs.getSolubilityTargets() #get the solubility value for each CID