def getEncoded(dataLocation = "fakefasta.fasta"): dataobj = readdata.readData(dataLocation,["host","fasta"]) dataobj.cleanHost() xfull,yfull = dataobj.sepDataLabels() encoded = [] for i in range(len(xfull)): encode = [] encode.append(i) #add id encode.append(encode1(xfull[i])) #add onehots labels = [] for key in yfull[i]: label = yfull[i][key] labels.append(label) encode.append(labels) #add labels encoded.append(encode) return encoded
[0, 0, 1, 0, 0, 1] ]) # The output of the exclusive OR function follows. # In[4]: #output data #bpm of the track y = np.array([[0], [1], [0.5], [0.33], [0.66], [0.66], [0.33]]) print("ici") import readdata X, y = readdata.readData("who cares") print("ici", X.shape, y.shape) # The seed for the random generator is set so that it will return the same random numbers each time, which is sometimes useful for debugging. # In[5]: #Number_Repete = 1 Number_Sample = len(X[1, :]) #Number_Sample_Boucle = int(Number_Sample/Number_Repete) Number_Example = len(y) Number_Neurons = 2 #print("Nb sample", Number_Repete, Number_Sample, Number_Sample_Boucle)
import numpy as np from keras.utils import to_categorical import readdata def oneHot(c): onehot = [0] * 26 onehot[ord(c)-65] = 1 return onehot plvlmax = 15 inp = "HA_t_complete_some.fasta" outp = "phocs/phoc" dataobj = readdata.readData(inp) x, y = dataobj.sepDataLabels() for lvl in range(11,plvlmax+1): print("level = " + str(lvl)) phocs = dataobj.dataToPhoc(x,lvl) goal = open(outp + str(lvl) + ".txt","w") for i in range(len(phocs)): line = str(phocs[i]).replace(",","")[1:-1] + " " labels = y[i] for key in labels.keys(): line += str(labels[key]) + " " line = line[:-1] + "\n" goal.write(line) goal.close()
# 0/1 classification problem is hard for us if we adopt linear regression, so we use logistic regression to # accomplish this, with use of sigmoid function import numpy as np import readdata import plotdata from computeCost import computeCost from gradientDescent import logisticDerivative from gradientDescent import gradientDescent from scipy.optimize import fmin_bfgs if __name__ == "__main__": (x, y, num) = readdata.readData() plotdata.plotPoints(x, y) shape = x.shape numOfFeatures = shape[0] X = np.ones([numOfFeatures + 1, num]) X[1:, :] = x[:, :] theta = np.zeros(numOfFeatures + 1) print computeCost(X, y, theta) flag = 1 # way to control which algorithm to choose 0 -> iteration way, 1 -> optimized way (bfgs algorithm) if flag == 0: iterations = 100000 alpha = 0.001 plotdata.plotTheta(x, y, gradientDescent(X, y, theta, alpha, iterations)) else: theta = fmin_bfgs(computeCost, np.transpose(theta), args=(X, y)) plotdata.plotTheta(x, y, theta)
if np.random.uniform( ) < split: #chance of adding it to training or testing set (randomly) x.append(data[i]) e = dict(host=labelts[i]) y.append(e) else: xt.append(data[i]) e = dict(host=labelts[i]) yt.append(e) return (x, y, xt, yt) dataLocation = "HA_t_complete_human.fasta" print("reading data") dataobj = readdata.readData(dataLocation, ["host", "fasta"]) print("Cleaning labels") dataobj.cleanHost() print("separating data and labels") xfull, yfull = dataobj.sepDataLabels() print("translating data to phoc") xfull = dataobj.dataToPhoc(xfull, phoclevel, False) ''' #x and y are training data and training labels, xt and yt are test data and test labels, respectively x = [] y = [] xt = [] yt = [] #keep track of the indexes for debugging purposes trainIndexes = [] testIndexes = []