def generateorientedmatforonespecies(repository,speciesname,endofthefile): Matpath=repository+speciesname+".TElearnmatrix.mat" TEpath=repository+speciesname+".TElist" DEpath=repository+speciesname+".TEdict" DEfamilypath=repository+speciesname+".TEfamilydict" #dict to return if op.exists(TEpath): TElist=utils.loadfilelist(TEpath) #list of each TE in the mat TEfamilydict=utils.loadstrfiledict(DEfamilypath) else: TElist,TEfamilydict=utils.generatelistofTEforaspecies(repository,speciesname,endofthefile) #the matrix if op.exists(Matpath): mat=np.asarray(scpio.loadmat(Mathpath)['mat']) #scpio loadmat return a dict #np.array to make all code functionnal TEdict=utils.loadfiledict(DEpath) else: L=len(TElist) mat=np.zeros((2*L,2*L)) chrlist=utils.dictchr[speciesname] #help us to loop only on good chr #by chr file print("==========>generation de la matrice d apprentisage") for i in chrlist: filename=repository+i+endofthefile print("=====>nom du fichier de chr: ",filename) filein=open(filename,"r") l=filein.readline() l=filein.readline() #init actualTE=l.split()[1] coordEndTEold=float(l.split()[5]) j=TElist.index(actualTE) if l.split()[6][0]=="C": j=j+L l=filein.readline() while l: i=j ls=l.split() actualTE=ls[1] j=TElist.index(actualTE) coordBeginTE=float(ls[4]) coordEndTE=float(ls[5]) if l.split()[6][0]=="C": #orientation decision j=j+L if (coordBeginTE-coordEndTEold)<10000: #segmentation limit if coordBeginTE<coordEndTEold:#inclusions of one TE in another one mat[i,j]+=1 mat[j,i]+=1 else: mat[i,j]+=1 coordEndTEold=coordEndTE l=filein.readline() filein.close() mat=filteramatrixfromrichrepeatelement(mat,TElist,True,L) #filter rich element toto={}#just to be un matlab savemat format scpio.savemat(Matpath,toto) return mat,TElist,TEdict,TEfamilydict
def generateTEmatrixforaspecies(repository,speciesname,endofthefile,chrsizepath,resolution): resolution=int(resolution) #cast of argv TEpath=repository+speciesname+".TElist" Matpath=repository+speciesname+".TEmatrix" TEpath=repository+speciesname+".TElist" DEpath=repository+speciesname+".TEdict" DEfamilypath=repository+speciesname+".TEfamilydict" #dict to return if op.exists(TEpath): TElist=utils.loadfilelist(TEpath) TEfamilydict=utils.loadstrfiledict(DEfamilypath) else: TElist,TEfamilydict=utils.generatelistofTEforaspecies(repository,speciesname,endofthefile) sizedict=utils.loadchrsizedict(chrsizepath,resolution) matcolsize=sizedict["HicUsedTotalSize"] #size of the matrix : ["HicUsedTotalSize"] or ["HicChrBeginchrX"]#TODO here mat=np.zeros((len(TElist),matcolsize)) chrlist=utils.dictchr[speciesname] #help us to loop only on good chr #print(sizedict) print(matcolsize) print(chrlist) #by chr file for i in chrlist: filename=repository+i+endofthefile filein=open(filename,"r") l=filein.readline() l=filein.readline() ls=l.split() #just to have the first chr size before loop chrbegin=np.float(sizedict["HicChrBegin"+ls[0]]) #line of the file while l: ls=l.split() #by bin begin=(np.float(ls[4])/resolution)+chrbegin end=(np.float(ls[5])/resolution)+chrbegin i=0 #print(TElist[0],TElist[1]) while i<np.ceil(end-begin): #print((end-begin),ls[1],ls[0],ls[4],ls[5],TElist.index(ls[1]),begin+i,chrbegin) mat[TElist.index(ls[1]),begin+i]=1 i+=1 l=filein.readline() filein.close() toto={}#just to be un matlab savemat format toto['mat']=sparse.csr_matrix(mat) scpio.savemat(Matpath,toto)
def makeTEPieChart(repository, speciesname, endofthefile, chrsizepath): outname = repository + speciesname + "TEproportion" sizedict = utils.loadchrsizedict(chrsizepath, 1) # absolutely no binning stuff => resolution=1 # load some accelerator information that we supposed to have TEpath = repository + speciesname + ".TElist" DEfamilypath = repository + speciesname + ".TEfamilydict" if op.exists(TEpath): TElist = utils.loadfilelist(TEpath) TEfamilydict = utils.loadstrfiledict(DEfamilypath) else: TElist, TEfamilydict = utils.generatelistofTEforaspecies(repository, speciesname, endofthefile) ProportionDict, TEreversefamilitydict = reservefamilydict(TEfamilydict) # reel algorythm sumchrtot = 0 sumTE = 0 # helpfull tu calculed unmasked proportion chrlist = utils.dictchr[speciesname] # in theory : chr here are same as chrsizepath #no bug ifelse for z in chrlist: filename = repository + z + endofthefile print("=====>nom du fichier de chr: ", filename) filein = open(filename, "r") l = filein.readline() # first line : annotation l = filein.readline() while l: ls = l.split() val = float(ls[5]) - float(ls[4]) if val < 0: print("Danger") ProportionDict[TEfamilydict[ls[1]]] += val sumTE += val l = filein.readline() sumchrtot += sizedict[z] filein.close() # print(ProportionDict) ProportionDict["Autre"] = sumchrtot - sumTE # save the dict fout = open(outname, "w") print("nom du repertoire de sortie", outname) fout.write("TEtype\tQuantity\n") for i in ProportionDict: s = i + "\t" + str(ProportionDict[i]) + "\n" fout.write(s) fout.close()
def generatebasicmatforonespecies(repository,speciesname,endofthefile): TEpath=repository+speciesname+".TElist" DEpath=repository+speciesname+".TEdict" DEfamilypath=repository+speciesname+".TEfamilydict" Matpath=repository+speciesname+".TElearnmatrix.mat" insertionouput=open(repository+speciesname+"insertionTElist","w") #dict to return if op.exists(TEpath): TElist=utils.loadfilelist(TEpath) TEfamilydict=utils.loadstrfiledict(DEfamilypath) else: TElist,TEfamilydict=utils.generatelistofTEforaspecies(repository,speciesname,endofthefile) #the matrix if op.exists(Matpath): mat=scpio.loadmat(Matpath)['mat'] #scpio loadmat return a dict #np.asarray to make all code functionnal TEdict=utils.loadfiledict(DEpath) else: L=len(TElist) mat=np.zeros((L,L)) TEdict=dict() TEdict["sum"]=0 chrlist=utils.dictchr[speciesname] #help us to loop only on good chr #by chr file print("==========>generation de la matrice d apprentisage") #checksum k=0 ke=0 ki=0 kj=0 TEf=asum=0 #for vocabulary complexity for z in chrlist: k+=1 filename=repository+z+endofthefile print("=====>nom du fichier de chr: ",filename) filein=open(filename,"r") l=filein.readline() l=filein.readline() #init actualTE=l.split()[1] TEf,asum=updateTEcomplexity(l.split()[3],TEf,asum) coordEndTEold=float(l.split()[5]) j=TElist.index(actualTE) updateTEdict(TEdict,TElist[j],1) lold=l l=filein.readline() while l: i=j #cheat to economise one operation cause we move linearly from the chr ls=l.split() actualTE=ls[1] TEf,asum=updateTEcomplexity(ls[3],TEf,asum) #this is a general factor j=TElist.index(actualTE) coordBeginTE=float(ls[4]) coordEndTE=float(ls[5]) if (coordBeginTE-coordEndTEold)<5000: #segmentation limit if coordBeginTE<coordEndTEold:#inclusions of one TE in another one #print("===>un exemple",z,coordBeginTE,coordEndTE,actualTE) mat[i,j]+=2 mat[j,i]+=1 updateTEdict(TEdict,TElist[i],2) #slower but most clear updateTEdict(TEdict,TElist[j],1) #slower but most clear #j=i #CARE: IT IS AN INCLUSION BETWEEN TE insertionouput.write(lold.strip("\n")+"\t"+l) #not so usefull ki+=3 else: mat[i,j]+=1 updateTEdict(TEdict,TElist[j],1) kj+=1 ke+=1 coordEndTEold=coordEndTE lold=l l=filein.readline() filein.close() #REDUCE NUMBER OF ELEMENT IN THE MATRIX #print("sum:",TEdict["sum"],np.sum(mat)) mat,TEdict=filteramatrixfromrichrepeatelement(mat,TElist,TEdict,False,L) #FILTER *RICH element #CARE after that the relation N node, N-1 Edge IS NOT CONSERVE #save TEdict #print(k,ke,ki,kj) #test number of each relation by type dout=open(repository+speciesname+".TEdict","w") for i in TElist: if TEdict.__contains__(i): dout.write(i+"\t"+str(TEdict[i])+"\n") #le bug est la!!! else: TEdict[i]=0 #not present in learning graph, present in the dataset TEdict["sum"]+=0 dout.write(i+"\t"+str(TEdict[i])+"\n") dout.write("sum\t"+str(TEdict["sum"])+"\n") dout.close() #save the matrix toto={}#just to be un matlab savemat format toto['mat']=mat scpio.savemat(Matpath,toto) print("====>Complexite du vocabulaire de l'espece: ",str(TEf/asum)," <=======") #TEdiversity print("====> somme apres generation de la matrice",np.sum(mat)) insertionouput.close() return mat,TElist,TEdict,TEfamilydict