Ejemplo n.º 1
0
def generatecouplemat(pathTEcouple,repository,speciesname,endofthefile,chrsizepath,resolution):
	resolution=int(resolution) #cast of argv
	Matpath=repository+speciesname+".TEcouplematrix"
	TEposdict=generateTElistedposdict(repository,speciesname,endofthefile,resolution)
	print(TEposdict.keys())
	filecouple=open(pathTEcouple,"r")
	chrlist=utils.dictchr[speciesname] #help us to loop only on good chr
	sizedict=utils.loadchrsizedict(chrsizepath,resolution)
	matcolsize=sizedict["HicChrBeginchrX"] #size of the matrix : ["HicUsedTotalSize"] or #TODO here
	l=filecouple.readline()
	#initialisation of the loop
	ls=l.split()
	vi=np.zeros((1,matcolsize))
	vi=completetheline(ls[0],TEposdict[ls[0]],sizedict,vi,1)
	vi=completetheline(ls[1],TEposdict[ls[1]],sizedict,vi,-1)
	mat=vi
	l=filecouple.readline()
	i=0
	while l:
		ls=l.split()
		print("===>line",i)
		vi=np.zeros((1,matcolsize))
		vi=completetheline(ls[0],TEposdict[ls[0]],sizedict,vi,1)
		vi=completetheline(ls[1],TEposdict[ls[1]],sizedict,vi,1)
		mat=np.concatenate((mat,vi),axis=0)
		l=filecouple.readline()
		i+=1
	filecouple.close()
	toto={}#just to be un matlab savemat format
	toto['mat']=mat
	scpio.savemat(Matpath,toto)
Ejemplo n.º 2
0
def gccontentforonechr(fastarep,chrsizepath,species,GCoutrep,resolution):
	resolution=int(resolution)
	sizedict=utils.loadchrsizedict(chrsizepath,resolution)
	matcolsize=sizedict["HicUsedTotalSize"] #HicUsedTotalSize HicChrBeginchr2
	mat=np.zeros(matcolsize)
	k=0
	for i in utils.dictchr[species]:
		print(i)
		o=open(fastarep+i+".fa","r")
		l=o.readline()
		s=""
		#init
		l=l.split(" ")
		l=o.readline()
		while l:
			l=l.replace("\n","")
			s+=l
			l=o.readline()
		o.close()
		i=0
		j=resolution
		if resolution>len(s):
			mat[k]=gcpercent(s)
			k+=1
		while j<len(s):
			mat[k]=gcpercent(s[i:j])
			i+=resolution
			j+=resolution
			k+=1
	toto={}#just to be un matlab savemat format
	toto['mat']=mat
	scpio.savemat(GCoutrep+str(resolution)+"pbGCvec.mat",toto)
	utils.savematrixasfilelist3(mat,GCoutrep+str(resolution)+"pbGCvec.csv")
Ejemplo n.º 3
0
def generateTErecurencematrixHicComparableForOneChr(repository,endofthefile,achr,resolution,chrsizepath,species):
	dumpmatname=repository+achr+endofthefile+"dumpmatHiCbin.dump"
	densityname=repository+achr+endofthefile
	if op.exists(dumpmatname+".npy"):
		print("===> Matrice deja existante")
		thematrix=np.load(open(dumpmatname+".npy","rb"))
	else:
		print("===>Constitution de la matrice")
		filename=repository+achr+endofthefile
		print("resolution:",resolution)
		sizedict=utils.loadchrsizedict(chrsizepath,resolution)
		#find the size of the list of list base on distance
		sizeList=findsizeforHicListofList(sizedict,utils.dictchr[species],achr)
		print("taille de la liste:",sizeList)
		setlist=generateHicListofpattern(filename,sizeList,resolution)
		transformHiClistofpatterninDensity(setlist,"LTR/Gypsy",densityname+"Gypsy")
		transformHiClistofpatterninDensity(setlist,"LINE/CR1",densityname+"CR1")
		transformHiClistofpatterninDensity(setlist,"DNA/P",densityname+"DNAP")
		transformHiClistofpatterninDensity(setlist,"RC/Helitron",densityname+"helitron")
		print("taille de la setlist",len(setlist))
		L=sizeList #in theory len(setlist)==sizeList
		#Lset=windowsize #juste for speed writing
		print("==>Taille de la matrice:",L)
		thematrix=np.zeros((L,L))
		i=0
		j=0
		while i<L:
			print(i)
			while j<L:
				if j>=i:
					val=distancebetweenunequallist3(setlist[i],setlist[j])
					thematrix[i,j]=val
					thematrix[j,i]=val
				j+=1
			j=0
			i+=1
		print("====>Dumping de la matrice")
		#pickle.dump(thematrix,open(dumpmatname,"wb")) #other option of dump but with bigger file
		#toto={}#just to be un matlab savemat format
		#toto['mat']=thematrix
		#scpio.savemat(repository+achr+endofthefile+"TErecurencematrix.mat",toto)
		np.save(dumpmatname, thematrix)
	"""cormat=np.corrcoef(thematrix)
	v,e=giveeigenvalueandvectorvalue(cormat,10)
	utils.savematrixasfilelist3(e[:,0],repository+achr+"Corrv1")
	utils.savematrixasfilelist3(e[:,1],repository+achr+"Corrv2")
	utils.savematrixasfilelist3(e[:,2],repository+achr+"Corrv3")
	utils.savematrixasfilelist3(e[:,3],repository+achr+"Corrv4")"""
	print("===> Afichage de la matrice")
	manipname=repository+achr+endofthefile+"recurenceplot.png"
	print(thematrix.shape)
	#print(np.where(thematrix<0))
	utils.showamatwithcolorcode(thematrix,manipname,"bwr") #jet<bwr
	return thematrix
Ejemplo n.º 4
0
def generateTEmatrixforaspecies(repository,speciesname,endofthefile,chrsizepath,resolution):
	resolution=int(resolution) #cast of argv
	TEpath=repository+speciesname+".TElist"
	Matpath=repository+speciesname+".TEmatrix"
	TEpath=repository+speciesname+".TElist"
	DEpath=repository+speciesname+".TEdict"
	DEfamilypath=repository+speciesname+".TEfamilydict"
	#dict to return
	if op.exists(TEpath):
		TElist=utils.loadfilelist(TEpath)
		TEfamilydict=utils.loadstrfiledict(DEfamilypath)
	else:
		TElist,TEfamilydict=utils.generatelistofTEforaspecies(repository,speciesname,endofthefile)
	sizedict=utils.loadchrsizedict(chrsizepath,resolution)
	matcolsize=sizedict["HicUsedTotalSize"] #size of the matrix : ["HicUsedTotalSize"] or ["HicChrBeginchrX"]#TODO here
	mat=np.zeros((len(TElist),matcolsize))
	chrlist=utils.dictchr[speciesname] #help us to loop only on good chr
	#print(sizedict)
	print(matcolsize)
	print(chrlist)
	#by chr file
	for i in chrlist:
		filename=repository+i+endofthefile
		filein=open(filename,"r")
		l=filein.readline()
		l=filein.readline()
		ls=l.split() #just to have the first chr size before loop
		chrbegin=np.float(sizedict["HicChrBegin"+ls[0]])
		#line of the file
		while l:
			ls=l.split()
			#by bin
			begin=(np.float(ls[4])/resolution)+chrbegin
			end=(np.float(ls[5])/resolution)+chrbegin
			i=0
			#print(TElist[0],TElist[1])
			while i<np.ceil(end-begin):
				#print((end-begin),ls[1],ls[0],ls[4],ls[5],TElist.index(ls[1]),begin+i,chrbegin)
				mat[TElist.index(ls[1]),begin+i]=1
				i+=1
			l=filein.readline()
		filein.close()
	toto={}#just to be un matlab savemat format
	toto['mat']=sparse.csr_matrix(mat)
	scpio.savemat(Matpath,toto)
def makeTEPieChart(repository, speciesname, endofthefile, chrsizepath):
    outname = repository + speciesname + "TEproportion"
    sizedict = utils.loadchrsizedict(chrsizepath, 1)  # absolutely no binning stuff => resolution=1
    # load some accelerator information that we supposed to have
    TEpath = repository + speciesname + ".TElist"
    DEfamilypath = repository + speciesname + ".TEfamilydict"
    if op.exists(TEpath):
        TElist = utils.loadfilelist(TEpath)
        TEfamilydict = utils.loadstrfiledict(DEfamilypath)
    else:
        TElist, TEfamilydict = utils.generatelistofTEforaspecies(repository, speciesname, endofthefile)
    ProportionDict, TEreversefamilitydict = reservefamilydict(TEfamilydict)
    # reel algorythm
    sumchrtot = 0
    sumTE = 0  # helpfull tu calculed unmasked proportion
    chrlist = utils.dictchr[speciesname]  # in theory : chr here are same as chrsizepath #no bug ifelse
    for z in chrlist:
        filename = repository + z + endofthefile
        print("=====>nom du fichier de chr: ", filename)
        filein = open(filename, "r")
        l = filein.readline()  # first line : annotation
        l = filein.readline()
        while l:
            ls = l.split()
            val = float(ls[5]) - float(ls[4])
            if val < 0:
                print("Danger")
            ProportionDict[TEfamilydict[ls[1]]] += val
            sumTE += val
            l = filein.readline()
        sumchrtot += sizedict[z]
        filein.close()
        # print(ProportionDict)
    ProportionDict["Autre"] = sumchrtot - sumTE
    # save the dict
    fout = open(outname, "w")
    print("nom du repertoire de sortie", outname)
    fout.write("TEtype\tQuantity\n")
    for i in ProportionDict:
        s = i + "\t" + str(ProportionDict[i]) + "\n"
        fout.write(s)
    fout.close()