def buildMatrixGenesVitis(listFilter, listFiles): global list_Genes matrixGenes = [] extensionFiles = listFiles[0][-4:] for f in listFiles: if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'): if f[-4:] == '.zip': archive = zipfile.ZipFile(f, 'r') fileArchive = archive.namelist() for namefilezip in fileArchive: csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvText = csvText[1] csvListText = csvText.split(r'\n') csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = ut.readFilesVitis(namefilezip, True) list_Genes = listGenes[1] listGenes = listGenes[0] os.remove(namefilezip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: #Read gene files .csv listGenes = ut.readFilesVitis(namefilezip, True) list_Genes = listGenes[1] listGenes = listGenes[0] #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: print( 'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip' ) sys.exit(-1) #return lists of gene lists filtered return matrixGenes
def buildMatrixGenesVitis(listFilter, listFiles): global list_Genes matrixGenes = [] listacodici = [] extensionFiles = listFiles[0][-4:] for f in listFiles: if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'): if f[-4:] == '.zip': archive = zipfile.ZipFile(f, 'r') fileArchive = archive.namelist() for namefilezip in fileArchive: csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvText = csvText[1] csvListText = csvText.split(r'\n') csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = ut.readFilesVitis(namefilezip) list_Genes = listGenes[1] listGenes = listGenes[0] os.remove(namefilezip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) elif f[-4:] == '.csv': #Read gene files .csv listGenes = ut.readFilesVitis(namefilezip) list_Genes = listGenes[1] listGenes = listGenes[0] #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: #print('cacca') #listacodici=[] codici = listFiles[0] listacodici = codici.split(',') file_csv = [] ncodici = len(listacodici) #print(ncodici) #dirname = os.path.dirname(annotated) #filename = os.path.join(dirname, 'relative/path/to/file/you/want') for x in listacodici: file_csv.append( glob.glob(('../annotated/' + str(x) + '_*'))) #print(listacodici) #print(file_csv) #print(listacodici) archiviovec = [] for x in range(ncodici): if file_csv[x][0] != 0: archiviovec.append(file_csv[x][0]) #print(archiviovec) #////////////////////////////copio il codice da sopra per le zip////////// archive = archiviovec #print(archive) fileArchive = archiviovec #print(fileArchive) for namefilezip in fileArchive: listGenes = ut.readFilesVitis(namefilezip) list_Genes = listGenes[1] listGenes = listGenes[0] #os.remove(namefilezip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) #/////////////////////////////////////////////////////////////////////// else: print( 'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip' ) sys.exit(-1) #return lists of gene lists filtered return matrixGenes
def readFilesGenes(listFiles, coupleGenes, listfilter, vitis, TCGAdb, listBioNameUpdate): dictGeneToAnalyze = {} #Read only file of a gene to analyze for f in coupleGenes: for elem in f: dictGeneToAnalyze[elem] = 0 matrixGenes = [] extensionFiles = listFiles[0][-4:] for f in listFiles: if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'): if f[-4:] == '.zip': archive = zipfile.ZipFile(f, 'r') fileArchive = archive.namelist() for namefilezip in fileArchive: if '.zip' in namefilezip: #Read if is a subzip subarchive = zipfile.ZipFile( BytesIO(archive.read(namefilezip)), 'r') fileSubArchive = subarchive.namelist() for namefilesubzip in fileSubArchive: if 'expansion' in namefilesubzip: csvText = str(subarchive.read(namefilesubzip)) csvText = csvText.split(r'"') csvText = csvText[0] csvListText = csvText.split(r'\n') nameGene = ((re.search( r'-\w*\s', csvListText[0])).group())[1:-1] if nameGene in dictGeneToAnalyze.keys(): csvTemp = open(namefilesubzip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = readFilesHuman( namefilesubzip, TCGAdb) os.remove(namefilesubzip) # #Filter lists for filter in listfilter: listGenes = applyFilter( listGenes, filter) matrixGenes.append(listGenes) elif 'expansion' in namefilezip: #Read if is a list of human csvText = str(archive.read(namefilezip)) csvText = csvText.split(r'"') csvText = csvText[0] csvListText = csvText.split(r'\n') nameGene = ((re.search(r'-\w*\s', csvListText[0])).group())[1:-1] if nameGene in dictGeneToAnalyze.keys(): csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = readFilesHuman(namefilezip, TCGAdb) os.remove(namefilezip) #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) elif 'csv' in namefilezip: #Read file csv inside an archive zip csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvListText = [] nameGene = '' #different split if is vitis or human if vitis: csvText = csvText[1] csvListText = csvText.split(r'\n') try: nameGene = listBioNameUpdate[(( csvListText[0].split(r','))[3]).upper()] except: listBioNameUpdate[(( csvListText[0].split(r',') )[3]).upper()] = (( csvListText[0].split(r','))[3]).upper() nameGene = (( csvListText[0].split(r','))[3]).upper() else: csvText = csvText[0] csvListText = csvText.split(r'\n') nameGene = (((re.search( r'-\w*\s', csvListText[0])).group())[1:-1]).upper() #if is a gene to analyze read it if nameGene in dictGeneToAnalyze.keys(): csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = [] if vitis: listGenes = (ut.readFilesVitis( namefilezip, True))[0] if nameGene == 'GT-001': listGenes[0] = nameGene #TODEL_GT-001 else: listGenes = readFilesHuman(namefilezip, TCGAdb) os.remove(namefilezip) # #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: #Read gene files .csv fileRead = open(f, 'r') csvText = fileRead.read() csvText = csvText.split(r'"') csvText = csvText[0] csvListText = csvText.split(r'\n') nameGene = ((re.search(r'-\w*\s', csvListText[0])).group())[1:-1] if nameGene in dictGeneToAnalyze.keys(): listGenes = readFilesHuman(f, TCGAdb) #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: print( 'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip' ) sys.exit(-1) return matrixGenes
def buildMatrixGenesHuman(listFilter, listFiles, fantom): global list_Genes global typeDB typeDB = fantom #BUILD MATRIX OF GENES matrixGenes = [] extensionFiles = listFiles[0][-4:] for f in listFiles: if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'): if f[-4:] == '.zip': archive = zipfile.ZipFile(f, 'r') fileArchive = archive.namelist() for namefilezip in fileArchive: if '.zip' in namefilezip: subarchive = zipfile.ZipFile( BytesIO(archive.read(namefilezip)), 'r') fileSubArchive = subarchive.namelist() for namefilesubzip in fileSubArchive: if 'expansion' in namefilesubzip: csvText = str(subarchive.read(namefilesubzip)) csvText = csvText.split(r'"') csvText = csvText[0] csvListText = csvText.split(r'\n') csvTemp = open(namefilesubzip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = ut.readFilesVitis( namefilesubzip, False) list_Genes = listGenes[1] listGenes = listGenes[0] os.remove(namefilesubzip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) elif 'expansion' in namefilezip: csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvText = csvText[1] csvListText = csvText.split(r'\n') csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = ut.readFilesVitis(namefilezip, typeDB) list_Genes = listGenes[1] listGenes = listGenes[0] os.remove(namefilezip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) elif 'csv' in namefilezip: csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvText = csvText[1] csvListText = csvText.split(r'\n') csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = ut.readFilesVitis(namefilezip, typeDB) list_Genes = listGenes[1] listGenes = listGenes[0] os.remove(namefilezip) #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: #Read gene files .csv listGenes = ut.readFilesVitis(f, typeDB) list_Genes = listGenes[1] listGenes = listGenes[0] #Filter lists for filter in listFilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) else: print( 'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip' ) sys.exit(-1) #READ UPDATE NAME GENE f = open('import_doc/couple_name_gene.csv', 'r') text = f.readlines() listLineName = [] i = 0 while i < len(text): listLineName.append(text[i].split(',')) i += 1 for l in listLineName: if l[0] != '': listBioNameUpdate[l[0]] = l[0] + '_' + l[1][:-1] for n in list_Genes: if n == l[0]: if l[0] != '': if comprimeNode: if (re.search(r'@\w*', l[1])).group() not in list_Genes: list_Genes[list_Genes.index(n)] = (re.search( r'@\w*', l[1])).group() else: del (list_Genes[list_Genes.index(n)]) else: list_Genes[list_Genes.index( n)] = l[0] + '_' + l[1][:-1] f.close() matrixGenesOld = [] #comprime node if request if comprimeNode: #deep copy used to check edges in pearson correlation in comprime node version matrixGenesOld = cp.deepcopy(matrixGenes) #update name with name of only gene for l in matrixGenes: l[0] = (re.search(r'@\w*', listBioNameUpdate[l[0]])).group() i = 1 while i < len(l): tmp = l[i] try: l[i] = (tmp[0], (re.search( r'@\w*', listBioNameUpdate[tmp[1]])).group(), tmp[2]) i += 1 except: del (l[i]) #unify list of same gene but different isoform i = 0 j = 1 while i < len(matrixGenes): while j < len(matrixGenes): if matrixGenes[i][0] == matrixGenes[j][0]: matrixGenes[i] = matrixGenes[i] + matrixGenes[j][1:] del (matrixGenes[j]) else: j += 1 i += 1 j = i + 1 #remove duplicate for l in matrixGenes: i = 1 j = 2 while i < len(l): while j < len(l): if l[i][1] == l[j][1]: #l[i] = (l[i][0], l[i][1], max(l[i][2], l[j][2])) l[i] = (int((l[i][0] + l[j][0]) / 2), l[i][1], round(((l[i][2] + l[j][2]) / 2), 4)) del (l[j]) else: j += 1 i += 1 j = i + 1 i = 1 while i < len(l): if l[i][1] == l[0]: del (l[i]) else: i += 1 #return lists of gene lists filtered return (matrixGenes, matrixGenesOld, comprimeNode)
def readFilesGenes(listFiles, coupleGenes, listfilter): dictGeneToAnalyze = {} #Read only file of a gene to analyze if '-u' not in [a[0] for a in listfilter]: for f in coupleGenes: for elem in f: dictGeneToAnalyze[elem] = 0 matrixGenes = [] extensionFiles = listFiles[0][-4:] for f in listFiles: if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'): if f[-4:] == '.zip': archive = zipfile.ZipFile(f, 'r') fileArchive = archive.namelist() for namefilezip in fileArchive: #Read file csv inside an archive zip csvText = str(archive.read(namefilezip)) if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvListText = [] nameGene = '' #different split if is vitis or human csvText = csvText[1] csvListText = csvText.split(r'\n') nameGene = ((csvListText[0].split(r','))[3]).upper() #if is a gene to analyze read it if nameGene in dictGeneToAnalyze.keys() or '-u' in [ a[0] for a in listfilter ]: csvTemp = open(namefilezip, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = [] listGenes = (ut.readFilesVitis(namefilezip))[0] os.remove(namefilezip) # #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) elif f[-4:] == '.csv': #Read gene files .csv TODO fileRead = open(f, 'r') csvText = fileRead.read() if csvText[1] == '"': csvText = csvText.split(r'"') else: csvText = csvText.split('\'') csvListText = [] nameGene = '' #different split if is vitis or human csvText = csvText[1] csvListText = csvText.split(r'\n') nameGene = ((csvListText[0].split(r','))[3]).upper() #if is a gene to analyze read it if nameGene in dictGeneToAnalyze.keys() or '-u' in [ a[0] for a in listfilter ]: csvTemp = open(f, 'w') for l in csvListText: csvTemp.write(l + '\n') csvTemp.close() listGenes = [] listGenes = (ut.readFilesVitis(namefilezip))[0] os.remove(namefilezip) # #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) # csvText = csvText.split(r'"') # csvText = csvText[0] # csvListText = csvText.split(r'\n') # nameGene = ((re.search(r'-\w*\s', csvListText[0])).group())[1:-1] # if nameGene in dictGeneToAnalyze.keys(): # listGenes = readFilesHuman(f, TCGAdb) # #Filter lists # for filter in listfilter: # listGenes = applyFilter(listGenes, filter) # matrixGenes.append(listGenes) else: #listacodici=[] codici = listFiles[0] listacodici = codici.split(',') file_csv = [] ncodici = len(listacodici) #print(ncodici) for x in listacodici: file_csv.append(glob.glob('../annotated/' + str(x) + '_*')) #print(listacodici) #print(file_csv) #print(listacodici) archiviovec = [] for x in range(ncodici): if file_csv[x][0] != 0: archiviovec.append(file_csv[x][0]) #print(archiviovec) #////////////////////////////copio il codice da sopra per le zip////////// archive = archiviovec #print(archive) fileArchive = archiviovec #print(fileArchive) for namefilezip in fileArchive: listGenes = ut.readFilesVitis(namefilezip) list_Genes = listGenes[1] listGenes = listGenes[0] #os.remove(namefilezip) #Filter lists for filter in listfilter: listGenes = applyFilter(listGenes, filter) matrixGenes.append(listGenes) #/////////////////////////////////////////////////////////////////////// else: print( 'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip' ) sys.exit(-1) return matrixGenes