def buildMatrixGenesVitis(listFilter, listFiles):
    global list_Genes
    matrixGenes = []
    extensionFiles = listFiles[0][-4:]
    for f in listFiles:
        if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'):
            if f[-4:] == '.zip':
                archive = zipfile.ZipFile(f, 'r')
                fileArchive = archive.namelist()
                for namefilezip in fileArchive:
                    csvText = str(archive.read(namefilezip))
                    if csvText[1] == '"':
                        csvText = csvText.split(r'"')
                    else:
                        csvText = csvText.split('\'')
                    csvText = csvText[1]
                    csvListText = csvText.split(r'\n')
                    csvTemp = open(namefilezip, 'w')
                    for l in csvListText:
                        csvTemp.write(l + '\n')
                    csvTemp.close()
                    listGenes = ut.readFilesVitis(namefilezip, True)
                    list_Genes = listGenes[1]
                    listGenes = listGenes[0]
                    os.remove(namefilezip)
                    #Filter lists
                    for filter in listFilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)
            else:
                #Read gene files .csv
                listGenes = ut.readFilesVitis(namefilezip, True)
                list_Genes = listGenes[1]
                listGenes = listGenes[0]
                #Filter lists
                for filter in listFilter:
                    listGenes = applyFilter(listGenes, filter)
                matrixGenes.append(listGenes)
        else:
            print(
                'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip'
            )
            sys.exit(-1)
    #return lists of gene lists filtered
    return matrixGenes
Esempio n. 2
0
def buildMatrixGenesVitis(listFilter, listFiles):
    global list_Genes
    matrixGenes = []
    listacodici = []
    extensionFiles = listFiles[0][-4:]
    for f in listFiles:
        if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'):
            if f[-4:] == '.zip':
                archive = zipfile.ZipFile(f, 'r')
                fileArchive = archive.namelist()
                for namefilezip in fileArchive:
                    csvText = str(archive.read(namefilezip))
                    if csvText[1] == '"':
                        csvText = csvText.split(r'"')
                    else:
                        csvText = csvText.split('\'')
                    csvText = csvText[1]
                    csvListText = csvText.split(r'\n')
                    csvTemp = open(namefilezip, 'w')
                    for l in csvListText:
                        csvTemp.write(l + '\n')
                    csvTemp.close()
                    listGenes = ut.readFilesVitis(namefilezip)
                    list_Genes = listGenes[1]
                    listGenes = listGenes[0]
                    os.remove(namefilezip)
                    #Filter lists
                    for filter in listFilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)
            elif f[-4:] == '.csv':
                #Read gene files .csv
                listGenes = ut.readFilesVitis(namefilezip)
                list_Genes = listGenes[1]
                listGenes = listGenes[0]
                #Filter lists
                for filter in listFilter:
                    listGenes = applyFilter(listGenes, filter)
                matrixGenes.append(listGenes)
            else:
                #print('cacca')
                #listacodici=[]
                codici = listFiles[0]
                listacodici = codici.split(',')
                file_csv = []
                ncodici = len(listacodici)
                #print(ncodici)
                #dirname = os.path.dirname(annotated)
                #filename = os.path.join(dirname, 'relative/path/to/file/you/want')
                for x in listacodici:
                    file_csv.append(
                        glob.glob(('../annotated/' + str(x) + '_*')))
                #print(listacodici)
                #print(file_csv)
                #print(listacodici)
                archiviovec = []
                for x in range(ncodici):
                    if file_csv[x][0] != 0:
                        archiviovec.append(file_csv[x][0])
                #print(archiviovec)
                #////////////////////////////copio il codice da sopra per le zip//////////
                archive = archiviovec
                #print(archive)
                fileArchive = archiviovec
                #print(fileArchive)
                for namefilezip in fileArchive:
                    listGenes = ut.readFilesVitis(namefilezip)
                    list_Genes = listGenes[1]
                    listGenes = listGenes[0]
                    #os.remove(namefilezip)
                    #Filter lists
                    for filter in listFilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)
                    #///////////////////////////////////////////////////////////////////////
        else:
            print(
                'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip'
            )
            sys.exit(-1)
    #return lists of gene lists filtered
    return matrixGenes
def readFilesGenes(listFiles, coupleGenes, listfilter, vitis, TCGAdb,
                   listBioNameUpdate):
    dictGeneToAnalyze = {}
    #Read only file of a gene to analyze
    for f in coupleGenes:
        for elem in f:
            dictGeneToAnalyze[elem] = 0
    matrixGenes = []
    extensionFiles = listFiles[0][-4:]
    for f in listFiles:
        if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'):
            if f[-4:] == '.zip':
                archive = zipfile.ZipFile(f, 'r')
                fileArchive = archive.namelist()
                for namefilezip in fileArchive:
                    if '.zip' in namefilezip:
                        #Read if is a subzip
                        subarchive = zipfile.ZipFile(
                            BytesIO(archive.read(namefilezip)), 'r')
                        fileSubArchive = subarchive.namelist()
                        for namefilesubzip in fileSubArchive:
                            if 'expansion' in namefilesubzip:
                                csvText = str(subarchive.read(namefilesubzip))
                                csvText = csvText.split(r'"')
                                csvText = csvText[0]
                                csvListText = csvText.split(r'\n')
                                nameGene = ((re.search(
                                    r'-\w*\s', csvListText[0])).group())[1:-1]
                                if nameGene in dictGeneToAnalyze.keys():
                                    csvTemp = open(namefilesubzip, 'w')
                                    for l in csvListText:
                                        csvTemp.write(l + '\n')
                                    csvTemp.close()
                                    listGenes = readFilesHuman(
                                        namefilesubzip, TCGAdb)
                                    os.remove(namefilesubzip)
                                    # #Filter lists
                                    for filter in listfilter:
                                        listGenes = applyFilter(
                                            listGenes, filter)
                                    matrixGenes.append(listGenes)
                    elif 'expansion' in namefilezip:
                        #Read if is a list of human
                        csvText = str(archive.read(namefilezip))
                        csvText = csvText.split(r'"')
                        csvText = csvText[0]
                        csvListText = csvText.split(r'\n')
                        nameGene = ((re.search(r'-\w*\s',
                                               csvListText[0])).group())[1:-1]
                        if nameGene in dictGeneToAnalyze.keys():
                            csvTemp = open(namefilezip, 'w')
                            for l in csvListText:
                                csvTemp.write(l + '\n')
                            csvTemp.close()
                            listGenes = readFilesHuman(namefilezip, TCGAdb)
                            os.remove(namefilezip)
                            #Filter lists
                            for filter in listfilter:
                                listGenes = applyFilter(listGenes, filter)
                            matrixGenes.append(listGenes)
                    elif 'csv' in namefilezip:
                        #Read file csv inside an archive zip
                        csvText = str(archive.read(namefilezip))
                        if csvText[1] == '"':
                            csvText = csvText.split(r'"')
                        else:
                            csvText = csvText.split('\'')
                        csvListText = []
                        nameGene = ''
                        #different split if is vitis or human
                        if vitis:
                            csvText = csvText[1]
                            csvListText = csvText.split(r'\n')
                            try:
                                nameGene = listBioNameUpdate[((
                                    csvListText[0].split(r','))[3]).upper()]
                            except:
                                listBioNameUpdate[((
                                    csvListText[0].split(r',')
                                )[3]).upper()] = ((
                                    csvListText[0].split(r','))[3]).upper()
                                nameGene = ((
                                    csvListText[0].split(r','))[3]).upper()
                        else:
                            csvText = csvText[0]
                            csvListText = csvText.split(r'\n')
                            nameGene = (((re.search(
                                r'-\w*\s',
                                csvListText[0])).group())[1:-1]).upper()
                        #if is a gene to analyze read it
                        if nameGene in dictGeneToAnalyze.keys():
                            csvTemp = open(namefilezip, 'w')
                            for l in csvListText:
                                csvTemp.write(l + '\n')
                            csvTemp.close()
                            listGenes = []
                            if vitis:
                                listGenes = (ut.readFilesVitis(
                                    namefilezip, True))[0]
                                if nameGene == 'GT-001':
                                    listGenes[0] = nameGene  #TODEL_GT-001
                            else:
                                listGenes = readFilesHuman(namefilezip, TCGAdb)
                            os.remove(namefilezip)
                            # #Filter lists
                            for filter in listfilter:
                                listGenes = applyFilter(listGenes, filter)
                            matrixGenes.append(listGenes)
            else:
                #Read gene files .csv
                fileRead = open(f, 'r')
                csvText = fileRead.read()
                csvText = csvText.split(r'"')
                csvText = csvText[0]
                csvListText = csvText.split(r'\n')
                nameGene = ((re.search(r'-\w*\s',
                                       csvListText[0])).group())[1:-1]
                if nameGene in dictGeneToAnalyze.keys():
                    listGenes = readFilesHuman(f, TCGAdb)
                    #Filter lists
                    for filter in listfilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)
        else:
            print(
                'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip'
            )
            sys.exit(-1)

    return matrixGenes
Esempio n. 4
0
def buildMatrixGenesHuman(listFilter, listFiles, fantom):
    global list_Genes
    global typeDB
    typeDB = fantom
    #BUILD MATRIX OF GENES
    matrixGenes = []
    extensionFiles = listFiles[0][-4:]
    for f in listFiles:
        if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'):
            if f[-4:] == '.zip':
                archive = zipfile.ZipFile(f, 'r')
                fileArchive = archive.namelist()
                for namefilezip in fileArchive:
                    if '.zip' in namefilezip:
                        subarchive = zipfile.ZipFile(
                            BytesIO(archive.read(namefilezip)), 'r')
                        fileSubArchive = subarchive.namelist()
                        for namefilesubzip in fileSubArchive:
                            if 'expansion' in namefilesubzip:
                                csvText = str(subarchive.read(namefilesubzip))
                                csvText = csvText.split(r'"')
                                csvText = csvText[0]
                                csvListText = csvText.split(r'\n')
                                csvTemp = open(namefilesubzip, 'w')
                                for l in csvListText:
                                    csvTemp.write(l + '\n')
                                csvTemp.close()
                                listGenes = ut.readFilesVitis(
                                    namefilesubzip, False)
                                list_Genes = listGenes[1]
                                listGenes = listGenes[0]
                                os.remove(namefilesubzip)
                                #Filter lists
                                for filter in listFilter:
                                    listGenes = applyFilter(listGenes, filter)
                                matrixGenes.append(listGenes)
                    elif 'expansion' in namefilezip:
                        csvText = str(archive.read(namefilezip))
                        if csvText[1] == '"':
                            csvText = csvText.split(r'"')
                        else:
                            csvText = csvText.split('\'')
                        csvText = csvText[1]
                        csvListText = csvText.split(r'\n')
                        csvTemp = open(namefilezip, 'w')
                        for l in csvListText:
                            csvTemp.write(l + '\n')
                        csvTemp.close()
                        listGenes = ut.readFilesVitis(namefilezip, typeDB)
                        list_Genes = listGenes[1]
                        listGenes = listGenes[0]
                        os.remove(namefilezip)
                        #Filter lists
                        for filter in listFilter:
                            listGenes = applyFilter(listGenes, filter)
                        matrixGenes.append(listGenes)
                    elif 'csv' in namefilezip:
                        csvText = str(archive.read(namefilezip))
                        if csvText[1] == '"':
                            csvText = csvText.split(r'"')
                        else:
                            csvText = csvText.split('\'')
                        csvText = csvText[1]
                        csvListText = csvText.split(r'\n')
                        csvTemp = open(namefilezip, 'w')
                        for l in csvListText:
                            csvTemp.write(l + '\n')
                        csvTemp.close()
                        listGenes = ut.readFilesVitis(namefilezip, typeDB)
                        list_Genes = listGenes[1]
                        listGenes = listGenes[0]
                        os.remove(namefilezip)
                        #Filter lists
                        for filter in listFilter:
                            listGenes = applyFilter(listGenes, filter)
                        matrixGenes.append(listGenes)
            else:
                #Read gene files .csv
                listGenes = ut.readFilesVitis(f, typeDB)
                list_Genes = listGenes[1]
                listGenes = listGenes[0]
                #Filter lists
                for filter in listFilter:
                    listGenes = applyFilter(listGenes, filter)
                matrixGenes.append(listGenes)
        else:
            print(
                'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip'
            )
            sys.exit(-1)

    #READ UPDATE NAME GENE
    f = open('import_doc/couple_name_gene.csv', 'r')
    text = f.readlines()
    listLineName = []
    i = 0
    while i < len(text):
        listLineName.append(text[i].split(','))
        i += 1
    for l in listLineName:
        if l[0] != '':
            listBioNameUpdate[l[0]] = l[0] + '_' + l[1][:-1]
        for n in list_Genes:
            if n == l[0]:
                if l[0] != '':
                    if comprimeNode:
                        if (re.search(r'@\w*',
                                      l[1])).group() not in list_Genes:
                            list_Genes[list_Genes.index(n)] = (re.search(
                                r'@\w*', l[1])).group()
                        else:
                            del (list_Genes[list_Genes.index(n)])
                    else:
                        list_Genes[list_Genes.index(
                            n)] = l[0] + '_' + l[1][:-1]
    f.close()

    matrixGenesOld = []
    #comprime node if request
    if comprimeNode:
        #deep copy used to check edges in pearson correlation in comprime node version
        matrixGenesOld = cp.deepcopy(matrixGenes)
        #update name with name of only gene
        for l in matrixGenes:
            l[0] = (re.search(r'@\w*', listBioNameUpdate[l[0]])).group()
            i = 1
            while i < len(l):
                tmp = l[i]
                try:
                    l[i] = (tmp[0], (re.search(
                        r'@\w*', listBioNameUpdate[tmp[1]])).group(), tmp[2])
                    i += 1
                except:
                    del (l[i])
        #unify list of same gene but different isoform
        i = 0
        j = 1
        while i < len(matrixGenes):
            while j < len(matrixGenes):
                if matrixGenes[i][0] == matrixGenes[j][0]:
                    matrixGenes[i] = matrixGenes[i] + matrixGenes[j][1:]
                    del (matrixGenes[j])
                else:
                    j += 1
            i += 1
            j = i + 1
        #remove duplicate
        for l in matrixGenes:
            i = 1
            j = 2
            while i < len(l):
                while j < len(l):
                    if l[i][1] == l[j][1]:
                        #l[i] = (l[i][0], l[i][1], max(l[i][2], l[j][2]))
                        l[i] = (int((l[i][0] + l[j][0]) / 2), l[i][1],
                                round(((l[i][2] + l[j][2]) / 2), 4))
                        del (l[j])
                    else:
                        j += 1
                i += 1
                j = i + 1
            i = 1
            while i < len(l):
                if l[i][1] == l[0]:
                    del (l[i])
                else:
                    i += 1

    #return lists of gene lists filtered
    return (matrixGenes, matrixGenesOld, comprimeNode)
def readFilesGenes(listFiles, coupleGenes, listfilter):
    dictGeneToAnalyze = {}
    #Read only file of a gene to analyze
    if '-u' not in [a[0] for a in listfilter]:
        for f in coupleGenes:
            for elem in f:
                dictGeneToAnalyze[elem] = 0
    matrixGenes = []
    extensionFiles = listFiles[0][-4:]
    for f in listFiles:
        if f[-4:] == extensionFiles and (f[-4:] != '.csv' or f[-4:] != '.zip'):
            if f[-4:] == '.zip':
                archive = zipfile.ZipFile(f, 'r')
                fileArchive = archive.namelist()
                for namefilezip in fileArchive:
                    #Read file csv inside an archive zip
                    csvText = str(archive.read(namefilezip))
                    if csvText[1] == '"':
                        csvText = csvText.split(r'"')
                    else:
                        csvText = csvText.split('\'')
                    csvListText = []
                    nameGene = ''
                    #different split if is vitis or human
                    csvText = csvText[1]
                    csvListText = csvText.split(r'\n')
                    nameGene = ((csvListText[0].split(r','))[3]).upper()
                    #if is a gene to analyze read it
                    if nameGene in dictGeneToAnalyze.keys() or '-u' in [
                            a[0] for a in listfilter
                    ]:
                        csvTemp = open(namefilezip, 'w')
                        for l in csvListText:
                            csvTemp.write(l + '\n')
                        csvTemp.close()
                        listGenes = []
                        listGenes = (ut.readFilesVitis(namefilezip))[0]
                        os.remove(namefilezip)
                        # #Filter lists
                        for filter in listfilter:
                            listGenes = applyFilter(listGenes, filter)
                        matrixGenes.append(listGenes)
            elif f[-4:] == '.csv':
                #Read gene files .csv TODO
                fileRead = open(f, 'r')
                csvText = fileRead.read()

                if csvText[1] == '"':
                    csvText = csvText.split(r'"')
                else:
                    csvText = csvText.split('\'')
                csvListText = []
                nameGene = ''
                #different split if is vitis or human
                csvText = csvText[1]
                csvListText = csvText.split(r'\n')
                nameGene = ((csvListText[0].split(r','))[3]).upper()
                #if is a gene to analyze read it
                if nameGene in dictGeneToAnalyze.keys() or '-u' in [
                        a[0] for a in listfilter
                ]:
                    csvTemp = open(f, 'w')
                    for l in csvListText:
                        csvTemp.write(l + '\n')
                    csvTemp.close()
                    listGenes = []
                    listGenes = (ut.readFilesVitis(namefilezip))[0]
                    os.remove(namefilezip)
                    # #Filter lists
                    for filter in listfilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)

                # csvText = csvText.split(r'"')
                # csvText = csvText[0]
                # csvListText = csvText.split(r'\n')
                # nameGene = ((re.search(r'-\w*\s', csvListText[0])).group())[1:-1]
                # if nameGene in dictGeneToAnalyze.keys():
                # listGenes = readFilesHuman(f, TCGAdb)
                # #Filter lists
                # for filter in listfilter:
                #     listGenes = applyFilter(listGenes, filter)
                # matrixGenes.append(listGenes)
            else:
                #listacodici=[]
                codici = listFiles[0]
                listacodici = codici.split(',')
                file_csv = []
                ncodici = len(listacodici)
                #print(ncodici)
                for x in listacodici:
                    file_csv.append(glob.glob('../annotated/' + str(x) + '_*'))
                #print(listacodici)
                #print(file_csv)
                #print(listacodici)
                archiviovec = []
                for x in range(ncodici):
                    if file_csv[x][0] != 0:
                        archiviovec.append(file_csv[x][0])
                #print(archiviovec)
                #////////////////////////////copio il codice da sopra per le zip//////////
                archive = archiviovec
                #print(archive)
                fileArchive = archiviovec
                #print(fileArchive)
                for namefilezip in fileArchive:
                    listGenes = ut.readFilesVitis(namefilezip)
                    list_Genes = listGenes[1]
                    listGenes = listGenes[0]
                    #os.remove(namefilezip)
                    #Filter lists
                    for filter in listfilter:
                        listGenes = applyFilter(listGenes, filter)
                    matrixGenes.append(listGenes)
                    #///////////////////////////////////////////////////////////////////////
        else:
            print(
                'ERROR: FILES HAVE DIFFERENT EXTENSION. File need to have the same extension. All .csv or all .zip'
            )
            sys.exit(-1)

    return matrixGenes