Beispiel #1
0
def fromCancerList(n, value):
    cutoff = value
    casedict = {
        'ACC': 92,
        'BRCA': 1044,
        'ESCA': 184,
        'HNSC': 510,
        'LAML': 149,
        'MESO': 83,
        'SKCM': 470,
        'THCA': 496,
        'COAD': 433
    }
    diseaseList = casedict.keys()
    listCandidateProteins = [
    ]  #list of all proteins in all files with DA > cutoff (not filtered for idiosync)
    for disease in diseaseList:
        filename = "epex_outputs\epexchosen_{}.txt".format(disease)
        fin = open(filename)
        readfile = fin.readlines()
        for i in range(len(readfile)):
            readfile[i] = readfile[i].split()
            if float(readfile[i][2]) > cutoff and readfile[i][
                    0] not in listCandidateProteins:
                # scorelist.append(round(float(readfile[i][2]),5))
                # listCandidateProteins.append((readfile[i][0],readfile[i][2]))  #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...]
                listCandidateProteins.append(readfile[i][0])

    fromCancerList = []
    nProteins = numlist.cancerlist(n)
    for protein in listCandidateProteins:
        if protein in nProteins[1]:
            fromCancerList.append(protein)
    return fromCancerList
Beispiel #2
0
def idiosync(num):
    cancerdictlist = countproteins()

    genelistnum = cancerlist(num)

    returndict = {}
    for genel in genelistnum[1]:
        tupl = ()
        for dict in cancerdictlist:
            for key in dict.keys():
                for gene in dict[key].keys():
                    if genel == gene:
                        tupl += (key, )
        if tupl not in returndict.keys():
            returndict[tupl] = [genel]
        else:
            returndict[tupl].append(genel)
    # print(returndict)

    return returndict
def numCompareTcga(nin, value):
    n = nin
    cutoff = value

    casedict = {
        'ACC': 92,
        'BRCA': 1044,
        'ESCA': 184,
        'HNSC': 510,
        'LAML': 149,
        'MESO': 83,
        'SKCM': 470,
        'THCA': 496,
        'COAD': 433
    }
    diseaseList = casedict.keys()
    listCandidateProteins = [
    ]  #list of all proteins in all files with DA > cutoff (not filtered for idiosync)
    for disease in diseaseList:
        filename = "epex_outputs\epexchosen_{}.txt".format(disease)
        fin = open(filename)
        readfile = fin.readlines()
        for i in range(len(readfile)):
            readfile[i] = readfile[i].split()
            if float(readfile[i][2]) > cutoff and readfile[i][
                    0] not in listCandidateProteins:
                # scorelist.append(round(float(readfile[i][2]),5))
                # listCandidateProteins.append((readfile[i][0],readfile[i][2]))  #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...]
                listCandidateProteins.append(readfile[i][0])

    fromCancerList = []
    nProteins = numlist.cancerlist(n)
    for protein in listCandidateProteins:
        if protein in nProteins[1]:
            fromCancerList.append(protein)
    '''
    2) For each protein, calculate the percent it is DA in all TCGA files
       represented as: (%, # DA, # normal)
    '''
    returnDict = {}
    tcgaProteins = []
    for disease in diseaseList:
        filename = "epex_outputs\epexchosen_{}.txt".format(disease)
        fin = open(filename)
        readfile = fin.readlines()
        for i in range(len(readfile)):
            readfile[i] = readfile[i].split()
            if readfile[i][0] in fromCancerList and float(
                    readfile[i][2]) > cutoff:
                if readfile[i][0] not in returnDict:
                    returnDict[readfile[i][0]] = 1
                else:
                    returnDict[readfile[i][0]] += 1
            tcgaProteins.append(readfile[i][0])
    tcgaProteinCount = Counter(tcgaProteins)
    for protein in returnDict:
        returnDict[protein] = (round(
            ((returnDict[protein] / tcgaProteinCount[protein]) * 100),
            1), returnDict[protein], tcgaProteinCount[protein])
    # print(returnDict)
    return returnDict
Beispiel #4
0
        listCandidateProteins = [
        ]  #list of all proteins in all files with DA > cutoff (not filtered for idiosync)
        for disease in diseaseList:
            filename = "epex_outputs\epexchosen_{}.txt".format(disease)
            fin = open(filename)
            readfile = fin.readlines()
            for i in range(len(readfile)):
                readfile[i] = readfile[i].split()
                if float(readfile[i][2]) > cutoff and readfile[i][
                        0] not in listCandidateProteins:
                    # scorelist.append(round(float(readfile[i][2]),5))
                    # listCandidateProteins.append((readfile[i][0],readfile[i][2]))  #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...]
                    listCandidateProteins.append(readfile[i][0])

        fromCancerList = []
        nProteins = numlist.cancerlist(n)
        for protein in listCandidateProteins:
            if protein in nProteins[1]:
                fromCancerList.append(protein)

        freq_dict = {}
        for disease in diseaseList:
            filename = "epex_outputs\epexchosen_{}.txt".format(disease)
            fin = open(filename)
            readfile = fin.readlines()
            for i in range(len(readfile)):
                readfile[i] = readfile[i].split()
                if readfile[i][0] in fromCancerList:
                    if readfile[i][0] not in freq_dict:
                        freq_dict[readfile[i][0]] = 1
                    else:
def relFreqDict(num, value):
    n = num
    cutoff = value

    casedict = {
        'ACC': 92,
        'BRCA': 1044,
        'ESCA': 184,
        'HNSC': 510,
        'LAML': 149,
        'MESO': 83,
        'SKCM': 470,
        'THCA': 496,
        'COAD': 433
    }
    diseaseList = casedict.keys()
    listCandidateProteins = [
    ]  #list of all proteins in all files with DA > cutoff (not filtered for idiosync)
    for disease in diseaseList:
        filename = "epex_outputs\epexchosen_{}.txt".format(disease)
        fin = open(filename)
        readfile = fin.readlines()
        for i in range(len(readfile)):
            readfile[i] = readfile[i].split()
            if float(readfile[i][2]) > cutoff and readfile[i][
                    0] not in listCandidateProteins:
                # scorelist.append(round(float(readfile[i][2]),5))
                # listCandidateProteins.append((readfile[i][0],readfile[i][2]))  #[(protein, DAscore), (protein, DAscore),(protein, DAscore)...]
                listCandidateProteins.append(readfile[i][0])

    fromCancerList = []
    nProteins = numlist.cancerlist(n)
    for protein in listCandidateProteins:
        if protein in nProteins[1]:
            fromCancerList.append(protein)

    freq_dict = {}
    for disease in diseaseList:
        filename = "epex_outputs\epexchosen_{}.txt".format(disease)
        fin = open(filename)
        readfile = fin.readlines()
        for i in range(len(readfile)):
            readfile[i] = readfile[i].split()
            if readfile[i][0] in fromCancerList:
                if readfile[i][0] not in freq_dict:
                    freq_dict[readfile[i][0]] = 1
                else:
                    freq_dict[readfile[i][0]] += 1
    # print(freq_dict)
    '''
    2. Make list of all of the frequencies. Compute mean, std, and z-score for each protein.
    '''
    ''' Make list '''
    freq_list = []
    for protein in freq_dict:
        freq_list.append(freq_dict[protein])
    ''' Compute mean '''
    def Average(lst):
        return sum(lst) / len(lst)

    mean = Average(freq_list)
    ''' Compute standard deviation '''
    import statistics
    standardDev = statistics.stdev(freq_list)
    ''' Compute z-score for each protein. Create dictionary like the following: {protein: z-score, protein : z-score ...} '''
    zscore_dict = {}
    for protein in freq_dict:
        zscore = round((freq_dict[protein] - mean) / standardDev, 2)
        zscore_dict[protein] = zscore
    # print(zscore_dict)
    return zscore_dict