Example #1
0
class System(object):
    def __init__(self):
        self.inputParser = InputParser()
        self.networkParser = NetworkParser()
        self.connection = Connection()
        self.output = ""

    def connect(self, host, port):
        self.connection.connect(host, port)

    def send(self, data):
        self.connection.sendPacket(data)

    def step(self):
        self.actions = []
        for line in self.inputParser.getOutput():
            if line[0] != '.':
                self.actions.append(['print', line + "\n"])
                self.send(line + "\n")
            else:
                self.connection.sendPacket("\xFF\xFA\xC9"+line[1:]+"\xFF\xF0")
        self.connection.receive()
        while True:
            packet = self.connection.getPacket()
            if not packet:
                break
            self.parsePacket(packet)
        return self.actions

    def parsePacket(self, packet):
        actions = self.networkParser.parse(packet)
        for action in actions:
            if action[0] == "send":
                self.send(action[1])
            if action[0] == "gmcp":
                split = action[1].split(' ', 1)
                if len(split)>1:
                    self.actions.append(["gmcp", split[0], json.loads(split[1])])
                else:
                    self.actions.append(["print", "no hablo gmcp\n"])
            else:
                self.actions.append(action)

    def input(self, key):
        self.inputParser.parse(key)

    def getInput(self):
        return self.inputParser.getCommand()
Example #2
0
    def getCosmicGenes(self):
        cosmicGenes = InputParser().readCausalGeneFile(
            settings.files['causalGenesFile'])
        cosmicGeneNames = []
        cancerTypes = dict()
        for gene in cosmicGenes:
            cosmicGeneNames.append(gene[3].name)
            cancerTypes[gene[3].name] = gene[4]

        return cosmicGeneNames, cancerTypes
Example #3
0
    def getCosmicGenes(self):
        """
			Read all names of COSMIC genes into a dictionary. Keys are the names,
			associated cancer type as value.
		"""
        cosmicGenes = InputParser().readCausalGeneFile(
            settings.files['causalGenesFile'])
        cosmicGeneNames = []
        cancerTypes = dict()
        for gene in cosmicGenes:
            cosmicGeneNames.append(gene[3].name)
            cancerTypes[gene[3].name] = gene[4]

        return cosmicGeneNames, cancerTypes
import random
from scipy import stats
path = sys.argv[2]
sys.path.insert(1, path)
sys.path.insert(1, 'linkSVsGenes/')

import settings
from inputParser import InputParser

outDir = sys.argv[1]

svTypes = ['DEL', 'DUP', 'INV', 'ITX']
#svTypes = ['DEL']

#get the cosmic genes
cosmicGenes = InputParser().readCausalGeneFile(
    settings.files['causalGenesFile'])
cosmicGeneNames = []
for gene in cosmicGenes:
    cosmicGeneNames.append(gene[3].name)

nonCausalGenes = InputParser().readNonCausalGeneFile(
    settings.files['nonCausalGenesFile'],
    cosmicGenes)  #In the same format as the causal genes.

#Combine the genes into one set.
allGenes = np.concatenate((cosmicGenes, nonCausalGenes), axis=0)

bcGeneNames = []
bcGenesFile = '../data/genes/breastCancerCausalGenes.txt'  #make setting
with open(bcGenesFile, 'r') as inF:
Example #5
0
    def generateFrequencyScatterPlot(self, cancerTypes, pathogenicSNVCounts):
        """
			Create figures S3A+B and figure 2A.

			Parameters:
			- cancerTypes: cancer types to include in the plot
			- pathogenicSNVCounts: dictionary with the cancer type as key, and as value
			a dictionary with each gene as key, and as value the count of high-impact
			SNVs affecting that gene in this cancer type.

		"""

        #Get the predicted positive SV-gene pairs (no longer cosmic-specific)
        allCosmicPairs = dict()
        for cancerType in cancerTypes:
            cosmicGeneNames, cosmicGeneCancerTypes = self.getCosmicGenes()
            correctCosmicPairs = self.getCorrectlyPredictedCosmicPairs(
                cancerType, cosmicGeneNames)
            allCosmicPairs[cancerType] = correctCosmicPairs

        #Create an order for the genes and cancer types
        cancerTypesIndex = dict()
        cosmicGenesIndex = dict()
        geneFrequencies = dict()
        geneInd = 0
        cancerTypePlotNames = []
        for cancerTypeInd in range(0, len(allCosmicPairs)):
            cancerType = list(allCosmicPairs.keys())[cancerTypeInd]
            cancerTypesIndex[cancerType] = cancerTypeInd

            splitCancerType = cancerType.split('_')
            cancerType2 = '_'.join(splitCancerType[1:2])
            cancerTypePlotNames.append(cancerType2)

            if cancerType not in geneFrequencies:
                geneFrequencies[cancerType] = dict()

            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]
                if gene not in cosmicGenesIndex:
                    cosmicGenesIndex[gene] = geneInd
                    geneInd += 1
                if gene not in geneFrequencies[cancerType]:
                    geneFrequencies[cancerType][gene] = 0
                geneFrequencies[cancerType][gene] += 1

        #check distribution of genes/cosmic etc
        uniqueGenes = dict()
        uniqueCosmicGenes = dict()
        uniqueSpecificGenes = dict()

        plotData = []
        plotDataAllGenes = []
        for cancerTypeInd in range(0, len(allCosmicPairs)):
            cancerType = list(allCosmicPairs.keys())[cancerTypeInd]
            cancerTypeNames = self.cancerTypeNames[cancerType]

            uniqueGenesC = dict()
            uniqueCosmicGenesC = dict()
            uniqueSpecificGenesC = dict()

            uniquePatients = dict()
            genesPerPatient = dict()

            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]
                uniquePatients[splitPair[1]] = 0

                uniqueGenes[gene] = 0
                uniqueGenesC[gene] = 0
                geneType = 'Predicted driver gene'
                if gene in cosmicGeneCancerTypes:
                    geneType = 'CGC gene'
                    uniqueCosmicGenes[gene] = 0
                    uniqueCosmicGenesC[gene] = 0
                    for keyword in cancerTypeNames:
                        if re.search(keyword, cosmicGeneCancerTypes[gene],
                                     re.IGNORECASE):
                            geneType = 'Cancer-type specific CGC gene'
                            uniqueSpecificGenes[gene] = 0
                            uniqueSpecificGenesC[gene] = 0

                if splitPair[1] not in genesPerPatient:
                    genesPerPatient[splitPair[1]] = []
                genesPerPatient[splitPair[1]].append(gene)

            print('cancer type: ', cancerType)
            print('genes: ', len(uniqueGenesC))
            print('cosmic genes: ', len(uniqueCosmicGenesC))
            print('specific genes: ', len(uniqueSpecificGenesC))
            print(uniqueSpecificGenesC)
            print('number of patients: ', len(uniquePatients))
            print('genes per patient: ',
                  len(uniqueGenesC) / len(uniquePatients))

            perPatientGeneDistribution = []
            perPatientCosmicGeneDistribution = []
            perPatientSCosmicGeneDistribution = []
            for patient in genesPerPatient:

                geneCount = 0
                cosmicGeneCount = 0
                sCosmicGeneCount = 0
                for gene in genesPerPatient[patient]:
                    geneCount += 1
                    if gene in cosmicGeneCancerTypes:
                        cosmicGeneCount += 1
                        for keyword in cancerTypeNames:
                            if re.search(keyword, cosmicGeneCancerTypes[gene],
                                         re.IGNORECASE):
                                sCosmicGeneCount += 1

                perPatientGeneDistribution.append(geneCount)
                perPatientCosmicGeneDistribution.append(cosmicGeneCount)
                perPatientSCosmicGeneDistribution.append(sCosmicGeneCount)

                plotDataAllGenes.append(
                    [cancerType, 'Predicted driver genes', geneCount, patient])
                plotData.append(
                    [cancerType, 'CGC genes', cosmicGeneCount, patient])
                plotData.append([
                    cancerType, 'Cancer type-specific CGC genes',
                    sCosmicGeneCount, patient
                ])

        print('total drivers: ', len(uniqueGenes))
        print('total known drivers: ', len(uniqueCosmicGenes))
        print('total specific drivers: ', len(uniqueSpecificGenes))

        #plot Fig S3A and S3B
        data = pd.DataFrame(plotData)
        data.columns = [
            'Cancer type', 'Gene type', 'Gene count per patient', 'Patient'
        ]

        v = sns.boxplot(y='Gene count per patient',
                        x='Cancer type',
                        data=data,
                        hue='Gene type',
                        palette=['#57db5f', '#5f57db'])

        plt.xticks(np.arange(0, len(cancerTypes)),
                   cancerTypePlotNames,
                   rotation='vertical')
        plt.tight_layout()

        plt.savefig('output/figures/figureS3A.svg')

        data = pd.DataFrame(plotDataAllGenes)
        data.columns = [
            'Cancer type', 'Gene type', 'Gene count per patient', 'Patient'
        ]

        v = sns.boxplot(y='Gene count per patient',
                        x='Cancer type',
                        data=data,
                        hue='Gene type',
                        palette=['#db5f57'])

        plt.xticks(np.arange(0, len(cancerTypes)),
                   cancerTypePlotNames,
                   rotation='vertical')
        plt.tight_layout()

        plt.savefig('output/figures/figureS3B.svg')

        ####Then use the same information to output figure 2A

        #instead of frequency by non-coding SVs, use number of coding events as size
        print('Calculating coding events...')
        codingFrequency = dict()
        normalizedCodingFrequency = dict()
        patientCounts = dict()

        #aside from normal codng events, also sample random genes to compare to
        iterationCount = 1
        #get all genes to sample from
        causalGenes = InputParser().readCausalGeneFile(
            settings.files['causalGenesFile'])
        nonCausalGenes = InputParser().readNonCausalGeneFile(
            settings.files['nonCausalGenesFile'],
            causalGenes)  #In the same format as the causal genes.

        #Combine the genes into one set.
        allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0)

        allGeneNames = []
        for gene in allGenes:
            allGeneNames.append(gene[3].name)
        cosmicGeneNames = []
        for gene in causalGenes:
            cosmicGeneNames.append(gene[3].name)

        intogenDrivers = []
        intogenFile = '../data/genes/Compendium_Cancer_Genes.tsv'
        with open(intogenFile, 'r') as inF:
            lineCount = 0
            for line in inF:
                if lineCount < 1:
                    lineCount = 1
                    continue
                splitLine = line.split('\t')
                intogenDrivers.append(splitLine[0])

        #instead, sample 10.000 times X genes of the same set size
        #take the average of that set.

        np.random.seed(1)
        randomSampleIterations = 100

        geneFrequencies = dict()
        nonCodingOnlyGenes = dict()
        allPValues = []
        for cancerType in cancerTypes:

            #if checking results for CTCF, make sure that we can find the results
            #in the pathogenic SNV pairs data.
            if cancerType == 'HMF_Breast_CTCF':
                cancerType2 = 'HMF_Breast'
            elif cancerType == 'HMF_Colorectal_CTCF':
                cancerType2 = 'HMF_Colorectal'
            elif cancerType == 'HMF_Lung_CTCF':
                cancerType2 = 'HMF_Lung'
            else:
                splitCancerType = cancerType.split('_')
                cancerType2 = '_'.join(splitCancerType[0:2])

            nonCodingOnlyGenes[cancerType] = dict()
            geneFrequencies[cancerType] = dict()

            trueGenes = dict()
            for pair in allCosmicPairs[cancerType]:

                splitPair = pair.split('_')
                gene = splitPair[0]
                trueGenes[gene] = 0

            randomDistribution = []
            for iteration in range(0, randomSampleIterations):

                #sample random genes of the same size.
                randomGenes = np.random.choice(allGeneNames, len(trueGenes))
                for gene in randomGenes:
                    if gene in pathogenicSNVCounts[cancerType2]:
                        if gene not in intogenDrivers:
                            continue
                        randomDistribution.append(
                            pathogenicSNVCounts[cancerType2][gene])
                    else:
                        randomDistribution.append(0)

            randomMean = np.mean(randomDistribution)
            randomStd = np.std(randomDistribution)
            pValues = []
            #allPValues = []
            seenGenes = []
            for pair in allCosmicPairs[cancerType]:

                splitPair = pair.split('_')
                patient = splitPair[1]
                svType = splitPair[2]
                gene = splitPair[0]

                if gene in seenGenes:
                    continue
                seenGenes.append(gene)

                score = 0
                if gene in pathogenicSNVCounts[cancerType2]:
                    score = pathogenicSNVCounts[cancerType2][gene]
                else:
                    #don't count duplicates, that would be more than 1 per patient
                    nonCodingOnlyGenes[cancerType][gene] = 0

                z = (score - randomMean) / randomStd

                pValue = stats.norm.sf(abs(z))
                pValues.append([gene, z, pValue])
                allPValues.append(
                    [gene, cancerType, z, pValue, score, patient])

            if len(allPValues) < 1:
                continue

        uncorrectedPValues = np.array(allPValues, dtype='object')

        #sort by most significant first
        uncorrectedPValues = uncorrectedPValues[np.argsort(
            uncorrectedPValues[:, 3])]

        #reject, pAdjusted, _, _ = multipletests(uncorrectedPValues[:,3], method='fdr_bh', alpha=0.1) #fdr_bh or bonferroni
        reject, pAdjusted, _, _ = multipletests(uncorrectedPValues[:, 3],
                                                method='bonferroni')
        signPatients = []
        for pValueInd in range(0, len(uncorrectedPValues[:, 3])):

            gene = uncorrectedPValues[pValueInd, 0]
            cancerType = uncorrectedPValues[pValueInd, 1]

            if reject[pValueInd] == True and uncorrectedPValues[pValueInd,
                                                                2] > 0:

                geneFrequencies[cancerType][gene] = uncorrectedPValues[
                    pValueInd, 2]

                signPatients.append([
                    uncorrectedPValues[pValueInd][0],
                    uncorrectedPValues[pValueInd][2], pAdjusted[pValueInd],
                    uncorrectedPValues[pValueInd][3],
                    uncorrectedPValues[pValueInd][4],
                    uncorrectedPValues[pValueInd][5]
                ])

        signPatients = np.array(signPatients, dtype='object')
        print(signPatients)
        print(signPatients.shape)

        cosmicCountSignificantGenes = 0
        for gene in signPatients[:, 0]:
            if gene in cosmicGeneCancerTypes:
                cosmicCountSignificantGenes += 1

        print('Number of Cosmic genes in significant genes: ',
              cosmicCountSignificantGenes)

        sortedPatients = signPatients[np.argsort(signPatients[:, 2])]
        signPatients = sortedPatients[0:50]

        print(signPatients)

        #save the significant genes to a file for table S3
        tableS3Data = []
        for row in sortedPatients:

            #find which cancer type had this gene.
            for cancerType in cancerTypes:
                for pair in allCosmicPairs[cancerType]:

                    splitPair = pair.split('_')
                    patient = splitPair[1]
                    svType = splitPair[2]
                    gene = splitPair[0]

                    if gene == row[0]:
                        #gene, uncorrected, corrected, patient, sv type, cancer type
                        splitCancerType = cancerType.split('_')
                        tableS3Data.append([
                            gene, row[2], row[3], patient, svType,
                            splitCancerType[1]
                        ])

        tableS3Data = np.array(tableS3Data)
        np.savetxt('output/significantGenes.txt',
                   tableS3Data,
                   fmt='%s',
                   delimiter='\t')

        #create the scatter plot in this order, use the frequency as point size
        genePlotIndices = dict()
        currentGenePlotIndex = 0
        plotData = []
        plotFrequencies = []
        pointColors = []
        cancerTypePlotNames = []
        for cancerType in allCosmicPairs:
            splitCancerType = cancerType.split('_')
            cancerType2 = '_'.join(splitCancerType[1:2])
            cancerTypePlotNames.append(cancerType2)
            cancerTypeIndex = cancerTypesIndex[cancerType]
            cancerTypeNames = self.cancerTypeNames[cancerType]
            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]

                #get frequency of this gene
                if gene in geneFrequencies[
                        cancerType] and gene in signPatients[:, 0]:
                    geneFrequency = geneFrequencies[cancerType][gene]

                    if gene not in genePlotIndices:
                        genePlotIndices[gene] = currentGenePlotIndex
                        currentGenePlotIndex += 1

                    #determine the color based on if this gene is cancer-type specific
                    edgecolors = 1
                    facecolors = 'black'

                    if gene in cosmicGeneCancerTypes:
                        facecolors = 'green'
                        edgecolors = 3
                        for keyword in cancerTypeNames:
                            if re.search(keyword, cosmicGeneCancerTypes[gene],
                                         re.IGNORECASE):
                                print('match', cancerType, gene)
                                edgecolors = 2
                                facecolors = 'red'

                    plotData.append([
                        genePlotIndices[gene], cancerTypeIndex, edgecolors,
                        geneFrequency * 500
                    ])

        plotData = np.array(plotData)
        print(plotData)
        print(plotData.shape)
        data = pd.DataFrame(plotData)
        data.columns = ['Gene', 'Cancer type', 'color', 'frequency']
        data = data.drop_duplicates()

        #make sure to use the same colors as in the other plots, and not skip colors because
        #not all cacner types have significant genes.
        customPalette = sns.color_palette("hls", len(cancerTypes))
        finalPalette = []
        for colorInd in range(0, len(customPalette)):
            if colorInd not in plotData[:, 1]:
                continue
            else:
                finalPalette.append(customPalette[colorInd])

        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=data,
                        x='Gene',
                        y='Cancer type',
                        size=data.frequency,
                        hue=data['Cancer type'],
                        legend=False,
                        style=data.color,
                        edgecolor='k',
                        sizes=(20, 300),
                        palette=finalPalette)

        plt.xticks(np.arange(0, len(genePlotIndices)),
                   list(genePlotIndices.keys()),
                   rotation='vertical')
        plt.yticks(np.arange(0, len(cancerTypesIndex)), cancerTypePlotNames)

        ax = plt.axes()
        ax.grid(which='minor', axis='y', linestyle='-')

        plt.tight_layout()
        plt.savefig('output/figures/figure2A.svg')
        plt.clf()
###parameters
geneNameConversionFile = settings.files['geneNameConversionFile']
expressionFile = settings.files['normalizedExpressionFile']
outDir = sys.argv[2]
randomize = sys.argv[3]  #shuffle expression to get random z-scores?

specificOutDir = outDir + '/tadDisruptionsZScores/'

if not os.path.exists(specificOutDir):
    os.makedirs(specificOutDir)

#For each TAD, determine which genes are there

#first get all genes and their positions
causalGenes = InputParser().readCausalGeneFile(
    settings.files['causalGenesFile'])
nonCausalGenes = InputParser().readNonCausalGeneFile(
    settings.files['nonCausalGenesFile'],
    causalGenes)  #In the same format as the causal genes.

#Combine the genes into one set.
allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0)

genes = []
for gene in allGenes:
    genes.append([gene[0], gene[1], gene[2], gene[3].name])

genes = np.array(genes, dtype='object')

#also use a map for the gene names
geneNameConversionMap = dict()
    def __init__(self, genes, svData):
        """
			Initialize the neighborhood defining. This involves gathering all required data types, mapping these to TADs/genes, and associating the effects of SVs to genes. 

			genes: (numpy array) array with the genes and their information. chr, start, end, geneObject
			svData: (numpy array) array with the SVs and their information. chr1, s1, e1, chr2, s2, e2, cancerType, sampleName, svObject.
		"""

        #1. Get TADs from the TAD file, and then map TADs to genes (left/right TAD).
        tadData = []

        tadFile = settings.files['tadFile']

        print("Getting TADs")
        tadData = InputParser().getTADsFromFile(tadFile)

        print("original number of svs:", svData.shape)

        if settings.general['shuffleTads'] == True:
            #Shuffle the TADs. Assign random genomic positions to the TADs, but keep the same length.
            genomicShuffler = GenomicShuffler()
            tadData = genomicShuffler.shuffleTADs(tadData)

        print("mapping TADs to genes")
        self.mapTADsToGenes(genes[:, 3], tadData)

        # #2. Get eQTLs from the eQTL file, and map eQTLs to TADs.
        eQTLFile = settings.files['eQTLFile']
        print("getting eQTLs")
        eQTLData = InputParser().getEQTLsFromFile(eQTLFile, genes[:, 3], self)
        #map the regulatory elements to the TADs so that we can later on when looking at disrupted TADs easily find which elements are affected.
        tadData = self.mapElementsToTads(eQTLData, tadData)

        #map the genes to TADs. These are all the gene objects that we can then access when looking at disrupted TADs.
        tadData = self.mapGenesToTads(genes, tadData)

        #3. Get enhancers

        print("getting enhancers")
        enhancerData = InputParser().getEnhancersFromFile(
            settings.files['enhancerFile'], genes[:, 3], self)
        #Add the enhancers to TADs & genes as well
        tadData = self.mapElementsToTads(enhancerData, tadData)

        #4. Get promoters

        print("getting promoters")
        promoterData = InputParser().getPromotersFromFile(
            settings.files['promoterFile'], genes[:, 3], self)

        #Add the promoters to the TADs
        tadData = self.mapElementsToTads(promoterData, tadData)

        #5. Get CpG islands
        print("Getting cpg islands")
        cpgData = InputParser().getCpgIslandsFromFile(
            settings.files['cpgFile'])

        #Add the CpG sites to the TADs
        tadData = self.mapElementsToTads(cpgData, tadData)

        #6. Get Transcription factors
        print("Getting transcription factors")

        tfData = InputParser().getTranscriptionFactorsFromFile(
            settings.files['tfFile'])

        #Add the CpG sites to the TADs
        tadData = self.mapElementsToTads(tfData, tadData)

        #7. Get Hi-C data
        #print("Getting Hi-C data")
        #hicData = InputParser().getHiCInteractionsFromFile(settings.files['hicFile'])

        #Map the interactions to TADs as elements
        #tadData = self.mapInteractionsToTads(hicData, tadData)

        #8. Get histone marks

        print("Getting histone marks")
        #files = [settings.files['h3k9me3'], settings.files['h3k4me3'], settings.files['h3k27ac'], settings.files['h3k27me3'],
        #			settings.files['h3k4me1'], settings.files['h3k36me3']]
        #types = ['h3k9me3', 'h3k4me3', 'h3k27ac', 'h3k27me3', 'h3k4me1', 'h3k36me3']

        #only use the types that matter
        files = [
            settings.files['h3k4me3'], settings.files['h3k27ac'],
            settings.files['h3k27me3'], settings.files['h3k4me1']
        ]
        types = ['h3k4me3', 'h3k27ac', 'h3k27me3', 'h3k4me1']

        for histoneFileInd in range(0, len(files)):
            histoneData = InputParser().getHistoneMarksFromFile(
                files[histoneFileInd], types[histoneFileInd])

            #map the histone marks to the TADs
            tadData = self.mapElementsToTads(histoneData, tadData)

        #9. Get DNAse I hypersensitivty sites
        print("Getting DNAse I hypersensitivity sites")

        dnaseIData = InputParser().getDNAseIFromFile(
            settings.files['dnaseIFile'])

        tadData = self.mapElementsToTads(dnaseIData, tadData)

        #10. get chromHMM states
        print("Getting chromHMM states")
        chromHmmData = InputParser().getChromHmmFromFile(
            settings.files['chromHmmFile'])

        tadData = self.mapElementsToTads(chromHmmData, tadData)

        #11. get RNAPolII peaks
        print("Getting rnaPol binding sites")
        rnaPolData = InputParser().getRnaPolFromFile(
            settings.files['rnaPolFile'])

        tadData = self.mapElementsToTads(rnaPolData, tadData)

        #12. get super enhancers
        print("Getting super enhancers")
        superEnhancerData = InputParser().getSuperEnhancersFromFile(
            settings.files['superEnhancerFile'])

        tadData = self.mapElementsToTads(superEnhancerData, tadData)

        #13. get CTCF sites
        print("Getting ctcf sites")
        ctcfData = InputParser().getCTCFSitesFromFile(
            settings.files['ctcfFile'])

        tadData = self.mapElementsToTads(ctcfData, tadData)
        tadData = self.mapCTCFStrengthToTads(ctcfData, tadData)

        #3. Determine the effect of the SVs on the neighborhood/regulator set
        print("Mapping SVs to the neighborhood")
        self.mapSVsToNeighborhood(genes, svData, tadData)
Example #8
0
 def __init__(self):
     self.inputParser = InputParser()
     self.networkParser = NetworkParser()
     self.connection = Connection()
     self.output = ""
Example #9
0
def getBinScores(zScores, rules, cosmic, expressionCutoff, randomExpression, svType, elementType):
	"""
		Get the z-scores in each bin in the TADs.

		zScores (numpy array): z-scores as calculated in computeZScoresDisruptedTads.py
		rules (str): True or False, do we only plot genes that have an SV-gene pair identified by the rules?
		cosmic (str): True or False, do we only focus on COSMIC genes?
		expressionCutoff (str): obsolete
		randomExpression (str): True or False, do we use randomized z-scores (across & between patients/genes) to plot?
		svType (str): which SV type are we plotting for
		elementType (str): eQTL_se_enh: run with only SV-gene pairs that gain/lose an enhancer, eQTL or super enhancer, enh: only enhancers, promoter: only promoters, se: only super enhancers.

	"""

	splitZScores = []
	allPatients = []
	for zScore in zScores:
		splitScore = zScore[0].split("_")

		splitZScores.append([splitScore[0], splitScore[1], float(zScore[5])])

		if splitScore[0] not in allPatients:
			allPatients.append(splitScore[0])

	zScores = np.array(splitZScores, dtype='object')

	causalGenes = InputParser().readCausalGeneFile(settings.files['causalGenesFile'])
	nonCausalGenes = InputParser().readNonCausalGeneFile(settings.files['nonCausalGenesFile'], causalGenes) #In the same format as the causal genes.

	#Combine the genes into one set.
	allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0)
	if cosmic == 'True':
		allGenes = causalGenes

	causalGeneList = []
	for gene in causalGenes:
		causalGeneList.append(gene[3].name)
	#then go through the TADs that are disrupted by a non-coding SV.

	#Get all SVs
	svDir = settings.files['svDir']
	svData = InputParser().getSVsFromFile_hmf(svDir)

	#Filter for the right SV type that we make this plot for. 
	filteredSVs = []
	types = []
	for sv in svData:

		if svType != 'ALL':
			if sv[8].svType != svType:
				continue

		svEntry = sv[0] + "_" + str(sv[1]) + "_" + str(sv[2]) + "_" + sv[3] + "_" + str(sv[4]) + "_" + str(sv[5]) + "_" + sv[8].sampleName

		filteredSVs.append(sv)
		if sv[8].svType not in types:
			types.append(sv[8].svType)

	print(types)
	filteredSVs = np.array(filteredSVs, dtype='object')

	#For each SV, determine which TAD it starts and ends in.
	#Keep this as a TAD pair.
	tadFile = settings.files['tadFile']
	tadData = InputParser().getTADsFromFile(tadFile)

	tadPairs = dict() #keep the pair as name, and the patients as value.
	for sv in filteredSVs:

		#get the left and rightmost TAD.

		#if intrachromosomal, check overlap
		if sv[0] == sv[3]:
			tadChrSubsetInd = sv[0] == tadData[:,0]
			tadChrSubset = tadData[tadChrSubsetInd]

			#If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped.
			startMatches = sv[1] <= tadChrSubset[:,2]
			endMatches = sv[5] >= tadChrSubset[:,1]

			tadMatches = tadChrSubset[startMatches * endMatches]

			if tadMatches.shape[0] < 2: #no matches, or overlapping just 1 TAD.
				continue

			#Get the leftmost and rightmost TADs
			farLeftTad = tadMatches[0] #This list is sorted
			farRightTad = tadMatches[tadMatches.shape[0]-1]


			tadPair = farLeftTad[0] + '_' + str(farLeftTad[1]) + '_' + str(farLeftTad[2]) + '_' + farRightTad[0] + '_' + str(farRightTad[1]) + '_' + str(farRightTad[2])

			if tadPair not in tadPairs:
				tadPairs[tadPair] = []
			tadPairs[tadPair].append(sv[7])



		else: #if interchromosomal, determine the TAD based on breakpoints on either chromosome.

			tadChr1SubsetInd = sv[0] == tadData[:,0]
			tadChr1Subset = tadData[tadChr1SubsetInd]

			#If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped.
			startMatches = sv[1] <= tadChr1Subset[:,2]
			endMatches = sv[5] >= tadChr1Subset[:,1]

			tadMatches = tadChr1Subset[startMatches * endMatches]

			if tadMatches.shape[0] < 1: #no matches
				continue

			#Get the leftmost and rightmost TADs
			farLeftTad = tadMatches[0] #This list is sorted

			#repeat for right TAD
			tadChr2SubsetInd = sv[0] == tadData[:,0]
			tadChr2Subset = tadData[tadChr2SubsetInd]

			#If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped.
			startMatches = sv[1] <= tadChr2Subset[:,2]
			endMatches = sv[5] >= tadChr2Subset[:,1]

			tadMatches = tadChr2Subset[startMatches * endMatches]

			if tadMatches.shape[0] < 1: #no matches
				continue

			farRightTad = tadMatches[0]

			tadPair = farLeftTad[0] + '_' + str(farLeftTad[1]) + '_' + str(farLeftTad[2]) + '_' + farRightTad[0] + '_' + str(farRightTad[1]) + '_' + str(farRightTad[2])

			if tadPair not in tadPairs:
				tadPairs[tadPair] = []
			tadPairs[tadPair].append(sv[7])

	#have an additional filter here for the TADs; if there is one TAD pair where we also see the same TAD boundary disrupted again in the same patient, but on another side, we should ignore it for now.

	#if the start of the left TAD is also the end of another pair, or te end of the right TAD is the start of another pair, then we should remove this pair.
	splitPairs = []
	for pair in tadPairs:
		splitPair = pair.split('_')
		splitPairs.append([splitPair[0], int(splitPair[1]), int(splitPair[2]), splitPair[3], int(splitPair[4]), int(splitPair[5])])

	splitPairs = np.array(splitPairs, dtype='object')

	tadPairsFiltered = dict()
	for pair in splitPairs:

		pairChrSubset = splitPairs[splitPairs[:,3] == pair[0]]
		pairStr = '_'.join([str(i) for i in pair])

		pairPatients = tadPairs[pairStr]

		matched = False

		if pair[1] in pairChrSubset[:,5]:
			matchingPairs = pairChrSubset[pairChrSubset[:,5] == pair[1]]

			#for these matches, check if they are also disrupted in the same patient.
			for matchedPair in matchingPairs:
				matchedPairStr = '_'.join([str(i) for i in matchedPair])

				matchedPairPatients = tadPairs[matchedPairStr]

				for patient in matchedPairPatients:
					if patient in pairPatients:
						#print(pair, ' has match in : ', matchedPairStr, ' patient: ', patient)
						matched = True

		if pair[5] in pairChrSubset[:,1]:
			matchingPairs = pairChrSubset[pairChrSubset[:,1] == pair[5]]
			#for these matches, check if they are also disrupted in the same patient.
			for matchedPair in matchingPairs:
				matchedPairStr = '_'.join([str(i) for i in matchedPair])

				matchedPairPatients = tadPairs[matchedPairStr]

				for patient in matchedPairPatients:
					if patient in pairPatients:
						matched = True
						#print(pairStr, ' has match in : ', matchedPairStr, ' patient: ', patient)


		windowOverlap = False

		if matched == False and windowOverlap == False:
			if pairStr not in tadPairsFiltered:
				tadPairsFiltered[pairStr] = pairPatients

	
	#also use a map for the gene names, because these are different in the expression data. 
	geneNameConversionMap = dict()
	geneNameConversionFile = settings.files['geneNameConversionFile']
	with open(geneNameConversionFile, 'r') as inF:

		lineCount = 0
		for line in inF:

			if lineCount < 1:
				lineCount += 1
				continue
			line = line.strip()
			splitLine = line.split("\t")
			ensgId = splitLine[3]
			splitEnsgId = ensgId.split('.') #we only keep everything before the dot
			geneName = splitLine[4]
			geneNameConversionMap[splitEnsgId[0]] = geneName


	#always get the rules so that we can do the filter out genes overlapped by CNV amplifications that are not affected by non-coding duplications. 
	ruleBasedCombinations = np.loadtxt(sys.argv[1] + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_', dtype='object')
	ruleBasedPairs = []
	ruleBasedPairsSVs = []
	for combination in ruleBasedCombinations:
		splitPair = combination[0].split('_')

		#use for the CNV amp check
		ruleBasedPairsSVs.append(splitPair[0] + '_' + splitPair[7] + '_' + splitPair[12])

		#use to exclude based on rules
		#now check for specific features.
		#eQTLs: 0 and 26, enhancers: 1 and 27, SEs: 24, 50
		if elementType == 'eQTL_se_enh':
			if combination[1] == '1.0' or combination[27] == '1.0' or combination[2] == '1.0' or combination[28] == '1.0' or combination[25] == '1.0' or combination[51] == '1.0':
				ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7])
		elif elementType == 'enh':
			#enhancers only
			if combination[2] == '1.0' or combination[28] == '1.0':
				ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7])
		elif elementType == 'se':
			#se only
			if combination[25] == '1.0' or combination[51] == '1.0':
				ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7])
		elif elementType == 'promoter':
			if combination[3] == '1.0' or combination[29] == '1.0':
				ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7])
		else: #add everything, without filter.
			ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7])

	#Collect all patients with mutations, easier in the adjacent TAds to just filter all patienst with ANY mutations witout having to go through all types individually.
	mutDir = sys.argv[1] + '/patientGeneMutationPairs/'
	#svPatients = np.load(mutDir + 'svPatients.npy', allow_pickle=True, encoding='latin1').item()
	snvPatients = np.load(mutDir + 'snvPatients.npy', allow_pickle=True, encoding='latin1').item()
	#cnvPatients = np.load(mutDir + 'cnvPatients.npy', allow_pickle=True, encoding='latin1').item()
	svPatientsDel = np.load(mutDir + 'svPatientsDel.npy', allow_pickle=True, encoding='latin1').item()
	svPatientsDup = np.load(mutDir + 'svPatientsDup.npy', allow_pickle=True, encoding='latin1').item()
	svPatientsInv = np.load(mutDir + 'svPatientsInv.npy', allow_pickle=True, encoding='latin1').item()
	svPatientsItx = np.load(mutDir + 'svPatientsItx.npy', allow_pickle=True, encoding='latin1').item()
	cnvPatientsDel = np.load(mutDir + 'cnvPatientsDel.npy', allow_pickle=True, encoding='latin1').item()
	cnvPatientsAmp = np.load(mutDir + 'cnvPatientsAmp.npy', allow_pickle=True, encoding='latin1').item()

	bins = 10 #have 10 on each side.
	binZScores = dict()

	for binInd in range(0, bins*2):

		if binInd not in binZScores:
			binZScores[binInd] = []


	binZScoresPerPatient = dict()
	for patient in allPatients:
		binZScoresPerPatient[patient] = dict()

		for binInd in range(0, bins*2):
			binZScoresPerPatient[patient][binInd] = []

	perTadPositivePatients = dict()

	#now for each TAD, get the z-scores
	for tad in tadPairs:

		perTadPositivePatients[tad] = []

		splitTad = tad.split('_')

		#Make a mapping for positions to the right bin.

		#determine the size and how large each bin should be
		binSizeTad1 = (float(splitTad[2]) - float(splitTad[1])) / bins

		currentStart = float(splitTad[1]) #start at the TAD start
		#currentStart = float(splitTad[1]) - offset
		binStartsTad1 = [currentStart] #list at which position each bin should start.
		for binInd in range(0, bins):

			currentStart += binSizeTad1
			binStartsTad1.append(currentStart)

		#repeat for TAD 2
		binSizeTad2 = (float(splitTad[5]) - float(splitTad[4])) / bins
		#binSizeTad2 = ((float(splitTad[5]) + offset) - float(splitTad[4])) / bins
		currentStart = float(splitTad[4]) #start at the TAD start
		binStartsTad2 = [currentStart] #list at which position each bin should start.
		for binInd in range(0, bins):

			currentStart += binSizeTad2
			binStartsTad2.append(currentStart)

		#Go through the genes of the first TAD; find the genes that will be in this bin
		geneChrSubset = allGenes[allGenes[:,0] == splitTad[0]]

		for binInd in range(0, len(binStartsTad1)-1):


			#get the genes in this bin
			genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])]

			#get the z-scores of these genes
			allGeneZScores = []
			geneZScoresPerPatient = dict()
			for gene in genes:
				geneName = gene[3].name

				if geneName in zScores[:,1]: #only add the gene if it has a match.

					geneZScores = zScores[zScores[:,1] == geneName]

					#keep the z-scores separate for each patient

					for patient in range(0, len(geneZScores[:,0])):

						if geneZScores[patient,0] not in tadPairs[tad]:
							continue

						if geneZScores[patient,0] not in perTadPositivePatients[tad]:
							perTadPositivePatients[tad].append(geneZScores[patient,0])


						sample = geneZScores[patient,0]

						if rules == 'True': #if rules, do not include this gene if there was no SV linked to it in this patient.
							if geneName + '_' + sample not in ruleBasedPairs:
								continue

						#check cnv amp

						if gene[3].name in cnvPatientsAmp[sample] and gene[3].name + '_' + sample + '_DUP' not in ruleBasedPairsSVs:
							continue

						if svType == 'DEL':
							#only for a deletion, we do not need to print the deleted genes.
							#if a gene is deleted, the deletion will never result in the gain effect.
							#this is only true for deletions.

							if gene[3].name in svPatientsDel[sample] or gene[3].name in cnvPatientsDel[sample]:
								continue

						if str(float(geneZScores[patient,2])) == 'nan':
							continue

						finalScore = 0
						if randomExpression == 'True':
							randInd = random.sample(range(0, zScores.shape[0]), 1)[0]
							finalScore = float(zScores[randInd,2])
						else:
							finalScore = float(geneZScores[patient,2])
	
						allGeneZScores.append(finalScore)
						print('LT: ', binInd, geneName, geneZScores[patient,0], finalScore)
					
			if len(allGeneZScores) > 0:
				binZScores[binInd] += allGeneZScores

		#now for TAD 2, start from where the TAD 1 indices left off.
		geneChrSubset = allGenes[allGenes[:,0] == splitTad[3]]

		for binInd in range(0, len(binStartsTad2)-1):

			#get the genes in this bin
			genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad2[binInd]) * (geneChrSubset[:,1] <= binStartsTad2[binInd+1])]

			#get the z-scores of these genes
			allGeneZScores = []
			geneZScoresPerPatient = dict()
			for gene in genes:
				geneName = gene[3].name



				if geneName in zScores[:,1]:

					geneZScores = zScores[zScores[:,1] == geneName]

					#keep the z-scores separate for each patient
					for patient in range(0, len(geneZScores[:,0])):

						if geneZScores[patient,0] not in tadPairs[tad]:
							continue
						
						if geneZScores[patient,0] not in perTadPositivePatients[tad]:
							perTadPositivePatients[tad].append(geneZScores[patient,0])
						

					
						sample = geneZScores[patient,0]
						
						if rules == 'True':
							if geneName + '_' + sample not in ruleBasedPairs:
								continue

						#remove incorrect CNV amplification effects.
						if gene[3].name in cnvPatientsAmp[sample] and gene[3].name + '_' + sample + '_DUP' not in ruleBasedPairsSVs:
							continue
						#do the check per SV type, depending on which SV we are looking at.
						#this is because if we have a deletion, there could still be effects from duplications in the same TAD, because we exclude genes overlapped by duplications to see dup effects.
						#but for deletions, this is not relevant, and we should remove all such mutations.

						if svType == 'DEL':
							#only for a deletion, we do not need to print the deleted genes.
							#if a gene is deleted, the deletion will never result in the gain effect.
							#this is only true for deletions.

							if gene[3].name in svPatientsDel[sample] or gene[3].name in cnvPatientsDel[sample]:
								continue
						

						if str(float(geneZScores[patient,2])) == 'nan':
							continue
						
						finalScore = 0
						if randomExpression == 'True':
							randInd = random.sample(range(0, zScores.shape[0]), 1)[0]
							finalScore = float(zScores[randInd,2])
						else:
							finalScore = float(geneZScores[patient,2])

						allGeneZScores.append(finalScore)
						
						print('RT: ', binInd, geneName, geneZScores[patient,0], finalScore)
						
			if len(allGeneZScores) > 0:
				binZScores[binInd+bins] += allGeneZScores
				
	#divide the region into 3 bins on each side.
	#so, get the coordinates on each side depending on where the TAD pair starts and ends
	#determine which genes are in these regions
	#add the additional bins.
	
	binZScoresOffset = dict()
	for binInd in range(0, 40):
			
		if binInd not in binZScoresOffset:
			binZScoresOffset[binInd] = []
	
	for binInd in range(0, bins*2):
		binZScoresOffset[binInd+10] = binZScores[binInd]
	
	binZScoresPerPatientOffset = dict()
	for patient in allPatients:
		binZScoresPerPatientOffset[patient] = dict()
		
		for binInd in range(0, 40):
			binZScoresPerPatientOffset[patient][binInd] = []
			
		for binInd in range(0, bins*2):
			binZScoresPerPatientOffset[patient][binInd+10] = binZScoresPerPatient[patient][binInd]
	
	#get the expression data
	expressionFile = settings.files['expressionFile']

	expressionData = []
	samples = []
	with open(expressionFile, 'r') as inF:
		lineCount = 0
		for line in inF:
			line = line.strip()
			if lineCount == 0:
				samples = ['']
				samples += line.split("\t")

				lineCount += 1
				continue
			splitLine = line.split("\t")
			fullGeneName = splitLine[0]
			if fullGeneName not in geneNameConversionMap:
				continue
			geneName = geneNameConversionMap[fullGeneName] #get the gene name rather than the ENSG ID

			data = splitLine[1:len(splitLine)]
			fixedData = [geneName]
			fixedData += data
			expressionData.append(fixedData)

	expressionData = np.array(expressionData, dtype="object")

	#generate the randomized expression for the adjacent TADs, where we cannot use the z-scores anymore since these were not computed for the adjacent TADs. 
	if randomExpression == 'True':
		from copy import deepcopy
		randomizedExpressionMatrices = []
		shuffleIterations = 1
		for i in range(0,shuffleIterations):
			genes = expressionData[:,0]
			expression = deepcopy(expressionData[:,1:])
			expressionT = expression.T
			np.random.shuffle(expressionT)
			shuffledExpression = expressionT.T
			shuffledExpressionData = np.empty(expressionData.shape, dtype='object')
			shuffledExpressionData[:,0] = genes
			shuffledExpressionData[:,1:] = shuffledExpression

			randomizedExpressionMatrices.append(shuffledExpressionData)

		expressionData = randomizedExpressionMatrices[0]

	#pre-filter expression data, for the positive and negative set in the adjacent TADs.
	#this makes it quicker to search through
	filteredExpressionData = dict()
	for sampleInd in range(0, len(samples)):
		sample = samples[sampleInd]

		if sample == '':
			continue

		if sample not in filteredExpressionData:
			filteredExpressionData[sample] = dict()

		for row in expressionData:
			geneName = row[0]

			filteredExpressionData[sample][geneName] = float(row[sampleInd])

	#Get all TADs that are affected by SVs (positive) and that are not (negative)
	affectedCount = 0
	tadPositiveAndNegativeSet = []
	with open(sys.argv[1] + '/tadDisruptionsZScores/tadPositiveAndNegativeSet.txt', 'r') as inF:
		for line in inF:

			splitLine = line.split('\t')
			tad = splitLine[0]
			positiveSet = ast.literal_eval(splitLine[1])
			negativeSet = ast.literal_eval(splitLine[2])
			svTypes = ast.literal_eval(splitLine[3])

			if len(positiveSet) > 0:
				affectedCount += 1

			tadPositiveAndNegativeSet.append([tad, positiveSet, negativeSet, svTypes])

	tadPositiveAndNegativeSet = np.array(tadPositiveAndNegativeSet, dtype='object')
	print('affected tads: ', affectedCount)


	#so instead of looking at a region around the TADs, use the TADs that are not affected.
	#so per pair, find where it is in the positive/negative set file
	#get the previous or next one
	#check if this tad is affected or not
	#if the tad is not affected, add the same amount of bins as the affected tads and plot these on the left and right.

	for tad in tadPairs:

		splitTad = tad.split('_')
		leftTad = splitTad[0] + '_' + splitTad[1] + '_' + splitTad[2]

		#get the TAD to the left of this tad pair
		leftTadPosition = np.where(tadPositiveAndNegativeSet[:,0] == leftTad)[0]

		leftAdjacentTad = tadPositiveAndNegativeSet[leftTadPosition-1][0]
		splitLeftAdjacentTad = leftAdjacentTad[0].split('_')
		leftNegativeSet = leftAdjacentTad[2]

		splitPos = splitLeftAdjacentTad[0].split('_')

		if splitPos[0] != splitTad[0]: #check if the TAD is on the next chromosome
			continue


		#otherwise, divide this tad into bins, and get the z-scores of z-scores for the genes.
		binSizeTad1 = (float(splitLeftAdjacentTad[2]) - float(splitLeftAdjacentTad[1])) / bins
		currentStart = float(splitLeftAdjacentTad[1]) #start at the TAD start

		binStartsTad1 = [currentStart] #list at which position each bin should start.
		for binInd in range(0, bins):

			currentStart += binSizeTad1
			binStartsTad1.append(currentStart)

		#Go through the genes of the first TAD; find the genes that will be in this bin
		geneChrSubset = allGenes[allGenes[:,0] == splitLeftAdjacentTad[0]]

		for binInd in range(0, len(binStartsTad1)-1):

			#get the genes in this bin
			genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])]

			#get the z-scores of these genes
			allGeneZScores = []
			geneZScoresPerPatient = dict()
			for gene in genes:
				geneName = gene[3].name

				#get the expression of this gene in the negative set
				negativeExpr = []
				positiveExpr = []

				if geneName not in expressionData[:,0]:

					continue

				positiveSampleInd = []
				negativeSampleInd = []
				positivePatients = []
				negativePatients = []
				for sample in range(0, len(samples)):

					if samples[sample] == '':
						continue

					#we use the tad itself to define the positive set.
					#based on the left adjacent tad, we define the negative set.
					if samples[sample] in perTadPositivePatients[tad]:

						if samples[sample] in leftAdjacentTad[1]: #skip if this patient has a disruption of the adjacent TAD.
							continue

						#exclude this gene if it overlaps a mutation
						if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]:

							continue

						positiveSampleInd.append(sample)
						positiveExpr.append(filteredExpressionData[samples[sample]][geneName])
						positivePatients.append(samples[sample])
					elif samples[sample] in leftNegativeSet:

						#exclude this gene if it overlaps a mutation
						if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]:

							continue

						negativeExpr.append(filteredExpressionData[samples[sample]][geneName])
						negativePatients.append(samples[sample])
						negativeSampleInd.append(sample)

				for patientInd in range(0, len(positiveExpr)):
					patient = positiveExpr[patientInd]

					if float(np.std(negativeExpr)) == 0:

						continue

					z = (float(patient) - np.mean(negativeExpr)) / float(np.std(negativeExpr))

					if str(z) == 'nan':
						continue
					

					print('LAT: ', binInd, geneName, positivePatients[patientInd], z)
					
					allGeneZScores.append(z)
					
			if len(allGeneZScores) > 0:
				#binZScoresOffset[binInd].append(np.mean(allGeneZScores))

				binZScoresOffset[binInd] += allGeneZScores

		#repeat for right TAD
		rightTad = splitTad[3] + '_' + splitTad[4] + '_' + splitTad[5]

		#get the TAD to the left of this tad pair
		rightTadPosition = np.where(tadPositiveAndNegativeSet[:,0] == rightTad)[0]

		if rightTadPosition+1 >= len(tadPositiveAndNegativeSet):
			continue #TAD is outside the genome.

		rightAdjacentTad = tadPositiveAndNegativeSet[rightTadPosition+1][0]
		splitRightAdjacentTad = rightAdjacentTad[0].split('_')
		rightNegativeSet = rightAdjacentTad[2]

		splitPos = splitRightAdjacentTad[0].split('_')
		if splitPos[0] != splitTad[3]: #check if the TAD is on the next chromosome
			continue

		#otherwise, divide this tad into bins, and get the z-scores of z-scores for the genes.
		binSizeTad1 = (float(splitRightAdjacentTad[2]) - float(splitRightAdjacentTad[1])) / bins
		currentStart = float(splitRightAdjacentTad[1]) #start at the TAD start

		binStartsTad1 = [currentStart] #list at which position each bin should start.
		for binInd in range(0, bins):

			currentStart += binSizeTad1
			binStartsTad1.append(currentStart)

		#Go through the genes of the first TAD; find the genes that will be in this bin
		geneChrSubset = allGenes[allGenes[:,0] == splitRightAdjacentTad[0]]

		for binInd in range(0, len(binStartsTad1)-1):


			#get the genes in this bin
			genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])]

			#get the z-scores of these genes
			allGeneZScores = []
			geneZScoresPerPatient = dict()
			for gene in genes:
				geneName = gene[3].name

				#get the expression of this gene in the negative set
				negativeExpr = []
				positiveExpr = []

				if geneName not in expressionData[:,0]:

					continue

				positiveSampleInd = []
				negativeSampleInd = []
				positivePatients = []
				negativePatients = []
				for sample in range(0, len(samples)):

					if samples[sample] == '':
						continue

					#we use the tad itself to define the positive set.
					#based on the left adjacent tad, we define the negative set.
					if samples[sample] in perTadPositivePatients[tad]:

						if samples[sample] in rightAdjacentTad[1]: #skip if this patient has a disruption of the adjacent TAD.
							continue

						#exclude this gene if it overlaps a mutation
						if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]:
							continue

						positiveSampleInd.append(sample)
						positiveExpr.append(filteredExpressionData[samples[sample]][geneName])
						positivePatients.append(samples[sample])
					elif samples[sample] in rightNegativeSet:

						#exclude this gene if it overlaps a mutation
						if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]:
						#if geneName in svPatients[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatients[samples[sample]]:
							continue

						negativeExpr.append(filteredExpressionData[samples[sample]][geneName])
						negativePatients.append(samples[sample])
						negativeSampleInd.append(sample)


				for patientInd in range(0, len(positiveExpr)):
					patient = positiveExpr[patientInd]

					if float(np.std(negativeExpr)) == 0:
						continue

					z = (float(patient) - np.mean(negativeExpr)) / float(np.std(negativeExpr))

					if str(z) == 'nan':
						continue
					
						
					print('RAT: ', binInd, geneName, positivePatients[patientInd], z)

					allGeneZScores.append(z)
					
			if len(allGeneZScores) > 0:

				binZScoresOffset[binInd+30] += allGeneZScores


	return binZScoresOffset
Example #10
0
import sys
sys.path.insert(0, './jeopardy_helpers')
import kmeans
import graphGenerator
from inputParser import InputParser
import utils
import nodeInserter
import datetime
from random import randint

if __name__ == "__main__":
    start = datetime.datetime.now()
    parsedInfo = InputParser(sys.argv)
    info = parsedInfo.getInfo()
    info = graphGenerator.graphGen(info)

    mainInfo = info
    bestPath = None
    bestValue = 0

    while True:
        info = kmeans.getClusters(info)
        info = utils.associateInsertedMap(info)
        info = utils.RouteInitPhase(info)

        info['clusterParameter'] = [1, 1.1, 1.2, 1.3][randint(0, 3)]
        for i in range(0, 10):
            info, shouldGoAhead = nodeInserter.insertNodesInPaths(info)
            if not shouldGoAhead:
                break
Example #11
0
# Implementing: https://trunk.tufts.edu/access/content/attachment/d975cd4d-a3cb-4383-9807-0bfc295b7d20/Assignments/3e271133-863e-4468-8436-8f94184262e0/26nt12QMproject.rtf

import QuineMcCluskey
import SolutionPrinter
import sys
from inputParser import InputParser
import time

if __name__ == "__main__":

    #Get and parse input
    try:
        inputPath = sys.argv[1]
    except IndexError:
        inputPath = None
    functions = InputParser.extractFunctions(inputPath)
    parsedFuncs = InputParser.getMintermsAndDCs(functions)

    #Solve each function
    for f in parsedFuncs:
        start_time = time.time()
        fMaxterms = f.flipMinterms()  #Kinda hacky, not proud of this solution
        fMaxterms.overrideNumInputs(f.numInputs())
        #print "Calculating SOP"
        solutionSOP = QuineMcCluskey.solve(f)
        #print "Calculating POS"
        solutionPOS = QuineMcCluskey.solve(fMaxterms)
        print f
        print "=",
        SolutionPrinter.printSolution(f, solutionSOP, "SOP")
        print "=",
Example #12
0
    def generateFrequencyScatterPlot(self, allCosmicPairs,
                                     cosmicGeneCancerTypes,
                                     pathogenicSNVCounts):

        #Create an order for the genes and cancer types
        cancerTypesIndex = dict()
        cosmicGenesIndex = dict()
        geneFrequencies = dict()
        geneInd = 0
        for cancerTypeInd in range(0, len(allCosmicPairs)):
            cancerType = list(allCosmicPairs.keys())[cancerTypeInd]
            cancerTypesIndex[cancerType] = cancerTypeInd

            if cancerType not in geneFrequencies:
                geneFrequencies[cancerType] = dict()

            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]
                if gene not in cosmicGenesIndex:
                    cosmicGenesIndex[gene] = geneInd
                    geneInd += 1
                if gene not in geneFrequencies[cancerType]:
                    geneFrequencies[cancerType][gene] = 0
                geneFrequencies[cancerType][gene] += 1

        #check distribution of genes/cosmic etc
        uniqueGenes = dict()
        uniqueCosmicGenes = dict()
        uniqueSpecificGenes = dict()
        for cancerTypeInd in range(0, len(allCosmicPairs)):
            cancerType = list(allCosmicPairs.keys())[cancerTypeInd]
            cancerTypeNames = self.cancerTypeNames[cancerType]

            uniqueGenesC = dict()
            uniqueCosmicGenesC = dict()
            uniqueSpecificGenesC = dict()

            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]

                uniqueGenes[gene] = 0
                uniqueGenesC[gene] = 0
                if gene in cosmicGeneCancerTypes:
                    uniqueCosmicGenes[gene] = 0
                    uniqueCosmicGenesC[gene] = 0
                    for keyword in cancerTypeNames:
                        if re.search(keyword, cosmicGeneCancerTypes[gene],
                                     re.IGNORECASE):
                            uniqueSpecificGenes[gene] = 0
                            uniqueSpecificGenesC[gene] = 0

            print('cancer type: ', cancerType)
            print('genes: ', len(uniqueGenesC))
            print('cosmic genes: ', len(uniqueCosmicGenesC))
            print('specific genes: ', len(uniqueSpecificGenesC))
            print(uniqueSpecificGenesC)

        print('total drivers: ', len(uniqueGenes))
        print('total known drivers: ', len(uniqueCosmicGenes))
        print('total specific drivers: ', len(uniqueSpecificGenes))

        #instead of frequency by non-coding SVs, use number of coding events as size
        print('Calculating coding events...')
        codingFrequency = dict()
        normalizedCodingFrequency = dict()
        patientCounts = dict()

        #aside from normal codng events, also sample random genes to compare to
        iterationCount = 1
        #get all genes to sample from
        causalGenes = InputParser().readCausalGeneFile(
            settings.files['causalGenesFile'])
        nonCausalGenes = InputParser().readNonCausalGeneFile(
            settings.files['nonCausalGenesFile'],
            causalGenes)  #In the same format as the causal genes.

        #Combine the genes into one set.
        allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0)

        allGeneNames = []
        for gene in allGenes:
            allGeneNames.append(gene[3].name)
        cosmicGeneNames = []
        for gene in causalGenes:
            cosmicGeneNames.append(gene[3].name)
        #allGenes = nonCausalGenes

        np.random.seed(42)
        randomGenes = np.random.choice(allGeneNames, 100)

        geneFrequencies = dict()
        nonCodingOnlyGenes = dict()
        allPValues = []
        for cancerType in self.cancerTypes:
            nonCodingOnlyGenes[cancerType] = dict()
            geneFrequencies[cancerType] = dict()

            randomDistribution = []
            for gene in randomGenes:

                if gene in pathogenicSNVCounts[cancerType]:
                    randomDistribution.append(
                        pathogenicSNVCounts[cancerType][gene])
                else:
                    randomDistribution.append(0)
            # print(cancerType)
            # print(randomDistribution)
            # print(np.mean(randomDistribution))

            pValues = []
            for pair in allCosmicPairs[cancerType]:

                splitPair = pair.split('_')
                gene = splitPair[0]

                score = 0
                if gene in pathogenicSNVCounts[cancerType]:
                    #print(gene, ': ', pathogenicSNVCounts[cancerType][gene])
                    score = pathogenicSNVCounts[cancerType][gene]
                else:
                    #print(gene, ' not pathogenic')
                    #don't count duplicates, that would be more than 1 per patient
                    nonCodingOnlyGenes[cancerType][gene] = 0

                z = (score -
                     np.mean(randomDistribution)) / np.std(randomDistribution)

                pValue = stats.norm.sf(abs(z))
                pValues.append([gene, z, pValue])
                allPValues.append([gene, cancerType, z, pValue])

            #uncorrectedPValues = np.array(pValues, dtype = 'object')

        #adjust across cancer types

        #print(uncorrectedPValues)

        uncorrectedPValues = np.array(allPValues, dtype='object')

        reject, pAdjusted, _, _ = multipletests(
            uncorrectedPValues[:,
                               3], method='bonferroni')  #fdr_bh or bonferroni

        signPatients = []
        for pValueInd in range(0, len(uncorrectedPValues[:, 3])):

            gene = uncorrectedPValues[pValueInd, 0]
            cancerType = uncorrectedPValues[pValueInd, 1]

            if reject[pValueInd] == True and uncorrectedPValues[pValueInd,
                                                                2] > 0:

                geneFrequencies[cancerType][gene] = uncorrectedPValues[
                    pValueInd, 2]

                signPatients.append([
                    uncorrectedPValues[pValueInd][0],
                    uncorrectedPValues[pValueInd][2], pAdjusted[pValueInd]
                ])

        signPatients = np.array(signPatients, dtype='object')

        print(signPatients)
        exit()

        #create the scatter plot in this order, use the frequency as point size
        genePlotIndices = dict()
        currentGenePlotIndex = 0
        plotData = []
        plotFrequencies = []
        pointColors = []
        for cancerType in allCosmicPairs:
            cancerTypeIndex = cancerTypesIndex[cancerType]
            cancerTypeNames = self.cancerTypeNames[cancerType]
            for pair in allCosmicPairs[cancerType]:
                splitPair = pair.split('_')
                gene = splitPair[0]

                #get frequency of this gene
                if gene in geneFrequencies[cancerType]:
                    geneFrequency = geneFrequencies[cancerType][gene]
                    #use frequency of coding events
                    #geneFrequency = normalizedCodingFrequency[cancerType][gene]
                    #3.5

                    if gene not in genePlotIndices:
                        genePlotIndices[gene] = currentGenePlotIndex
                        currentGenePlotIndex += 1

                    #determine the color based on if this gene is cancer-type specific
                    edgecolors = 1
                    facecolors = 'black'

                    if gene in cosmicGeneCancerTypes:
                        facecolors = 'green'
                        edgecolors = 3
                        for keyword in cancerTypeNames:
                            if re.search(keyword, cosmicGeneCancerTypes[gene],
                                         re.IGNORECASE):
                                print('match', cancerType, gene)
                                edgecolors = 2
                                facecolors = 'red'

                    plotData.append([
                        genePlotIndices[gene], cancerTypeIndex, edgecolors,
                        geneFrequency * 500
                    ])

                    #plt.scatter(genePlotIndices[gene], cancerTypeIndex, color=facecolors, s = geneFrequency*5)
        plotData = np.array(plotData)
        data = pd.DataFrame(plotData)
        data.columns = ['gene', 'cancerType', 'color', 'frequency']
        data = data.drop_duplicates()
        print(data)
        #exit()
        sns.scatterplot(data=data,
                        x='gene',
                        y='cancerType',
                        size=data.frequency,
                        hue=data.cancerType,
                        legend=False,
                        style=data.color,
                        edgecolor='k',
                        sizes=(20, 300),
                        palette=sns.color_palette("hls",
                                                  data.cancerType.nunique()))

        #plt.yticks(np.arange(0, len(genePlotIndices)), list(genePlotIndices.keys()))
        #plt.xticks(np.arange(0, len(cancerTypesIndex)), list(cancerTypesIndex.keys()), rotation = 'vertical')

        plt.xticks(np.arange(0, len(genePlotIndices)),
                   list(genePlotIndices.keys()),
                   rotation='vertical')
        plt.yticks(np.arange(0, len(cancerTypesIndex)),
                   list(cancerTypesIndex.keys()))

        plt.tight_layout()
        plt.savefig('frequency_scatter.svg')
        plt.show()

        return 0
Example #13
0
    def plotPathogenicSVFrequency(self):

        # plotData = dict()
        # plotData['pathogenicSVs'] = []
        # plotData['totalSVs'] = []
        plotData = []
        for cancerType in self.cancerTypes:

            #count how many pathogenic SVs we have
            pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt'

            pathogenicSVCount = 0
            with open(pathogenicSVFile, 'r') as inF:
                for line in inF:
                    pathogenicSVCount += 1

            plotData.append([cancerType, pathogenicSVCount])

            #plotData['pathogenicSVs'].append(pathogenicSVCount)

            #count the total number of SVs
            # svDir = settings.files['svDir']
            # svData = InputParser().getSVs_hmf(svDir, self.cancerTypeMetadataNames[cancerType])
            # #plotData['totalSVs'].append(svData.shape[0])
            #
            #
            # plotData.append([cancerType, svData.shape[0], 'SV'])

        data = pd.DataFrame(plotData)
        data.columns = ['cancerType', 'svCount']

        #make bar plot
        ax = sns.barplot(x="cancerType",
                         y="svCount",
                         data=data,
                         color='#a2d5f2')
        plt.xticks(np.arange(0, len(self.cancerTypes)),
                   self.cancerTypes,
                   rotation='vertical')

        plt.tight_layout()
        # Show graphic
        plt.show()
        exit()

        plotData = []
        samplePlotData = []
        for cancerType in self.cancerTypes:

            #count the total number of SVs
            svDir = settings.files['svDir']
            svData = InputParser().getSVs_hmf(
                svDir, self.cancerTypeMetadataNames[cancerType])
            #plotData['totalSVs'].append(svData.shape[0])

            plotData.append([cancerType, svData.shape[0]])
            samplePlotData.append([cancerType, len(np.unique(svData[:, 7]))])

        data = pd.DataFrame(plotData)
        data.columns = ['cancerType', 'svCount']

        #make bar plot
        ax = sns.barplot(x="cancerType",
                         y="svCount",
                         data=data,
                         color='#07689f')
        plt.xticks(np.arange(0, len(self.cancerTypes)),
                   self.cancerTypes,
                   rotation='vertical')

        plt.tight_layout()
        # Show graphic
        plt.show()

        data = pd.DataFrame(samplePlotData)
        data.columns = ['cancerType', 'sampleCount']

        #make bar plot
        ax = sns.barplot(x="cancerType",
                         y="sampleCount",
                         data=data,
                         color='#ff7e67')
        plt.xticks(np.arange(0, len(self.cancerTypes)),
                   self.cancerTypes,
                   rotation='vertical')

        plt.tight_layout()
        # Show graphic
        plt.show()
        exit()

        #read the line count of the pathogenic SV files.
        pathogenicSVCounts = dict()
        svTypeDistribution = dict()
        plotData = []
        for cancerType in self.cancerTypes:
            pathogenicSVCounts[cancerType] = 0
            svTypeDistribution[cancerType] = dict()
            svTypeDistribution[cancerType]['DEL'] = 0
            svTypeDistribution[cancerType]['DUP'] = 0
            svTypeDistribution[cancerType]['INV'] = 0
            svTypeDistribution[cancerType]['ITX'] = 0
            countsPerSample = dict()

            pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt'

            with open(pathogenicSVFile, 'r') as inF:
                for line in inF:
                    pathogenicSVCounts[cancerType] += 1

                    splitLine = line.split('\t')
                    pair = splitLine[0]
                    splitPair = pair.split('_')
                    sample = splitPair[7]
                    if sample not in countsPerSample:
                        countsPerSample[sample] = 0
                    countsPerSample[sample] += 1

                    svType = splitPair[12]
                    svTypeDistribution[cancerType][svType] += 1

            print(cancerType)
            print(len(countsPerSample))
            for sample in countsPerSample:
                plotData.append([
                    cancerType, sample, countsPerSample[sample],
                    pathogenicSVCounts[cancerType]
                ])

            #plotData.append([cancerType, pathogenicSVCounts[cancerType]])
        #exit()
        #plotData = np.array(plotData)
        data = pd.DataFrame(plotData)
        data.columns = ['cancerType', 'sample', 'sampleCount', 'svCount']

        #sns.scatterplot(data=data, x='cancerType', y='svCount', legend=False)
        v = sns.violinplot(data=data,
                           x='cancerType',
                           y='sampleCount',
                           legend=False)

        # add n = X to show total count.

        cancerTypesWithCounts = []
        for cancerType in self.cancerTypes:
            cancerTypesWithCounts.append(cancerType + ' (N = ' +
                                         str(pathogenicSVCounts[cancerType]) +
                                         ')')

        plt.xticks(np.arange(0, len(self.cancerTypes)),
                   cancerTypesWithCounts,
                   rotation='vertical')
        plt.ylim([0, 200])
        plt.tight_layout()
        plt.show()

        ###make a plot showing how many pathogenic SVs vs total SVs

        #make the SV type bar chart

        #make a dictionary per SV type, where each array is then the cancer type.
        plotData = dict()
        for svType in ['DEL', 'DUP', 'INV', 'ITX']:
            plotData[svType] = []
            for cancerType in svTypeDistribution:
                plotData[svType].append(svTypeDistribution[cancerType][svType])

        df = pd.DataFrame(plotData)

        # From raw value to percentage
        totals = [
            i + j + k + l
            for i, j, k, l in zip(df['DEL'], df['DUP'], df['INV'], df['ITX'])
        ]
        delBars = [i / j * 100 for i, j in zip(df['DEL'], totals)]
        dupBars = [i / j * 100 for i, j in zip(df['DUP'], totals)]
        invBars = [i / j * 100 for i, j in zip(df['INV'], totals)]
        itxBars = [i / j * 100 for i, j in zip(df['ITX'], totals)]

        # plot
        barWidth = 0.85
        r = np.arange(0, len(self.cancerTypes))
        names = self.cancerTypes
        # Create green Bars
        plt.bar(r,
                delBars,
                color='#e41a1c',
                edgecolor='white',
                width=barWidth,
                label='DEL')
        # Create orange Bars
        plt.bar(r,
                dupBars,
                bottom=delBars,
                color='#377eb8',
                edgecolor='white',
                width=barWidth,
                label='DUP')
        # Create blue Bars
        plt.bar(r,
                invBars,
                bottom=[i + j for i, j in zip(delBars, dupBars)],
                color='#4daf4a',
                edgecolor='white',
                width=barWidth,
                label='INV')
        plt.bar(
            r,
            itxBars,
            bottom=[i + j + k for i, j, k in zip(delBars, dupBars, invBars)],
            color='#984ea3',
            edgecolor='white',
            width=barWidth,
            label='ITX')

        # Custom x axis
        plt.xticks(r, names, rotation='vertical')
        #plt.xlabel("Cancer type")

        plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)
        plt.tight_layout()
        # Show graphic
        plt.show()
Example #14
0
#maybe output the .npy files to a tmp dir to make them easier to find
outFilePrefix = gainLossName + '_' + cosmicName + '_'

if gainLossName != 'loss':
    svTypes = ['DEL', 'DUP', 'INV', 'ITX']

else:
    svTypes = ['INV', 'ITX']

usedSVTypes = [
]  #use this to later determine which colors need to be used in the plot in case we skip an sv type, e.g.
#for cosmic when translocations are not linked to any cosmic gene.

if cosmicName != 'all':
    #read the cosmic files to split instances into cosmic/non-cosmic.
    cosmicGenes = InputParser().readCausalGeneFile(
        settings.files['causalGenesFile'])
    cosmicGeneNames = []
    for gene in cosmicGenes:
        cosmicGeneNames.append(gene[3].name)

if generatePlottingData == "True":
    adjustedPValues = dict()
    allFeatureZScores = dict()

    svTypeInd = 0
    for svType in svTypes:
        #define the classifiers to use (from optimization)
        #would be nicer if these are in 1 file somewhere, since they are also used in another script

        if svType == 'DEL':
            clf = RandomForestClassifier(random_state=785,
Example #15
0
# Implementing: https://trunk.tufts.edu/access/content/attachment/d975cd4d-a3cb-4383-9807-0bfc295b7d20/Assignments/3e271133-863e-4468-8436-8f94184262e0/26nt12QMproject.rtf

import QuineMcCluskey
import SolutionPrinter
import sys
from inputParser import InputParser
import time

if __name__ == "__main__":

    #Get and parse input
    try:
        inputPath = sys.argv[1]
    except IndexError:
        inputPath = None
    functions = InputParser.extractFunctions(inputPath)
    parsedFuncs = InputParser.getMintermsAndDCs(functions)

    #Solve each function
    for f in parsedFuncs:
        start_time = time.time()
        fMaxterms = f.flipMinterms() #Kinda hacky, not proud of this solution
        fMaxterms.overrideNumInputs(f.numInputs())
        #print "Calculating SOP"
        solutionSOP = QuineMcCluskey.solve(f)
        #print "Calculating POS"
        solutionPOS = QuineMcCluskey.solve(fMaxterms)
        print f
        print "=",
        SolutionPrinter.printSolution(f,solutionSOP,"SOP")
        print "=",
Example #16
0
def plotSVStatsPanels(cancerTypes, loopType):
    """
		Plot panels A, B, C and D of Figure S2. Also C and D can be pt 1 and 2
		of figure 4B if CTCF data is provided.

		Parameters:
		- cancerTypes: cancerTypes to output for. Should equal output folder names
		- loopType: used to determine output figure name. TAD for Figure S3, CTCF for Figure 4B.
	"""

    #1. plot pathogenic SV count
    plotData = []
    cancerTypePlotNames = []
    for cancerType in cancerTypes:
        splitCancerType = cancerType.split('_')
        cancerType2 = '_'.join(splitCancerType[1:2])
        if loopType == 'TAD':
            cancerTypePlotNames.append(cancerType2)
        else:
            cancerTypePlotNames.append(cancerType2 + '_CTCF')

        #count how many pathogenic SVs we have
        pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt'

        pathogenicSVCount = 0
        with open(pathogenicSVFile, 'r') as inF:
            for line in inF:
                pathogenicSVCount += 1

        plotData.append([cancerType, pathogenicSVCount])

    pathogenicSVCounts = pd.DataFrame(plotData)
    pathogenicSVCounts.columns = ['Cancer type', 'Number of pathogenic SVs']

    #make bar plot
    ax = sns.barplot(data=pathogenicSVCounts,
                     x="Cancer type",
                     y="Number of pathogenic SVs",
                     color='#a2d5f2')
    plt.xticks(np.arange(0, len(cancerTypes)),
               cancerTypePlotNames,
               rotation='vertical')

    plt.tight_layout()
    #Plot and save based on CTCF/TAD input
    if loopType == 'TAD':
        plt.savefig('output/figures/figureS2C.svg')
    else:
        plt.savefig('output/figures/figure4A_A.svg')
    plt.clf()

    #2. Make plot of the total SV counts
    plotData = []
    samplePlotData = []
    for cancerType in cancerTypes:

        #count the total number of SVs
        svDir = settings.files['svDir']
        svData = InputParser().getSVs_hmf(svDir,
                                          cancerTypeMetadataNames[cancerType])

        plotData.append([cancerType, svData.shape[0]])
        samplePlotData.append([cancerType, len(np.unique(svData[:, 7]))])

    totalSVCounts = pd.DataFrame(plotData)
    totalSVCounts.columns = ['Cancer type', 'Number of SVs']

    #make bar plot
    ax = sns.barplot(x="Cancer type",
                     y="Number of SVs",
                     data=totalSVCounts,
                     color='#07689f')
    plt.xticks(np.arange(0, len(cancerTypes)),
               cancerTypePlotNames,
               rotation='vertical')

    plt.tight_layout()
    if loopType == 'TAD':
        plt.savefig('output/figures/figureS2B.svg')
    plt.clf()

    #3. Make plot of the sample counts
    sampleCounts = pd.DataFrame(samplePlotData)
    sampleCounts.columns = ['Cancer type', 'Number of samples']

    #make bar plot
    ax = sns.barplot(x="Cancer type",
                     y="Number of samples",
                     data=sampleCounts,
                     color='#ff7e67')
    plt.xticks(np.arange(0, len(cancerTypes)),
               cancerTypePlotNames,
               rotation='vertical')

    plt.tight_layout()
    if loopType == 'TAD':
        plt.savefig('output/figures/figureS2A.svg')
    plt.clf()

    #4. Show the relative % of pathogenic compared to total SVs.
    plotData = []
    for index, row in totalSVCounts.iterrows():

        cancerType = row['Cancer type']
        svGenePairFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_'

        svGenePairCount = 0
        with open(svGenePairFile, 'r') as inF:
            for line in inF:
                svGenePairCount += 1

        pathogenicSVCount = pathogenicSVCounts[
            pathogenicSVCounts['Cancer type'] ==
            row['Cancer type']]['Number of pathogenic SVs']
        relativeFrequency = (pathogenicSVCount / float(svGenePairCount)) * 100

        plotData.append([row['Cancer type'], relativeFrequency])

    sampleCounts = pd.DataFrame(plotData)
    sampleCounts.columns = ['Cancer type', 'Relative pathogenic SV frequency']

    #make bar plot
    ax = sns.barplot(x="Cancer type",
                     y="Relative pathogenic SV frequency",
                     data=sampleCounts,
                     color='black')
    plt.xticks(np.arange(0, len(cancerTypes)),
               cancerTypePlotNames,
               rotation='vertical')

    plt.tight_layout()
    if loopType == 'TAD':
        plt.savefig('output/figures/figureS2D.svg')
    else:
        plt.savefig('output/figures/figure4A_B.svg')

    plt.clf()