コード例 #1
0
    def setUpForMSStratification(self) -> MSIIdentifier:

        self.stratifyByMS = True

        self.MSICohorts = dict()  # A hashtable of individual cohorts with MSI

        # Create the necessary directories, file paths, and metadata.
        aggregateMSDirectory = os.path.join(self.rootMetadata.directory,
                                            "microsatellite_analysis")
        aggregateMSSDirectory = os.path.join(aggregateMSDirectory, "MSS")
        aggregateMSIDirectory = os.path.join(aggregateMSDirectory, "MSI")
        checkDirs(aggregateMSSDirectory, aggregateMSIDirectory)

        generateMetadata(
            "MSS_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSSDirectory, "MSS")
        generateMetadata(
            "MSI_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSIDirectory, "MSI")
        self.aggregateMSSMutCounts = 0
        self.aggregateMSIMutCounts = 0

        self.aggregateMSSFilePath = generateFilePath(
            directory=aggregateMSSDirectory,
            dataGroup="MSS_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSSFile = open(self.aggregateMSSFilePath, 'w')
        self.aggregateMSIFilePath = generateFilePath(
            directory=aggregateMSIDirectory,
            dataGroup="MSI_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSIFile = open(self.aggregateMSIFilePath, 'w')

        # Set up the MSIIdentifier to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        MSISeqInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSISeq_data",
            fileExtension=".tsv")
        self.MSICohortsFilePath = generateFilePath(
            directory=aggregateMSDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSI_cohorts",
            fileExtension=".txt")

        self.myMSIIdentifier = MSIIdentifier(MSISeqInputDataFilePath,
                                             self.MSICohortsFilePath)
        return (self.myMSIIdentifier)
            def generateBackgroundBasedOnRadius(usesNucGroup):

                # Set the dyad radius (And linker offset)
                if usesNucGroup:
                    dyadRadius = 1000
                    currentLinkerOffset = 0
                else:
                    dyadRadius = 73
                    currentLinkerOffset = linkerOffset

                # Generate the path to the tsv file of dyad position context counts
                dyadPosContextCountsFilePath = generateFilePath(
                    directory=os.path.dirname(metadata.baseNucPosFilePath),
                    dataGroup=metadata.nucPosName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType="dyad_pos_counts",
                    fileExtension=".tsv")

                # Make sure we have a tsv file with the appropriate context counts at each dyad position.
                if not os.path.exists(dyadPosContextCountsFilePath):
                    print(
                        "Dyad position " + contextText +
                        " counts file not found at",
                        dyadPosContextCountsFilePath)
                    print("Generating genome wide dyad position " +
                          contextText + " counts file...")
                    # Make sure we have a fasta file for strongly positioned nucleosome coordinates
                    nucPosFastaFilePath = generateNucleosomeFasta(
                        metadata.baseNucPosFilePath, metadata.genomeFilePath,
                        dyadRadius, currentLinkerOffset)
                    generateDyadPosContextCounts(nucPosFastaFilePath,
                                                 dyadPosContextCountsFilePath,
                                                 contextNum, dyadRadius,
                                                 currentLinkerOffset)

                # A path to the final output file.
                nucleosomeMutationBackgroundFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType=DataTypeStr.nucMutBackground,
                    fileExtension=".tsv")

                # Generate the nucleosome mutation background file!
                generateNucleosomeMutationBackgroundFile(
                    dyadPosContextCountsFilePath, mutationBackgroundFilePath,
                    nucleosomeMutationBackgroundFilePath, dyadRadius,
                    currentLinkerOffset)

                nucleosomeMutationBackgroundFilePaths.append(
                    nucleosomeMutationBackgroundFilePath)
コード例 #3
0
def parseAlexandrov(bedInputFilePaths, genomeFilePath, nucPosFilePath):

    outputBedFilePaths = list()

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(bedInputFilePath)
        generateMetadata(os.path.basename(dataDirectory),
                         getIsolatedParentDir(genomeFilePath),
                         getIsolatedParentDir(nucPosFilePath),
                         os.path.basename(bedInputFilePath),
                         InputFormat.customBed,
                         os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(bedInputFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write data to the output file.
        with open(bedInputFilePath, 'r') as bedInputFile:
            with open(outputBedFilePath, 'w') as outputBedFile:

                for line in bedInputFile:

                    choppedUpLine = str(line).strip().split('\t')

                    # Make sure we have a valid chromosome
                    if (
                            "chr" + choppedUpLine[2]
                    ) in acceptableChromosomes and not '/' in choppedUpLine[5]:

                        # Convert the line to custom bed format.
                        if choppedUpLine[5] == '-': choppedUpLine[5] = '*'
                        if choppedUpLine[6] == '-': choppedUpLine[6] = '*'
                        outputBedFile.write('\t'.join(
                            ("chr" + choppedUpLine[2],
                             str(int(choppedUpLine[3]) - 1), choppedUpLine[4],
                             choppedUpLine[5], choppedUpLine[6], '.',
                             choppedUpLine[0])) + '\n')

        # Add the output file to the list.
        outputBedFilePaths.append(outputBedFilePath)

    # Pass the data to the custome bed parser.
    print("\nPassing data to custom bed parser.\n")
    parseCustomBed(outputBedFilePaths, genomeFilePath, nucPosFilePath, False,
                   False, False)
コード例 #4
0
    def setUpFileSystem(self):

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(self.inputDataFilePath)
        dataGroupName = getIsolatedParentDir(self.inputDataFilePath)

        # Create the intermediate files directory if necessary
        intermediateFilesDirectory = os.path.join(localRootDirectory,"intermediate_files")
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(intermediateFilesDirectory)

        # If the given input data is in bigWig form, use it to generate the file paths to the intermediate bedGraph files.
        if self.bigWigReadsFilePathPair is not None:
            self.bedGraphReadsFilePathPair = list()
            for bigWigReadsFilePath in self.bigWigReadsFilePathPair:
                self.bedGraphReadsFilePathPair.append(os.path.join(intermediateFilesDirectory,
                                                                   os.path.basename(bigWigReadsFilePath).rsplit('.',1)[0]+".bedGraph"))

        # Generate the trimmed reads output, the fasta output, and bed output file paths.
        self.trimmedReadsFilePath = os.path.join(intermediateFilesDirectory,dataGroupName+"_trimmed_reads.bed")
        self.fastaReadsFilePath = os.path.join(intermediateFilesDirectory,dataGroupName+"_trimmed_reads.fa")
        self.lesionsBedFilePath = generateFilePath(directory = intermediateFilesDirectory, dataGroup = dataGroupName,
                                                   dataType = DataTypeStr.customInput, fileExtension = ".bed") 

        # Generate metadata
        generateMetadata(dataGroupName, getIsolatedParentDir(self.genomeFilePath),
                         os.path.basename(self.inputDataFilePath), InputFormat.xRSeq, localRootDirectory,
                         callParamsFilePath = self.callParamsFilePath)
コード例 #5
0
def getBackgroundRawPairs(backgroundCountsFilePaths):

    # Match each background file path to its respective raw counts file path.
    backgroundRawPairs: Dict[str, List[str]] = dict()
    for backgroundCountsFilePath in backgroundCountsFilePaths:

        if not DataTypeStr.nucMutBackground in os.path.basename(
                backgroundCountsFilePath):
            raise InvalidPathError("Background counts file should have \"" +
                                   DataTypeStr.nucMutBackground +
                                   "\" in the name.  Given:")

        # Generate the expected raw counts file path
        metadata = Metadata(backgroundCountsFilePath)
        rawCountsFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            linkerOffset=getLinkerOffset(backgroundCountsFilePath),
            usesNucGroup=checkForNucGroup(backgroundCountsFilePath),
            dataType=DataTypeStr.rawNucCounts,
            fileExtension=".tsv")

        # Make sure it exists
        if not os.path.exists(rawCountsFilePath):
            raise ValueError("No raw counts file found to pair with " +
                             backgroundCountsFilePath +
                             "\nExpected file with path: " + rawCountsFilePath)

        if backgroundCountsFilePath not in backgroundRawPairs:
            backgroundRawPairs[backgroundCountsFilePath] = list()
        backgroundRawPairs[backgroundCountsFilePath].append(rawCountsFilePath)

    return backgroundRawPairs
コード例 #6
0
    def setUpNewIndividualCohort(self, cohortID):

        # If this isn't the first opened cohort file, close and sort the last one.
        if self.currentIndividualCohortFile is not None:
            self.currentIndividualCohortFile.close()
            subprocess.run(
                ("sort", "-k1,1", "-k2,2n", self.individualCohortFilePath,
                 "-s", "-o", self.individualCohortFilePath),
                check=True)
            Metadata(self.individualCohortFilePath).addMetadata(
                Metadata.AddableKeys.mutCounts,
                self.currentIndividualCohortMutCounts)

        # Make sure this is actually a new cohort.
        if cohortID in self.completedIndividualCohorts:
            raise UserInputError(
                "The cohort " + cohortID +
                " was encountered in more than one distinct block of data.")
        else:
            self.currentIndividualCohortID = cohortID

        individualCohortDirectory = os.path.join(
            self.rootIndividualCohortsDirectory,
            self.currentIndividualCohortID)
        individualCohortDataGroup = self.currentIndividualCohortID + "_" + self.rootMetadata.dataGroupName

        checkDirs(individualCohortDirectory)

        # Determine which other set up "umbrella" cohorts this cohort belongs to.
        cohortMembership = [
            self.currentIndividualCohortID,
        ]
        if self.stratifyByMS:
            if self.currentIndividualCohortID in self.MSICohorts:
                cohortMembership.append("MSI")
            else:
                cohortMembership.append("MSS")
        if self.stratifyByMutSig:
            if self.currentIndividualCohortID in self.mutSigDesignations:
                for mutSig in self.mutSigDesignations[
                        self.currentIndividualCohortID]:
                    cohortMembership.append("mut_sig_" + mutSig)

        # Generate the file path and metadata file and open the file for writing.
        self.individualCohortFilePath = generateFilePath(
            directory=individualCohortDirectory,
            dataGroup=individualCohortDataGroup,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.currentIndividualCohortFile = open(self.individualCohortFilePath,
                                                'w')
        generateMetadata(
            individualCohortDataGroup, self.rootMetadata.genomeName,
            os.path.join("..", self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, individualCohortDirectory,
            *cohortMembership)
        self.currentIndividualCohortMutCounts = 0
コード例 #7
0
def parseStandardBed(standardBedFilePaths: List[str], genomeFilePath):

    customBedOutputFilePaths = list(
    )  # The list of file paths to be passed to the custom bed parser.

    # Parse the given files into custom bed format.
    for standardBedFilePath in standardBedFilePaths:

        print("\nWorking in:", os.path.basename(standardBedFilePath))
        if not os.path.basename(standardBedFilePath).endswith(".bed"):
            raise InvalidPathError(
                standardBedFilePath,
                "Given file does not appear to be in bed format. (missing \".bed\" extension)"
            )

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(standardBedFilePath)
        intermediateFilesDir = os.path.join(localRootDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        dataGroupName = getIsolatedParentDir(standardBedFilePath)

        # Generate the output file path and metadata
        customBedOutputFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=dataGroupName,
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")
        customBedOutputFilePaths.append(customBedOutputFilePath)
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath),
                         os.path.basename(standardBedFilePath),
                         InputFormat.standardBed, localRootDirectory)

        # Get the list of acceptable chromosomes.
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Iterate through the standard bed file entries preparing them for custom-bed input.
        print("Converting entries for custom bed input...")
        with open(standardBedFilePath, 'r') as standardBedFile:
            with open(customBedOutputFilePath, 'w') as customBedOutputFile:

                for line in standardBedFile:

                    choppedUpLine = line.strip().split("\t")

                    # Make sure the lesion is in a valid chromosome.  Otherwise, skip it.
                    if not choppedUpLine[0] in acceptableChromosomes: continue

                    choppedUpLine[3] = '.'
                    choppedUpLine[4] = "OTHER"

                    customBedOutputFile.write('\t'.join(choppedUpLine[:6]) +
                                              '\n')

    # Pass the generated files to the custom bed parser.
    parseCustomBed(customBedOutputFilePaths, genomeFilePath, False, False,
                   False, False)
def countInBindingMotifs(mutationFilePaths, bindingMotifsFilePaths):

    bindingMotifsMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # Loop through each given mutation file path, creating a corresponding binding motifs mutation count file for each.
    for mutationFilePath in mutationFilePaths:
        for bindingMotifsFilePath in bindingMotifsFilePaths:

            print("\nWorking with", os.path.basename(mutationFilePath), "and",
                  os.path.basename(bindingMotifsFilePath))

            # Make sure we have the expected file type.
            if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
                raise ValueError("Mutation file should have \"" +
                                 DataTypeStr.mutations + "\" in the name.")

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(mutationFilePath)

            # Generate the output file path for mutation counts.
            binder = os.path.basename(bindingMotifsFilePath).rsplit(
                "binding_motifs", 1)[0]
            if "binding_motifs" not in os.path.basename(bindingMotifsFilePath):
                warnings.warn(
                    "\"binding_motifs\" not found in basename of binding motifs file.  The output file's name is probably a garbled mess."
                )

            bindingMotifsMutationCountsFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                fileExtension=".tsv",
                dataType=binder + "binding_motif_mutation_counts")
            bindingMotifsMutationCountsFilePaths.append(
                bindingMotifsMutationCountsFilePath)

            # Ready, set, go!
            counter = CountsFileGenerator(
                mutationFilePath, bindingMotifsFilePath,
                bindingMotifsMutationCountsFilePath,
                getAcceptableChromosomes(metadata.genomeFilePath))
            counter.count()
            counter.writeResults()

    return bindingMotifsMutationCountsFilePaths
コード例 #9
0
def getCustomBackgroundRawPairs(customRawCountsFilePaths,
                                customBackgroundCountsDir):

    customBackgroundRawPairs: Dict[str, List[str]] = dict()

    # For every raw counts file given, try to match it to a raw counts file in the customBackgroundCountsDir.
    for customRawCountsFilePath in customRawCountsFilePaths:

        rawMetadata = Metadata(customRawCountsFilePath)
        backgroundDir = os.path.join(customBackgroundCountsDir,
                                     rawMetadata.nucPosName)
        if not os.path.exists(backgroundDir):
            raise UserInputError(
                "Expected a directory at " + backgroundDir +
                " to contain the background for " + customRawCountsFilePath +
                " but the directory does not exist.  Have you forgotten to run "
                "the analysis for the related nucleosome map?")
        backgroundMetadata = Metadata(backgroundDir)

        customBackgroundCountsFilePath = generateFilePath(
            directory=backgroundMetadata.directory,
            dataGroup=backgroundMetadata.dataGroupName,
            linkerOffset=getLinkerOffset(customRawCountsFilePath),
            usesNucGroup=checkForNucGroup(customRawCountsFilePath),
            dataType=DataTypeStr.rawNucCounts,
            fileExtension=".tsv")
        if not os.path.exists(customBackgroundCountsFilePath):
            raise UserInputError(
                "Expected file at " + customBackgroundCountsFilePath +
                " to use as custom background for " + customRawCountsFilePath +
                " but this file does not exist.  Have you forgotten to "
                "run the relevant analysis to generate it?")
        if customBackgroundCountsFilePath not in customBackgroundRawPairs:
            customBackgroundRawPairs[customBackgroundCountsFilePath] = list()
        customBackgroundRawPairs[customBackgroundCountsFilePath].append(
            customRawCountsFilePath)

    return customBackgroundRawPairs
コード例 #10
0
    def __init__(self, rootDataDir, context):

        # Get the metadata from the given directory
        self.rootDataDir = rootDataDir
        self.rootMetadata = Metadata(self.rootDataDir)

        # Record the data's context.
        self.context = context

        # create and open the output file in the same directory as the root data.
        self.rootOutputFilePath = generateFilePath(
            directory=self.rootDataDir,
            dataGroup=self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.rootOutputFile = open(self.rootOutputFilePath, 'w')
        self.rootMutCounts = 0

        # By default, all other write options are off unless otherwise specified.
        self.stratifyByIndividualCohorts = False
        self.stratifyByMS = False
        self.stratifyByMutSig = False
        self.stratifyBySignature = False
コード例 #11
0
    def setUpForMutSigStratification(self) -> MutSigIdentifier:

        self.stratifyByMutSig = True
        self.mutSigDesignations = dict(
        )  # A dictionary of the mutation signatures assigned to each cohort.

        mutSigs = ["1A", "1B"] + [str(x) for x in list(range(2, 22))
                                  ] + ["R1", "R2", "R3", "U1", "U2"]

        # Create the necessary directories, file paths, and metadata.
        parentMutSigDirectory = os.path.join(self.rootMetadata.directory,
                                             "mut_sig_analysis")
        self.mutSigFilePaths = dict()
        self.mutSigFiles = dict()
        self.mutSigMutCounts = dict()

        for mutSig in mutSigs:

            thisMutSigDataGroup = "mut_sig_" + mutSig + '_' + self.rootMetadata.dataGroupName

            # Directory
            thisMutSigDirectory = os.path.join(parentMutSigDirectory, mutSig)
            checkDirs(thisMutSigDirectory)

            # Metadata
            generateMetadata(
                thisMutSigDataGroup, self.rootMetadata.genomeName,
                os.path.join('..', '..',
                             self.rootMetadata.localParentDataPath),
                self.rootMetadata.inputFormat, thisMutSigDirectory,
                "mutSig" + mutSig)

            # Mutation Counter
            self.mutSigMutCounts[mutSig] = 0

            # File path
            self.mutSigFilePaths[mutSig] = generateFilePath(
                directory=thisMutSigDirectory,
                dataGroup=thisMutSigDataGroup,
                context=self.context,
                dataType=DataTypeStr.mutations,
                fileExtension=".bed")
            self.mutSigFiles[mutSig] = open(self.mutSigFilePaths[mutSig], 'w')

        # Set up the MutSigIdentifier object to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        deconstructSigsInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="deconstructSigs_data",
            fileExtension=".tsv")
        self.mutSigDesignationsFilePath = generateFilePath(
            directory=parentMutSigDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="mut_sig_assignments",
            fileExtension=".tsv")

        self.mutSigIdentifier = MutSigIdentifier(
            deconstructSigsInputDataFilePath, self.mutSigDesignationsFilePath)
        return (self.mutSigIdentifier)
コード例 #12
0
def generateMutationBackground(mutationFilePaths, backgroundContextNum):

    mutationBackgroundFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # A dictionary for converting context numbers to text.
    contextNumToText = {
        1: "singlenuc",
        2: "dinuc",
        3: "trinuc",
        4: "quadrunuc",
        5: "pentanuc",
        6: "hexanuc"
    }

    for mutationFilePath in mutationFilePaths:

        # Retrieve metadata
        metadata = Metadata(mutationFilePath)
        intermediateFilesDirectory = os.path.join(metadata.directory,
                                                  "intermediate_files")

        # If necessary, adjust the context for files with even-length features.
        if getContext(mutationFilePath, asInt=True) % 2 == 0:
            thisBackgroundContextNum = backgroundContextNum + 1
        else:
            thisBackgroundContextNum = backgroundContextNum

        # Set the name of the type of context being used.
        assert thisBackgroundContextNum in contextNumToText, "Unexpected background context number: " + str(
            thisBackgroundContextNum)
        contextText = contextNumToText[thisBackgroundContextNum]

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(
            metadata.genomeFilePath)

        print("\nWorking in:", os.path.split(mutationFilePath)[1])
        if not DataTypeStr.mutations in os.path.split(mutationFilePath)[1]:
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        # Generate the file path for the genome context frequency file.
        genomeContextFrequencyFilePath = generateFilePath(
            directory=os.path.dirname(metadata.genomeFilePath),
            dataGroup=metadata.genomeName,
            context=contextText,
            dataType="frequency",
            fileExtension=".tsv")

        # Generate the file path for the mutation context frequency file.
        mutationContextFrequencyFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType="mutation_frequencies",
            fileExtension=".tsv")

        # Generate the file path for the background mutation rate file.
        mutationBackgroundFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType=DataTypeStr.mutBackground,
            fileExtension=".tsv")

        # If the genome context frequency file doesn't exist, create it.
        if not os.path.exists(genomeContextFrequencyFilePath):
            print("Genome", contextText,
                  "context frequency file not found at path:",
                  genomeContextFrequencyFilePath)
            print("Generating genome " + contextText +
                  " context frequency file...")
            generateGenomeContextFrequencyFile(metadata.genomeFilePath,
                                               genomeContextFrequencyFilePath,
                                               thisBackgroundContextNum,
                                               contextText,
                                               acceptableChromosomes)

        # Create a directory for intermediate files if it does not already exist...
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(intermediateFilesDirectory)

        # Create the mutation context frequency file.
        print("Generating mutation context frequency file...")
        generateMutationContextFrequencyFile(mutationFilePath,
                                             mutationContextFrequencyFilePath,
                                             thisBackgroundContextNum,
                                             contextText,
                                             acceptableChromosomes)

        # Generate the mutation background file.
        generateMutationBackgroundFile(genomeContextFrequencyFilePath,
                                       mutationContextFrequencyFilePath,
                                       mutationBackgroundFilePath, contextText)

        mutationBackgroundFilePaths.append(mutationBackgroundFilePath)

    return mutationBackgroundFilePaths
コード例 #13
0
def normalizeCounts(backgroundCountsFilePaths: List[str],
                    customRawCountsFilePaths: List[str] = list(),
                    customBackgroundCountsDir=None,
                    includeAlternativeScaling=False):

    normalizedCountsFilePaths = list()

    backgroundRawPairs = getBackgroundRawPairs(backgroundCountsFilePaths)

    # Get the background-raw pairs from the custom directories, if they were given.
    if customBackgroundCountsDir is not None:
        customBackgroundRawPairs = getCustomBackgroundRawPairs(
            customRawCountsFilePaths, customBackgroundCountsDir)
        for customBackgroundCountsFilePath in customBackgroundRawPairs:
            assert customBackgroundCountsFilePath not in backgroundRawPairs, "Unexpected intersection!"
            backgroundRawPairs[
                customBackgroundCountsFilePath] = customBackgroundRawPairs[
                    customBackgroundCountsFilePath]

    # Iterate through each background + raw counts pair
    for backgroundCountsFilePath in backgroundRawPairs:
        for rawCountsFilePath in backgroundRawPairs[backgroundCountsFilePath]:

            print("\nWorking with", os.path.basename(rawCountsFilePath), "and",
                  os.path.basename(backgroundCountsFilePath))

            metadata = Metadata(rawCountsFilePath)

            # Generate the path to the normalized file.
            if DataTypeStr.rawNucCounts in backgroundCountsFilePath:
                context = "custom_context"
            else:
                context = getContext(backgroundCountsFilePath)
            normalizedCountsFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                context=context,
                linkerOffset=getLinkerOffset(backgroundCountsFilePath),
                usesNucGroup=checkForNucGroup(backgroundCountsFilePath),
                dataType=DataTypeStr.normNucCounts,
                fileExtension=".tsv")

            # Prepare the arguments to the subprocess call.
            args = [
                "Rscript",
                os.path.join(rScriptsDirectory,
                             "NormalizeNucleosomeMutationCounts.R"),
                rawCountsFilePath, backgroundCountsFilePath,
                normalizedCountsFilePath
            ]

            # If alternative scaling is requested, determine the appropriate scaling factor and add it to the arguments
            if includeAlternativeScaling:

                # If we are normalizing by sequence context, just revert the automatic scaling.
                if customBackgroundCountsDir is None:
                    args.append(1)

                    # If we are normalizing by a custom context, scale based on the relative sizes of the parent background and raw data sets.
                else:
                    args.append(
                        str(
                            getParentDataFeatureCounts(
                                backgroundCountsFilePath) /
                            getParentDataFeatureCounts(rawCountsFilePath)))

            # Pass the file paths to the R script to generate the normalized counts file.
            print("Calling R script to generate normalized counts...")
            subprocess.run(args, check=True)

            normalizedCountsFilePaths.append(normalizedCountsFilePath)

    # Document where the custom background counts came from in each relevant directory.
    if customBackgroundCountsDir is not None:
        for customRawCountsDir in set([
                os.path.dirname(customRawCountsFilePath)
                for customRawCountsFilePath in customRawCountsFilePaths
        ]):
            metadata = Metadata(customRawCountsDir)
            customBackgroundInfoFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                dataType=DataTypeStr.customBackgroundInfo,
                fileExtension=".txt")
            with open(customBackgroundInfoFilePath,
                      'w') as customBackgroundInfoFile:
                customBackgroundInfoFile.write(
                    "Custom background directory: " +
                    customBackgroundCountsDir + '\n')
                customBackgroundInfoFile.write(
                    "Last date used: " +
                    str(datetime.datetime.now()).rsplit(':', 1)[0] + '\n')

    return normalizedCountsFilePaths
コード例 #14
0
def parseCustomBed(bedInputFilePaths,
                   genomeFilePath,
                   stratifyByMS,
                   stratifyByMutSig,
                   separateIndividualCohorts,
                   onlySingleBaseSubs=False,
                   includeIndels=False):

    if onlySingleBaseSubs and includeIndels:
        raise UserInputError(
            "Indels are incompatible with single nucleotide substitutions.")
    if len(bedInputFilePaths) == 0:
        raise UserInputError("No bed files were found to parse.")

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata
        # If this is an intermediate file, keep in mind that it's not in the data group's root directory
        # and metadata should already have been generated elsewhere
        if getIsolatedParentDir(bedInputFilePath) == "intermediate_files":
            dataDirectory = os.path.dirname(os.path.dirname(bedInputFilePath))
        else:
            dataDirectory = os.path.dirname(bedInputFilePath)
            generateMetadata(os.path.basename(dataDirectory),
                             getIsolatedParentDir(genomeFilePath),
                             os.path.basename(bedInputFilePath),
                             InputFormat.customBed,
                             os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        autoAcquiredFilePath = os.path.join(intermediateFilesDir,
                                            "auto_acquire.fa")

        context = autoAcquireAndQACheck(bedInputFilePath, genomeFilePath,
                                        autoAcquiredFilePath,
                                        onlySingleBaseSubs, includeIndels)

        # Make sure the input file is not named the same as what will become the output file.  If it is, it needs to be copied
        # to the intermediate_files directory so it is available to be read from as the new output file is being written.
        expectedOutputFilePath = generateFilePath(
            directory=dataDirectory,
            dataGroup=os.path.basename(dataDirectory),
            context=context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        if bedInputFilePath == expectedOutputFilePath:
            inputFilePathCopy = os.path.join(
                intermediateFilesDir, os.path.basename(bedInputFilePath))
            print(
                "Input file path is identical to generated output file path and will be overwritten. ",
                "Creating a copy of the input file at:", inputFilePathCopy,
                "to use for reading.")
            shutil.copy2(bedInputFilePath, inputFilePathCopy)
            bedInputFilePath = inputFilePathCopy

        # Create an instance of the WriteManager to handle writing.
        with WriteManager(dataDirectory, context) as writeManager:

            # Check to see if cohort designations are present to see if preparations need to be made.
            optionalArgument = tuple()
            with open(bedInputFilePath, 'r') as bedInputFile:
                line = bedInputFile.readline()

                # Is the cohort designation present?
                if len(line.strip().split('\t')) == 7:

                    # Include in sort function
                    optionalArgument = ("-k7,7", )

                    # Prepare the write manager for individual cohorts if desired.
                    if separateIndividualCohorts:
                        writeManager.setUpForIndividualCohorts()

                elif stratifyByMS or stratifyByMutSig:
                    raise UserInputError(
                        "Additional stratification given, but no cohort designation given."
                    )
                elif separateIndividualCohorts:
                    raise UserInputError(
                        "Separation by individual cohorts requested, but no cohort designation given."
                    )

            # Sort the input data (should also ensure that the output data is sorted)
            subprocess.run(("sort", ) + optionalArgument +
                           ("-k1,1", "-k2,2n", "-k3,3n", bedInputFilePath,
                            "-s", "-o", bedInputFilePath),
                           check=True)

            # If requested, also prepare for stratification by microsatellite stability.
            if stratifyByMS:
                setUpForMSStratification(writeManager, bedInputFilePath)

            if stratifyByMutSig:
                setUpForMutSigStratification(writeManager, bedInputFilePath)

            # Go, go, go!
            convertToStandardInput(bedInputFilePath, writeManager,
                                   onlySingleBaseSubs, includeIndels)
コード例 #15
0
def expandContext(inputBedFilePaths, expansionContextNum):

    assert expansionContextNum in (
        3, 5), "Unexpected expansion context: " + str(expansionContextNum)

    expandedContextFilePaths = list(
    )  # A list of paths to the output files generated by the function

    for inputBedFilePath in inputBedFilePaths:

        # Retrieve metadata
        metadata = Metadata(inputBedFilePath)

        # If necessary, adjust the context for files with even-length features.
        if getContext(inputBedFilePath, asInt=True) % 2 == 0:
            thisExpansionContextNum = expansionContextNum + 1
        else:
            thisExpansionContextNum = expansionContextNum

        # Make sure file names look valid.
        print("\nWorking in:", os.path.split(inputBedFilePath)[1])
        if not DataTypeStr.mutations in os.path.split(inputBedFilePath)[1]:
            raise InvalidPathError(
                inputBedFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        # Make sure the context of the input bed file is less than the expansion context.
        if getContext(inputBedFilePath, asInt=True) >= thisExpansionContextNum:
            raise InvalidPathError(
                inputBedFilePath,
                "The given mutation file at does not have a lower context "
                "than the desired output context.",
                postPathMessage="There is nothing to expand.")

        # Generate paths to intermediate data files.
        intermediateFilesDirectory = os.path.join(metadata.directory,
                                                  "intermediate_files")

        bedExpansionFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            dataType="intermediate_expansion",
            fileExtension=".bed")

        fastaReadsFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            dataType="expanded_reads",
            fileExtension=".fa")

        # Generate a path to the final output file.
        expandedContextFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            context=thisExpansionContextNum,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")

        # Create a directory for intermediate files if it does not already exist...
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(os.path.join(intermediateFilesDirectory))

        # Expand the nucleotide coordinates in the singlenuc context bed file as requested.
        expandBedPositions(inputBedFilePath, bedExpansionFilePath,
                           thisExpansionContextNum)

        # Convert the expanded coordinates in the bed file to the referenced nucleotides in fasta format.
        bedToFasta(bedExpansionFilePath, metadata.genomeFilePath,
                   fastaReadsFilePath)

        # Using the newly generated fasta file, create a new bed file with the expanded context.
        generateExpandedContext(inputBedFilePath, fastaReadsFilePath,
                                expandedContextFilePath,
                                thisExpansionContextNum)

        expandedContextFilePaths.append(expandedContextFilePath)

        # Delete the input file, which has the same mutation information, but a smaller context.
        print("Deleting old mutation context file...")
        os.remove(inputBedFilePath)

    return expandedContextFilePaths
コード例 #16
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
def generateNucleosomeFasta(baseNucPosFilePath, genomeFilePath, dyadRadius,
                            linkerOffset):

    # Ensure that an intermediate files directory exists for the current nucleosome map.
    intermediateFilesDir = os.path.join(os.path.dirname(baseNucPosFilePath),
                                        "intermediate_files")
    checkDirs(intermediateFilesDir)

    # Generate a path to the fasta file of nucleosome sequences (Potentially including linker DNA).
    if dyadRadius == 73:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            linkerOffset=linkerOffset,
            fileExtension=".fa")
    elif dyadRadius == 1000:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            usesNucGroup=True,
            fileExtension=".fa")
    else:
        raise ValueError("Invalid dyad radius: " + str(dyadRadius) +
                         ".  Expected 73 or 1000.")

    # Make sure the file doesn't already exist.  If it does, we're done!
    if os.path.exists(nucPosFastaFilePath):
        print("Found relevant nucleosome fasta file:",
              os.path.basename(nucPosFastaFilePath))
        return nucPosFastaFilePath
    else:
        print("Nucleosome fasta file not found at: ",
              nucPosFastaFilePath,
              "\nGenerating...",
              sep='')

    # Generate the (temporary) expanded file path.
    expandedNucPosBedFilePath = generateFilePath(
        directory=intermediateFilesDir,
        dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
        dataType="expanded",
        fileExtension=".bed")

    # Expand the bed coordinates.
    print("Expanding nucleosome coordinates...")
    with open(baseNucPosFilePath, 'r') as baseNucPosFile:
        with open(expandedNucPosBedFilePath, 'w') as expandedNucPosBedFile:

            # Write the expanded positions to the new file, one line at a time.
            for line in baseNucPosFile:
                choppedUpLine = line.strip().split('\t')
                choppedUpLine[1] = str(
                    int(choppedUpLine[1]) - dyadRadius - linkerOffset - 2)
                choppedUpLine[2] = str(
                    int(choppedUpLine[2]) + dyadRadius + linkerOffset + 2)

                # Write the results to the expansion file as long as it is not before the start of the chromosome.
                if int(choppedUpLine[1]) > -1:
                    expandedNucPosBedFile.write('\t'.join(choppedUpLine) +
                                                '\n')
                else:
                    print("Nucleosome at chromosome", choppedUpLine[0],
                          "with expanded start pos", choppedUpLine[1],
                          "extends into invalid positions.  Skipping.")

    # Convert the expanded bed file to fasta format.
    print("Converting expanded coordinates to fasta file...")
    bedToFasta(expandedNucPosBedFilePath,
               genomeFilePath,
               nucPosFastaFilePath,
               includeStrand=False)

    return nucPosFastaFilePath
コード例 #18
0
def parseICGC(ICGCFilePaths: List[str], genomeFilePath, separateDonors,
              stratifyByMS, stratifyByMutSig):

    outputBedFilePaths = list()

    if len(ICGCFilePaths) == 0:
        raise UserInputError("No ICGC files were found to parse.")

    # Run the parser for each ICGC file given.
    for ICGCFilePath in ICGCFilePaths:

        print("\nWorking in:", os.path.split(ICGCFilePath)[1])

        if not ICGCFilePath.endswith(".gz"):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file is not gzipped (.gz file format):")
        if not "simple_somatic_mutation" in os.path.basename(ICGCFilePath):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file path does not have \"simple_somatic_mutation\" in the name:",
                "Note: if a directory was specified to search for ICGC input files, "
                "all files ending in .tsv.gz are selected.")

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(ICGCFilePath)
        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        generateMetadata(getIsolatedParentDir(ICGCFilePath),
                         getIsolatedParentDir(genomeFilePath),
                         os.path.basename(ICGCFilePath), InputFormat.ICGC,
                         os.path.dirname(ICGCFilePath))

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(ICGCFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write the relevant information from the ICGC file to the output file.
        print("Writing data to custom bed format.")
        with gzip.open(ICGCFilePath, 'r') as ICGCFile:
            with open(outputBedFilePath, 'w') as outputBedFile:
                for mutation in ICGCIterator(ICGCFile, genomeFilePath):

                    # Change the formatting if a deletion or insertion is given.
                    if mutation.mutatedFrom == '-':
                        mutation.mutatedFrom = '*'
                        # NOTE: We are making the assumption that the given base pos (1-based) is after the insertion, not before.
                        mutation.startPos = str(int(mutation.startPos) - 1)

                    elif mutation.mutatedTo == '-':
                        mutation.mutatedTo = '*'

                    outputBedFile.write('\t'.join(
                        (mutation.chromosome, mutation.startPos,
                         mutation.endPos, mutation.mutatedFrom,
                         mutation.mutatedTo, mutation.strand,
                         mutation.donorID)) + '\n')

        outputBedFilePaths.append(outputBedFilePath)

    # Pass the parsed bed files to the custom bed parser for even more parsing! (Hooray for modularization!)
    print("\nPassing data to custom bed parser...")
    parseCustomBed(outputBedFilePaths, genomeFilePath, stratifyByMS,
                   stratifyByMutSig, separateDonors, True)
コード例 #19
0
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str],
                         genomeFilePath, nucPosFilePath, includeAllPAHs):

    for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths:

        print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath))

        if not kucabSubstitutionsFilePath.endswith("final.txt"):
            raise InvalidPathError(
                kucabSubstitutionsFilePath,
                "Given kucab input file does not end in \"final.txt\":")

        # Prepare the output file path.
        localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath)
        dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath)
        if includeAllPAHs:
            outputDirectory = os.path.join(localRootDirectory, "all_PAHs")
            dataGroupName += "_all_PAHs"
        else:
            dataGroupName += "_smoker_lung"
            outputDirectory = os.path.join(localRootDirectory, "smoker_lung")

        # Make sure the data directory exists.
        if not os.path.exists(outputDirectory): os.mkdir(outputDirectory)

        # Generate the output file path and metadata
        outputTrinucBedFilePath = generateFilePath(
            directory=outputDirectory,
            dataGroup=dataGroupName,
            context="trinuc",
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        generateMetadata(
            dataGroupName, getIsolatedParentDir(genomeFilePath),
            getIsolatedParentDir(nucPosFilePath),
            os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)),
            outputDirectory)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study.
        PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2",
                           "MSM0.42", "MSM0.74", "MSM0.103"
                           "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12",
                           "MSM0.132", "MSM0.13", "MSM0.96")
        # These designations specifically mimic the indel signature in smokers' lung cancer tumors.
        LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2",
                                          "MSM0.103", "MSM0.14")

        # Set the designations that will be used to collect data based on the input to the function.
        if includeAllPAHs:
            relevantDesignations = PAHDesignations
        else:
            relevantDesignations = LungCancerSpecificDesignations

        print("Reading data and writing to trinuc bed file...")
        with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile:
            with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile:

                firstLineFlag = True
                for line in kucabSubstitutionsFile:

                    # Skip the first line with headers.
                    if firstLineFlag:
                        firstLineFlag = False
                        continue

                    # The lines are separated by tabs.  The relevant data have the following indices in a tab-separated list:
                    # 15: mutagen designation
                    # 4: Chromosome
                    # 5: Start Pos (1 base)
                    # 6: Reference base
                    # 7: Mutated base
                    # 13: pre-base context
                    # 14: post-base context
                    choppedUpLine = line.strip().split('\t')

                    # Skip the mutation if it does not belong to the relevant group.
                    if not choppedUpLine[15] in relevantDesignations: continue

                    # Compile the necessary information for the bed file.
                    chromosome = "chr" + choppedUpLine[4]

                    # Handle the weird chromsome formatting and then check for invalid chromosomes.
                    if chromosome == "chr23": chromosome = "chrX"
                    if chromosome == "chr24": chromosome = "chrY"
                    if not chromosome in acceptableChromosomes: continue
                    startPos1Base = choppedUpLine[5]
                    startPos0Base = str(int(startPos1Base) - 1)

                    mutatedFrom = choppedUpLine[6]
                    mutatedTo = choppedUpLine[7]
                    trinucContext = ''.join(
                        (choppedUpLine[13], mutatedFrom, choppedUpLine[14]))

                    # If the mutated base is listed as arising from a purine, flip the mutation and the strand.
                    if isPurine(mutatedFrom):
                        mutation = reverseCompliment(
                            mutatedFrom) + '>' + reverseCompliment(mutatedTo)
                        strand = '-'
                        trinucContext = reverseCompliment(trinucContext)
                    else:
                        mutation = mutatedFrom + '>' + mutatedTo
                        strand = '+'

                    # Write the information to the trinuc bed file.
                    outputTrinucBedFile.write('\t'.join(
                        (chromosome, startPos0Base, startPos1Base,
                         trinucContext, mutation, strand)) + '\n')

        # Sort the output file.
        print("Sorting output file...")
        subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath,
                        "-o", outputTrinucBedFilePath),
                       check=True)