Beispiel #1
0
def removeUnacceptableChromosomes(bedFilePaths: List[str], genomeFastaFilepath):

    acceptableChromosomes = getAcceptableChromosomes(genomeFastaFilepath)

    for bedFilePath in bedFilePaths:

        print("\nWorking in",os.path.basename(bedFilePath))

        # Create an intermediate file to write acceptable entries to.
        acceptableChromBedFilePath = bedFilePath.rsplit('.', 1)[0] + "_acceptable_only.bed"

        removedEntries = 0
        unacceptableChromosomes = list()

        # Read through the bed file, writing only entries with acceptable chromosomes to the output file.
        with open(bedFilePath, 'r') as bedFile:
            with open(acceptableChromBedFilePath, 'w') as acceptableChromBedFile:

                for line in bedFile:
                    chromosome = line.split()[0]
                    if chromosome in acceptableChromosomes:
                        acceptableChromBedFile.write(line)
                    else:
                        removedEntries += 1
                        if chromosome not in unacceptableChromosomes:
                            unacceptableChromosomes.append(chromosome)
                            print("Unallowed chromosome found:",chromosome)

        # Rewrite the original file.
        print("Removed",removedEntries,"Entries.  Rewriting original file...")
        os.replace(acceptableChromBedFilePath, bedFilePath)
def parseAlexandrov(bedInputFilePaths, genomeFilePath, nucPosFilePath):

    outputBedFilePaths = list()

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(bedInputFilePath)
        generateMetadata(os.path.basename(dataDirectory),
                         getIsolatedParentDir(genomeFilePath),
                         getIsolatedParentDir(nucPosFilePath),
                         os.path.basename(bedInputFilePath),
                         InputFormat.customBed,
                         os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(bedInputFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write data to the output file.
        with open(bedInputFilePath, 'r') as bedInputFile:
            with open(outputBedFilePath, 'w') as outputBedFile:

                for line in bedInputFile:

                    choppedUpLine = str(line).strip().split('\t')

                    # Make sure we have a valid chromosome
                    if (
                            "chr" + choppedUpLine[2]
                    ) in acceptableChromosomes and not '/' in choppedUpLine[5]:

                        # Convert the line to custom bed format.
                        if choppedUpLine[5] == '-': choppedUpLine[5] = '*'
                        if choppedUpLine[6] == '-': choppedUpLine[6] = '*'
                        outputBedFile.write('\t'.join(
                            ("chr" + choppedUpLine[2],
                             str(int(choppedUpLine[3]) - 1), choppedUpLine[4],
                             choppedUpLine[5], choppedUpLine[6], '.',
                             choppedUpLine[0])) + '\n')

        # Add the output file to the list.
        outputBedFilePaths.append(outputBedFilePath)

    # Pass the data to the custome bed parser.
    print("\nPassing data to custom bed parser.\n")
    parseCustomBed(outputBedFilePaths, genomeFilePath, nucPosFilePath, False,
                   False, False)
def parsePreparedInput(inputFilePaths: List[str], genomeFilePath, checkEachLine = True):
    
    for inputFilePath in inputFilePaths:

        print("\nWorking in",os.path.basename(inputFilePath))

        # Perform some checks to make sure the input is formatted correctly.
        dataGroupName = getIsolatedParentDir(inputFilePath)
        inputFileBasename = os.path.basename(inputFilePath)
        inputFileContext = getContext(inputFilePath)

        if inputFileContext is None: raise UserInputError("No context is apparent from the given prepared input file.")
        if inputFileBasename.split('_'+inputFileContext)[0] != dataGroupName:
            raise InvalidPathError(inputFilePath, 
                                   "Prepared input file is not named as expected given the data group name generated from the "
                                   "parent directory.  Expected: \"" + dataGroupName + "\" immediately preceding the context definition "
                                   "but given file path is:")
        if not inputFileBasename.endswith(DataTypeStr.mutations + ".bed"):
            raise InvalidPathError(inputFilePath,
                                   "Prepared input file is not named properly to indicate the presence of mutation data.  "
                                   "Expected a file ending in \"" + DataTypeStr.mutations + ".bed\" but given path is:")

        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)
        acceptableChromosomesFilePath = getAcceptableChromosomes(genomeFilePath, True)

        # Perform QA with the checkForErrors function
        print("Checking for errors in line formatting...")
        with open(inputFilePath, 'r') as inputFile:
            choppedUpLine = inputFile.readline().strip().split('\t')
            cohortDesignationPresent = len(choppedUpLine) == 7
            checkForErrors(choppedUpLine, cohortDesignationPresent, acceptableChromosomes,
                            acceptableChromosomesFilePath)

            if checkEachLine:
                for line in inputFile:
                    choppedUpLine = line.strip().split('\t')
                    checkForErrors(choppedUpLine, cohortDesignationPresent, acceptableChromosomes,
                                    acceptableChromosomesFilePath)

        # If everything else looks good, generate the metadata.  This directory is now ready to go!
        print("Checks passed.  Generating metadata, including mutation counts using a call to wc -l")
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath), 
                         os.path.basename(inputFilePath), InputFormat.prepared,  os.path.dirname(inputFilePath))
        featureCounts = int(subprocess.check_output(("wc", "-l", inputFilePath), encoding = "UTF-8").split()[0])
        Metadata(inputFilePath).addMetadata(Metadata.AddableKeys.mutCounts, featureCounts)
    def __init__(self, inputDataFilePath: str, callParamsFilePath: str, genomeFilePath: str):

        self.inputDataFilePath = inputDataFilePath
        self.genomeFilePath = genomeFilePath

        # Get the list of acceptable chromosomes
        self.acceptableChromosomes = getAcceptableChromosomes(self.genomeFilePath)

        # Read in information from the callParams File.
        self.callParamsFilePath = callParamsFilePath
        self.expectedLocationsByLength = dict()
        self.acceptableBasesByLength = dict()

        with open(self.callParamsFilePath, 'r') as callParamsFile:
            for line in callParamsFile:

                choppedUpLine: List[str] = line.split()

                sequenceLength = int(choppedUpLine[0])
                assert sequenceLength not in self.expectedLocationsByLength

                self.expectedLocationsByLength[sequenceLength] = list()
                expectedLocations = choppedUpLine[1].split(',')
                for expectedLocation in expectedLocations:
                    if ':' not in expectedLocation:
                        self.expectedLocationsByLength[sequenceLength].append((int(expectedLocation),int(expectedLocation)))
                    else:
                        expectedLocationStart, expectedLocationEnd = expectedLocation.split(':')
                        if int(expectedLocationStart) > int(expectedLocationEnd):
                            self.expectedLocationsByLength[sequenceLength].append((int(expectedLocationEnd),int(expectedLocationStart)))
                        else:
                            self.expectedLocationsByLength[sequenceLength].append((int(expectedLocationStart),int(expectedLocationEnd)))

                self.acceptableBasesByLength[sequenceLength] = choppedUpLine[2].split(',')

        # Initialize these values as empty until we know what form the input data is in.
        self.bigWigReadsFilePathPair = None
        self.bedGraphReadsFilePathPair = None
        self.bedInputFilePath = None

        # Initialize a variable to keep track of whether or not the input reads have been trimmed.
        self.readsHaveBeenTrimmed = False

        # Determine what form the input data is in and assign it accordingly.
        if self.inputDataFilePath.endswith(".bigWig"):
            self.bigWigReadsFilePathPair = getFilePathPair(self.inputDataFilePath)

        elif self.inputDataFilePath.endswith(".bedGraph"):
            self.bedGraphReadsFilePathPair = getFilePathPair(self.inputDataFilePath)

        elif self.inputDataFilePath.endswith(".bed"):
            self.bedInputFilePath = self.inputDataFilePath

        else: raise InvalidPathError(self.inputDataFilePath, "Input data file path is not in an acceptable format.  " 
                                     "Expected a bigWig, bedGraph, or bed file type but received:")

        self.setUpFileSystem()
Beispiel #5
0
def parseStandardBed(standardBedFilePaths: List[str], genomeFilePath):

    customBedOutputFilePaths = list(
    )  # The list of file paths to be passed to the custom bed parser.

    # Parse the given files into custom bed format.
    for standardBedFilePath in standardBedFilePaths:

        print("\nWorking in:", os.path.basename(standardBedFilePath))
        if not os.path.basename(standardBedFilePath).endswith(".bed"):
            raise InvalidPathError(
                standardBedFilePath,
                "Given file does not appear to be in bed format. (missing \".bed\" extension)"
            )

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(standardBedFilePath)
        intermediateFilesDir = os.path.join(localRootDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        dataGroupName = getIsolatedParentDir(standardBedFilePath)

        # Generate the output file path and metadata
        customBedOutputFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=dataGroupName,
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")
        customBedOutputFilePaths.append(customBedOutputFilePath)
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath),
                         os.path.basename(standardBedFilePath),
                         InputFormat.standardBed, localRootDirectory)

        # Get the list of acceptable chromosomes.
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Iterate through the standard bed file entries preparing them for custom-bed input.
        print("Converting entries for custom bed input...")
        with open(standardBedFilePath, 'r') as standardBedFile:
            with open(customBedOutputFilePath, 'w') as customBedOutputFile:

                for line in standardBedFile:

                    choppedUpLine = line.strip().split("\t")

                    # Make sure the lesion is in a valid chromosome.  Otherwise, skip it.
                    if not choppedUpLine[0] in acceptableChromosomes: continue

                    choppedUpLine[3] = '.'
                    choppedUpLine[4] = "OTHER"

                    customBedOutputFile.write('\t'.join(choppedUpLine[:6]) +
                                              '\n')

    # Pass the generated files to the custom bed parser.
    parseCustomBed(customBedOutputFilePaths, genomeFilePath, False, False,
                   False, False)
    def __init__(self, ICGCFile: IO, genomeFilePath):

        self.ICGCFile = ICGCFile  # The file containing the ICGC data
        self.acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)
        self.finishedDonors = list(
        )  # A list of donors to make sure we don't encounter one more than once.
        self.currentDonor = ''  # The donorID currently being read for mutation data.
        self.currentDonorMutations = dict(
        )  # A dictionary of mutations unique to the current donor, to avoid writing duplicate mutations.
        self.previousDonorMutations: List[ICGCMutation] = list(
        )  # A list that keeps the last donor's mutations in order to write data on individual donors all at once.
def countInBindingMotifs(mutationFilePaths, bindingMotifsFilePaths):

    bindingMotifsMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # Loop through each given mutation file path, creating a corresponding binding motifs mutation count file for each.
    for mutationFilePath in mutationFilePaths:
        for bindingMotifsFilePath in bindingMotifsFilePaths:

            print("\nWorking with", os.path.basename(mutationFilePath), "and",
                  os.path.basename(bindingMotifsFilePath))

            # Make sure we have the expected file type.
            if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
                raise ValueError("Mutation file should have \"" +
                                 DataTypeStr.mutations + "\" in the name.")

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(mutationFilePath)

            # Generate the output file path for mutation counts.
            binder = os.path.basename(bindingMotifsFilePath).rsplit(
                "binding_motifs", 1)[0]
            if "binding_motifs" not in os.path.basename(bindingMotifsFilePath):
                warnings.warn(
                    "\"binding_motifs\" not found in basename of binding motifs file.  The output file's name is probably a garbled mess."
                )

            bindingMotifsMutationCountsFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                fileExtension=".tsv",
                dataType=binder + "binding_motif_mutation_counts")
            bindingMotifsMutationCountsFilePaths.append(
                bindingMotifsMutationCountsFilePath)

            # Ready, set, go!
            counter = CountsFileGenerator(
                mutationFilePath, bindingMotifsFilePath,
                bindingMotifsMutationCountsFilePath,
                getAcceptableChromosomes(metadata.genomeFilePath))
            counter.count()
            counter.writeResults()

    return bindingMotifsMutationCountsFilePaths
Beispiel #8
0
def generateMutationBackground(mutationFilePaths, backgroundContextNum):

    mutationBackgroundFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # A dictionary for converting context numbers to text.
    contextNumToText = {
        1: "singlenuc",
        2: "dinuc",
        3: "trinuc",
        4: "quadrunuc",
        5: "pentanuc",
        6: "hexanuc"
    }

    for mutationFilePath in mutationFilePaths:

        # Retrieve metadata
        metadata = Metadata(mutationFilePath)
        intermediateFilesDirectory = os.path.join(metadata.directory,
                                                  "intermediate_files")

        # If necessary, adjust the context for files with even-length features.
        if getContext(mutationFilePath, asInt=True) % 2 == 0:
            thisBackgroundContextNum = backgroundContextNum + 1
        else:
            thisBackgroundContextNum = backgroundContextNum

        # Set the name of the type of context being used.
        assert thisBackgroundContextNum in contextNumToText, "Unexpected background context number: " + str(
            thisBackgroundContextNum)
        contextText = contextNumToText[thisBackgroundContextNum]

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(
            metadata.genomeFilePath)

        print("\nWorking in:", os.path.split(mutationFilePath)[1])
        if not DataTypeStr.mutations in os.path.split(mutationFilePath)[1]:
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        # Generate the file path for the genome context frequency file.
        genomeContextFrequencyFilePath = generateFilePath(
            directory=os.path.dirname(metadata.genomeFilePath),
            dataGroup=metadata.genomeName,
            context=contextText,
            dataType="frequency",
            fileExtension=".tsv")

        # Generate the file path for the mutation context frequency file.
        mutationContextFrequencyFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType="mutation_frequencies",
            fileExtension=".tsv")

        # Generate the file path for the background mutation rate file.
        mutationBackgroundFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType=DataTypeStr.mutBackground,
            fileExtension=".tsv")

        # If the genome context frequency file doesn't exist, create it.
        if not os.path.exists(genomeContextFrequencyFilePath):
            print("Genome", contextText,
                  "context frequency file not found at path:",
                  genomeContextFrequencyFilePath)
            print("Generating genome " + contextText +
                  " context frequency file...")
            generateGenomeContextFrequencyFile(metadata.genomeFilePath,
                                               genomeContextFrequencyFilePath,
                                               thisBackgroundContextNum,
                                               contextText,
                                               acceptableChromosomes)

        # Create a directory for intermediate files if it does not already exist...
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(intermediateFilesDirectory)

        # Create the mutation context frequency file.
        print("Generating mutation context frequency file...")
        generateMutationContextFrequencyFile(mutationFilePath,
                                             mutationContextFrequencyFilePath,
                                             thisBackgroundContextNum,
                                             contextText,
                                             acceptableChromosomes)

        # Generate the mutation background file.
        generateMutationBackgroundFile(genomeContextFrequencyFilePath,
                                       mutationContextFrequencyFilePath,
                                       mutationBackgroundFilePath, contextText)

        mutationBackgroundFilePaths.append(mutationBackgroundFilePath)

    return mutationBackgroundFilePaths
def parseDeaminationData(cPDFilePaths: List[str],
                         deaminationFilePaths: List[str], genomeFastaFilePath):
    """
    See script header.
    """

    acceptableChromosomes = getAcceptableChromosomes(genomeFastaFilePath)

    for cPDFilePath in cPDFilePaths:

        print("\nWorking in", os.path.basename(cPDFilePath))

        # Create a name for the parsed output file.
        cPDParsedFilePath = cPDFilePath.rsplit('.', 1)[0] + "_parsed.bed"

        # Create a path to the output file with only cytosine positions.
        cPDCytosinePositionsFilePath = cPDFilePath.rsplit(
            '.', 1)[0] + "_cytosines.bed"

        with open(cPDFilePath, 'r') as cPDFile:
            with open(cPDParsedFilePath, 'w') as cPDParsedFile:

                print("Parsing original file...")

                cPDFile.readline()  # Skip the header line.

                for line in cPDFile:

                    choppedUpLine = line.split()

                    # Record position information, inferring strand from the given gap sequencing value
                    # and extending positions to encompass the full CPD sequence.
                    chromosome = choppedUpLine[0]

                    # Check for invalid chromosomes
                    if chromosome not in acceptableChromosomes:
                        print("Skipping invalid chromosome:", chromosome)
                        continue

                    if isPurine(choppedUpLine[3]):
                        strand = '-'
                        position0 = str(int(choppedUpLine[1]) - 1)
                        position1 = choppedUpLine[2]
                    else:
                        strand = '+'
                        position0 = choppedUpLine[1]
                        position1 = str(int(choppedUpLine[2]) + 1)

                    # Make sure there is aggreement among the weird sequence columns as to the CPD sequence
                    assert choppedUpLine[5][-2:] == choppedUpLine[6][
                        -2:] == choppedUpLine[7][:2] == choppedUpLine[
                            8][:2], line
                    cPD = choppedUpLine[5][-2:]

                    cPDParsedFile.write('\t'.join(
                        (chromosome, position0, position1, cPD, '.', strand)) +
                                        '\n')

        # Derive the sequences directly from the positions and make sure it matches the cPD value obtained previously.
        # At the same time, create the file with only the cytosine positions in CPDs.
        print("Deriving sequence context from given fasta file...")
        addSequenceToBed(cPDParsedFilePath, genomeFastaFilePath, 4)
        with open(cPDParsedFilePath, 'r') as cPDParsedFile:
            with open(cPDCytosinePositionsFilePath,
                      'w') as cPDCytosinePositionsFile:

                print(
                    "Validating original file and trimming to single base cytosine positions..."
                )

                for line in cPDParsedFile:

                    choppedUpLine = line.split()
                    assert choppedUpLine[3] == choppedUpLine[4], line

                    for i, base in enumerate(choppedUpLine[3]):
                        if base == 'C':

                            if choppedUpLine[5] == '+':
                                position0 = str(int(choppedUpLine[1]) + i)
                            else:
                                position0 = str(int(choppedUpLine[1]) + 1 - i)
                            position1 = str(int(position0) + 1)

                            cPDCytosinePositionsFile.write('\t'.join(
                                (choppedUpLine[0], position0, position1, '.',
                                 '.', choppedUpLine[5])) + '\n')

    for deaminationFilePath in deaminationFilePaths:

        print("\nWorking in", os.path.basename(deaminationFilePath))

        # Create a name for the parsed output file.
        deaminationParsedFilePath = deaminationFilePath.rsplit(
            '.', 1)[0] + "_parsed.bed"

        # Create a path to the output file with only cytosine positions in dipy contexts.
        dipyDeaminationPositionsFilePath = deaminationFilePath.rsplit(
            '.', 1)[0] + "_dipy_cytosines.bed"

        with open(deaminationFilePath, 'r') as deaminationFile:
            with open(deaminationParsedFilePath, 'w') as deaminationParsedFile:

                print("Parsing original file...")

                deaminationFile.readline()  # Skip the header line.

                for line in deaminationFile:

                    choppedUpLine = line.split()

                    # Check for non-cytosine positions, and ignore them if found.
                    if choppedUpLine[3] in ('A', 'T'): continue

                    # Record position information for all other rows.  Expand to trinucleotide context.
                    chromosome = choppedUpLine[0]

                    # Check for invalid chromosomes
                    if chromosome not in acceptableChromosomes:
                        print("Skipping invalid chromosome:", chromosome)
                        continue

                    position0 = str(int(choppedUpLine[1]) - 1)
                    position1 = str(int(choppedUpLine[2]) + 1)
                    if choppedUpLine[3] == 'C': strand = '+'
                    else: strand = '-'
                    trinuc = choppedUpLine[5]

                    deaminationParsedFile.write('\t'.join(
                        (chromosome, position0, position1, trinuc, '.',
                         strand)) + '\n')

        # Derive the trinucleotide context sequences directly from the positions and make sure it matches the sequence obtained from the file.
        # At the same time, create the file with only the cytosine positions with an adjacent pyrimidine.
        print("Deriving sequence context from given fasta file...")
        addSequenceToBed(deaminationParsedFilePath,
                         genomeFastaFilePath,
                         substitutionPosition=4)
        with open(deaminationParsedFilePath, 'r') as deaminationParsedFile:
            with open(dipyDeaminationPositionsFilePath,
                      'w') as dipyDeaminationPositionsFile:

                print(
                    "Validating original file and trimming cytosines without adjacent pyrimidines..."
                )

                for line in deaminationParsedFile:

                    choppedUpLine = line.split()

                    assert choppedUpLine[3] == choppedUpLine[4], line

                    if isPurine(choppedUpLine[3][0]) and isPurine(
                            choppedUpLine[3][2]):
                        continue
                    else:

                        position0 = str(int(choppedUpLine[1]) + 1)
                        position1 = str(int(choppedUpLine[2]) - 1)

                        dipyDeaminationPositionsFile.write('\t'.join(
                            (choppedUpLine[0], position0, position1, '.', '.',
                             choppedUpLine[5])) + '\n')
Beispiel #10
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
Beispiel #11
0
def autoAcquireAndQACheck(bedInputFilePath: str, genomeFilePath,
                          autoAcquiredFilePath, onlySingleBaseSubs,
                          includeIndels):

    print(
        "Checking custom bed file for formatting and auto-acquire requests...")

    # To start, assume that no sequences need to be acquired, and do it on the fly if need be.
    autoAcquiring = False
    autoAcquireFastaIterator = None
    fastaEntry = None
    cohortDesignationPresent = None

    # Unless indels are included, determine the context of the feqtures in the file.
    if includeIndels: context = 0
    else: context = None

    # Get the list of acceptable chromosomes
    acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)
    acceptableChromosomesFilePath = getAcceptableChromosomes(
        genomeFilePath, True)

    # Create a temporary file to write the data to (potentially after auto-acquiring).
    # Will replace original file at the end if auto-acquiring occurred.
    temporaryBedFilePath = bedInputFilePath + ".tmp"

    # Iterate through the input file one line at a time, checking the format of each entry and looking for auto-acquire requests.
    with open(bedInputFilePath, 'r') as bedInputFile:
        with open(temporaryBedFilePath, 'w') as temporaryBedFile:
            for line in bedInputFile:

                choppedUpLine = str(line).strip().split('\t')

                # If it isn't already, initialize the cohortDesignationPresent variable.
                if cohortDesignationPresent is None:
                    cohortDesignationPresent = len(choppedUpLine) == 7

                # Check for possible error states.
                checkForErrors(choppedUpLine, cohortDesignationPresent,
                               acceptableChromosomes,
                               acceptableChromosomesFilePath)

                # If this is the first entry requiring auto-acquiring, generate the required fasta file.
                if (not autoAcquiring and
                    (choppedUpLine[3] == '.' or choppedUpLine[4] == '.' or
                     (choppedUpLine[5] == '.' and choppedUpLine[3] != '*'))):
                    print(
                        "Found line with auto-acquire requested.  Generating fasta..."
                    )
                    autoAcquiring = True
                    bedToFasta(bedInputFilePath, genomeFilePath,
                               autoAcquiredFilePath)
                    autoAcquiredFile = open(autoAcquiredFilePath, 'r')
                    autoAcquireFastaIterator = FastaFileIterator(
                        autoAcquiredFile)
                    fastaEntry = autoAcquireFastaIterator.readEntry()
                    print("Continuing...")

                # Check for any base identities that need to be auto-acquired.
                if choppedUpLine[3] == '.':

                    # Find the equivalent fasta entry.
                    while not equivalentEntries(fastaEntry, choppedUpLine):
                        assert not autoAcquireFastaIterator.eof, (
                            "Reached end of fasta file without finding a match for: ",
                            ' '.join(choppedUpLine))
                        fastaEntry = autoAcquireFastaIterator.readEntry()

                    # Set the sequence.
                    choppedUpLine[3] = fastaEntry.sequence

                # Check for any strand designations that need to be auto-acquired.
                # Also, make sure this isn't an insertion, in which case the strand designation cannot be determined.
                if choppedUpLine[5] == '.' and choppedUpLine[3] != '*':

                    # Find the equivalent fasta entry.
                    while not equivalentEntries(fastaEntry, choppedUpLine):
                        assert not autoAcquireFastaIterator.eof, (
                            "Reached end of fasta file without finding a match for: ",
                            ' '.join(choppedUpLine))
                        fastaEntry = autoAcquireFastaIterator.readEntry()

                    # Determine which strand is represented.
                    if fastaEntry.sequence == choppedUpLine[3]:
                        choppedUpLine[5] = '+'
                    elif fastaEntry.sequence == reverseCompliment(
                            choppedUpLine[3]):
                        choppedUpLine[5] = '-'
                    else:
                        assert False, (
                            "The given sequence " + choppedUpLine[3] +
                            " for location " + fastaEntry.sequenceName + ' ' +
                            "does not match the corresponding sequence in the given genome, or its reverse compliment."
                        )

                # Change any '.' characters in the "altered to" column to "OTHER"
                if choppedUpLine[4] == '.': choppedUpLine[4] = "OTHER"

                # Determine the sequence context of the line and whether or not it matches the sequence context for other.
                # Skip this if the file is "mixed", this line is an indel, or only single base substitutions are allowed and this line isn't one.
                if (not context == 0 and not (choppedUpLine[3] == '*'
                                              or choppedUpLine[4] == '*')
                        and (not onlySingleBaseSubs
                             or isSingleBaseSubstitution(choppedUpLine))):

                    thisContext = len(choppedUpLine[3])
                    if context is None: context = thisContext
                    elif thisContext != context: context = 0

                # Write the current line to the temporary bed file.
                temporaryBedFile.write('\t'.join(choppedUpLine) + '\n')

    # If any lines were auto-acquired, replace the input bed file with the temporary bed file. (Which has auto-acquires)
    if autoAcquiring:
        print(
            "Overwriting custom bed input with auto-acquired bases/strand designations."
        )
        os.replace(temporaryBedFilePath, bedInputFilePath)
    # Otherwise, just delete the temporary file.
    else:
        os.remove(temporaryBedFilePath)

    if context > 6: context = float("inf")
    return context
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str],
                         genomeFilePath, nucPosFilePath, includeAllPAHs):

    for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths:

        print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath))

        if not kucabSubstitutionsFilePath.endswith("final.txt"):
            raise InvalidPathError(
                kucabSubstitutionsFilePath,
                "Given kucab input file does not end in \"final.txt\":")

        # Prepare the output file path.
        localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath)
        dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath)
        if includeAllPAHs:
            outputDirectory = os.path.join(localRootDirectory, "all_PAHs")
            dataGroupName += "_all_PAHs"
        else:
            dataGroupName += "_smoker_lung"
            outputDirectory = os.path.join(localRootDirectory, "smoker_lung")

        # Make sure the data directory exists.
        if not os.path.exists(outputDirectory): os.mkdir(outputDirectory)

        # Generate the output file path and metadata
        outputTrinucBedFilePath = generateFilePath(
            directory=outputDirectory,
            dataGroup=dataGroupName,
            context="trinuc",
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        generateMetadata(
            dataGroupName, getIsolatedParentDir(genomeFilePath),
            getIsolatedParentDir(nucPosFilePath),
            os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)),
            outputDirectory)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study.
        PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2",
                           "MSM0.42", "MSM0.74", "MSM0.103"
                           "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12",
                           "MSM0.132", "MSM0.13", "MSM0.96")
        # These designations specifically mimic the indel signature in smokers' lung cancer tumors.
        LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2",
                                          "MSM0.103", "MSM0.14")

        # Set the designations that will be used to collect data based on the input to the function.
        if includeAllPAHs:
            relevantDesignations = PAHDesignations
        else:
            relevantDesignations = LungCancerSpecificDesignations

        print("Reading data and writing to trinuc bed file...")
        with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile:
            with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile:

                firstLineFlag = True
                for line in kucabSubstitutionsFile:

                    # Skip the first line with headers.
                    if firstLineFlag:
                        firstLineFlag = False
                        continue

                    # The lines are separated by tabs.  The relevant data have the following indices in a tab-separated list:
                    # 15: mutagen designation
                    # 4: Chromosome
                    # 5: Start Pos (1 base)
                    # 6: Reference base
                    # 7: Mutated base
                    # 13: pre-base context
                    # 14: post-base context
                    choppedUpLine = line.strip().split('\t')

                    # Skip the mutation if it does not belong to the relevant group.
                    if not choppedUpLine[15] in relevantDesignations: continue

                    # Compile the necessary information for the bed file.
                    chromosome = "chr" + choppedUpLine[4]

                    # Handle the weird chromsome formatting and then check for invalid chromosomes.
                    if chromosome == "chr23": chromosome = "chrX"
                    if chromosome == "chr24": chromosome = "chrY"
                    if not chromosome in acceptableChromosomes: continue
                    startPos1Base = choppedUpLine[5]
                    startPos0Base = str(int(startPos1Base) - 1)

                    mutatedFrom = choppedUpLine[6]
                    mutatedTo = choppedUpLine[7]
                    trinucContext = ''.join(
                        (choppedUpLine[13], mutatedFrom, choppedUpLine[14]))

                    # If the mutated base is listed as arising from a purine, flip the mutation and the strand.
                    if isPurine(mutatedFrom):
                        mutation = reverseCompliment(
                            mutatedFrom) + '>' + reverseCompliment(mutatedTo)
                        strand = '-'
                        trinucContext = reverseCompliment(trinucContext)
                    else:
                        mutation = mutatedFrom + '>' + mutatedTo
                        strand = '+'

                    # Write the information to the trinuc bed file.
                    outputTrinucBedFile.write('\t'.join(
                        (chromosome, startPos0Base, startPos1Base,
                         trinucContext, mutation, strand)) + '\n')

        # Sort the output file.
        print("Sorting output file...")
        subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath,
                        "-o", outputTrinucBedFilePath),
                       check=True)