def parseAlexandrov(bedInputFilePaths, genomeFilePath, nucPosFilePath):

    outputBedFilePaths = list()

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(bedInputFilePath)
        generateMetadata(os.path.basename(dataDirectory),
                         getIsolatedParentDir(genomeFilePath),
                         getIsolatedParentDir(nucPosFilePath),
                         os.path.basename(bedInputFilePath),
                         InputFormat.customBed,
                         os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(bedInputFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write data to the output file.
        with open(bedInputFilePath, 'r') as bedInputFile:
            with open(outputBedFilePath, 'w') as outputBedFile:

                for line in bedInputFile:

                    choppedUpLine = str(line).strip().split('\t')

                    # Make sure we have a valid chromosome
                    if (
                            "chr" + choppedUpLine[2]
                    ) in acceptableChromosomes and not '/' in choppedUpLine[5]:

                        # Convert the line to custom bed format.
                        if choppedUpLine[5] == '-': choppedUpLine[5] = '*'
                        if choppedUpLine[6] == '-': choppedUpLine[6] = '*'
                        outputBedFile.write('\t'.join(
                            ("chr" + choppedUpLine[2],
                             str(int(choppedUpLine[3]) - 1), choppedUpLine[4],
                             choppedUpLine[5], choppedUpLine[6], '.',
                             choppedUpLine[0])) + '\n')

        # Add the output file to the list.
        outputBedFilePaths.append(outputBedFilePath)

    # Pass the data to the custome bed parser.
    print("\nPassing data to custom bed parser.\n")
    parseCustomBed(outputBedFilePaths, genomeFilePath, nucPosFilePath, False,
                   False, False)
Example #2
0
    def setUpForMSStratification(self) -> MSIIdentifier:

        self.stratifyByMS = True

        self.MSICohorts = dict()  # A hashtable of individual cohorts with MSI

        # Create the necessary directories, file paths, and metadata.
        aggregateMSDirectory = os.path.join(self.rootMetadata.directory,
                                            "microsatellite_analysis")
        aggregateMSSDirectory = os.path.join(aggregateMSDirectory, "MSS")
        aggregateMSIDirectory = os.path.join(aggregateMSDirectory, "MSI")
        checkDirs(aggregateMSSDirectory, aggregateMSIDirectory)

        generateMetadata(
            "MSS_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSSDirectory, "MSS")
        generateMetadata(
            "MSI_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSIDirectory, "MSI")
        self.aggregateMSSMutCounts = 0
        self.aggregateMSIMutCounts = 0

        self.aggregateMSSFilePath = generateFilePath(
            directory=aggregateMSSDirectory,
            dataGroup="MSS_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSSFile = open(self.aggregateMSSFilePath, 'w')
        self.aggregateMSIFilePath = generateFilePath(
            directory=aggregateMSIDirectory,
            dataGroup="MSI_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSIFile = open(self.aggregateMSIFilePath, 'w')

        # Set up the MSIIdentifier to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        MSISeqInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSISeq_data",
            fileExtension=".tsv")
        self.MSICohortsFilePath = generateFilePath(
            directory=aggregateMSDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSI_cohorts",
            fileExtension=".txt")

        self.myMSIIdentifier = MSIIdentifier(MSISeqInputDataFilePath,
                                             self.MSICohortsFilePath)
        return (self.myMSIIdentifier)
Example #3
0
    def setUpNewIndividualCohort(self, cohortID):

        # If this isn't the first opened cohort file, close and sort the last one.
        if self.currentIndividualCohortFile is not None:
            self.currentIndividualCohortFile.close()
            subprocess.run(
                ("sort", "-k1,1", "-k2,2n", self.individualCohortFilePath,
                 "-s", "-o", self.individualCohortFilePath),
                check=True)
            Metadata(self.individualCohortFilePath).addMetadata(
                Metadata.AddableKeys.mutCounts,
                self.currentIndividualCohortMutCounts)

        # Make sure this is actually a new cohort.
        if cohortID in self.completedIndividualCohorts:
            raise UserInputError(
                "The cohort " + cohortID +
                " was encountered in more than one distinct block of data.")
        else:
            self.currentIndividualCohortID = cohortID

        individualCohortDirectory = os.path.join(
            self.rootIndividualCohortsDirectory,
            self.currentIndividualCohortID)
        individualCohortDataGroup = self.currentIndividualCohortID + "_" + self.rootMetadata.dataGroupName

        checkDirs(individualCohortDirectory)

        # Determine which other set up "umbrella" cohorts this cohort belongs to.
        cohortMembership = [
            self.currentIndividualCohortID,
        ]
        if self.stratifyByMS:
            if self.currentIndividualCohortID in self.MSICohorts:
                cohortMembership.append("MSI")
            else:
                cohortMembership.append("MSS")
        if self.stratifyByMutSig:
            if self.currentIndividualCohortID in self.mutSigDesignations:
                for mutSig in self.mutSigDesignations[
                        self.currentIndividualCohortID]:
                    cohortMembership.append("mut_sig_" + mutSig)

        # Generate the file path and metadata file and open the file for writing.
        self.individualCohortFilePath = generateFilePath(
            directory=individualCohortDirectory,
            dataGroup=individualCohortDataGroup,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.currentIndividualCohortFile = open(self.individualCohortFilePath,
                                                'w')
        generateMetadata(
            individualCohortDataGroup, self.rootMetadata.genomeName,
            os.path.join("..", self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, individualCohortDirectory,
            *cohortMembership)
        self.currentIndividualCohortMutCounts = 0
def getQuartileNucleosomePositions(quartileFilePaths: List[str],
                                   nucPosDir: str, stratificationType,
                                   sloppyCopy):

    # If this isn't just a sloppy copy, create a dictionary containing each line in the root nucPos file for the corresponding location ID
    # Maybe this could cause memory issues, but I think it should be fine since the nucleosome maps are usually not too big.
    if not sloppyCopy:
        nucPosLines = dict()
        with open(os.path.join(nucPosDir,
                               os.path.basename(nucPosDir) +
                               ".bed")) as nucPosFile:

            for line in nucPosFile:
                chromosome, startPos, endPos = line.split()[:3]
                key = (chromosome + ':' + str(float(startPos)) + '-' +
                       str(float(endPos) - 1) + "(+)")
                nucPosLines[key] = line

    for quartileFilePath in quartileFilePaths:

        print("Working in", os.path.basename(quartileFilePath))

        if "lower_quartile" in os.path.basename(quartileFilePath):
            quartile = "lower_quartile"
        elif "upper_quartile" in os.path.basename(quartileFilePath):
            quartile = "upper_quartile"
        else:
            raise ValueError("Quartile Designation not found in file name: " +
                             os.path.basename(quartileFilePath))

        # Create a name for the new nucleosome data based on the name of the quartile file and the parent nucleosome data.
        nucleosomeDataName = '_'.join(
            (os.path.basename(nucPosDir), stratificationType, quartile))
        outputNucPosFilePath = os.path.join(os.path.dirname(nucPosDir),
                                            nucleosomeDataName,
                                            nucleosomeDataName + ".bed")
        checkDirs(os.path.dirname(outputNucPosFilePath))

        with open(quartileFilePath, 'r') as quartileFile:
            quartileFile.readline()  # Get rid of headers
            with open(outputNucPosFilePath, 'w') as outputNucPosFile:

                # Parse out the location information and convert it to bed format.
                for line in quartileFile:

                    if sloppyCopy:
                        chromosome, startPos, endPos, strand = parseFastaDescription(
                            line.split()[0])
                        outputNucPosFile.write('\t'.join(
                            (chromosome, startPos, str(float(endPos) + 1), '.',
                             '.', strand)) + '\n')
                    else:
                        outputNucPosFile.write(nucPosLines[line.split()[0]])

        # Sort the output
        subprocess.run(("sort", "-k1,1", "-k2,3n", outputNucPosFilePath, "-o",
                        outputNucPosFilePath),
                       check=True)
Example #5
0
def parseStandardBed(standardBedFilePaths: List[str], genomeFilePath):

    customBedOutputFilePaths = list(
    )  # The list of file paths to be passed to the custom bed parser.

    # Parse the given files into custom bed format.
    for standardBedFilePath in standardBedFilePaths:

        print("\nWorking in:", os.path.basename(standardBedFilePath))
        if not os.path.basename(standardBedFilePath).endswith(".bed"):
            raise InvalidPathError(
                standardBedFilePath,
                "Given file does not appear to be in bed format. (missing \".bed\" extension)"
            )

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(standardBedFilePath)
        intermediateFilesDir = os.path.join(localRootDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        dataGroupName = getIsolatedParentDir(standardBedFilePath)

        # Generate the output file path and metadata
        customBedOutputFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=dataGroupName,
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")
        customBedOutputFilePaths.append(customBedOutputFilePath)
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath),
                         os.path.basename(standardBedFilePath),
                         InputFormat.standardBed, localRootDirectory)

        # Get the list of acceptable chromosomes.
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Iterate through the standard bed file entries preparing them for custom-bed input.
        print("Converting entries for custom bed input...")
        with open(standardBedFilePath, 'r') as standardBedFile:
            with open(customBedOutputFilePath, 'w') as customBedOutputFile:

                for line in standardBedFile:

                    choppedUpLine = line.strip().split("\t")

                    # Make sure the lesion is in a valid chromosome.  Otherwise, skip it.
                    if not choppedUpLine[0] in acceptableChromosomes: continue

                    choppedUpLine[3] = '.'
                    choppedUpLine[4] = "OTHER"

                    customBedOutputFile.write('\t'.join(choppedUpLine[:6]) +
                                              '\n')

    # Pass the generated files to the custom bed parser.
    parseCustomBed(customBedOutputFilePaths, genomeFilePath, False, False,
                   False, False)
Example #6
0
    def setUpForIndividualCohorts(self):

        self.stratifyByIndividualCohorts = True

        self.currentIndividualCohortID = None  # The cohort being written to at a point in time.
        self.currentIndividualCohortFile: IO = None  # The open file for the current cohort.
        self.completedIndividualCohorts = dict(
        )  # A hashtable of cohorts that have been seen before and should NOT be revisited/rewritten.

        # Create the directory.
        self.rootIndividualCohortsDirectory = os.path.join(
            self.rootMetadata.directory, "individual_cohorts")
        checkDirs(self.rootIndividualCohortsDirectory)
    def __init__(self, mutationFilePath, domainRangesFilePath):

        # Open the mutation and gene positions files to compare against one another.
        self.mutationFile = open(mutationFilePath, 'r')
        self.domainRangesFile = open(domainRangesFilePath, 'r')

        # Set up the file system for outputting files for different domains..
        self.domainOutputFiles = dict()
        self.domainOutputFolder = os.path.join(
            os.path.dirname(mutationFilePath),
            os.path.basename(domainRangesFilePath).rsplit('.', 1)[0])
        checkDirs(self.domainOutputFolder)
        self.domainOutputFilePathBasename = os.path.basename(
            mutationFilePath).rsplit('.', 1)[0]

        # Keeps track of mutations that matched to a domain to check for overlap.
        self.mutationsInPotentialOverlap: List[MutationData] = list()

        # The mutation and gene currently being investigated.
        self.currentMutation: MutationData = None
        self.currentDomain: DomainData = None
def parseICGC(ICGCFilePaths: List[str], genomeFilePath, separateDonors,
              stratifyByMS, stratifyByMutSig):

    outputBedFilePaths = list()

    if len(ICGCFilePaths) == 0:
        raise UserInputError("No ICGC files were found to parse.")

    # Run the parser for each ICGC file given.
    for ICGCFilePath in ICGCFilePaths:

        print("\nWorking in:", os.path.split(ICGCFilePath)[1])

        if not ICGCFilePath.endswith(".gz"):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file is not gzipped (.gz file format):")
        if not "simple_somatic_mutation" in os.path.basename(ICGCFilePath):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file path does not have \"simple_somatic_mutation\" in the name:",
                "Note: if a directory was specified to search for ICGC input files, "
                "all files ending in .tsv.gz are selected.")

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(ICGCFilePath)
        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        generateMetadata(getIsolatedParentDir(ICGCFilePath),
                         getIsolatedParentDir(genomeFilePath),
                         os.path.basename(ICGCFilePath), InputFormat.ICGC,
                         os.path.dirname(ICGCFilePath))

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(ICGCFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write the relevant information from the ICGC file to the output file.
        print("Writing data to custom bed format.")
        with gzip.open(ICGCFilePath, 'r') as ICGCFile:
            with open(outputBedFilePath, 'w') as outputBedFile:
                for mutation in ICGCIterator(ICGCFile, genomeFilePath):

                    # Change the formatting if a deletion or insertion is given.
                    if mutation.mutatedFrom == '-':
                        mutation.mutatedFrom = '*'
                        # NOTE: We are making the assumption that the given base pos (1-based) is after the insertion, not before.
                        mutation.startPos = str(int(mutation.startPos) - 1)

                    elif mutation.mutatedTo == '-':
                        mutation.mutatedTo = '*'

                    outputBedFile.write('\t'.join(
                        (mutation.chromosome, mutation.startPos,
                         mutation.endPos, mutation.mutatedFrom,
                         mutation.mutatedTo, mutation.strand,
                         mutation.donorID)) + '\n')

        outputBedFilePaths.append(outputBedFilePath)

    # Pass the parsed bed files to the custom bed parser for even more parsing! (Hooray for modularization!)
    print("\nPassing data to custom bed parser...")
    parseCustomBed(outputBedFilePaths, genomeFilePath, stratifyByMS,
                   stratifyByMutSig, separateDonors, True)
Example #9
0
    def setUpForMutSigStratification(self) -> MutSigIdentifier:

        self.stratifyByMutSig = True
        self.mutSigDesignations = dict(
        )  # A dictionary of the mutation signatures assigned to each cohort.

        mutSigs = ["1A", "1B"] + [str(x) for x in list(range(2, 22))
                                  ] + ["R1", "R2", "R3", "U1", "U2"]

        # Create the necessary directories, file paths, and metadata.
        parentMutSigDirectory = os.path.join(self.rootMetadata.directory,
                                             "mut_sig_analysis")
        self.mutSigFilePaths = dict()
        self.mutSigFiles = dict()
        self.mutSigMutCounts = dict()

        for mutSig in mutSigs:

            thisMutSigDataGroup = "mut_sig_" + mutSig + '_' + self.rootMetadata.dataGroupName

            # Directory
            thisMutSigDirectory = os.path.join(parentMutSigDirectory, mutSig)
            checkDirs(thisMutSigDirectory)

            # Metadata
            generateMetadata(
                thisMutSigDataGroup, self.rootMetadata.genomeName,
                os.path.join('..', '..',
                             self.rootMetadata.localParentDataPath),
                self.rootMetadata.inputFormat, thisMutSigDirectory,
                "mutSig" + mutSig)

            # Mutation Counter
            self.mutSigMutCounts[mutSig] = 0

            # File path
            self.mutSigFilePaths[mutSig] = generateFilePath(
                directory=thisMutSigDirectory,
                dataGroup=thisMutSigDataGroup,
                context=self.context,
                dataType=DataTypeStr.mutations,
                fileExtension=".bed")
            self.mutSigFiles[mutSig] = open(self.mutSigFilePaths[mutSig], 'w')

        # Set up the MutSigIdentifier object to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        deconstructSigsInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="deconstructSigs_data",
            fileExtension=".tsv")
        self.mutSigDesignationsFilePath = generateFilePath(
            directory=parentMutSigDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="mut_sig_assignments",
            fileExtension=".tsv")

        self.mutSigIdentifier = MutSigIdentifier(
            deconstructSigsInputDataFilePath, self.mutSigDesignationsFilePath)
        return (self.mutSigIdentifier)
def generateNucleosomeFasta(baseNucPosFilePath, genomeFilePath, dyadRadius,
                            linkerOffset):

    # Ensure that an intermediate files directory exists for the current nucleosome map.
    intermediateFilesDir = os.path.join(os.path.dirname(baseNucPosFilePath),
                                        "intermediate_files")
    checkDirs(intermediateFilesDir)

    # Generate a path to the fasta file of nucleosome sequences (Potentially including linker DNA).
    if dyadRadius == 73:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            linkerOffset=linkerOffset,
            fileExtension=".fa")
    elif dyadRadius == 1000:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            usesNucGroup=True,
            fileExtension=".fa")
    else:
        raise ValueError("Invalid dyad radius: " + str(dyadRadius) +
                         ".  Expected 73 or 1000.")

    # Make sure the file doesn't already exist.  If it does, we're done!
    if os.path.exists(nucPosFastaFilePath):
        print("Found relevant nucleosome fasta file:",
              os.path.basename(nucPosFastaFilePath))
        return nucPosFastaFilePath
    else:
        print("Nucleosome fasta file not found at: ",
              nucPosFastaFilePath,
              "\nGenerating...",
              sep='')

    # Generate the (temporary) expanded file path.
    expandedNucPosBedFilePath = generateFilePath(
        directory=intermediateFilesDir,
        dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
        dataType="expanded",
        fileExtension=".bed")

    # Expand the bed coordinates.
    print("Expanding nucleosome coordinates...")
    with open(baseNucPosFilePath, 'r') as baseNucPosFile:
        with open(expandedNucPosBedFilePath, 'w') as expandedNucPosBedFile:

            # Write the expanded positions to the new file, one line at a time.
            for line in baseNucPosFile:
                choppedUpLine = line.strip().split('\t')
                choppedUpLine[1] = str(
                    int(choppedUpLine[1]) - dyadRadius - linkerOffset - 2)
                choppedUpLine[2] = str(
                    int(choppedUpLine[2]) + dyadRadius + linkerOffset + 2)

                # Write the results to the expansion file as long as it is not before the start of the chromosome.
                if int(choppedUpLine[1]) > -1:
                    expandedNucPosBedFile.write('\t'.join(choppedUpLine) +
                                                '\n')
                else:
                    print("Nucleosome at chromosome", choppedUpLine[0],
                          "with expanded start pos", choppedUpLine[1],
                          "extends into invalid positions.  Skipping.")

    # Convert the expanded bed file to fasta format.
    print("Converting expanded coordinates to fasta file...")
    bedToFasta(expandedNucPosBedFilePath,
               genomeFilePath,
               nucPosFastaFilePath,
               includeStrand=False)

    return nucPosFastaFilePath
Example #11
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
Example #12
0
def parseCustomBed(bedInputFilePaths,
                   genomeFilePath,
                   stratifyByMS,
                   stratifyByMutSig,
                   separateIndividualCohorts,
                   onlySingleBaseSubs=False,
                   includeIndels=False):

    if onlySingleBaseSubs and includeIndels:
        raise UserInputError(
            "Indels are incompatible with single nucleotide substitutions.")
    if len(bedInputFilePaths) == 0:
        raise UserInputError("No bed files were found to parse.")

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata
        # If this is an intermediate file, keep in mind that it's not in the data group's root directory
        # and metadata should already have been generated elsewhere
        if getIsolatedParentDir(bedInputFilePath) == "intermediate_files":
            dataDirectory = os.path.dirname(os.path.dirname(bedInputFilePath))
        else:
            dataDirectory = os.path.dirname(bedInputFilePath)
            generateMetadata(os.path.basename(dataDirectory),
                             getIsolatedParentDir(genomeFilePath),
                             os.path.basename(bedInputFilePath),
                             InputFormat.customBed,
                             os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        autoAcquiredFilePath = os.path.join(intermediateFilesDir,
                                            "auto_acquire.fa")

        context = autoAcquireAndQACheck(bedInputFilePath, genomeFilePath,
                                        autoAcquiredFilePath,
                                        onlySingleBaseSubs, includeIndels)

        # Make sure the input file is not named the same as what will become the output file.  If it is, it needs to be copied
        # to the intermediate_files directory so it is available to be read from as the new output file is being written.
        expectedOutputFilePath = generateFilePath(
            directory=dataDirectory,
            dataGroup=os.path.basename(dataDirectory),
            context=context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        if bedInputFilePath == expectedOutputFilePath:
            inputFilePathCopy = os.path.join(
                intermediateFilesDir, os.path.basename(bedInputFilePath))
            print(
                "Input file path is identical to generated output file path and will be overwritten. ",
                "Creating a copy of the input file at:", inputFilePathCopy,
                "to use for reading.")
            shutil.copy2(bedInputFilePath, inputFilePathCopy)
            bedInputFilePath = inputFilePathCopy

        # Create an instance of the WriteManager to handle writing.
        with WriteManager(dataDirectory, context) as writeManager:

            # Check to see if cohort designations are present to see if preparations need to be made.
            optionalArgument = tuple()
            with open(bedInputFilePath, 'r') as bedInputFile:
                line = bedInputFile.readline()

                # Is the cohort designation present?
                if len(line.strip().split('\t')) == 7:

                    # Include in sort function
                    optionalArgument = ("-k7,7", )

                    # Prepare the write manager for individual cohorts if desired.
                    if separateIndividualCohorts:
                        writeManager.setUpForIndividualCohorts()

                elif stratifyByMS or stratifyByMutSig:
                    raise UserInputError(
                        "Additional stratification given, but no cohort designation given."
                    )
                elif separateIndividualCohorts:
                    raise UserInputError(
                        "Separation by individual cohorts requested, but no cohort designation given."
                    )

            # Sort the input data (should also ensure that the output data is sorted)
            subprocess.run(("sort", ) + optionalArgument +
                           ("-k1,1", "-k2,2n", "-k3,3n", bedInputFilePath,
                            "-s", "-o", bedInputFilePath),
                           check=True)

            # If requested, also prepare for stratification by microsatellite stability.
            if stratifyByMS:
                setUpForMSStratification(writeManager, bedInputFilePath)

            if stratifyByMutSig:
                setUpForMutSigStratification(writeManager, bedInputFilePath)

            # Go, go, go!
            convertToStandardInput(bedInputFilePath, writeManager,
                                   onlySingleBaseSubs, includeIndels)