def parseAlexandrov(bedInputFilePaths, genomeFilePath, nucPosFilePath):

    outputBedFilePaths = list()

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(bedInputFilePath)
        generateMetadata(os.path.basename(dataDirectory),
                         getIsolatedParentDir(genomeFilePath),
                         getIsolatedParentDir(nucPosFilePath),
                         os.path.basename(bedInputFilePath),
                         InputFormat.customBed,
                         os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(bedInputFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write data to the output file.
        with open(bedInputFilePath, 'r') as bedInputFile:
            with open(outputBedFilePath, 'w') as outputBedFile:

                for line in bedInputFile:

                    choppedUpLine = str(line).strip().split('\t')

                    # Make sure we have a valid chromosome
                    if (
                            "chr" + choppedUpLine[2]
                    ) in acceptableChromosomes and not '/' in choppedUpLine[5]:

                        # Convert the line to custom bed format.
                        if choppedUpLine[5] == '-': choppedUpLine[5] = '*'
                        if choppedUpLine[6] == '-': choppedUpLine[6] = '*'
                        outputBedFile.write('\t'.join(
                            ("chr" + choppedUpLine[2],
                             str(int(choppedUpLine[3]) - 1), choppedUpLine[4],
                             choppedUpLine[5], choppedUpLine[6], '.',
                             choppedUpLine[0])) + '\n')

        # Add the output file to the list.
        outputBedFilePaths.append(outputBedFilePath)

    # Pass the data to the custome bed parser.
    print("\nPassing data to custom bed parser.\n")
    parseCustomBed(outputBedFilePaths, genomeFilePath, nucPosFilePath, False,
                   False, False)
Ejemplo n.º 2
0
    def setUpForMSStratification(self) -> MSIIdentifier:

        self.stratifyByMS = True

        self.MSICohorts = dict()  # A hashtable of individual cohorts with MSI

        # Create the necessary directories, file paths, and metadata.
        aggregateMSDirectory = os.path.join(self.rootMetadata.directory,
                                            "microsatellite_analysis")
        aggregateMSSDirectory = os.path.join(aggregateMSDirectory, "MSS")
        aggregateMSIDirectory = os.path.join(aggregateMSDirectory, "MSI")
        checkDirs(aggregateMSSDirectory, aggregateMSIDirectory)

        generateMetadata(
            "MSS_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSSDirectory, "MSS")
        generateMetadata(
            "MSI_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSIDirectory, "MSI")
        self.aggregateMSSMutCounts = 0
        self.aggregateMSIMutCounts = 0

        self.aggregateMSSFilePath = generateFilePath(
            directory=aggregateMSSDirectory,
            dataGroup="MSS_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSSFile = open(self.aggregateMSSFilePath, 'w')
        self.aggregateMSIFilePath = generateFilePath(
            directory=aggregateMSIDirectory,
            dataGroup="MSI_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSIFile = open(self.aggregateMSIFilePath, 'w')

        # Set up the MSIIdentifier to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        MSISeqInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSISeq_data",
            fileExtension=".tsv")
        self.MSICohortsFilePath = generateFilePath(
            directory=aggregateMSDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSI_cohorts",
            fileExtension=".txt")

        self.myMSIIdentifier = MSIIdentifier(MSISeqInputDataFilePath,
                                             self.MSICohortsFilePath)
        return (self.myMSIIdentifier)
Ejemplo n.º 3
0
    def setUpNewIndividualCohort(self, cohortID):

        # If this isn't the first opened cohort file, close and sort the last one.
        if self.currentIndividualCohortFile is not None:
            self.currentIndividualCohortFile.close()
            subprocess.run(
                ("sort", "-k1,1", "-k2,2n", self.individualCohortFilePath,
                 "-s", "-o", self.individualCohortFilePath),
                check=True)
            Metadata(self.individualCohortFilePath).addMetadata(
                Metadata.AddableKeys.mutCounts,
                self.currentIndividualCohortMutCounts)

        # Make sure this is actually a new cohort.
        if cohortID in self.completedIndividualCohorts:
            raise UserInputError(
                "The cohort " + cohortID +
                " was encountered in more than one distinct block of data.")
        else:
            self.currentIndividualCohortID = cohortID

        individualCohortDirectory = os.path.join(
            self.rootIndividualCohortsDirectory,
            self.currentIndividualCohortID)
        individualCohortDataGroup = self.currentIndividualCohortID + "_" + self.rootMetadata.dataGroupName

        checkDirs(individualCohortDirectory)

        # Determine which other set up "umbrella" cohorts this cohort belongs to.
        cohortMembership = [
            self.currentIndividualCohortID,
        ]
        if self.stratifyByMS:
            if self.currentIndividualCohortID in self.MSICohorts:
                cohortMembership.append("MSI")
            else:
                cohortMembership.append("MSS")
        if self.stratifyByMutSig:
            if self.currentIndividualCohortID in self.mutSigDesignations:
                for mutSig in self.mutSigDesignations[
                        self.currentIndividualCohortID]:
                    cohortMembership.append("mut_sig_" + mutSig)

        # Generate the file path and metadata file and open the file for writing.
        self.individualCohortFilePath = generateFilePath(
            directory=individualCohortDirectory,
            dataGroup=individualCohortDataGroup,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.currentIndividualCohortFile = open(self.individualCohortFilePath,
                                                'w')
        generateMetadata(
            individualCohortDataGroup, self.rootMetadata.genomeName,
            os.path.join("..", self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, individualCohortDirectory,
            *cohortMembership)
        self.currentIndividualCohortMutCounts = 0
def getQuartileNucleosomePositions(quartileFilePaths: List[str],
                                   nucPosDir: str, stratificationType,
                                   sloppyCopy):

    # If this isn't just a sloppy copy, create a dictionary containing each line in the root nucPos file for the corresponding location ID
    # Maybe this could cause memory issues, but I think it should be fine since the nucleosome maps are usually not too big.
    if not sloppyCopy:
        nucPosLines = dict()
        with open(os.path.join(nucPosDir,
                               os.path.basename(nucPosDir) +
                               ".bed")) as nucPosFile:

            for line in nucPosFile:
                chromosome, startPos, endPos = line.split()[:3]
                key = (chromosome + ':' + str(float(startPos)) + '-' +
                       str(float(endPos) - 1) + "(+)")
                nucPosLines[key] = line

    for quartileFilePath in quartileFilePaths:

        print("Working in", os.path.basename(quartileFilePath))

        if "lower_quartile" in os.path.basename(quartileFilePath):
            quartile = "lower_quartile"
        elif "upper_quartile" in os.path.basename(quartileFilePath):
            quartile = "upper_quartile"
        else:
            raise ValueError("Quartile Designation not found in file name: " +
                             os.path.basename(quartileFilePath))

        # Create a name for the new nucleosome data based on the name of the quartile file and the parent nucleosome data.
        nucleosomeDataName = '_'.join(
            (os.path.basename(nucPosDir), stratificationType, quartile))
        outputNucPosFilePath = os.path.join(os.path.dirname(nucPosDir),
                                            nucleosomeDataName,
                                            nucleosomeDataName + ".bed")
        checkDirs(os.path.dirname(outputNucPosFilePath))

        with open(quartileFilePath, 'r') as quartileFile:
            quartileFile.readline()  # Get rid of headers
            with open(outputNucPosFilePath, 'w') as outputNucPosFile:

                # Parse out the location information and convert it to bed format.
                for line in quartileFile:

                    if sloppyCopy:
                        chromosome, startPos, endPos, strand = parseFastaDescription(
                            line.split()[0])
                        outputNucPosFile.write('\t'.join(
                            (chromosome, startPos, str(float(endPos) + 1), '.',
                             '.', strand)) + '\n')
                    else:
                        outputNucPosFile.write(nucPosLines[line.split()[0]])

        # Sort the output
        subprocess.run(("sort", "-k1,1", "-k2,3n", outputNucPosFilePath, "-o",
                        outputNucPosFilePath),
                       check=True)
Ejemplo n.º 5
0
def parseStandardBed(standardBedFilePaths: List[str], genomeFilePath):

    customBedOutputFilePaths = list(
    )  # The list of file paths to be passed to the custom bed parser.

    # Parse the given files into custom bed format.
    for standardBedFilePath in standardBedFilePaths:

        print("\nWorking in:", os.path.basename(standardBedFilePath))
        if not os.path.basename(standardBedFilePath).endswith(".bed"):
            raise InvalidPathError(
                standardBedFilePath,
                "Given file does not appear to be in bed format. (missing \".bed\" extension)"
            )

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(standardBedFilePath)
        intermediateFilesDir = os.path.join(localRootDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        dataGroupName = getIsolatedParentDir(standardBedFilePath)

        # Generate the output file path and metadata
        customBedOutputFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=dataGroupName,
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")
        customBedOutputFilePaths.append(customBedOutputFilePath)
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath),
                         os.path.basename(standardBedFilePath),
                         InputFormat.standardBed, localRootDirectory)

        # Get the list of acceptable chromosomes.
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Iterate through the standard bed file entries preparing them for custom-bed input.
        print("Converting entries for custom bed input...")
        with open(standardBedFilePath, 'r') as standardBedFile:
            with open(customBedOutputFilePath, 'w') as customBedOutputFile:

                for line in standardBedFile:

                    choppedUpLine = line.strip().split("\t")

                    # Make sure the lesion is in a valid chromosome.  Otherwise, skip it.
                    if not choppedUpLine[0] in acceptableChromosomes: continue

                    choppedUpLine[3] = '.'
                    choppedUpLine[4] = "OTHER"

                    customBedOutputFile.write('\t'.join(choppedUpLine[:6]) +
                                              '\n')

    # Pass the generated files to the custom bed parser.
    parseCustomBed(customBedOutputFilePaths, genomeFilePath, False, False,
                   False, False)
Ejemplo n.º 6
0
    def setUpForIndividualCohorts(self):

        self.stratifyByIndividualCohorts = True

        self.currentIndividualCohortID = None  # The cohort being written to at a point in time.
        self.currentIndividualCohortFile: IO = None  # The open file for the current cohort.
        self.completedIndividualCohorts = dict(
        )  # A hashtable of cohorts that have been seen before and should NOT be revisited/rewritten.

        # Create the directory.
        self.rootIndividualCohortsDirectory = os.path.join(
            self.rootMetadata.directory, "individual_cohorts")
        checkDirs(self.rootIndividualCohortsDirectory)
    def __init__(self, mutationFilePath, domainRangesFilePath):

        # Open the mutation and gene positions files to compare against one another.
        self.mutationFile = open(mutationFilePath, 'r')
        self.domainRangesFile = open(domainRangesFilePath, 'r')

        # Set up the file system for outputting files for different domains..
        self.domainOutputFiles = dict()
        self.domainOutputFolder = os.path.join(
            os.path.dirname(mutationFilePath),
            os.path.basename(domainRangesFilePath).rsplit('.', 1)[0])
        checkDirs(self.domainOutputFolder)
        self.domainOutputFilePathBasename = os.path.basename(
            mutationFilePath).rsplit('.', 1)[0]

        # Keeps track of mutations that matched to a domain to check for overlap.
        self.mutationsInPotentialOverlap: List[MutationData] = list()

        # The mutation and gene currently being investigated.
        self.currentMutation: MutationData = None
        self.currentDomain: DomainData = None
Ejemplo n.º 8
0
def parseICGC(ICGCFilePaths: List[str], genomeFilePath, separateDonors,
              stratifyByMS, stratifyByMutSig):

    outputBedFilePaths = list()

    if len(ICGCFilePaths) == 0:
        raise UserInputError("No ICGC files were found to parse.")

    # Run the parser for each ICGC file given.
    for ICGCFilePath in ICGCFilePaths:

        print("\nWorking in:", os.path.split(ICGCFilePath)[1])

        if not ICGCFilePath.endswith(".gz"):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file is not gzipped (.gz file format):")
        if not "simple_somatic_mutation" in os.path.basename(ICGCFilePath):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file path does not have \"simple_somatic_mutation\" in the name:",
                "Note: if a directory was specified to search for ICGC input files, "
                "all files ending in .tsv.gz are selected.")

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(ICGCFilePath)
        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        generateMetadata(getIsolatedParentDir(ICGCFilePath),
                         getIsolatedParentDir(genomeFilePath),
                         os.path.basename(ICGCFilePath), InputFormat.ICGC,
                         os.path.dirname(ICGCFilePath))

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(ICGCFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write the relevant information from the ICGC file to the output file.
        print("Writing data to custom bed format.")
        with gzip.open(ICGCFilePath, 'r') as ICGCFile:
            with open(outputBedFilePath, 'w') as outputBedFile:
                for mutation in ICGCIterator(ICGCFile, genomeFilePath):

                    # Change the formatting if a deletion or insertion is given.
                    if mutation.mutatedFrom == '-':
                        mutation.mutatedFrom = '*'
                        # NOTE: We are making the assumption that the given base pos (1-based) is after the insertion, not before.
                        mutation.startPos = str(int(mutation.startPos) - 1)

                    elif mutation.mutatedTo == '-':
                        mutation.mutatedTo = '*'

                    outputBedFile.write('\t'.join(
                        (mutation.chromosome, mutation.startPos,
                         mutation.endPos, mutation.mutatedFrom,
                         mutation.mutatedTo, mutation.strand,
                         mutation.donorID)) + '\n')

        outputBedFilePaths.append(outputBedFilePath)

    # Pass the parsed bed files to the custom bed parser for even more parsing! (Hooray for modularization!)
    print("\nPassing data to custom bed parser...")
    parseCustomBed(outputBedFilePaths, genomeFilePath, stratifyByMS,
                   stratifyByMutSig, separateDonors, True)
Ejemplo n.º 9
0
    def setUpForMutSigStratification(self) -> MutSigIdentifier:

        self.stratifyByMutSig = True
        self.mutSigDesignations = dict(
        )  # A dictionary of the mutation signatures assigned to each cohort.

        mutSigs = ["1A", "1B"] + [str(x) for x in list(range(2, 22))
                                  ] + ["R1", "R2", "R3", "U1", "U2"]

        # Create the necessary directories, file paths, and metadata.
        parentMutSigDirectory = os.path.join(self.rootMetadata.directory,
                                             "mut_sig_analysis")
        self.mutSigFilePaths = dict()
        self.mutSigFiles = dict()
        self.mutSigMutCounts = dict()

        for mutSig in mutSigs:

            thisMutSigDataGroup = "mut_sig_" + mutSig + '_' + self.rootMetadata.dataGroupName

            # Directory
            thisMutSigDirectory = os.path.join(parentMutSigDirectory, mutSig)
            checkDirs(thisMutSigDirectory)

            # Metadata
            generateMetadata(
                thisMutSigDataGroup, self.rootMetadata.genomeName,
                os.path.join('..', '..',
                             self.rootMetadata.localParentDataPath),
                self.rootMetadata.inputFormat, thisMutSigDirectory,
                "mutSig" + mutSig)

            # Mutation Counter
            self.mutSigMutCounts[mutSig] = 0

            # File path
            self.mutSigFilePaths[mutSig] = generateFilePath(
                directory=thisMutSigDirectory,
                dataGroup=thisMutSigDataGroup,
                context=self.context,
                dataType=DataTypeStr.mutations,
                fileExtension=".bed")
            self.mutSigFiles[mutSig] = open(self.mutSigFilePaths[mutSig], 'w')

        # Set up the MutSigIdentifier object to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        deconstructSigsInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="deconstructSigs_data",
            fileExtension=".tsv")
        self.mutSigDesignationsFilePath = generateFilePath(
            directory=parentMutSigDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="mut_sig_assignments",
            fileExtension=".tsv")

        self.mutSigIdentifier = MutSigIdentifier(
            deconstructSigsInputDataFilePath, self.mutSigDesignationsFilePath)
        return (self.mutSigIdentifier)
def generateNucleosomeFasta(baseNucPosFilePath, genomeFilePath, dyadRadius,
                            linkerOffset):

    # Ensure that an intermediate files directory exists for the current nucleosome map.
    intermediateFilesDir = os.path.join(os.path.dirname(baseNucPosFilePath),
                                        "intermediate_files")
    checkDirs(intermediateFilesDir)

    # Generate a path to the fasta file of nucleosome sequences (Potentially including linker DNA).
    if dyadRadius == 73:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            linkerOffset=linkerOffset,
            fileExtension=".fa")
    elif dyadRadius == 1000:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            usesNucGroup=True,
            fileExtension=".fa")
    else:
        raise ValueError("Invalid dyad radius: " + str(dyadRadius) +
                         ".  Expected 73 or 1000.")

    # Make sure the file doesn't already exist.  If it does, we're done!
    if os.path.exists(nucPosFastaFilePath):
        print("Found relevant nucleosome fasta file:",
              os.path.basename(nucPosFastaFilePath))
        return nucPosFastaFilePath
    else:
        print("Nucleosome fasta file not found at: ",
              nucPosFastaFilePath,
              "\nGenerating...",
              sep='')

    # Generate the (temporary) expanded file path.
    expandedNucPosBedFilePath = generateFilePath(
        directory=intermediateFilesDir,
        dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
        dataType="expanded",
        fileExtension=".bed")

    # Expand the bed coordinates.
    print("Expanding nucleosome coordinates...")
    with open(baseNucPosFilePath, 'r') as baseNucPosFile:
        with open(expandedNucPosBedFilePath, 'w') as expandedNucPosBedFile:

            # Write the expanded positions to the new file, one line at a time.
            for line in baseNucPosFile:
                choppedUpLine = line.strip().split('\t')
                choppedUpLine[1] = str(
                    int(choppedUpLine[1]) - dyadRadius - linkerOffset - 2)
                choppedUpLine[2] = str(
                    int(choppedUpLine[2]) + dyadRadius + linkerOffset + 2)

                # Write the results to the expansion file as long as it is not before the start of the chromosome.
                if int(choppedUpLine[1]) > -1:
                    expandedNucPosBedFile.write('\t'.join(choppedUpLine) +
                                                '\n')
                else:
                    print("Nucleosome at chromosome", choppedUpLine[0],
                          "with expanded start pos", choppedUpLine[1],
                          "extends into invalid positions.  Skipping.")

    # Convert the expanded bed file to fasta format.
    print("Converting expanded coordinates to fasta file...")
    bedToFasta(expandedNucPosBedFilePath,
               genomeFilePath,
               nucPosFastaFilePath,
               includeStrand=False)

    return nucPosFastaFilePath
Ejemplo n.º 11
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
Ejemplo n.º 12
0
def parseCustomBed(bedInputFilePaths,
                   genomeFilePath,
                   stratifyByMS,
                   stratifyByMutSig,
                   separateIndividualCohorts,
                   onlySingleBaseSubs=False,
                   includeIndels=False):

    if onlySingleBaseSubs and includeIndels:
        raise UserInputError(
            "Indels are incompatible with single nucleotide substitutions.")
    if len(bedInputFilePaths) == 0:
        raise UserInputError("No bed files were found to parse.")

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata
        # If this is an intermediate file, keep in mind that it's not in the data group's root directory
        # and metadata should already have been generated elsewhere
        if getIsolatedParentDir(bedInputFilePath) == "intermediate_files":
            dataDirectory = os.path.dirname(os.path.dirname(bedInputFilePath))
        else:
            dataDirectory = os.path.dirname(bedInputFilePath)
            generateMetadata(os.path.basename(dataDirectory),
                             getIsolatedParentDir(genomeFilePath),
                             os.path.basename(bedInputFilePath),
                             InputFormat.customBed,
                             os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        autoAcquiredFilePath = os.path.join(intermediateFilesDir,
                                            "auto_acquire.fa")

        context = autoAcquireAndQACheck(bedInputFilePath, genomeFilePath,
                                        autoAcquiredFilePath,
                                        onlySingleBaseSubs, includeIndels)

        # Make sure the input file is not named the same as what will become the output file.  If it is, it needs to be copied
        # to the intermediate_files directory so it is available to be read from as the new output file is being written.
        expectedOutputFilePath = generateFilePath(
            directory=dataDirectory,
            dataGroup=os.path.basename(dataDirectory),
            context=context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        if bedInputFilePath == expectedOutputFilePath:
            inputFilePathCopy = os.path.join(
                intermediateFilesDir, os.path.basename(bedInputFilePath))
            print(
                "Input file path is identical to generated output file path and will be overwritten. ",
                "Creating a copy of the input file at:", inputFilePathCopy,
                "to use for reading.")
            shutil.copy2(bedInputFilePath, inputFilePathCopy)
            bedInputFilePath = inputFilePathCopy

        # Create an instance of the WriteManager to handle writing.
        with WriteManager(dataDirectory, context) as writeManager:

            # Check to see if cohort designations are present to see if preparations need to be made.
            optionalArgument = tuple()
            with open(bedInputFilePath, 'r') as bedInputFile:
                line = bedInputFile.readline()

                # Is the cohort designation present?
                if len(line.strip().split('\t')) == 7:

                    # Include in sort function
                    optionalArgument = ("-k7,7", )

                    # Prepare the write manager for individual cohorts if desired.
                    if separateIndividualCohorts:
                        writeManager.setUpForIndividualCohorts()

                elif stratifyByMS or stratifyByMutSig:
                    raise UserInputError(
                        "Additional stratification given, but no cohort designation given."
                    )
                elif separateIndividualCohorts:
                    raise UserInputError(
                        "Separation by individual cohorts requested, but no cohort designation given."
                    )

            # Sort the input data (should also ensure that the output data is sorted)
            subprocess.run(("sort", ) + optionalArgument +
                           ("-k1,1", "-k2,2n", "-k3,3n", bedInputFilePath,
                            "-s", "-o", bedInputFilePath),
                           check=True)

            # If requested, also prepare for stratification by microsatellite stability.
            if stratifyByMS:
                setUpForMSStratification(writeManager, bedInputFilePath)

            if stratifyByMutSig:
                setUpForMutSigStratification(writeManager, bedInputFilePath)

            # Go, go, go!
            convertToStandardInput(bedInputFilePath, writeManager,
                                   onlySingleBaseSubs, includeIndels)