Beispiel #1
0
def getDataDirectory():

    # Check for the text file which should contain the path to the data directory.
    dataDirectoryTextFilePath = os.path.join(os.getenv("HOME"), ".mutperiod",
                                             "data_dir.txt")

    # If it exists, return the directory path within.
    if os.path.exists(dataDirectoryTextFilePath):
        with open(dataDirectoryTextFilePath, 'r') as dataDirectoryTextFile:

            dataDirectory = dataDirectoryTextFile.readline().strip()

            # Double check to make sure the data directory is still intact.
            # If it isn't, inform the user, and progress through the function to recreate it.
            if not os.path.exists(dataDirectory):
                print(
                    "Data directory not found at expected location: {}".format(
                        dataDirectory))
                print(
                    "Please select a new location to create a data directory.")
            else:
                return dataDirectory

    else:

        # Create a simple dialog to select a new data directory location.
        from benbiohelpers.TkWrappers.TkinterDialog import TkinterDialog, Selections
        checkDirs(os.path.dirname(dataDirectoryTextFilePath))
        dialog = TkinterDialog(
            workingDirectory=os.path.dirname(dataDirectoryTextFilePath))
        dialog.createFileSelector("Location to create new data directory:",
                                  0, ("Fasta Files", ".fa"),
                                  directory=True)

        # Run the UI
        dialog.mainloop()

        # If no input was received (i.e. the UI was terminated prematurely), then quit!
        if dialog.selections is None: quit()

        selections: Selections = dialog.selections
        dataDirectoryDirectory = selections.getIndividualFilePaths()[0]

        # Make sure a valid, writeable directory was given.  Then create the new directory (if it doesn't exist already),
        # write it to the text file, and return it!  (Also create the __external_data directory.)
        if not os.path.exists(dataDirectoryDirectory):
            raise UserInputError("Given directory: " + dataDirectoryDirectory +
                                 " does not exist.")

        dataDirectory = os.path.join(dataDirectoryDirectory, "mutperiod_data")
        try:
            checkDirs(dataDirectory)
            checkDirs(os.path.join(dataDirectory, "__external_data"))
        except IOError:
            raise InvalidPathError(
                dataDirectoryDirectory,
                "Given location for data directory is not writeable:")
        with open(dataDirectoryTextFilePath, 'w') as dataDirectoryTextFile:
            dataDirectoryTextFile.write(dataDirectory + '\n')
        return dataDirectory
Beispiel #2
0
def parseArgs(args):
    
    # If only the subcommand was given, run the UI.
    if len(sys.argv) == 2: 
        main(); return

    # Get valid tsv paths from the given input.
    tsvFilePaths = list()
    if args.tsv_paths is not None:
        for tsvFilePath in args.tsv_paths:
            checkIfPathExists(tsvFilePath)
            if os.path.isdir(tsvFilePath):
                tsvFilePaths += [os.path.abspath(filePath) for filePath in getFilesInDirectory(tsvFilePath, DataTypeStr.generalNucCounts + ".tsv")]
            else: tsvFilePaths.append(os.path.abspath(tsvFilePath))
    
    # Get valid rda paths from the given input.
    rdaFilePaths = list()
    if args.rda_paths is not None:
        for rdaFilePath in args.rda_paths:
            checkIfPathExists(rdaFilePath)
            if os.path.isdir(rdaFilePath):
                rdaFilePaths += [os.path.abspath(filePath) for filePath in getFilesInDirectory(rdaFilePath, ".rda")]
            else: rdaFilePaths.append(os.path.abspath(rdaFilePath))

    if args.output_directory is not None: exportPath = os.path.abspath(args.output_directory)
    elif args.output_file is not None: exportPath = os.path.abspath(args.output_file)
    else: raise UserInputError("No output path given.")

    # Pass the given commands to the generateFigures function
    generateFigures(list(set(tsvFilePaths)), list(set(rdaFilePaths)), exportPath, args.remove_outliers, args.smooth_nuc_group, args.align_strands)
Beispiel #3
0
def generateFigures(tsvFilePaths: List[str], rdaFilePaths: List[str], exportPath: str,
                    omitOutliers, smoothNucGroup, strandAlign):

    # Check for invalid arguments.
    if len(rdaFilePaths) + len(tsvFilePaths) == 0:
        raise UserInputError("No input files were found to generate graphs from.")

    if not (os.path.isdir(exportPath) or exportPath.endswith(".pdf")): 
        raise InvalidPathError(exportPath, "The given export path is neither an existing directory nor a pdf file.")
    if os.path.isdir(exportPath):
        try:
            # NOTE: No one in their right mind would name a file this, so I think it's safe to overwrite and delete.
            testFilePath = os.path.join(exportPath,"_-_TeSt_fIlEeeeee.TXET")
            testFile = open(testFilePath, 'w')
            testFile.close()
            os.remove(testFilePath)
        except IOError:
            raise InvalidPathError(exportPath, "Given export path is not writeable:")
    else:
        try:
            testFile = open(exportPath, 'w')
            testFile.close()
        except IOError:
            raise InvalidPathError(exportPath, "Given export path is not writeable:")

    for tsvFilePath in tsvFilePaths:
        if not tsvFilePath.endswith(DataTypeStr.generalNucCounts + ".tsv"):
            raise InvalidPathError(tsvFilePath, "The given nucleosome counts tsv file does not end with \"" + 
                                   DataTypeStr.generalNucCounts + ".tsv\" as expected.")

    for rdaFilePath in rdaFilePaths:
        if not rdaFilePath.endswith(".rda"):
            raise InvalidPathError(rdaFilePath, "The given nucleosome counts rda file does not end with \".rda\" as expected.")

    # Retrieve the expected periods for each of the given tsv counts files.
    tsvExpectedPeriods = [str(getExpectedPeriod(tsvFilePath)) for tsvFilePath in tsvFilePaths]

    # Determine whether the export path is a directory or file and set variables accordingly.
    if os.path.isdir(exportPath):
        exportDir = exportPath
        exportFileName = ''
    else:
        exportDir = os.path.dirname(exportPath)
        exportFileName = os.path.basename(exportPath)

    # Create the temporary inputs file to pass to the R script
    inputsFilePath = os.path.join(os.getenv("HOME"), ".mutperiod","R_inputs.txt")

    # Write the inputs
    with open(inputsFilePath, 'w') as inputsFile:
        inputsFile.write('$'.join(tsvFilePaths) + '\n')
        inputsFile.write('$'.join(tsvExpectedPeriods) + '\n')
        inputsFile.write('$'.join(rdaFilePaths) + '\n')
        inputsFile.write(exportDir + '\n')
        inputsFile.write(exportFileName + '\n')

    # Call the R script to generate the figures.
    print("Calling R script...")
    subprocess.run(("Rscript",os.path.join(rScriptsDirectory,"GenerateFigures.R"),inputsFilePath, str(omitOutliers),
                    str(smoothNucGroup), str(strandAlign)), check = True)
def getCustomBackgroundRawPairs(customRawCountsFilePaths,
                                customBackgroundCountsDir):

    customBackgroundRawPairs: Dict[str, List[str]] = dict()

    # For every raw counts file given, try to match it to a raw counts file in the customBackgroundCountsDir.
    for customRawCountsFilePath in customRawCountsFilePaths:

        rawMetadata = Metadata(customRawCountsFilePath)
        backgroundDir = os.path.join(customBackgroundCountsDir,
                                     rawMetadata.nucPosName)
        if not os.path.exists(backgroundDir):
            raise UserInputError(
                "Expected a directory at " + backgroundDir +
                " to contain the background for " + customRawCountsFilePath +
                " but the directory does not exist.  Have you forgotten to run "
                "the analysis for the related nucleosome map?")
        backgroundMetadata = Metadata(backgroundDir)

        customBackgroundCountsFilePath = generateFilePath(
            directory=backgroundMetadata.directory,
            dataGroup=backgroundMetadata.dataGroupName,
            linkerOffset=getLinkerOffset(customRawCountsFilePath),
            usesNucGroup=checkForNucGroup(customRawCountsFilePath),
            dataType=DataTypeStr.rawNucCounts,
            fileExtension=".tsv")
        if not os.path.exists(customBackgroundCountsFilePath):
            raise UserInputError(
                "Expected file at " + customBackgroundCountsFilePath +
                " to use as custom background for " + customRawCountsFilePath +
                " but this file does not exist.  Have you forgotten to "
                "run the relevant analysis to generate it?")
        if customBackgroundCountsFilePath not in customBackgroundRawPairs:
            customBackgroundRawPairs[customBackgroundCountsFilePath] = list()
        customBackgroundRawPairs[customBackgroundCountsFilePath].append(
            customRawCountsFilePath)

    return customBackgroundRawPairs
def parseArgs(args):

    # If only the subcommand was given, run the UI.
    if len(sys.argv) == 2:
        main()
        return

    # Make sure an output file path was given.
    if args.output_file_path is None:
        raise UserInputError("No output file path was given.")

    # Determine what files were passed to each argument.
    filePathGroups = list()
    for i in range(3):
        filePathGroups.append(list())

    # Pulls all the relevant file paths out of the three groups that could have been passed as arguments.
    for i, filePaths in enumerate(
        (args.nucleosomeMutationFilePaths, args.group_1, args.group_2)):
        if filePaths is not None:

            for filePath in filePaths:

                checkIfPathExists(filePath)
                if os.path.isdir(filePath):
                    filePathGroups[i] += [
                        os.path.abspath(filePath)
                        for filePath in getFilesInDirectory(
                            filePath, DataTypeStr.generalNucCounts + ".tsv")
                    ]
                else:
                    filePathGroups[i].append(os.path.abspath(filePath))

        filePathGroups[i] = set(filePathGroups[i])

    # Make sure that any file paths passed to group 1 or group 2 are present in the default group.
    filePathGroups[
        0] = filePathGroups[0] | filePathGroups[1] | filePathGroups[2]

    runNucleosomeMutationAnalysis(list(filePathGroups[0]),
                                  args.output_file_path,
                                  args.use_expected_periodicity,
                                  args.align_strands, list(filePathGroups[1]),
                                  list(filePathGroups[2]))
def generateCustomBackground(customBackgroundDir, nucleosomeMapNames,
                             useSingleNucRadius, includeLinker,
                             useNucGroupRadius):

    print("Generating background counts...")

    customBackgroundMutationFilePath = getFilesInDirectory(
        customBackgroundDir,
        DataTypeStr.mutations + ".bed",
        searchRecursively=False)
    if customBackgroundMutationFilePath is None:
        raise UserInputError(
            "No parsed mutation files found in custom background directory: " +
            customBackgroundDir)

    runAnalysisSuite((customBackgroundMutationFilePath, ), nucleosomeMapNames,
                     "No Normalization", None, useSingleNucRadius,
                     includeLinker, useNucGroupRadius)

    print("Finished generating background!\n")
def runAnalysisSuite(mutationFilePaths: List[str],
                     nucleosomeMapNames: List[str],
                     normalizationMethod,
                     customBackgroundDir,
                     useSingleNucRadius,
                     includeLinker,
                     useNucGroupRadius,
                     includeAlternativeScaling=False):

    # Make sure at least one radius was selected.
    if not useNucGroupRadius and not useSingleNucRadius:
        raise UserInputError("Must select at least one radius.")

    # Make sure at least one mutation and one nucleosome file was found.
    if len(mutationFilePaths) == 0:
        raise UserInputError("No valid input files given.")
    if len(nucleosomeMapNames) == 0:
        raise UserInputError("No valid nucleosome map files given")

    # Convert background context to int
    if normalizationMethod == "Singlenuc/Dinuc":
        normalizationMethodNum = 1
    elif normalizationMethod == "Trinuc/Quadrunuc":
        normalizationMethodNum = 3
    elif normalizationMethod == "Pentanuc/Hexanuc":
        normalizationMethodNum = 5
    elif normalizationMethod in ("No Normalization", "Custom Background"):
        normalizationMethodNum = None
    else:
        raise ValueError("Matching strings is hard.")

    # Set the linker offset
    if includeLinker: linkerOffset = 30
    else: linkerOffset = 0

    ### Ensure that every mutation file has a context sufficient for the requested background.

    # create a new list of mutation file paths, replacing any with contexts that are too low.
    if normalizationMethodNum is not None:
        print("\nExpanding file context where necessary...\n")
        updatedMutationFilePaths = list()
        for mutationFilePath in mutationFilePaths:
            mutationFileContext = getContext(mutationFilePath, True)

            # Some error checking...
            if mutationFileContext is None:
                raise InvalidPathError(
                    os.path.basename(mutationFilePath),
                    "Malformed file name.  Context is not clear for:",
                    "Are you sure the file was generated by mutperiod?")
            if mutationFileContext == 0:
                raise UserInputError(
                    "Mixed context files cannot be normalized by sequence context."
                )
            assert mutationFileContext != -1, "Wait, what?  How did you even get this context for this input file? " + os.path.basename

            if mutationFileContext < normalizationMethodNum:
                updatedMutationFilePaths += expandContext(
                    (mutationFilePath, ), normalizationMethodNum)
            else:
                updatedMutationFilePaths.append(mutationFilePath)
    else:
        updatedMutationFilePaths = mutationFilePaths

    ### Run the rest of the analysis.

    print("\nCounting mutations at each dyad position...")
    nucleosomeMutationCountsFilePaths = countNucleosomePositionMutations(
        updatedMutationFilePaths, nucleosomeMapNames, useSingleNucRadius,
        useNucGroupRadius, linkerOffset)

    if normalizationMethodNum is not None:

        print("\nGenerating genome-wide mutation background...")
        mutationBackgroundFilePaths = generateMutationBackground(
            updatedMutationFilePaths, normalizationMethodNum)

        print("\nGenerating nucleosome mutation background...")
        nucleosomeMutationBackgroundFilePaths = generateNucleosomeMutationBackground(
            mutationBackgroundFilePaths, nucleosomeMapNames,
            useSingleNucRadius, useNucGroupRadius, linkerOffset)

        print("\nNormalizing counts with nucleosome background data...")
        normalizeCounts(nucleosomeMutationBackgroundFilePaths)

    elif normalizationMethod == "Custom Background":
        print("\nNormalizing counts using custom background data...")
        normalizeCounts(list(), nucleosomeMutationCountsFilePaths,
                        customBackgroundDir, includeAlternativeScaling)
def parseArgs(args):

    # If only the subcommand was given, run the UI.
    if len(sys.argv) == 2:
        main()
        return

    # Get the bed mutation files from the given paths, searching directories if necessary.
    finalBedMutationPaths = list()
    if args.mutation_file_paths is None:
        raise UserInputError("No mutation file paths were given.")
    for mutationFilePath in args.mutation_file_paths:
        checkIfPathExists(mutationFilePath)
        if os.path.isdir(mutationFilePath):
            finalBedMutationPaths += [
                os.path.abspath(filePath) for filePath in getFilesInDirectory(
                    mutationFilePath, DataTypeStr.mutations + ".bed")
            ]
        else:
            finalBedMutationPaths.append(os.path.abspath(mutationFilePath))

    if len(finalBedMutationPaths) == 0:
        raise UserInputError("No bed mutation files were found.")

    nucleosomeMapNames = list()
    if args.nucleosome_maps is None:
        raise UserInputError("No nucleosome maps were given.")
    for nucleosomeMapPath in args.nucleosome_maps:
        checkIfPathExists(nucleosomeMapPath)
        if os.path.isdir(nucleosomeMapPath):
            nucleosomeMapNames.append(os.path.basename(nucleosomeMapPath))
        else:
            nucleosomeMapNames.append(
                getIsolatedParentDir(os.path.abspath(nucleosomeMapPath)))

    if len(nucleosomeMapNames) == 0:
        raise UserInputError("No nucleosome maps were found.")

    # Determine what normalization method was selected.
    normalizationMethod = "No Normalization"
    customBackgroundDir = None
    if args.context_normalization == 1 or args.context_normalization == 2:
        normalizationMethod = "Singlenuc/Dinuc"
    elif args.context_normalization == 3 or args.context_normalization == 4:
        normalizationMethod = "Trinuc/Quadrunuc"
    elif args.context_normalization == 5 or args.context_normalization == 6:
        normalizationMethod = "Pentanuc/Hexanuc"
    elif args.background is not None:
        normalizationMethod = "Custom Background"
        if os.path.isdir(args.background):
            customBackgroundDir = os.path.abspath(args.background)
        else:
            customBackgroundDir = os.path.dirname(
                os.path.abspath(args.background))
        if args.generate_background_immediately:
            generateCustomBackground(customBackgroundDir, nucleosomeMapNames,
                                     args.singlenuc_radius, args.add_linker,
                                     args.nuc_group_radius)
    elif args.generate_background_immediately:
        raise UserInputError(
            "Background generation requested, but no background given.")

    runAnalysisSuite(list(set(finalBedMutationPaths)),
                     list(set(nucleosomeMapNames)), normalizationMethod,
                     customBackgroundDir, args.singlenuc_radius,
                     args.add_linker, args.nuc_group_radius)
Beispiel #9
0
def generateMutationContextFrequencyFile(mutationFilePath,
                                         mutationContextFrequencyFilePath,
                                         contextNum, contextText,
                                         acceptableChromosomes):

    contextCounts = dict(
    )  # A dictionary of all relevant contexts and their counts.

    # Open the mutation bed file.
    with open(mutationFilePath, 'r') as mutationFile:

        # Used to pull out the context of desired length.
        middleIndex = None
        extensionLength = None

        # Read through the lines and count contexts.
        for line in mutationFile:

            choppedUpLine = line.strip().split('\t')
            surroundingBases = choppedUpLine[3]

            # Preform some checks and initialize some helpful variables if it hasn't been done previously
            if middleIndex is None:

                # Make sure the file has sufficient information to generate the requested context
                if len(surroundingBases) < contextNum:
                    raise UserInputError(
                        "The given mutation file does not have enough information to produce a "
                        + contextText + " context.")

                middleIndex = len(surroundingBases) / 2 - 0.5
                extensionLength = contextNum / 2 - 0.5

            # Pull out the context of the desired length.
            context = surroundingBases[int(middleIndex - extensionLength
                                           ):int(middleIndex +
                                                 extensionLength + 1)]

            # Make sure we didn't encounter an invalid chromosome.
            if choppedUpLine[0] not in acceptableChromosomes:
                raise UserInputError(
                    "Encountered " + choppedUpLine[0] +
                    " which is not a valid chromosome for this genome.")

            contextCounts.setdefault(context, 0)
            contextCounts[context] += 1

    # Get the total number of mutations by summing the context counts
    totalMutations = sum(contextCounts.values())

    # Open the file to write the mutations to.
    with open(mutationContextFrequencyFilePath,
              'w') as mutationContextFrequencyFile:

        mutationContextFrequencyFile.write("Total Mutations: " +
                                           str(totalMutations) + '\n')
        # Output headers for the data.
        mutationContextFrequencyFile.write('\t'.join(
            (contextText, "Occurrences", "Frequency")) + '\n')

        # On each line of the file, write the context sequence, how many times it occurred,
        # and the frequency with respect to the total number of mutations.
        for context in sorted(contextCounts.keys()):
            mutationContextFrequencyFile.write('\t'.join(
                (context, str(contextCounts[context]),
                 str(int(contextCounts[context]) / totalMutations))))
            mutationContextFrequencyFile.write('\n')
def runNucleosomeMutationAnalysis(nucleosomeMutationCountsFilePaths: List[str],
                                  outputFilePath: str,
                                  overridePeakPeriodicityWithExpected,
                                  alignStrands,
                                  filePathGroup1: List[str] = list(),
                                  filePathGroup2: List[str] = list()):

    # Check for valid input.
    if (len(filePathGroup1) == 0) != (len(filePathGroup2) == 0):
        raise UserInputError(
            "One file path group contains file paths, but the other is empty.")
    if len(nucleosomeMutationCountsFilePaths) == 0:
        raise UserInputError("No nucleosome counts files given.")
    if not (outputFilePath.endswith(".rda")
            or outputFilePath.endswith(".tsv")):
        raise InvalidPathError(
            outputFilePath,
            "Given output file does not end with \".rda\" or \".tsv\":")
    try:
        outputFile = open(outputFilePath, 'w')
        outputFile.close()
    except IOError:
        raise InvalidPathError(outputFilePath,
                               "Given output file path is not writeable: ")

    # Retrieve the expected periods for each of the given counts files.
    expectedPeriods = [
        str(getExpectedPeriod(nucleosomeMutationCountsFilePath)) for
        nucleosomeMutationCountsFilePath in nucleosomeMutationCountsFilePaths
    ]

    # Write the inputs to a temporary file to be read by the R script
    inputsFilePath = os.path.join(os.getenv("HOME"), ".mutperiod",
                                  "R_inputs.txt")

    with open(inputsFilePath, 'w') as inputsFile:
        if (len(filePathGroup1) == 0 and len(filePathGroup2) == 0):
            print(
                "Generating inputs to run analysis without grouped comparison..."
            )
            inputsFile.write('\n'.join(
                ('$'.join(nucleosomeMutationCountsFilePaths), outputFilePath,
                 str(overridePeakPeriodicityWithExpected),
                 '$'.join(expectedPeriods), str(alignStrands))) + '\n')

        else:
            print(
                "Generating inputs to run analysis with grouped comparison...")
            inputsFile.write('\n'.join(
                ('$'.join(nucleosomeMutationCountsFilePaths), outputFilePath,
                 '$'.join(filePathGroup1), '$'.join(filePathGroup2),
                 str(overridePeakPeriodicityWithExpected),
                 '$'.join(expectedPeriods), str(alignStrands))) + '\n')

    # Call the R script
    print("Calling R script...")
    subprocess.run(
        ("Rscript",
         os.path.join(rScriptsDirectory,
                      "RunNucleosomeMutationAnalysis.R"), inputsFilePath),
        check=True)

    print("Results can be found at", outputFilePath)
def generateNucleosomeMutationBackground(mutationBackgroundFilePaths,
                                         nucleosomeMapNames,
                                         useSingleNucRadius, useNucGroupRadius,
                                         linkerOffset):

    if not (useSingleNucRadius or useNucGroupRadius):
        raise UserInputError(
            "Must generate background in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationBackgroundFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # Loop through each given mutation background file path, creating the corresponding nucleosome mutation background(s) for each.
    for mutationBackgroundFilePath in mutationBackgroundFilePaths:

        print("\nWorking with", os.path.basename(mutationBackgroundFilePath))
        if not DataTypeStr.mutBackground in os.path.basename(
                mutationBackgroundFilePath):
            raise InvalidPathError(
                mutationBackgroundFilePath,
                "Given file path does not have \"" +
                DataTypeStr.mutBackground + "\" in the file name.")

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Get metadata (Assumes that the metadata has already been generated from a call to countNucleosomePositionMutations)
            metadata = Metadata(
                os.path.join(os.path.dirname(mutationBackgroundFilePath),
                             nucleosomeMapName))

            # Determine the context of the mutation background file
            contextNum = getContext(mutationBackgroundFilePath, asInt=True)
            contextText = getContext(mutationBackgroundFilePath)
            print("Given mutation background is in", contextText, "context.")

            # To avoid copy pasting this code, here is a simple function to change how the background file is generated
            # based on the desired dyad radius.
            def generateBackgroundBasedOnRadius(usesNucGroup):

                # Set the dyad radius (And linker offset)
                if usesNucGroup:
                    dyadRadius = 1000
                    currentLinkerOffset = 0
                else:
                    dyadRadius = 73
                    currentLinkerOffset = linkerOffset

                # Generate the path to the tsv file of dyad position context counts
                dyadPosContextCountsFilePath = generateFilePath(
                    directory=os.path.dirname(metadata.baseNucPosFilePath),
                    dataGroup=metadata.nucPosName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType="dyad_pos_counts",
                    fileExtension=".tsv")

                # Make sure we have a tsv file with the appropriate context counts at each dyad position.
                if not os.path.exists(dyadPosContextCountsFilePath):
                    print(
                        "Dyad position " + contextText +
                        " counts file not found at",
                        dyadPosContextCountsFilePath)
                    print("Generating genome wide dyad position " +
                          contextText + " counts file...")
                    # Make sure we have a fasta file for strongly positioned nucleosome coordinates
                    nucPosFastaFilePath = generateNucleosomeFasta(
                        metadata.baseNucPosFilePath, metadata.genomeFilePath,
                        dyadRadius, currentLinkerOffset)
                    generateDyadPosContextCounts(nucPosFastaFilePath,
                                                 dyadPosContextCountsFilePath,
                                                 contextNum, dyadRadius,
                                                 currentLinkerOffset)

                # A path to the final output file.
                nucleosomeMutationBackgroundFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType=DataTypeStr.nucMutBackground,
                    fileExtension=".tsv")

                # Generate the nucleosome mutation background file!
                generateNucleosomeMutationBackgroundFile(
                    dyadPosContextCountsFilePath, mutationBackgroundFilePath,
                    nucleosomeMutationBackgroundFilePath, dyadRadius,
                    currentLinkerOffset)

                nucleosomeMutationBackgroundFilePaths.append(
                    nucleosomeMutationBackgroundFilePath)

            if useSingleNucRadius:
                generateBackgroundBasedOnRadius(False)
            if useNucGroupRadius:
                generateBackgroundBasedOnRadius(True)

    return nucleosomeMutationBackgroundFilePaths
Beispiel #12
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
Beispiel #13
0
# Get the user's input from the dialog.
selections: Selections = dialog.selections
mutationFilePath = list(
    selections.getFilePaths())[0]  # The path to the original bed mutation file
shouldMutationsBeFiltered = list(selections.getToggleStates())[
    0:6]  # A list of the bool values telling what mutations to filter.
omit = list(selections.getToggleStates())[
    6]  # Should the selected mutations be omitted
keep = list(selections.getToggleStates())[
    7]  # Should the selected mutations be kept, and others omitted.
createManyFiles = list(
    selections.getToggleStates()
)[8]  # Should the mutations omitted one at a time, or all together, in one file?
mutationsToFilter = list(
)  # If mutations need to be filtered all at once, we need to keep track of them.

if omit == keep:
    raise UserInputError(
        "Error: You must select only one option, omit OR keep.")

print("Working in file", os.path.split(mutationFilePath)[1])

# Send the selected mutations to the filterMutations function to be kicked to the curb.
for i, shouldMutationBeFiltered in enumerate(shouldMutationsBeFiltered):
    if shouldMutationBeFiltered and createManyFiles:
        filterMutations(mutationFilePath, omit, mutations[i])
    elif shouldMutationBeFiltered:
        mutationsToFilter.append(mutations[i])

if not createManyFiles:
    filterMutations(mutationFilePath, omit, *mutationsToFilter)