def parsePreparedInput(inputFilePaths: List[str], genomeFilePath, checkEachLine = True): for inputFilePath in inputFilePaths: print("\nWorking in",os.path.basename(inputFilePath)) # Perform some checks to make sure the input is formatted correctly. dataGroupName = getIsolatedParentDir(inputFilePath) inputFileBasename = os.path.basename(inputFilePath) inputFileContext = getContext(inputFilePath) if inputFileContext is None: raise UserInputError("No context is apparent from the given prepared input file.") if inputFileBasename.split('_'+inputFileContext)[0] != dataGroupName: raise InvalidPathError(inputFilePath, "Prepared input file is not named as expected given the data group name generated from the " "parent directory. Expected: \"" + dataGroupName + "\" immediately preceding the context definition " "but given file path is:") if not inputFileBasename.endswith(DataTypeStr.mutations + ".bed"): raise InvalidPathError(inputFilePath, "Prepared input file is not named properly to indicate the presence of mutation data. " "Expected a file ending in \"" + DataTypeStr.mutations + ".bed\" but given path is:") acceptableChromosomes = getAcceptableChromosomes(genomeFilePath) acceptableChromosomesFilePath = getAcceptableChromosomes(genomeFilePath, True) # Perform QA with the checkForErrors function print("Checking for errors in line formatting...") with open(inputFilePath, 'r') as inputFile: choppedUpLine = inputFile.readline().strip().split('\t') cohortDesignationPresent = len(choppedUpLine) == 7 checkForErrors(choppedUpLine, cohortDesignationPresent, acceptableChromosomes, acceptableChromosomesFilePath) if checkEachLine: for line in inputFile: choppedUpLine = line.strip().split('\t') checkForErrors(choppedUpLine, cohortDesignationPresent, acceptableChromosomes, acceptableChromosomesFilePath) # If everything else looks good, generate the metadata. This directory is now ready to go! print("Checks passed. Generating metadata, including mutation counts using a call to wc -l") generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath), os.path.basename(inputFilePath), InputFormat.prepared, os.path.dirname(inputFilePath)) featureCounts = int(subprocess.check_output(("wc", "-l", inputFilePath), encoding = "UTF-8").split()[0]) Metadata(inputFilePath).addMetadata(Metadata.AddableKeys.mutCounts, featureCounts)
def runAnalysisSuite(mutationFilePaths: List[str], nucleosomeMapNames: List[str], normalizationMethod, customBackgroundDir, useSingleNucRadius, includeLinker, useNucGroupRadius, includeAlternativeScaling=False): # Make sure at least one radius was selected. if not useNucGroupRadius and not useSingleNucRadius: raise UserInputError("Must select at least one radius.") # Make sure at least one mutation and one nucleosome file was found. if len(mutationFilePaths) == 0: raise UserInputError("No valid input files given.") if len(nucleosomeMapNames) == 0: raise UserInputError("No valid nucleosome map files given") # Convert background context to int if normalizationMethod == "Singlenuc/Dinuc": normalizationMethodNum = 1 elif normalizationMethod == "Trinuc/Quadrunuc": normalizationMethodNum = 3 elif normalizationMethod == "Pentanuc/Hexanuc": normalizationMethodNum = 5 elif normalizationMethod in ("No Normalization", "Custom Background"): normalizationMethodNum = None else: raise ValueError("Matching strings is hard.") # Set the linker offset if includeLinker: linkerOffset = 30 else: linkerOffset = 0 ### Ensure that every mutation file has a context sufficient for the requested background. # create a new list of mutation file paths, replacing any with contexts that are too low. if normalizationMethodNum is not None: print("\nExpanding file context where necessary...\n") updatedMutationFilePaths = list() for mutationFilePath in mutationFilePaths: mutationFileContext = getContext(mutationFilePath, True) # Some error checking... if mutationFileContext is None: raise InvalidPathError( os.path.basename(mutationFilePath), "Malformed file name. Context is not clear for:", "Are you sure the file was generated by mutperiod?") if mutationFileContext == 0: raise UserInputError( "Mixed context files cannot be normalized by sequence context." ) assert mutationFileContext != -1, "Wait, what? How did you even get this context for this input file? " + os.path.basename if mutationFileContext < normalizationMethodNum: updatedMutationFilePaths += expandContext( (mutationFilePath, ), normalizationMethodNum) else: updatedMutationFilePaths.append(mutationFilePath) else: updatedMutationFilePaths = mutationFilePaths ### Run the rest of the analysis. print("\nCounting mutations at each dyad position...") nucleosomeMutationCountsFilePaths = countNucleosomePositionMutations( updatedMutationFilePaths, nucleosomeMapNames, useSingleNucRadius, useNucGroupRadius, linkerOffset) if normalizationMethodNum is not None: print("\nGenerating genome-wide mutation background...") mutationBackgroundFilePaths = generateMutationBackground( updatedMutationFilePaths, normalizationMethodNum) print("\nGenerating nucleosome mutation background...") nucleosomeMutationBackgroundFilePaths = generateNucleosomeMutationBackground( mutationBackgroundFilePaths, nucleosomeMapNames, useSingleNucRadius, useNucGroupRadius, linkerOffset) print("\nNormalizing counts with nucleosome background data...") normalizeCounts(nucleosomeMutationBackgroundFilePaths) elif normalizationMethod == "Custom Background": print("\nNormalizing counts using custom background data...") normalizeCounts(list(), nucleosomeMutationCountsFilePaths, customBackgroundDir, includeAlternativeScaling)
def generateMutationBackground(mutationFilePaths, backgroundContextNum): mutationBackgroundFilePaths = list( ) # A list of paths to the output files generated by the function # A dictionary for converting context numbers to text. contextNumToText = { 1: "singlenuc", 2: "dinuc", 3: "trinuc", 4: "quadrunuc", 5: "pentanuc", 6: "hexanuc" } for mutationFilePath in mutationFilePaths: # Retrieve metadata metadata = Metadata(mutationFilePath) intermediateFilesDirectory = os.path.join(metadata.directory, "intermediate_files") # If necessary, adjust the context for files with even-length features. if getContext(mutationFilePath, asInt=True) % 2 == 0: thisBackgroundContextNum = backgroundContextNum + 1 else: thisBackgroundContextNum = backgroundContextNum # Set the name of the type of context being used. assert thisBackgroundContextNum in contextNumToText, "Unexpected background context number: " + str( thisBackgroundContextNum) contextText = contextNumToText[thisBackgroundContextNum] # Get the list of acceptable chromosomes acceptableChromosomes = getAcceptableChromosomes( metadata.genomeFilePath) print("\nWorking in:", os.path.split(mutationFilePath)[1]) if not DataTypeStr.mutations in os.path.split(mutationFilePath)[1]: raise InvalidPathError( mutationFilePath, "Given mutation file does not have \"" + DataTypeStr.mutations + "\" in the name.", postPathMessage= "Are you sure you inputted a file from the mutperiod pipeline?" ) # Generate the file path for the genome context frequency file. genomeContextFrequencyFilePath = generateFilePath( directory=os.path.dirname(metadata.genomeFilePath), dataGroup=metadata.genomeName, context=contextText, dataType="frequency", fileExtension=".tsv") # Generate the file path for the mutation context frequency file. mutationContextFrequencyFilePath = generateFilePath( directory=intermediateFilesDirectory, dataGroup=metadata.dataGroupName, context=contextText, dataType="mutation_frequencies", fileExtension=".tsv") # Generate the file path for the background mutation rate file. mutationBackgroundFilePath = generateFilePath( directory=metadata.directory, dataGroup=metadata.dataGroupName, context=contextText, dataType=DataTypeStr.mutBackground, fileExtension=".tsv") # If the genome context frequency file doesn't exist, create it. if not os.path.exists(genomeContextFrequencyFilePath): print("Genome", contextText, "context frequency file not found at path:", genomeContextFrequencyFilePath) print("Generating genome " + contextText + " context frequency file...") generateGenomeContextFrequencyFile(metadata.genomeFilePath, genomeContextFrequencyFilePath, thisBackgroundContextNum, contextText, acceptableChromosomes) # Create a directory for intermediate files if it does not already exist... if not os.path.exists(intermediateFilesDirectory): os.mkdir(intermediateFilesDirectory) # Create the mutation context frequency file. print("Generating mutation context frequency file...") generateMutationContextFrequencyFile(mutationFilePath, mutationContextFrequencyFilePath, thisBackgroundContextNum, contextText, acceptableChromosomes) # Generate the mutation background file. generateMutationBackgroundFile(genomeContextFrequencyFilePath, mutationContextFrequencyFilePath, mutationBackgroundFilePath, contextText) mutationBackgroundFilePaths.append(mutationBackgroundFilePath) return mutationBackgroundFilePaths
def getFilePathGroup(potentialFilePaths, normalizationMethods: List[int], singleNuc, nucGroup, acceptableMSCohorts: List[str], acceptableMutSigCohorts: List[str], acceptableCustomCohorts: List[str], acceptableNucleosomeMaps: List[str]): filePathGroup = list() # The file paths to be returned. for potentialFilePath in potentialFilePaths: potentialFileName = os.path.basename(potentialFilePath) # Does it satisfy the normalization methods qualification? # (Also ensure that we have nucleosome counts, whether raw or normalized.) if len(normalizationMethods) != 0: if DataTypeStr.rawNucCounts in potentialFileName and 0 in normalizationMethods: passed = True elif DataTypeStr.normNucCounts in potentialFileName and getContext( potentialFilePath, True) in normalizationMethods: passed = True else: continue # Does it satisfy the nucleosome radius qualification? if singleNuc or nucGroup: if checkForNucGroup(potentialFilePath) and nucGroup: passed = True elif not checkForNucGroup(potentialFilePath) and singleNuc: passed = True else: continue # Does it belong to one of the acceptable cohorts in each category? invalidCohortGroup = False for acceptableCohortsGroup in (acceptableMSCohorts, acceptableMutSigCohorts, acceptableCustomCohorts): if len(acceptableCohortsGroup) != 0: filePathCohortDesignations = Metadata( potentialFilePath).cohorts acceptableCohortFound = False for cohort in filePathCohortDesignations: if cohort in acceptableCohortsGroup: acceptableCohortFound = True continue if not acceptableCohortFound: invalidCohortGroup = True continue if invalidCohortGroup: continue # Does it belong to one of the acceptable nucleosome maps given? if len(acceptableNucleosomeMaps) != 0: filePathNucleosomeMap = Metadata(potentialFilePath).nucPosName if not filePathNucleosomeMap in acceptableNucleosomeMaps: continue # If we've made it this far, add the file path to the return group! filePathGroup.append(potentialFilePath) return filePathGroup
def normalizeCounts(backgroundCountsFilePaths: List[str], customRawCountsFilePaths: List[str] = list(), customBackgroundCountsDir=None, includeAlternativeScaling=False): normalizedCountsFilePaths = list() backgroundRawPairs = getBackgroundRawPairs(backgroundCountsFilePaths) # Get the background-raw pairs from the custom directories, if they were given. if customBackgroundCountsDir is not None: customBackgroundRawPairs = getCustomBackgroundRawPairs( customRawCountsFilePaths, customBackgroundCountsDir) for customBackgroundCountsFilePath in customBackgroundRawPairs: assert customBackgroundCountsFilePath not in backgroundRawPairs, "Unexpected intersection!" backgroundRawPairs[ customBackgroundCountsFilePath] = customBackgroundRawPairs[ customBackgroundCountsFilePath] # Iterate through each background + raw counts pair for backgroundCountsFilePath in backgroundRawPairs: for rawCountsFilePath in backgroundRawPairs[backgroundCountsFilePath]: print("\nWorking with", os.path.basename(rawCountsFilePath), "and", os.path.basename(backgroundCountsFilePath)) metadata = Metadata(rawCountsFilePath) # Generate the path to the normalized file. if DataTypeStr.rawNucCounts in backgroundCountsFilePath: context = "custom_context" else: context = getContext(backgroundCountsFilePath) normalizedCountsFilePath = generateFilePath( directory=metadata.directory, dataGroup=metadata.dataGroupName, context=context, linkerOffset=getLinkerOffset(backgroundCountsFilePath), usesNucGroup=checkForNucGroup(backgroundCountsFilePath), dataType=DataTypeStr.normNucCounts, fileExtension=".tsv") # Prepare the arguments to the subprocess call. args = [ "Rscript", os.path.join(rScriptsDirectory, "NormalizeNucleosomeMutationCounts.R"), rawCountsFilePath, backgroundCountsFilePath, normalizedCountsFilePath ] # If alternative scaling is requested, determine the appropriate scaling factor and add it to the arguments if includeAlternativeScaling: # If we are normalizing by sequence context, just revert the automatic scaling. if customBackgroundCountsDir is None: args.append(1) # If we are normalizing by a custom context, scale based on the relative sizes of the parent background and raw data sets. else: args.append( str( getParentDataFeatureCounts( backgroundCountsFilePath) / getParentDataFeatureCounts(rawCountsFilePath))) # Pass the file paths to the R script to generate the normalized counts file. print("Calling R script to generate normalized counts...") subprocess.run(args, check=True) normalizedCountsFilePaths.append(normalizedCountsFilePath) # Document where the custom background counts came from in each relevant directory. if customBackgroundCountsDir is not None: for customRawCountsDir in set([ os.path.dirname(customRawCountsFilePath) for customRawCountsFilePath in customRawCountsFilePaths ]): metadata = Metadata(customRawCountsDir) customBackgroundInfoFilePath = generateFilePath( directory=metadata.directory, dataGroup=metadata.dataGroupName, dataType=DataTypeStr.customBackgroundInfo, fileExtension=".txt") with open(customBackgroundInfoFilePath, 'w') as customBackgroundInfoFile: customBackgroundInfoFile.write( "Custom background directory: " + customBackgroundCountsDir + '\n') customBackgroundInfoFile.write( "Last date used: " + str(datetime.datetime.now()).rsplit(':', 1)[0] + '\n') return normalizedCountsFilePaths
def generateNucleosomeMutationBackground(mutationBackgroundFilePaths, nucleosomeMapNames, useSingleNucRadius, useNucGroupRadius, linkerOffset): if not (useSingleNucRadius or useNucGroupRadius): raise UserInputError( "Must generate background in either a single nucleosome or group nucleosome radius." ) nucleosomeMutationBackgroundFilePaths = list( ) # A list of paths to the output files generated by the function # Loop through each given mutation background file path, creating the corresponding nucleosome mutation background(s) for each. for mutationBackgroundFilePath in mutationBackgroundFilePaths: print("\nWorking with", os.path.basename(mutationBackgroundFilePath)) if not DataTypeStr.mutBackground in os.path.basename( mutationBackgroundFilePath): raise InvalidPathError( mutationBackgroundFilePath, "Given file path does not have \"" + DataTypeStr.mutBackground + "\" in the file name.") for nucleosomeMapName in nucleosomeMapNames: print("Counting with nucleosome map:", nucleosomeMapName) # Get metadata (Assumes that the metadata has already been generated from a call to countNucleosomePositionMutations) metadata = Metadata( os.path.join(os.path.dirname(mutationBackgroundFilePath), nucleosomeMapName)) # Determine the context of the mutation background file contextNum = getContext(mutationBackgroundFilePath, asInt=True) contextText = getContext(mutationBackgroundFilePath) print("Given mutation background is in", contextText, "context.") # To avoid copy pasting this code, here is a simple function to change how the background file is generated # based on the desired dyad radius. def generateBackgroundBasedOnRadius(usesNucGroup): # Set the dyad radius (And linker offset) if usesNucGroup: dyadRadius = 1000 currentLinkerOffset = 0 else: dyadRadius = 73 currentLinkerOffset = linkerOffset # Generate the path to the tsv file of dyad position context counts dyadPosContextCountsFilePath = generateFilePath( directory=os.path.dirname(metadata.baseNucPosFilePath), dataGroup=metadata.nucPosName, context=contextText, linkerOffset=currentLinkerOffset, usesNucGroup=usesNucGroup, dataType="dyad_pos_counts", fileExtension=".tsv") # Make sure we have a tsv file with the appropriate context counts at each dyad position. if not os.path.exists(dyadPosContextCountsFilePath): print( "Dyad position " + contextText + " counts file not found at", dyadPosContextCountsFilePath) print("Generating genome wide dyad position " + contextText + " counts file...") # Make sure we have a fasta file for strongly positioned nucleosome coordinates nucPosFastaFilePath = generateNucleosomeFasta( metadata.baseNucPosFilePath, metadata.genomeFilePath, dyadRadius, currentLinkerOffset) generateDyadPosContextCounts(nucPosFastaFilePath, dyadPosContextCountsFilePath, contextNum, dyadRadius, currentLinkerOffset) # A path to the final output file. nucleosomeMutationBackgroundFilePath = generateFilePath( directory=metadata.directory, dataGroup=metadata.dataGroupName, context=contextText, linkerOffset=currentLinkerOffset, usesNucGroup=usesNucGroup, dataType=DataTypeStr.nucMutBackground, fileExtension=".tsv") # Generate the nucleosome mutation background file! generateNucleosomeMutationBackgroundFile( dyadPosContextCountsFilePath, mutationBackgroundFilePath, nucleosomeMutationBackgroundFilePath, dyadRadius, currentLinkerOffset) nucleosomeMutationBackgroundFilePaths.append( nucleosomeMutationBackgroundFilePath) if useSingleNucRadius: generateBackgroundBasedOnRadius(False) if useNucGroupRadius: generateBackgroundBasedOnRadius(True) return nucleosomeMutationBackgroundFilePaths
def generateNucleosomeMutationBackgroundFile( dyadPosContextCountsFilePath, mutationBackgroundFilePath, nucleosomeMutationBackgroundFilePath, dyadRadius, linkerOffset): # Dictionaries of expected mutations for every dyad position included in the analysis, one for each strand. plusStrandNucleosomeMutationBackground = dict() minusStrandNucleosomeMutationBackground = dict() # This is a bit weird. If the context number is even, we need to account for half positions, # but if the context number is odd, we need to keep in mind that there's one extra valid position in the dyad range. if getContext(mutationBackgroundFilePath, asInt=True) % 2 == 0: halfBaseOffset = 0.5 extraDyadPos = 0 else: halfBaseOffset = 0 extraDyadPos = 1 # Initialize the dictionary for i in range(-dyadRadius - linkerOffset, dyadRadius + linkerOffset + extraDyadPos): dyadPos = i + halfBaseOffset plusStrandNucleosomeMutationBackground[dyadPos] = 0 minusStrandNucleosomeMutationBackground[dyadPos] = 0 # Get the corresponding mutation background and context counts dictionaries. backgroundMutationRate = getGenomeBackgroundMutationRates( mutationBackgroundFilePath) dyadPosContextCounts = getDyadPosContextCounts( dyadPosContextCountsFilePath) # Calculate the expected mutation rates for each dyad position based on the context counts at that position and that context's mutation rate for dyadPos in dyadPosContextCounts: for context in dyadPosContextCounts[dyadPos]: reverseContext = reverseCompliment(context) # Add the context's mutation rate to the running total in the background dictionaries. plusStrandNucleosomeMutationBackground[ dyadPos] += backgroundMutationRate[ context] * dyadPosContextCounts[dyadPos][context] minusStrandNucleosomeMutationBackground[ dyadPos] += backgroundMutationRate[ reverseContext] * dyadPosContextCounts[dyadPos][context] # Write the results of the dictionary to the nucleosome mutation background file. with open(nucleosomeMutationBackgroundFilePath, 'w') as nucleosomeMutationBackgroundFile: # Write the headers for the data. headers = '\t'.join(("Dyad_Position", "Expected_Mutations_Plus_Strand", "Expected_Mutations_Minus_Strand", "Expected_Mutations_Both_Strands", "Expected_Mutations_Aligned_Strands")) nucleosomeMutationBackgroundFile.write(headers + '\n') # Write the data for each dyad position. for i in range(-dyadRadius - linkerOffset, dyadRadius + linkerOffset + extraDyadPos): dyadPos = i + halfBaseOffset dataRow = '\t'.join( (str(dyadPos), str(plusStrandNucleosomeMutationBackground[dyadPos]), str(minusStrandNucleosomeMutationBackground[dyadPos]), str(plusStrandNucleosomeMutationBackground[dyadPos] + minusStrandNucleosomeMutationBackground[dyadPos]), str(plusStrandNucleosomeMutationBackground[dyadPos] + minusStrandNucleosomeMutationBackground[-dyadPos]))) nucleosomeMutationBackgroundFile.write(dataRow + '\n')
def expandContext(inputBedFilePaths, expansionContextNum): assert expansionContextNum in ( 3, 5), "Unexpected expansion context: " + str(expansionContextNum) expandedContextFilePaths = list( ) # A list of paths to the output files generated by the function for inputBedFilePath in inputBedFilePaths: # Retrieve metadata metadata = Metadata(inputBedFilePath) # If necessary, adjust the context for files with even-length features. if getContext(inputBedFilePath, asInt=True) % 2 == 0: thisExpansionContextNum = expansionContextNum + 1 else: thisExpansionContextNum = expansionContextNum # Make sure file names look valid. print("\nWorking in:", os.path.split(inputBedFilePath)[1]) if not DataTypeStr.mutations in os.path.split(inputBedFilePath)[1]: raise InvalidPathError( inputBedFilePath, "Given mutation file does not have \"" + DataTypeStr.mutations + "\" in the name.", postPathMessage= "Are you sure you inputted a file from the mutperiod pipeline?" ) # Make sure the context of the input bed file is less than the expansion context. if getContext(inputBedFilePath, asInt=True) >= thisExpansionContextNum: raise InvalidPathError( inputBedFilePath, "The given mutation file at does not have a lower context " "than the desired output context.", postPathMessage="There is nothing to expand.") # Generate paths to intermediate data files. intermediateFilesDirectory = os.path.join(metadata.directory, "intermediate_files") bedExpansionFilePath = generateFilePath( directory=intermediateFilesDirectory, dataGroup=metadata.dataGroupName, dataType="intermediate_expansion", fileExtension=".bed") fastaReadsFilePath = generateFilePath( directory=intermediateFilesDirectory, dataGroup=metadata.dataGroupName, dataType="expanded_reads", fileExtension=".fa") # Generate a path to the final output file. expandedContextFilePath = generateFilePath( directory=metadata.directory, dataGroup=metadata.dataGroupName, context=thisExpansionContextNum, dataType=DataTypeStr.mutations, fileExtension=".bed") # Create a directory for intermediate files if it does not already exist... if not os.path.exists(intermediateFilesDirectory): os.mkdir(os.path.join(intermediateFilesDirectory)) # Expand the nucleotide coordinates in the singlenuc context bed file as requested. expandBedPositions(inputBedFilePath, bedExpansionFilePath, thisExpansionContextNum) # Convert the expanded coordinates in the bed file to the referenced nucleotides in fasta format. bedToFasta(bedExpansionFilePath, metadata.genomeFilePath, fastaReadsFilePath) # Using the newly generated fasta file, create a new bed file with the expanded context. generateExpandedContext(inputBedFilePath, fastaReadsFilePath, expandedContextFilePath, thisExpansionContextNum) expandedContextFilePaths.append(expandedContextFilePath) # Delete the input file, which has the same mutation information, but a smaller context. print("Deleting old mutation context file...") os.remove(inputBedFilePath) return expandedContextFilePaths