def runAll(args): print('\n\n\nYou have requested to count unique sam files') print('\tWARNING:') print('\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY') print('\n') #set up environment# args.SamDirectory = common.fixDirName(args.SamDirectory) countDir = os.path.dirname(args.SamDirectory[:-1]) + '/BinCounts/' if args.output: countDir = common.fixDirName(args.output) statsDir = os.path.dirname(args.SamDirectory[:-1]) + '/PipelineStats/' if args.statdir: statsDir = common.fixDirName(args.statdir) for i in [countDir, statsDir]: common.makeDir(i) samFiles = common.getSampleList(args.SamDirectory, args.samples, 'sam') #run multiprocessing of all bin counting commands# argList = [(x, countDir, statsDir, args.species) for x in samFiles] common.daemon(countfile.runOne, argList, 'count sam files') print('\nBin counts complete\n\n\n')
def runAll(args): print('\n\n\nYou have requested preprocess (trim) fastq files') print('\tWARNING:') print('\t\tIF USING ANY LENGTH OTHER THAN 36 BP, REFERENCE FILES ARE NOT SUPPORTED FOR DOWNSTREAM PROCESSING') print('\n') #make sure environment is properly prepared# args.FastqDirectory = common.fixDirName(args.FastqDirectory) if not args.remove: common.makeDir(args.FastqDirectory + '/FullLength/') #get list of fastq files to process (depending on args.samples) fastqFiles = common.getSampleList(args.FastqDirectory, args.samples, 'fastq') #use the daemon to and preprocessing code to trim all fastq files with parallel processing if args.remove: argList = [(x, args.trim5, args.length, remove=True,) for x in fastqFiles] else: argList = [(x, args.trim5, args.length,) for x in fastqFiles] common.daemon(trimfile.preprocessOne, argList, 'trim sequencing reads to desired length', cpuPerProcess=1) print('\nPre-processing complete\n\n\n')
def runAll(args): print('\n\n\nYou have requested to map fastq files') print('\tWARNING:') print('\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY') print('\n') #set up environment# args.FastqDirectory = common.fixDirName(args.FastqDirectory) samDir = os.path.dirname(args.FastqDirectory[:-1]) + '/Sam/' if args.output: samDir = common.fixDirName(args.output) statsDir = os.path.dirname(args.FastqDirectory[:-1]) + '/PipelineStats/' if args.statdir: statsDir = common.fixDirName(args.statdir) tempDir = args.FastqDirectory + 'Temp/' for i in [samDir, tempDir, statsDir]: common.makeDir(i) fastqFiles = common.getSampleList(args.FastqDirectory, args.samples, 'fastq') #run multiprocessing of all mapping commands# argList = [(x, args.species, args.trim, statsDir, tempDir, samDir) for x in fastqFiles] common.daemon(mapfile.runOne, argList, 'map fastq files', cpuPerProcess=8) #remove all temporary files# shutil.rmtree(tempDir[:-1]) print('\nMapping complete\n\n\n')
def runAll(args): print('\n\n\nYou have requested to map fastq files') print('\tWARNING:') print('\t\tPLEASE MAKE SURE YOU ARE USING') print('\t\t\tBowtie v1 and Samtools v0.1.19') print('\t\t\tBowtie v1 mapping indexes for either mm10 or hg38') print('\n') #set up environment# args.FastqDirectory = common.fixDirName(args.FastqDirectory) samDir = os.path.dirname(args.FastqDirectory[:-1]) + '/Sam/' if args.output: samDir = common.fixDirName(args.output) statsDir = os.path.dirname(args.FastqDirectory[:-1]) + '/PipelineStats/' if args.statdir: statsDir = common.fixDirName(args.statdir) tempDir = args.FastqDirectory + 'Temp/' for i in [samDir, tempDir, statsDir]: common.makeDir(i) fastqFiles = common.getSampleList(args.FastqDirectory, args.samples, 'fastq') #run multiprocessing of all mapping commands# argList = [(x, args.MapIndex, args.trim, statsDir, tempDir, samDir, args.bowtie, args.samtools) for x in fastqFiles] common.daemon(mapfile.runOne, argList, 'map fastq files', cpuPerProcess=8) #remove all temporary files# shutil.rmtree(tempDir[:-1]) print('\nMapping complete\n\n\n')
def runAll(args): print('\n\n\nYou have requested to normalize and segment bincounts files') print('\tWARNING:') print( '\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY' ) print('\n') #Set up environment# args.AnalysisDirectory = common.fixDirName(args.AnalysisDirectory) CountDir = args.AnalysisDirectory + 'BinCounts/' #args.CountDirectory = common.fixDirName(args.CountDirectory) if args.bincountdir: CountDir = common.fixDirName(args.bincountdir) lowessDir = args.AnalysisDirectory + 'LowessBinCounts/' #os.path.dirname(args.CountDirectory[:-1]) + '/LowessBinCounts/' segmentDir = args.AnalysisDirectory + 'Segments/' #os.path.dirname(args.CountDirectory[:-1]) + '/Segments/' tempDir = args.AnalysisDirectory + 'Temp/' #os.path.dirname(args.CountDirectory[:-1]) + '/Temp/' common.makeDir(lowessDir) if not args.normalizeonly: common.makeDir(segmentDir) common.makeDir(tempDir) sampleFiles = common.getSampleList(CountDir, args.samples, 'bincounts') info = common.importInfoFile(args.infofile, args.columns, 'normalize') if args.infofile: refArray = info else: thisDtype = info refArray = np.array([( os.path.basename(x)[:-14], 'unk', 1, ) for x in sampleFiles], dtype=thisDtype) sampleDict = { x: [y for y in sampleFiles if x == os.path.basename(y)[:len(x)]][0] for x in refArray['name'] } #Run normalization for all samples# methodDict = {x: [ False, ] for x in np.unique(refArray['method'])} methodDict['NA'] = [False] sampleNormMethodDict = {x: 'NA' for x in refArray['name']} if not args.gconly: for i in methodDict: refSlice = refArray[(refArray['method'] == i) & (refArray['cells'] == 1)] methodSamples = [sampleDict[x] for x in refSlice['name']] methodDict[i] = normalizefile.runMakeMethodRef( args.species, methodSamples, i, lowessDir) if methodDict[i][0] != False: for j in refSlice['name']: sampleNormMethodDict[j] = i #run multiprocessing for gc (+ method) correction normArgs = [ (args.species, sampleDict[x], methodDict[sampleNormMethodDict[x]], lowessDir + x + '.lowess.txt') for x in sampleDict.keys() ] common.daemon(normalizefile.runNormalizeOne, normArgs, 'normalize bincount files') print('\nNormalization complete\n\n\n') #Run CBS for all samples# if not args.normalizeonly: segArgs = [(x, args.species, tempDir, lowessDir, segmentDir) for x in refArray['name']] common.daemon(segmentfile.segmentOne, segArgs, 'segment bincount data') shutil.rmtree(tempDir[:-1]) print('\nSegmentation complete\n\n\n')
def runAll(args): print('\n\n\nYou have requested to analyze CNV call data') print('\tWARNING:') print( '\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY' ) print('\n') #Set up environment# args.AnalysisDirectory = common.fixDirName(args.AnalysisDirectory) folderDict = { 'LowessBinCounts': args.lowess, 'Segments': args.segments, 'PipelineStats': args.countstats } for i in list(folderDict.keys()): if not folderDict[i]: folderDict[i] = args.AnalysisDirectory + i + '/' else: folderDict[i] = common.fixDirName(folderDict[i]) QCdir = args.AnalysisDirectory + 'QC/' CNVdir = args.AnalysisDirectory + 'CNVlists/' summaryDir = args.AnalysisDirectory + 'SummaryFiles/' PloidyPlotDir = args.AnalysisDirectory + 'PloidyDeterminationPlots/' CNplotDir = args.AnalysisDirectory + 'CopyNumberProfilePlots/' ChromPlotDir = args.AnalysisDirectory + 'ChromosomeCopyNumberPlots/' for i in [ args.AnalysisDirectory, QCdir, CNVdir, summaryDir, PloidyPlotDir, CNplotDir, ChromPlotDir ]: # common.makeDir(i) #get list of samples to process #will involve checking infofile (if present) and whether required input files exist sampleFiles = common.getSampleList(folderDict['Segments'], args.samples, 'segments') sampleNames = [x.split('/')[-1].split('.')[0] for x in sampleFiles] # info = common.importInfoFile(args.infofile, args.columns, 'interpret') # if args.infofile: # refArray = info # else: # thisDtype = info # refArray = np.array( # [ (x, 1, 'unk',) for x in sampleNames], # dtype=thisDtype) #QC assessment# # qcfile.runQCone(sampleNames[0], args.species, folderDict['PipelineStats'], folderDict['LowessBinCounts'], folderDict['Segments'], QCdir, PloidyPlotDir) argList = [(x, args.species, folderDict['PipelineStats'], folderDict['LowessBinCounts'], folderDict['Segments'], QCdir, PloidyPlotDir) for x in sampleNames] common.daemon(qcfile.runQCone, argList, 'assess sample quality') analysisSamples = [] ploidyDict = {} genderDict = {} mergeQCfile = summaryDir + 'QCmetrics.txt' OUT = open(mergeQCfile, 'w') OUT.write('Name\tReads\tMAPD\tCS\tPloidy\tGender\tPASS\n') for i in sampleNames: IN = open(QCdir + i + '.qcTEMP.txt', 'r') data = IN.readline() OUT.write(data) data = data.rstrip().split('\t') if data[-1] == 'True': analysisSamples.append(i) ploidyDict[i] = float(data[4]) genderDict[i] = data[-2] IN.close() os.remove(QCdir + i + '.qcTEMP.txt') OUT.close() os.rmdir(QCdir) #FUnC: CNV filtering# if args.nofilter: print '\nFURTHER CODE IS ONLY DEVELOPED FOR WHEN FUnC IS IMPLEMENTED, EXITING NOW\n\n\n' raise SystemExit # funcfile.FUnCone(analysisSamples[0], args.species, folderDict['Segments'], CNVdir, # ploidyDict[analysisSamples[0]], genderDict[analysisSamples[0]]) argList = [(x, args.species, folderDict['Segments'], CNVdir, ploidyDict[x], genderDict[x]) for x in analysisSamples] common.daemon(funcfile.FUnCone, argList, 'remove unreliable CNV calls') #CNV analysis# # summaryStats = analyzefiles.analyzeOne(analysisSamples[0], args.species, CNVdir, folderDict['LowessBinCounts'], CNplotDir, ChromPlotDir, ploidyDict[analysisSamples[0]], genderDict[analysisSamples[0]]) # summaryStats = [summaryStats] argList = [(x, args.species, CNVdir, folderDict['LowessBinCounts'], CNplotDir, ChromPlotDir, ploidyDict[x], genderDict[x]) for x in analysisSamples] summaryStats = common.daemon(analyzefiles.analyzeOne, argList, 'create summary files') cellStatsFile = summaryDir + 'CellStats.txt' chromAmpFile = summaryDir + 'ChromosomeAmplifiedPercent.txt' chromDelFile = summaryDir + 'ChromosomeDeletedPercent.txt' #write summary statistics files# with open(cellStatsFile, 'w') as CELL, open(chromAmpFile, 'w') as AMP, open(chromDelFile, 'w') as DEL: CELL.write( 'Sample\tDeletionNumber\tAmplificationNumber\tTotalCNVnumber\tDeletedMB\tAmplifiedMB\tNetDNAalterdMB\n' ) chromHeader = 'Sample\t' + '\t'.join(summaryStats[0]['chroms']) + '\n' AMP.write(chromHeader) DEL.write(chromHeader) for i, j in enumerate(analysisSamples): CELL.write(str(j + '\t')) cellOut = [ summaryStats[i]['cellStats']['delCount'], summaryStats[i]['cellStats']['ampCount'], summaryStats[i]['cellStats']['delCount'] + summaryStats[i]['cellStats']['ampCount'], np.round(summaryStats[i]['cellStats']['delMB'], 3), np.round(summaryStats[i]['cellStats']['ampMB'], 3), np.round( summaryStats[i]['cellStats']['ampMB'] - summaryStats[i]['cellStats']['delMB'], 3) ] cellOut = '\t'.join(map(str, cellOut)) + '\n' CELL.write(cellOut) AMP.write(str(j + '\t')) ampOut = [ np.round(summaryStats[i]['chromAmp'][x], 3) for x in summaryStats[0]['chroms'] ] ampOut = '\t'.join(map(str, ampOut)) + '\n' AMP.write(ampOut) DEL.write(str(j + '\t')) delOut = [ np.round(summaryStats[i]['chromDel'][x], 3) for x in summaryStats[0]['chroms'] ] delOut = '\t'.join(map(str, delOut)) + '\n' DEL.write(delOut) print('\nCNV analysis complete\n\n\n')